You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by sc...@apache.org on 2012/11/06 19:22:04 UTC

svn commit: r1406261 [3/4] - in /uima/uimaj/trunk/uimaj-core/src: main/java/org/apache/uima/cas/ main/java/org/apache/uima/cas/impl/ main/java/org/apache/uima/util/ main/java/org/apache/uima/util/impl/ main/resources/org/apache/uima/ test/java/org/apac...

Propchange: uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/BinaryCasSerDes4.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/CASImpl.java
URL: http://svn.apache.org/viewvc/uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/CASImpl.java?rev=1406261&r1=1406260&r2=1406261&view=diff
==============================================================================
--- uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/CASImpl.java (original)
+++ uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/CASImpl.java Tue Nov  6 18:22:03 2012
@@ -42,6 +42,7 @@ import java.util.ListIterator;
 import java.util.Map;
 import java.util.Set;
 
+import org.apache.uima.cas.AbstractCas;
 import org.apache.uima.cas.AbstractCas_ImplBase;
 import org.apache.uima.cas.AnnotationBaseFS;
 import org.apache.uima.cas.ArrayFS;
@@ -84,6 +85,7 @@ import org.apache.uima.cas.text.Language
 import org.apache.uima.internal.util.IntVector;
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.jcas.impl.JCasImpl;
+import org.apache.uima.util.SerializationMeasures;
 
 /**
  * Implements the CAS interfaces. This class must be public because we need to
@@ -108,6 +110,8 @@ public class CASImpl extends AbstractCas
 
   public static final int FALSE = 0;
 
+  private static final int[] INT0 = new int[0];
+  
   public static final int DEFAULT_INITIAL_HEAP_SIZE = 500000;
 
   public static final int DEFAULT_RESET_HEAP_SIZE = 5000000;
@@ -241,6 +245,10 @@ public class CASImpl extends AbstractCas
      * element per component being journaled.
      */
     private List<Marker> trackingMarkList;
+    
+    // must be in svd part because has a field that is updated
+    // while serializing
+    private BinaryCasSerDes4 binaryCompressor;
 
     private SharedViewData(boolean useFSCache) {
       this.useFSCache = useFSCache;
@@ -1163,48 +1171,40 @@ public class CASImpl extends AbstractCas
       return;
     }
    
-    DataInputStream dis = new DataInputStream(istream);
+    final DataInputStream dis = (istream instanceof DataInputStream) ?  
+       (DataInputStream) istream : new DataInputStream(istream);
 
     try {
       // key
-      // deteremine if byte swap if needed based on key
+      // determine if byte swap if needed based on key
       byte[] bytebuf = new byte[4];
       bytebuf[0] = dis.readByte(); // U
       bytebuf[1] = dis.readByte(); // I
       bytebuf[2] = dis.readByte(); // M
       bytebuf[3] = dis.readByte(); // A
 
-      boolean swap = false;
-      // check if first byte is ascii char U
-      if (bytebuf[0] != 85) {
-        swap = true;
-      }
+      final boolean swap = (bytebuf[0] != 85);
 
       // version      
-      // version 2 indicates this is in delta format.
-      int version;
-      if (swap) {
-        version = swap4(dis, bytebuf);
-      } else {
-        version = dis.readInt();
-      }
+      // version bit in 2's place indicates this is in delta format.
+      final int version = readInt(dis, swap);      
+      final boolean delta = ((version & 2) == 2);
       
-      boolean delta = false;
-      if (version == 2)  {
-        delta = true;
-      }
       if (!delta) {
         this.resetNoQuestions();
       }
       
-      // main fsheap
-      int fsheapsz = 0;
-      if (swap) {
-        fsheapsz = swap4(dis, bytebuf);
-      } else {
-        fsheapsz = dis.readInt();
+      if (0 != (version & 4)) {
+        if (svd.binaryCompressor == null) {
+          svd.binaryCompressor = new BinaryCasSerDes4(this.getTypeSystemImpl(), false);
+        }
+        svd.binaryCompressor.deserialize(this, dis, delta);
+        return;
       }
       
+      // main fsheap
+      final int fsheapsz = readInt(dis, swap);
+      
       int startPos = 0;
       if (!delta) {
         this.getHeap().reinitSizeOnly(fsheapsz);
@@ -1212,32 +1212,19 @@ public class CASImpl extends AbstractCas
     	startPos = this.getHeap().getNextId();
     	this.getHeap().grow(fsheapsz);
       }
-      
+            
       for (int i = startPos; i < fsheapsz+startPos; i++) {
-        if (swap) {
-          this.getHeap().heap[i] = swap4(dis, bytebuf);
-        } else {
-          this.getHeap().heap[i] = dis.readInt();
-        }
+        this.getHeap().heap[i] = readInt(dis, swap);
       }
       
       // string heap
-      int stringheapsz = 0;
-      if (swap) {
-        stringheapsz = swap4(dis, bytebuf);
-      } else {
-        stringheapsz = dis.readInt();
-      }
+      int stringheapsz = readInt(dis, swap);
 
       final StringHeapDeserializationHelper shdh = new StringHeapDeserializationHelper();
       
       shdh.charHeap = new char[stringheapsz];
       for (int i = 0; i < stringheapsz; i++) {
-        if (swap) {
-          shdh.charHeap[i] = swap2(dis, bytebuf);
-        } else {
-          shdh.charHeap[i] = dis.readChar();
-        }
+        shdh.charHeap[i] = (char) readShort(dis, swap);
       }
       shdh.charHeapPos = stringheapsz;
 
@@ -1247,12 +1234,7 @@ public class CASImpl extends AbstractCas
       }
 
       // string ref heap
-      int refheapsz = 0;
-      if (swap) {
-        refheapsz = swap4(dis, bytebuf);
-      } else {
-        refheapsz = dis.readInt();
-      }
+      int refheapsz = readInt(dis, swap);
 
       refheapsz--;
       refheapsz = refheapsz / 2;
@@ -1264,14 +1246,8 @@ public class CASImpl extends AbstractCas
 
       dis.readInt(); // 0
       for (int i = shdh.refHeapPos; i < shdh.refHeap.length; i += StringHeapDeserializationHelper.REF_HEAP_CELL_SIZE) {
-        if (swap) {
-          shdh.refHeap[i + StringHeapDeserializationHelper.CHAR_HEAP_POINTER_OFFSET] = swap4(dis,
-              bytebuf);
-          shdh.refHeap[i + StringHeapDeserializationHelper.CHAR_HEAP_STRLEN_OFFSET] = swap4(dis, bytebuf);
-        } else {
-          shdh.refHeap[i + StringHeapDeserializationHelper.CHAR_HEAP_POINTER_OFFSET] = dis.readInt();
-          shdh.refHeap[i + StringHeapDeserializationHelper.CHAR_HEAP_STRLEN_OFFSET] = dis.readInt();
-        }
+        shdh.refHeap[i + StringHeapDeserializationHelper.CHAR_HEAP_POINTER_OFFSET] = readInt(dis, swap);
+        shdh.refHeap[i + StringHeapDeserializationHelper.CHAR_HEAP_STRLEN_OFFSET] = readInt(dis, swap);
         shdh.refHeap[i + StringHeapDeserializationHelper.STRING_LIST_ADDR_OFFSET] = 0;
       }
       shdh.refHeapPos = refheapsz + StringHeapDeserializationHelper.FIRST_CELL_REF;
@@ -1280,35 +1256,17 @@ public class CASImpl extends AbstractCas
       
       //if delta, handle modified fs heap cells
       if (delta) {
-        int fsmodssz = 0;
-        if (swap) {
-          fsmodssz = swap4(dis, bytebuf);
-        } else {
-          fsmodssz = dis.readInt();
-        }
+        int fsmodssz = readInt(dis, swap);
         for (int i = 0; i < fsmodssz; i++) {
-          if (swap) {
-            this.getHeap().heap[swap4(dis,bytebuf)] = swap4(dis, bytebuf);
-          } else {
-            this.getHeap().heap[dis.readInt()] = dis.readInt();
-          }
+          this.getHeap().heap[readInt(dis, swap)] = readInt(dis, swap);
         }
       }
 
       // indexed FSs
-      int fsindexsz = 0;
-      if (swap) {
-        fsindexsz = swap4(dis, bytebuf);
-      } else {
-        fsindexsz = dis.readInt();
-      }
+      int fsindexsz = readInt(dis, swap);
       int[] fsindexes = new int[fsindexsz];
       for (int i = 0; i < fsindexsz; i++) {
-        if (swap) {
-          fsindexes[i] = swap4(dis, bytebuf);
-        } else {
-          fsindexes[i] = dis.readInt();
-        }
+        fsindexes[i] = readInt(dis, swap);
       }
 
       // build the index
@@ -1319,218 +1277,156 @@ public class CASImpl extends AbstractCas
       }
       
       // byte heap
-      int byteheapsz = 0;
-      if (swap) {
-        byteheapsz = swap4(dis, bytebuf);
-      } else {
-        byteheapsz = dis.readInt();
-      }
+      int heapsz = readInt(dis, swap);
 
       if (!delta) {
-        this.getByteHeap().heap = new byte[Math.max(16, byteheapsz)]; // must
-        // be >
-        // 0
-        for (int i = 0; i < byteheapsz; i++) {
-          this.getByteHeap().heap[i] = dis.readByte();
-        }
-        this.getByteHeap().heapPos = byteheapsz;
+        this.getByteHeap().heap = new byte[Math.max(16, heapsz)]; // must be > 0
+        dis.read(this.getByteHeap().heap, 0, heapsz);
+        this.getByteHeap().heapPos = heapsz;
       }  else {
-        for (int i=0; i < byteheapsz; i++) {
-    	  this.getByteHeap().addByte(dis.readByte());
+        for (int i=0; i < heapsz; i++) {
+      	  this.getByteHeap().addByte(dis.readByte());
         }
       }
       // word alignment
-      int align = (4 - (byteheapsz % 4)) % 4;
-      for (int i = 0; i < align; i++) {
-        dis.readByte();
-      }
+      int align = (4 - (heapsz % 4)) % 4;
+      dis.skipBytes(align);
 
       // short heap
-      int shortheapsz = 0;
-      if (swap) {
-        shortheapsz = swap4(dis, bytebuf);
-      } else {
-        shortheapsz = dis.readInt();
-      }
+      heapsz = readInt(dis, swap);
       
       if (!delta) {
-        this.getShortHeap().heap = new short[Math.max(16, shortheapsz)]; // must
-        // be >
-        // 0
-        for (int i = 0; i < shortheapsz; i++) {
-          if (swap) {
-            this.getShortHeap().heap[i] = (short) swap2(dis, bytebuf);
-          } else {
-            this.getShortHeap().heap[i] = dis.readShort();
-          }
+        this.getShortHeap().heap = new short[Math.max(16, heapsz)]; // must be > 0
+        for (int i = 0; i < heapsz; i++) {
+          this.getShortHeap().heap[i] = readShort(dis, swap);
         }
-        this.getShortHeap().heapPos = shortheapsz;
+        this.getShortHeap().heapPos = heapsz;
       } else {
-    	for (int i = 0; i < shortheapsz; i++) {
-          if (swap) {
-            this.getShortHeap().addShort((short) swap2(dis, bytebuf));
-          } else {
-            this.getShortHeap().addShort(dis.readShort());
-          }
-         }
+      	for (int i = 0; i < heapsz; i++) {
+      	  this.getShortHeap().addShort(readShort(dis, swap));
+        }
       }
       // word alignment
-      if (shortheapsz % 2 != 0) {
+      if (heapsz % 2 != 0) {
         dis.readShort();
       }
 
       // long heap
-      int longheapsz = 0;
-      if (swap) {
-        longheapsz = swap4(dis, bytebuf);
-        bytebuf = new byte[8];
-      } else {
-        longheapsz = dis.readInt();
-      }
+      heapsz = readInt(dis, swap);
       
       if (!delta) {
-        this.getLongHeap().heap = new long[Math.max(16, longheapsz)]; // must
-        // be >
-        // 0
-        for (int i = 0; i < longheapsz; i++) {
-          if (swap) {
-            this.getLongHeap().heap[i] = swap8(dis, bytebuf);
-          } else {
-            this.getLongHeap().heap[i] = dis.readLong();
-          }
+        this.getLongHeap().heap = new long[Math.max(16, heapsz)]; // must be > 0
+        for (int i = 0; i < heapsz; i++) {
+          this.getLongHeap().heap[i] = readLong(dis, swap);
         }
-        this.getLongHeap().heapPos = longheapsz;
+        this.getLongHeap().heapPos = heapsz;
       } else {
-    	for (int i = 0; i < longheapsz; i++) {
-          if (swap) {
-            this.getLongHeap().addLong( swap8(dis, bytebuf));
-          } else {
-            this.getLongHeap().addLong(dis.readLong());
-          }
+      	for (int i = 0; i < heapsz; i++) {
+      	  this.getLongHeap().addLong(readLong(dis, swap));
         }
       }
       
       if (delta)  {
-        //modified Byte Heap
-    	if (swap) {
-    	  byteheapsz = swap4(dis, bytebuf);
-    	} else {
-    	  byteheapsz = dis.readInt();
-    	}
-    	if (byteheapsz > 0) {
-    	  int[] byteHeapAddrs = new int[byteheapsz];
-    	  for (int i=0; i < byteheapsz; i++) {
-    		if (swap) {
-    	      byteHeapAddrs[i] = swap4(dis, bytebuf);
-    	    } else {
-    	      byteHeapAddrs[i] = dis.readInt();
-    	    }
-    	  }
-    	  for (int i=0; i < byteheapsz; i++) {
-    	    this.getByteHeap().heap[byteHeapAddrs[i]] = dis.readByte();
-    	  }
-    	}
-    	// word alignment
-        align = (4 - (byteheapsz % 4)) % 4;
-        for (int i = 0; i < align; i++) {
-          dis.readByte();
-        }
-        
-        //modified Short Heap
-    	if (swap) {
-      	  shortheapsz = swap4(dis, bytebuf);
-      	} else {
-      	  shortheapsz = dis.readInt();
-      	}
-      	if (shortheapsz > 0) {
-      	  int[] shortHeapAddrs = new int[shortheapsz];
-      	  for (int i=0; i < shortheapsz; i++) {
-      		if (swap) {
-      	      shortHeapAddrs[i] = swap4(dis, bytebuf);
-      	    } else {
-      	      shortHeapAddrs[i] = dis.readInt();
-      	    }
+          //modified Byte Heap
+        heapsz = readInt(dis, swap);
+      	if (heapsz > 0) {
+      	  int[] heapAddrs = new int[heapsz];
+      	  for (int i = 0; i < heapsz; i++) {
+      	    heapAddrs[i] = readInt(dis, swap);
       	  }
-      	  for (int i=0; i < shortheapsz; i++) {
-      		if (swap) {
-              this.getShortHeap().heap[i] = (short) swap2(dis, bytebuf);
-            } else {
-              this.getShortHeap().heap[i] = dis.readShort();
-            }
+      	  for (int i = 0; i < heapsz; i++) {
+      	    this.getByteHeap().heap[heapAddrs[i]] = dis.readByte();
       	  }
       	}
+      	// word alignment
+        align = (4 - (heapsz % 4)) % 4;
+        dis.skipBytes(align);
+        
+        //modified Short Heap
+        heapsz = readInt(dis, swap);
+        if (heapsz > 0) {
+          int[] heapAddrs = new int[heapsz];
+      	  for (int i = 0; i < heapsz; i++) {
+            heapAddrs[i] = readInt(dis, swap);
+          }
+          for (int i = 0; i < heapsz; i++) {
+            this.getShortHeap().heap[heapAddrs[i]] = readShort(dis, swap);
+       	  }
+      	}
       	
         // word alignment
-        if (shortheapsz % 2 != 0) {
+        if (heapsz % 2 != 0) {
           dis.readShort();
         }
       
         //modified Long Heap
-      	if (swap) {
-          longheapsz = swap4(dis, bytebuf);
-        } else {
-          longheapsz = dis.readInt();
-        }
-        if (longheapsz > 0) {
-          int[] longHeapAddrs = new int[shortheapsz];
-          for (int i=0; i < shortheapsz; i++) {
-        	if (swap) {
-        	  longHeapAddrs[i] = swap4(dis, bytebuf);
-        	} else {
-        	  longHeapAddrs[i] = dis.readInt();
-        	}
+        heapsz = readInt(dis, swap);
+        if (heapsz > 0) {
+          int[] heapAddrs = new int[heapsz];
+          for (int i = 0; i < heapsz; i++) {
+            heapAddrs[i] = readInt(dis, swap);
           }
-          for (int i=0; i < longheapsz; i++) {
-        	if (swap) {
-              this.getLongHeap().heap[i] = (short) swap8(dis, bytebuf);
-            } else {
-              this.getLongHeap().heap[i] = dis.readLong();
-            }
+          for (int i = 0; i < heapsz; i++) {
+            this.getLongHeap().heap[heapAddrs[i]] = readLong(dis, swap);
           }
         }
-    	
-      }
+      } // of delta - modified processing
     } catch (IOException e) {
       CASRuntimeException exception = new CASRuntimeException(
           CASRuntimeException.BLOB_DESERIALIZATION, new String[] { e.getMessage() });
       throw exception;
     }
   }
-
-  private long swap8(DataInputStream dis, byte[] buf) throws IOException {
-
-    buf[7] = dis.readByte();
-    buf[6] = dis.readByte();
-    buf[5] = dis.readByte();
-    buf[4] = dis.readByte();
-    buf[3] = dis.readByte();
-    buf[2] = dis.readByte();
-    buf[1] = dis.readByte();
-    buf[0] = dis.readByte();
-    ByteBuffer bb = ByteBuffer.wrap(buf);
-    return bb.getLong();
-  }
-
-  private int swap4(DataInputStream dis, byte[] buf) throws IOException {
-    buf[3] = dis.readByte();
-    buf[2] = dis.readByte();
-    buf[1] = dis.readByte();
-    buf[0] = dis.readByte();
-    ByteBuffer bb = ByteBuffer.wrap(buf);
-    return bb.getInt();
-  }
-
-  private char swap2(DataInputStream dis, byte[] buf) throws IOException {
-    buf[1] = dis.readByte();
-    buf[0] = dis.readByte();
-    ByteBuffer bb = ByteBuffer.wrap(buf, 0, 2);
-    return bb.getChar();
+  
+  private long readLong(DataInputStream dis, boolean swap) throws IOException {
+    long v = dis.readLong();
+    return swap ? Long.reverseBytes(v) : v;
   }
+  
+  private int readInt(DataInputStream dis, boolean swap) throws IOException {
+    int v = dis.readInt();
+    return swap ? Integer.reverseBytes(v) : v;
+  }
+  
+  private short readShort(DataInputStream dis, boolean swap) throws IOException {
+    short v = dis.readShort();
+    return swap ? Short.reverseBytes(v) : v;
+  }
+
+//  private long swap8(DataInputStream dis, byte[] buf) throws IOException {
+//
+//    buf[7] = dis.readByte();
+//    buf[6] = dis.readByte();
+//    buf[5] = dis.readByte();
+//    buf[4] = dis.readByte();
+//    buf[3] = dis.readByte();
+//    buf[2] = dis.readByte();
+//    buf[1] = dis.readByte();
+//    buf[0] = dis.readByte();
+//    ByteBuffer bb = ByteBuffer.wrap(buf);
+//    return bb.getLong();
+//  }
+//
+//  private int swap4(DataInputStream dis, byte[] buf) throws IOException {
+//    buf[3] = dis.readByte();
+//    buf[2] = dis.readByte();
+//    buf[1] = dis.readByte();
+//    buf[0] = dis.readByte();
+//    ByteBuffer bb = ByteBuffer.wrap(buf);
+//    return bb.getInt();
+//  }
+//
+//  private char swap2(DataInputStream dis, byte[] buf) throws IOException {
+//    buf[1] = dis.readByte();
+//    buf[0] = dis.readByte();
+//    ByteBuffer bb = ByteBuffer.wrap(buf, 0, 2);
+//    return bb.getChar();
+//  }
 
   // assumes:
   // indexes are empty on entry
   //   
-  private void reinitIndexedFSs(int[] fsIndex) {
+  void reinitIndexedFSs(int[] fsIndex) {
     // Add FSs to index repository for base CAS
     int numViews = fsIndex[0];
     int loopLen = fsIndex[1]; // number of sofas, not necessarily the same as
@@ -1578,7 +1474,7 @@ public class CASImpl extends AbstractCas
   }
   
   // fsIndex contains added, removed and reindexed FS per view
-  private void reinitDeltaIndexedFSs(int[] fsIndex) {
+  void reinitDeltaIndexedFSs(int[] fsIndex) {
 	// Add FSs to index repository for base CAS
 	int numViews = fsIndex[0]; //total number of views
 	int loopLen = fsIndex[1]; // number of sofas, not necessarily the same as
@@ -1664,7 +1560,7 @@ public class CASImpl extends AbstractCas
       if (loopIndexRep != null) {
         fsLoopIndex = loopIndexRep.getIndexedFSs();
       } else {
-        fsLoopIndex = (new IntVector()).toArray();
+        fsLoopIndex = INT0;
       }
       v.add(fsLoopIndex.length);
       for (int k = 0; k < fsLoopIndex.length; k++) {
@@ -1674,6 +1570,7 @@ public class CASImpl extends AbstractCas
     return v.toArray();
   }
   
+ 
   
   //Delta IndexedFSs format:
   // number of views
@@ -1719,9 +1616,9 @@ public class CASImpl extends AbstractCas
         fsDeletedFromIndex = loopIndexRep.getDeletedFSs();
         fsReindexed = loopIndexRep.getReindexedFSs();
       } else {
-        fsLoopIndex = (new IntVector()).toArray();
-        fsDeletedFromIndex = (new IntVector()).toArray();
-        fsReindexed = (new IntVector()).toArray();
+        fsLoopIndex = INT0;
+        fsDeletedFromIndex = INT0;
+        fsReindexed = INT0;
       }
       v.add(fsLoopIndex.length);
       for (int k = 0; k < fsLoopIndex.length; k++) {
@@ -3926,14 +3823,14 @@ public class CASImpl extends AbstractCas
 
   @SuppressWarnings("unchecked")
   public AnnotationIndex<AnnotationFS> getAnnotationIndex() {
-    return new AnnotationIndexImpl(
+    return new AnnotationIndexImpl<AnnotationFS>(
             (FSIndex<AnnotationFS>) (FSIndex<?>) getIndexRepository().getIndex(
              CAS.STD_ANNOTATION_INDEX));
   }
 
   @SuppressWarnings("unchecked")
   public AnnotationIndex<AnnotationFS> getAnnotationIndex(Type type) {
-    return new AnnotationIndexImpl(
+    return new AnnotationIndexImpl<AnnotationFS>(
             (FSIndex<AnnotationFS>) (FSIndex<?>) getIndexRepository().getIndex(
             CAS.STD_ANNOTATION_INDEX, type));
   }
@@ -4390,4 +4287,16 @@ public class CASImpl extends AbstractCas
 		return this.svd.modifiedLongHeapCells;  
   }
   
+  /**
+   * Serialize in compressed binary form
+   * @param out - an OutputStream, a DataOutputStream, or a File
+   * @throws IOException
+   */
+  public void serializeWithCompression(Object out) throws IOException {
+    if (svd.binaryCompressor == null) {
+      svd.binaryCompressor = new BinaryCasSerDes4(this.getTypeSystemImpl(), false);
+    }
+    svd.binaryCompressor.serialize(this, out);
+  }
+  
 }

Modified: uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/CASSerializer.java
URL: http://svn.apache.org/viewvc/uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/CASSerializer.java?rev=1406261&r1=1406260&r2=1406261&view=diff
==============================================================================
--- uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/CASSerializer.java (original)
+++ uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/CASSerializer.java Tue Nov  6 18:22:03 2012
@@ -30,10 +30,38 @@ import org.apache.uima.cas.CASRuntimeExc
 import org.apache.uima.cas.Marker;
 
 /**
- * Serialization for CAS. This serializes the state of the CAS, assuming that the type and index
+ * Binary Serialization for CAS. This serializes the state of the CAS, assuming that the type and index
  * information remains constant. <code>CASSerializer</code> objects can be serialized with
- * standard Java serialization.
+ * standard Java serialization; many uses of this class follow this form:
  * 
+ * 1) create an instance of this class
+ * 2) add a Cas to it (via addCAS methods)
+ * 3) use the instance of this class as the argument to anObjectOutputStream.writeObject(anInstanceOfThisClass)
+ *    In UIMA this is done in the SerializationUtils class.
+ * 
+ * There are also custom serialization methods that serialize to outputStreams.
+ * 
+ * The format of the serialized data is in one of several formats:
+ *   normal Java object serialization / custom binary serialization
+ *
+ *   The custom binary serialization is in several formats:
+ *     full / delta:
+ *       full - the entire cas
+ *       delta - only differences from a previous "mark" are serialized
+ *     uncompressed / compressed / compressed (fast)
+ *       uncompressed
+ *       compressed - trades off time for space to give the most compression
+ *       compressed (fast) - less compression, but faster 
+ *     
+ * This class is for internal use.  Some of the serialized formats are readable by the C++
+ * implementation, and used for efficiently transferring CASes between Java frameworks and other ones.
+ * Others are used with Vinci or SOAP to communicate to remote annotators.
+ * 
+ * External interfaces to compressed forms of this serialization are provided by the 
+ * user class org.apache.uima.util.Compression
+ * 
+ * To serialize the shared common information among a group of CASes sharing the same
+ * type definition and index specifications, 
  * @see org.apache.uima.cas.impl.CASMgrSerializer
  * 
  * 
@@ -73,8 +101,7 @@ public class CASSerializer implements Se
   /**
    * Serialize CAS data without heap-internal meta data. Currently used for serialization to C++.
    * 
-   * @param casImpl
-   *                The CAS to be serialized.
+   * @param casImpl The CAS to be serialized.
    */
   public void addNoMetaData(CASImpl casImpl) {
     addCAS(casImpl, false);
@@ -84,8 +111,7 @@ public class CASSerializer implements Se
    * Add the CAS to be serialized. Note that we need the implementation here, the interface is not
    * enough.
    * 
-   * @param cas
-   *                The CAS to be serialized.
+   * @param cas The CAS to be serialized.
    */
   public void addCAS(CASImpl cas) {
     addCAS(cas, true);
@@ -95,8 +121,7 @@ public class CASSerializer implements Se
    * Add the CAS to be serialized. Note that we need the implementation here, the interface is not
    * enough.
    * 
-   * @param cas
-   *                The CAS to be serialized.
+   * @param cas The CAS to be serialized.
    */
   public void addCAS(CASImpl cas, boolean addMetaData) {
     this.fsIndex = cas.getIndexedFSs();
@@ -104,7 +129,12 @@ public class CASSerializer implements Se
     this.heapArray = new int[heapSize];
     System.arraycopy(cas.getHeap().heap, 0, this.heapArray, 0, heapSize);
     if (addMetaData) {
-      this.heapMetaData = cas.getHeap().getMetaData();
+      // some details about current main-heap specifications
+      // not required to deserialize
+      // not sent for C++
+      // is 7 words long
+      // not serialized by custom serializers, only by Java object serialization
+      this.heapMetaData = cas.getHeap().getMetaData();  
     }
     this.stringTable = stringArrayListToArray(cas.getStringTable());
 
@@ -120,29 +150,113 @@ public class CASSerializer implements Se
     this.longHeapArray = new long[longHeapSize];
     System.arraycopy(cas.getLongHeap().heap, 0, this.longHeapArray, 0, longHeapSize);
   }
+  
+  // version
+  // encode: bits 7 6 5 4 3 2 1 0
+  //                        0 0 1 = no delta, no compression
+  //                        0 1 - = delta, no compression
+  //                        1 d - = compression, w/wo delta
+
+  static void outputVersion(int version, DataOutputStream dos) throws IOException {
+    // output the key and version number
+
+    byte[] uima = new byte[4];
+    uima[0] = 85; // U
+    uima[1] = 73; // I
+    uima[2] = 77; // M
+    uima[3] = 65; // A
+
+    ByteBuffer buf = ByteBuffer.wrap(uima);
+    int key = buf.asIntBuffer().get();
+
+    dos.writeInt(key);
+    dos.writeInt(version);
+  }
+  
+  private void outputStringHeap(DataOutputStream dos, CASImpl cas, StringHeapDeserializationHelper shdh) throws IOException {
+    // output the strings
+
+    // compute the number of total size of data in stringHeap
+    // total size = char buffer length + length of strings in the string list;
+    int stringHeapLength = shdh.charHeapPos;
+    int stringListLength = 0;
+    for (int i = 0; i < shdh.refHeap.length; i += 3) {
+      int ref = shdh.refHeap[i + StringHeapDeserializationHelper.STRING_LIST_ADDR_OFFSET];
+      // this is a string in the string list
+      // get length and add to total string heap length
+      if (ref != 0) {
+        // terminate each string with a null
+        stringListLength += 1 + cas.getStringHeap().getStringForCode(ref).length();
+      }
+    }
+
+    int stringTotalLength = stringHeapLength + stringListLength;
+    if (stringHeapLength == 0 && stringListLength > 0) {
+      // nothing from stringHeap
+      // add 1 for the null at the beginning
+      stringTotalLength += 1;
+    }
+    dos.writeInt(stringTotalLength);
+
+    // write the data in the stringheap, if there is any
+    if (stringTotalLength > 0) {
+      if (shdh.charHeapPos > 0) {
+        dos.writeChars(String.valueOf(shdh.charHeap, 0, shdh.charHeapPos));
+      } else {
+        // no stringheap data
+        // if there is data in the string lists, write a leading 0
+        if (stringListLength > 0) {
+          dos.writeChar(0);
+        }
+      }
+
+      // word alignment
+      if (stringTotalLength % 2 != 0) {
+        dos.writeChar(0);
+      }
+    }
+
+    // write out the string ref heap
+    // each reference consist of a offset into stringheap and a length
+    int refheapsz = ((shdh.refHeap.length - StringHeapDeserializationHelper.FIRST_CELL_REF) / StringHeapDeserializationHelper.REF_HEAP_CELL_SIZE) * 2;
+    refheapsz++;
+    dos.writeInt(refheapsz);
+    dos.writeInt(0);
+    for (int i = StringHeapDeserializationHelper.FIRST_CELL_REF; i < shdh.refHeap.length; i += 3) {
+      dos.writeInt(shdh.refHeap[i + StringHeapDeserializationHelper.CHAR_HEAP_POINTER_OFFSET]);
+      dos.writeInt(shdh.refHeap[i + StringHeapDeserializationHelper.CHAR_HEAP_STRLEN_OFFSET]);
+    }
+  }
 
   /**
    * Serializes the CAS data and writes it to the output stream.
-   * --------------------------------------------------------------------- Blob Format
-   * 
-   * Element Size Number of Description (bytes) Elements ------------ ---------
-   * -------------------------------- 4 1 Blob key = "UIMA" in utf-8 4 1 Version (currently = 1) 4 1
-   * size of 32-bit FS Heap array = s32H 4 s32H 32-bit FS heap array 4 1 size of 16-bit string Heap
-   * array = sSH 2 sSH 16-bit string heap array 4 1 size of string Ref Heap array = sSRH 4 2*sSRH
-   * string ref offsets and lengths 4 1 size of FS index array = sFSI 4 sFSI FS index array
-   * 
-   * 4 1 size of 8-bit Heap array = s8H 1 s8H 8-bit Heap array 4 1 size of 16-bit Heap array = s16H
-   * 2 s16H 16-bit Heap array 4 1 size of 64-bit Heap array = s64H 8 s64H 64-bit Heap array
+   * --------------------------------------------------------------------- 
+   * Blob         Format    Element 
+   * Size         Number of Description 
+   * (bytes)      Elements 
+   * ------------ --------- -------------------------------- 
+   * 4            1         Blob key = "UIMA" in utf-8 
+   * 4            1         Version (currently = 1) 
+   * 4            1         size of 32-bit FS Heap array = s32H 
+   * 4            s32H      32-bit FS heap array 
+   * 4            1         size of 16-bit string Heap array = sSH  
+   * 2            sSH       16-bit string heap array 
+   * 4            1         size of string Ref Heap zrray = sSRH 
+   * 4            2*sSRH    string ref offsets and lengths 
+   * 4            1         size of FS index array = sFSI 
+   * 4            sFSI      FS index array
+   * 4            1         size of 8-bit Heap array = s8H  
+   * 1            s8H       8-bit Heap array 
+   * 4            1         size of 16-bit Heap array = s16H 
+   * 2            s16H      16-bit Heap array 
+   * 4            1         size of 64-bit Heap array = s64H 
+   * 8            s64H      64-bit Heap array
    * ---------------------------------------------------------------------
    * 
-   * This reads in and deserializes CAS data from a stream. Byte swapping may be needed is the blob
-   * is from C++ -- C++ blob serialization writes data in native byte order.
-   * 
-   * @param cas
-   *                The CAS to be serialized. ostream The output stream.
+   * @param cas  The CAS to be serialized. ostream The output stream.
    */
   public void addCAS(CASImpl cas, OutputStream ostream) {
-
+ 
     try {
 
       DataOutputStream dos = new DataOutputStream(ostream);
@@ -151,20 +265,8 @@ public class CASSerializer implements Se
       this.fsIndex = cas.getIndexedFSs();
 
       // output the key and version number
-
-      byte[] uima = new byte[4];
-      uima[0] = 85; // U
-      uima[1] = 73; // I
-      uima[2] = 77; // M
-      uima[3] = 65; // A
-
-      ByteBuffer buf = ByteBuffer.wrap(uima);
-      int key = buf.asIntBuffer().get();
-
-      int version = 1;
-      dos.writeInt(key);
-      dos.writeInt(version);
-
+      outputVersion(1, dos);
+      
       // output the FS heap
       final int heapSize = cas.getHeap().getCellsUsed();
       dos.writeInt(heapSize);
@@ -175,56 +277,57 @@ public class CASSerializer implements Se
       // output the strings
       StringHeapDeserializationHelper shdh = cas.getStringHeap().serialize();
 
-      // compute the number of total size of data in stringHeap
-      // total size = char buffer length + length of strings in the string list;
-      int stringHeapLength = shdh.charHeapPos;
-      int stringListLength = 0;
-      for (int i = 0; i < shdh.refHeap.length; i += 3) {
-        int ref = shdh.refHeap[i + StringHeapDeserializationHelper.STRING_LIST_ADDR_OFFSET];
-        // this is a string in the string list
-        // get length and add to total string heap length
-        if (ref != 0) {
-          // terminate each string with a null
-          stringListLength += 1 + cas.getStringHeap().getStringForCode(ref).length();
-        }
-      }
-
-      int stringTotalLength = stringHeapLength + stringListLength;
-      if (stringHeapLength == 0 && stringListLength > 0) {
-        // nothing from stringHeap
-        // add 1 for the null at the beginning
-        stringTotalLength += 1;
-      }
-      dos.writeInt(stringTotalLength);
-
-      // write the data in the stringheap, if there is any
-      if (stringTotalLength > 0) {
-        if (shdh.charHeapPos > 0) {
-          dos.writeChars(String.valueOf(shdh.charHeap, 0, shdh.charHeapPos));
-        } else {
-          // no stringheap data
-          // if there is data in the string lists, write a leading 0
-          if (stringListLength > 0) {
-            dos.writeChar(0);
-          }
-        }
-
-        // word alignment
-        if (stringTotalLength % 2 != 0) {
-          dos.writeChar(0);
-        }
-      }
-
-      // write out the string ref heap
-      // each reference consist of a offset into stringheap and a length
-      int refheapsz = ((shdh.refHeap.length - StringHeapDeserializationHelper.FIRST_CELL_REF) / StringHeapDeserializationHelper.REF_HEAP_CELL_SIZE) * 2;
-      refheapsz++;
-      dos.writeInt(refheapsz);
-      dos.writeInt(0);
-      for (int i = StringHeapDeserializationHelper.FIRST_CELL_REF; i < shdh.refHeap.length; i += 3) {
-        dos.writeInt(shdh.refHeap[i + StringHeapDeserializationHelper.CHAR_HEAP_POINTER_OFFSET]);
-        dos.writeInt(shdh.refHeap[i + StringHeapDeserializationHelper.CHAR_HEAP_STRLEN_OFFSET]);
-      }
+      outputStringHeap(dos, cas, shdh);
+//      // compute the number of total size of data in stringHeap
+//      // total size = char buffer length + length of strings in the string list;
+//      int stringHeapLength = shdh.charHeapPos;
+//      int stringListLength = 0;
+//      for (int i = 0; i < shdh.refHeap.length; i += 3) {
+//        int ref = shdh.refHeap[i + StringHeapDeserializationHelper.STRING_LIST_ADDR_OFFSET];
+//        // this is a string in the string list
+//        // get length and add to total string heap length
+//        if (ref != 0) {
+//          // terminate each string with a null
+//          stringListLength += 1 + cas.getStringHeap().getStringForCode(ref).length();
+//        }
+//      }
+//
+//      int stringTotalLength = stringHeapLength + stringListLength;
+//      if (stringHeapLength == 0 && stringListLength > 0) {
+//        // nothing from stringHeap
+//        // add 1 for the null at the beginning
+//        stringTotalLength += 1;
+//      }
+//      dos.writeInt(stringTotalLength);
+//
+//      // write the data in the stringheap, if there is any
+//      if (stringTotalLength > 0) {
+//        if (shdh.charHeapPos > 0) {
+//          dos.writeChars(String.valueOf(shdh.charHeap, 0, shdh.charHeapPos));
+//        } else {
+//          // no stringheap data
+//          // if there is data in the string lists, write a leading 0
+//          if (stringListLength > 0) {
+//            dos.writeChar(0);
+//          }
+//        }
+//
+//        // word alignment
+//        if (stringTotalLength % 2 != 0) {
+//          dos.writeChar(0);
+//        }
+//      }
+//
+//      // write out the string ref heap
+//      // each reference consist of a offset into stringheap and a length
+//      int refheapsz = ((shdh.refHeap.length - StringHeapDeserializationHelper.FIRST_CELL_REF) / StringHeapDeserializationHelper.REF_HEAP_CELL_SIZE) * 2;
+//      refheapsz++;
+//      dos.writeInt(refheapsz);
+//      dos.writeInt(0);
+//      for (int i = StringHeapDeserializationHelper.FIRST_CELL_REF; i < shdh.refHeap.length; i += 3) {
+//        dos.writeInt(shdh.refHeap[i + StringHeapDeserializationHelper.CHAR_HEAP_POINTER_OFFSET]);
+//        dos.writeInt(shdh.refHeap[i + StringHeapDeserializationHelper.CHAR_HEAP_STRLEN_OFFSET]);
+//      }
 
       // output the index FSs
       dos.writeInt(this.fsIndex.length);
@@ -279,33 +382,33 @@ public class CASSerializer implements Se
    * 
    * ElementSize NumberOfElements Description
    * ----------- ---------------- ---------------------------------------------------------
-   * 4				1				Blob key = "UIMA" in utf-8 (byte order flag)
-   * 4				1				Version (1 = complete cas, 2 = delta cas)
-   * 4				1				size of 32-bit heap array = s32H
-   * 4            s32H              32-bit FS heap array (new elements) 
-   * 4              1 				size of 16-bit string Heap array = sSH 
-   * 2 			   sSH 				16-bit string heap array (new strings)
-   * 4 				1 				size of string Ref Heap array = sSRH 
-   * 4 			2*sSRH				string ref offsets and lengths (for new strings)
-   * 4              1				number of modified, preexisting 32-bit modified FS heap elements = sM32H
-   * 4			2*sM32H             32-bit heap offset and value (preexisting cells modified)	 
-   * 4 	            1 				size of FS index array = sFSI 
-   * 4			  sFSI 				FS index array in Delta format
-   * 4 				1 				size of 8-bit Heap array = s8H 
-   * 1 			  s8H 				8-bit Heap array (new elements)
-   * 4 				1 				size of 16-bit Heap array = s16H
-   * 2 			  s16H 				16-bit Heap array (new elements) 
-   * 4 				1 				size of 64-bit Heap array = s64H 
-   * 8 			  s64H 				64-bit Heap array (new elements)
-   * 4				1				number of modified, preexisting 8-bit heap elements = sM8H
-   * 4			  sM8H              8-bit heap offsets (preexisting cells modified)
-   * 1			  sM8H              8-bit heap values  (preexisting cells modified)
-   * 4				1				number of modified, preexisting 16-bit heap elements = sM16H
-   * 4			  sM16H             16-bit heap offsets (preexisting cells modified)
-   * 2			  sM16H             16-bit heap values  (preexisting cells modified)
-   * 4				1				number of modified, preexisting 64-bit heap elements = sM64H
-   * 4			  sM64H             64-bit heap offsets (preexisting cells modified)
-   * 2			  sM64H             64-bit heap values  (preexisting cells modified)
+   * 4				   1				        Blob key = "UIMA" in utf-8 (byte order flag)
+   * 4				   1				        Version (1 = complete cas, 2 = delta cas)
+   * 4				   1				        size of 32-bit heap array = s32H
+   * 4           s32H             32-bit FS heap array (new elements) 
+   * 4           1 				        size of 16-bit string Heap array = sSH 
+   * 2 			     sSH 				      16-bit string heap array (new strings)
+   * 4 				   1 				        size of string Ref Heap array = sSRH 
+   * 4 			     2*sSRH				    string ref offsets and lengths (for new strings)
+   * 4           1        				number of modified, preexisting 32-bit modified FS heap elements = sM32H
+   * 4			     2*sM32H          32-bit heap offset and value (preexisting cells modified)	 
+   * 4 	         1 	        			size of FS index array = sFSI 
+   * 4		       sFSI 	    			FS index array in Delta format
+   * 4 		 	  	 1 			        	size of 8-bit Heap array = s8H 
+   * 1 			     s8H 			      	8-bit Heap array (new elements)
+   * 4 			  	 1 			        	size of 16-bit Heap array = s16H
+   * 2 			     s16H 				    16-bit Heap array (new elements) 
+   * 4 			  	 1 			        	size of 64-bit Heap array = s64H 
+   * 8 			     s64H 				    64-bit Heap array (new elements)
+   * 4				   1			        	number of modified, preexisting 8-bit heap elements = sM8H
+   * 4			     sM8H             8-bit heap offsets (preexisting cells modified)
+   * 1			     sM8H             8-bit heap values  (preexisting cells modified)
+   * 4			  	 1				        number of modified, preexisting 16-bit heap elements = sM16H
+   * 4			     sM16H            16-bit heap offsets (preexisting cells modified)
+   * 2			     sM16H            16-bit heap values  (preexisting cells modified)
+   * 4			  	 1				        number of modified, preexisting 64-bit heap elements = sM64H
+   * 4			     sM64H            64-bit heap offsets (preexisting cells modified)
+   * 2			     sM64H            64-bit heap values  (preexisting cells modified)
    * 
    * 
    * @param cas
@@ -327,20 +430,9 @@ public class CASSerializer implements Se
       this.fsIndex = cas.getDeltaIndexedFSs(mark);
       
       // output the key and version number
-
-      byte[] uima = new byte[4];
-      uima[0] = 85; // U
-      uima[1] = 73; // I
-      uima[2] = 77; // M
-      uima[3] = 65; // A
-
-      ByteBuffer buf = ByteBuffer.wrap(uima);
-      int key = buf.asIntBuffer().get();
-
-      int version = 2;    //1 = current full serialization; 2 = delta format 
-                          //perhaps this should be split into 2 bytes for version and 2 bytes for format.
-      dos.writeInt(key);
-      dos.writeInt(version);
+      //1 = current full serialization; 2 = delta format 
+      //perhaps this should be split into 2 bytes for version and 2 bytes for format.
+      outputVersion(2, dos);
 
       // output the new FS heap cells
       final int heapSize = cas.getHeap().getCellsUsed() - mark.nextFSId;
@@ -353,56 +445,57 @@ public class CASSerializer implements Se
       // output the new strings
       StringHeapDeserializationHelper shdh = cas.getStringHeap().serialize(mark.nextStringHeapAddr);
 
-      // compute the number of total size of data in stringHeap
-      // total size = char buffer length + length of strings in the string list;
-      int stringHeapLength = shdh.charHeapPos;
-      int stringListLength = 0;
-      for (int i = 0; i < shdh.refHeap.length; i += 3) {
-        int ref = shdh.refHeap[i + StringHeapDeserializationHelper.STRING_LIST_ADDR_OFFSET];
-        // this is a string in the string list
-        // get length and add to total string heap length
-        if (ref != 0) {
-          // terminate each string with a null
-          stringListLength += 1 + cas.getStringHeap().getStringForCode(ref).length();
-        }
-      }
-
-      int stringTotalLength = stringHeapLength + stringListLength;
-      if (stringHeapLength == 0 && stringListLength > 0) {
-        // nothing from stringHeap
-        // add 1 for the null at the beginning
-        stringTotalLength += 1;
-      }
-      dos.writeInt(stringTotalLength);
-
-      // write the data in the stringheap, if there is any
-      if (stringTotalLength > 0) {
-        if (shdh.charHeapPos > 0) {
-          dos.writeChars(String.valueOf(shdh.charHeap, 0, shdh.charHeapPos));
-        } else {
-          // no stringheap data
-          // if there is data in the string lists, write a leading 0
-          if (stringListLength > 0) {
-            dos.writeChar(0);
-          }
-        }
-
-        // word alignment
-        if (stringTotalLength % 2 != 0) {
-          dos.writeChar(0);
-        }
-      }
-
-      // write out the string ref heap
-      // each reference consist of a offset into stringheap and a length
-      int refheapsz = ((shdh.refHeap.length - StringHeapDeserializationHelper.FIRST_CELL_REF) / StringHeapDeserializationHelper.REF_HEAP_CELL_SIZE) * 2;
-      refheapsz++;
-      dos.writeInt(refheapsz);
-      dos.writeInt(0);
-      for (int i = StringHeapDeserializationHelper.FIRST_CELL_REF; i < shdh.refHeap.length; i += 3) {
-        dos.writeInt(shdh.refHeap[i + StringHeapDeserializationHelper.CHAR_HEAP_POINTER_OFFSET]);
-        dos.writeInt(shdh.refHeap[i + StringHeapDeserializationHelper.CHAR_HEAP_STRLEN_OFFSET]);
-      }
+      outputStringHeap(dos, cas, shdh);
+//      // compute the number of total size of data in stringHeap
+//      // total size = char buffer length + length of strings in the string list;
+//      int stringHeapLength = shdh.charHeapPos;
+//      int stringListLength = 0;
+//      for (int i = 0; i < shdh.refHeap.length; i += 3) {
+//        int ref = shdh.refHeap[i + StringHeapDeserializationHelper.STRING_LIST_ADDR_OFFSET];
+//        // this is a string in the string list
+//        // get length and add to total string heap length
+//        if (ref != 0) {
+//          // terminate each string with a null
+//          stringListLength += 1 + cas.getStringHeap().getStringForCode(ref).length();
+//        }
+//      }
+//
+//      int stringTotalLength = stringHeapLength + stringListLength;
+//      if (stringHeapLength == 0 && stringListLength > 0) {
+//        // nothing from stringHeap
+//        // add 1 for the null at the beginning
+//        stringTotalLength += 1;
+//      }
+//      dos.writeInt(stringTotalLength);
+//
+//      // write the data in the stringheap, if there is any
+//      if (stringTotalLength > 0) {
+//        if (shdh.charHeapPos > 0) {
+//          dos.writeChars(String.valueOf(shdh.charHeap, 0, shdh.charHeapPos));
+//        } else {
+//          // no stringheap data
+//          // if there is data in the string lists, write a leading 0
+//          if (stringListLength > 0) {
+//            dos.writeChar(0);
+//          }
+//        }
+//
+//        // word alignment
+//        if (stringTotalLength % 2 != 0) {
+//          dos.writeChar(0);
+//        }
+//      }
+//
+//      // write out the string ref heap
+//      // each reference consist of a offset into stringheap and a length
+//      int refheapsz = ((shdh.refHeap.length - StringHeapDeserializationHelper.FIRST_CELL_REF) / StringHeapDeserializationHelper.REF_HEAP_CELL_SIZE) * 2;
+//      refheapsz++;
+//      dos.writeInt(refheapsz);
+//      dos.writeInt(0);
+//      for (int i = StringHeapDeserializationHelper.FIRST_CELL_REF; i < shdh.refHeap.length; i += 3) {
+//        dos.writeInt(shdh.refHeap[i + StringHeapDeserializationHelper.CHAR_HEAP_POINTER_OFFSET]);
+//        dos.writeInt(shdh.refHeap[i + StringHeapDeserializationHelper.CHAR_HEAP_STRLEN_OFFSET]);
+//      }
 
       //output modified FS Heap cells
       int[] fsHeapModifiedAddrs = cas.getModifiedFSHeapAddrs().toArray();
@@ -452,16 +545,16 @@ public class CASSerializer implements Se
       
       // 8 bit heap modified cells
       int[] byteHeapModifiedAddrs = cas.getModifiedByteHeapAddrs().toArray();
-      byte[] byteValues = new byte[byteHeapModifiedAddrs.length];
+//      byte[] byteValues = new byte[byteHeapModifiedAddrs.length];
       dos.writeInt(byteHeapModifiedAddrs.length);
       for (int i=0; i < byteHeapModifiedAddrs.length; i++) {
-    	dos.writeInt(byteHeapModifiedAddrs[i]);
-    	byteValues[i] = cas.getByteHeap().getHeapValue(byteHeapModifiedAddrs[i]);
+      	dos.writeInt(byteHeapModifiedAddrs[i]);
+//      	byteValues[i] = cas.getByteHeap().getHeapValue(byteHeapModifiedAddrs[i]);
       }
-      for (int i=0; i < byteValues.length; i++) {  
+      for (int i=0; i < byteHeapModifiedAddrs.length; i++) {  
     	  dos.writeByte(cas.getByteHeap().getHeapValue(byteHeapModifiedAddrs[i]));
-	  }
-      
+	    }
+
       // word alignment
       align = (4 - (byteheapsz % 4)) % 4;
       for (int i = 0; i < align; i++) {
@@ -504,7 +597,35 @@ public class CASSerializer implements Se
     }
 
   }
-
+  
+//  /**
+//   * Serialize with compression
+//   * Target is not constrained to the C++ format
+//   * For non delta serialization, pass marker with 0 as values
+//   * @throws IOException 
+//   */
+//
+//  public void serialize(CASImpl cas, OutputStream ostream, Marker marker) throws IOException {
+//    if (marker != null && !marker.isValid() ) {
+//      CASRuntimeException exception = new CASRuntimeException(
+//                CASRuntimeException.INVALID_MARKER, new String[] { "Invalid Marker." });
+//      throw exception;
+//    }
+//    MarkerImpl mark = (MarkerImpl) marker;
+//    DataOutputStream dos = new DataOutputStream(ostream);
+//
+//    this.fsIndex = cas.getDeltaIndexedFSs(mark);
+//    outputVersion(3 , dos);
+//    
+//    // output the new FS heap cells
+//    final int heapSize = cas.getHeap().getCellsUsed() - mark.nextFSId);
+//    compressHeapOut(dos, cas, heapSize, mark)
+//
+//    // output the new strings
+//
+//  }
+  
+  
   /**
    * Method stringArrayListToArray.
    * 

Modified: uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/MarkerImpl.java
URL: http://svn.apache.org/viewvc/uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/MarkerImpl.java?rev=1406261&r1=1406260&r2=1406261&view=diff
==============================================================================
--- uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/MarkerImpl.java (original)
+++ uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/MarkerImpl.java Tue Nov  6 18:22:03 2012
@@ -95,5 +95,25 @@ public class MarkerImpl implements Marke
   public boolean isValid() {
     return isValid;
   }
+
+  public int getNextFSId() {
+    return nextFSId;
+  }
+
+  public int getNextStringHeapAddr() {
+    return nextStringHeapAddr;
+  }
+
+  public int getNextByteHeapAddr() {
+    return nextByteHeapAddr;
+  }
+
+  public int getNextShortHeapAddr() {
+    return nextShortHeapAddr;
+  }
+
+  public int getNextLongHeapAddr() {
+    return nextLongHeapAddr;
+  }
   
 }

Modified: uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/StringHeap.java
URL: http://svn.apache.org/viewvc/uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/StringHeap.java?rev=1406261&r1=1406260&r2=1406261&view=diff
==============================================================================
--- uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/StringHeap.java (original)
+++ uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/StringHeap.java Tue Nov  6 18:22:03 2012
@@ -80,35 +80,7 @@ final class StringHeap {
    * @return Serialization helper that can be interpreted easier by serialization code.
    */
   StringHeapDeserializationHelper serialize() {
-    StringHeapDeserializationHelper shdh = new StringHeapDeserializationHelper();
-    // Ref heap is 3 times the size of the string list.
-    shdh.refHeap = new int[this.stringList.size()
-        * StringHeapDeserializationHelper.REF_HEAP_CELL_SIZE];
-    shdh.refHeapPos = shdh.refHeap.length;
-    // Compute required size of character heap.
-    int charHeapSize = 0;
-    for (int i = 0; i < this.stringList.size(); i++) {
-      String s = this.stringList.get(i);
-      if (s != null) {
-        charHeapSize += s.length();
-      }
-    }
-    shdh.charHeap = new char[charHeapSize];
-    shdh.charHeapPos = shdh.charHeap.length;
-
-    int charCount = 0;
-    // Now write out the actual data
-    for (int i = 1; i < this.stringList.size(); i++) {
-      String s = this.stringList.get(i);
-      int refHeapOffset = i * StringHeapDeserializationHelper.REF_HEAP_CELL_SIZE;
-      shdh.refHeap[refHeapOffset + StringHeapDeserializationHelper.CHAR_HEAP_POINTER_OFFSET] = charCount;
-      shdh.refHeap[refHeapOffset + StringHeapDeserializationHelper.CHAR_HEAP_STRLEN_OFFSET] = s
-          .length();
-      System.arraycopy(s.toCharArray(), 0, shdh.charHeap, charCount, s.length());
-      charCount += s.length();
-    }
-    assert (charCount == shdh.charHeap.length);
-    return shdh;
+    return serialize(1);  
   }
   
   StringHeapDeserializationHelper serialize(int startPos) {

Modified: uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/CasCopier.java
URL: http://svn.apache.org/viewvc/uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/CasCopier.java?rev=1406261&r1=1406260&r2=1406261&view=diff
==============================================================================
--- uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/CasCopier.java (original)
+++ uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/CasCopier.java Tue Nov  6 18:22:03 2012
@@ -101,11 +101,18 @@ public class CasCopier {
   public static void copyCas(CAS aSrcCas, CAS aDestCas, boolean aCopySofa) {
     CasCopier copier = new CasCopier(aSrcCas, aDestCas);
     
-    Iterator<SofaFS> sofaIter = aSrcCas.getSofaIterator();
-    while (sofaIter.hasNext()) {
-      SofaFS sofa = sofaIter.next();
-      CAS view = aSrcCas.getView(sofa);
-      copier.copyCasView(view, aCopySofa);
+    // oops, this misses the initial view if a sofa FS has not yet been created
+//    Iterator<SofaFS> sofaIter = aSrcCas.getSofaIterator();
+//    while (sofaIter.hasNext()) {
+//      SofaFS sofa = sofaIter.next();
+//      CAS view = aSrcCas.getView(sofa);
+//      copier.copyCasView(view, aCopySofa);
+//    }
+    
+    Iterator<CAS> viewIterator = aSrcCas.getViewIterator();
+    while (viewIterator.hasNext()) {
+      CAS view = viewIterator.next();
+      copier.copyCasView(view, aCopySofa);     
     }
   }
 
@@ -127,13 +134,16 @@ public class CasCopier {
     
     if (aCopySofa) {
       // can't copy the SofaFS - just copy the sofa data and mime type
-      String sofaMime = aSrcCasView.getSofa().getSofaMime();
-      if (aSrcCasView.getDocumentText() != null) {
-        targetView.setSofaDataString(aSrcCasView.getDocumentText(), sofaMime);
-      } else if (aSrcCasView.getSofaDataURI() != null) {
-        targetView.setSofaDataURI(aSrcCasView.getSofaDataURI(), sofaMime);
-      } else if (aSrcCasView.getSofaDataArray() != null) {
-        targetView.setSofaDataArray(copyFs(aSrcCasView.getSofaDataArray()), sofaMime);
+      SofaFS sofa = aSrcCasView.getSofa();
+      if (null != sofa) { 
+        String sofaMime = sofa.getSofaMime();
+        if (aSrcCasView.getDocumentText() != null) {
+          targetView.setSofaDataString(aSrcCasView.getDocumentText(), sofaMime);
+        } else if (aSrcCasView.getSofaDataURI() != null) {
+          targetView.setSofaDataURI(aSrcCasView.getSofaDataURI(), sofaMime);
+        } else if (aSrcCasView.getSofaDataArray() != null) {
+          targetView.setSofaDataArray(copyFs(aSrcCasView.getSofaDataArray()), sofaMime);
+        }
       }
     }
 

Added: uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/SerializationMeasures.java
URL: http://svn.apache.org/viewvc/uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/SerializationMeasures.java?rev=1406261&view=auto
==============================================================================
--- uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/SerializationMeasures.java (added)
+++ uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/SerializationMeasures.java Tue Nov  6 18:22:03 2012
@@ -0,0 +1,411 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.uima.util;
+
+import static org.apache.uima.cas.impl.BinaryCasSerDes4.CAN_BE_NEGATIVE;
+import static org.apache.uima.cas.impl.BinaryCasSerDes4.IN_MAIN_HEAP;
+import static org.apache.uima.cas.impl.BinaryCasSerDes4.SlotKind.Slot_ArrayLength;
+import static org.apache.uima.cas.impl.BinaryCasSerDes4.SlotKind.Slot_Byte;
+import static org.apache.uima.cas.impl.BinaryCasSerDes4.SlotKind.Slot_Double_Exponent;
+import static org.apache.uima.cas.impl.BinaryCasSerDes4.SlotKind.Slot_Double_Mantissa_Sign;
+import static org.apache.uima.cas.impl.BinaryCasSerDes4.SlotKind.Slot_Float_Exponent;
+import static org.apache.uima.cas.impl.BinaryCasSerDes4.SlotKind.Slot_Float_Mantissa_Sign;
+import static org.apache.uima.cas.impl.BinaryCasSerDes4.SlotKind.Slot_FsIndexes;
+import static org.apache.uima.cas.impl.BinaryCasSerDes4.SlotKind.Slot_HeapRef;
+import static org.apache.uima.cas.impl.BinaryCasSerDes4.SlotKind.Slot_Int;
+import static org.apache.uima.cas.impl.BinaryCasSerDes4.SlotKind.Slot_Long_High;
+import static org.apache.uima.cas.impl.BinaryCasSerDes4.SlotKind.Slot_Long_Low;
+import static org.apache.uima.cas.impl.BinaryCasSerDes4.SlotKind.Slot_MainHeap;
+import static org.apache.uima.cas.impl.BinaryCasSerDes4.SlotKind.Slot_Short;
+import static org.apache.uima.cas.impl.BinaryCasSerDes4.SlotKind.Slot_StrChars;
+import static org.apache.uima.cas.impl.BinaryCasSerDes4.SlotKind.Slot_StrLength;
+import static org.apache.uima.cas.impl.BinaryCasSerDes4.SlotKind.Slot_StrOffset;
+import static org.apache.uima.cas.impl.BinaryCasSerDes4.SlotKind.Slot_TypeCode;
+
+import org.apache.uima.cas.impl.BinaryCasSerDes4.SlotKind;
+
+
+/**
+ * Structure:
+
+ *   StatDetail        
+ *   
+ *   str         has neh for offset, length, dictionary hits/misses
+ *   
+ *   indexedFs   has neh for diffs
+ *   
+ *   modHeap     named, has neh for diffs, heap for values
+ */
+public class SerializationMeasures {
+  
+  public static final int MAX_NBR_ENCODE_LENGTH = 10; // for long values taking 64 bits at 7 bits per byte
+                                                      
+  /** 
+   * helper method to truncate printing of lots of trailing 0s
+   * @param c
+   * @return
+   */
+  private static int maxIndexToZeros(int[] c) {
+    for (int i = c.length - 1; i >= 0; i--) {
+      if (c[i] != 0) {
+        return Math.min(i + 1, c.length - 1);
+      }
+    }
+    return 1;
+  }  
+  
+  /**
+   * Statistical details
+   *   There's instances of this class for
+   *     - the main heap
+   *     - the aux heaps
+   *     - the string offsets, the string lengths
+   *     
+   * Heap: xxxx  [name-of-delta: [Total: <TotalBytes>(negative%)  Histo: a(neg%) b(neg%) c(neg%) d(neg%) e(neg%)]]
+    *   2 styles: one uses only one counter, no delta  - used for byte, short, and long heaps
+    *   other is for main heap, uses 4 deltas.
+   *
+   */
+  public static class StatDetail {
+    private final String name;
+    public long original = -1;    // if set, use this, otherwise use countTotal * bytesPerCount
+    final boolean canBeNegative;
+    public final int[] c = new int[MAX_NBR_ENCODE_LENGTH]; 
+    private final int[] cn; // negative counts
+    private final int bytesPerCount;  // # bytes in source, per entry
+    public int countTotal;  // plain count not weighted by number of bytes
+    public int lengthTotal;  // count weighted by encodedLength
+    
+    // encoding variants
+//    private long itemCount = 0;
+//    public long hits = 0;  // misses = itemCount - hits
+//    public long lt64 = 0;  // (non diff slot only) things coded outside of dictionary    
+    public long diffEncoded = 0; // things not diff encoded = totalCount - diffEncoded
+    public long valueLeDiff = 0; // things not diff encoded which could have been
+//    public long total = 0;  
+    
+    // zip info
+    public long beforeZip;   // should be same as lengthTotal;
+    public long afterZip = -1;  // -1 means not zipped
+    public long zipTime;
+    public long deserializationTime;  // excluding unzipping
+    
+    public StatDetail(String name, 
+                      boolean canBeNegative,
+                      boolean inMainHeap,
+                      int bytesPerCount) {
+      this.canBeNegative = canBeNegative;
+      this.bytesPerCount = bytesPerCount;
+      this.name = name;
+      if (canBeNegative) {
+        cn = new int[MAX_NBR_ENCODE_LENGTH];
+      } else {
+        cn = null;
+      }
+      if (inMainHeap) {
+        original = 0;  // main heap original computed outside of this mechanism
+      }
+    }
+    
+    public long getOriginal() {
+      if (original == -1) {
+        return countTotal * bytesPerCount;
+      }
+      else {
+        return original;
+      }
+    }
+    
+    public void accum(StatDetail o) {
+      for (int i = 0; i < c.length; i++) {
+        c[i] += o.c[i];
+        if (canBeNegative && (null != o.cn)) {
+          cn[i] += o.cn[i];
+        }
+      }
+
+      countTotal += o.countTotal;
+      lengthTotal += o.lengthTotal;
+      original = getOriginal();
+      original += o.getOriginal();
+      diffEncoded += o.diffEncoded;
+      valueLeDiff += o.valueLeDiff;
+      beforeZip += o.beforeZip;
+      if (afterZip == -1) {
+        afterZip = 0;
+      }
+      afterZip  += (o.afterZip == -1) ? o.beforeZip : o.afterZip;
+      zipTime += o.zipTime;
+      deserializationTime += o.deserializationTime;
+    }
+        
+    public void incr(int encodedLength, boolean isNegative) {
+      if (isNegative) {
+        cn[encodedLength - 1] ++;
+      }
+      incr(encodedLength);
+    }
+    
+    public void incr(int encodedLength) {
+      c[encodedLength - 1] ++;    
+      countTotal ++;
+      lengthTotal += encodedLength;
+    }
+    
+    /**
+     * v is the number of bytes to incr counter 0 by
+     * @param v
+     */
+    public void incrNoCompression(int v) {
+      c[bytesPerCount - 1] += v;
+      countTotal += v;
+      lengthTotal += v * bytesPerCount;
+    }
+    
+    public String toString() {
+      long tot = lengthTotal;
+      if (tot == 0) {
+        return String.format("Item: %25s%n", name);
+      }
+      String diff = (0 < diffEncoded) ?
+          String.format(
+              "%n                                                                  DiffEncoded(%%, %%v<diff): %,d(%.1f%% %.1f%%)", 
+              diffEncoded, percent(diffEncoded, countTotal), percent(valueLeDiff, diffEncoded)) :
+          "";
+      String zp = (afterZip == -1) ? "" :
+        String.format(" afterZip: %,7d(%4.1f%%), %,3d ms", afterZip, percent(afterZip, beforeZip), zipTime);
+      
+      String dt = (deserializationTime == 0) ? "" :
+        String.format(" Deserialization time: %f", deserializationTime/1000F);
+      
+      StringBuilder sb = new StringBuilder();
+      // find max index to include = first non-zero from end,  + 1
+      int maxToInclude = maxIndexToZeros(c);
+      for (int i = 0; i <= maxToInclude; i++) {
+        sb.append((canBeNegative) ? 
+            String.format(" %,d(%,d)", c[i], cn[i]) :
+            String.format(" %,d", c[i]));
+      }
+      String totPct = (original == 0) ? 
+          String.format("LengthTot: %,d", lengthTotal) :
+          String.format("LengthTot: %,d(%.1f%%)", lengthTotal, percentCompr(lengthTotal));
+      String histoDetails = String.format("[%s  Histo:%s]", totPct, sb);
+      return String.format("Item: %25s %s %s %s %s%n",
+            name, zp, dt, histoDetails, diff);
+    }
+    
+    private float percentCompr(long totCompr) {
+      return percent(totCompr, ((original == -1) || (original == 0)) ? countTotal * bytesPerCount : original);
+    }
+  }
+  
+  /** 
+   * each instance of this class remembers a set of statDetail instances to
+   * do bulk operations against that set of the statistics
+   */
+  public class AllStatDetails {
+    final StatDetail[] allStatDetails;
+    StatDetail aggr;
+    final String name;
+    
+    public AllStatDetails (String aggrName, StatDetail ... someHeaps) {
+      name = aggrName;
+      allStatDetails = new StatDetail[someHeaps.length];
+      aggr = new StatDetail(aggrName, CAN_BE_NEGATIVE, IN_MAIN_HEAP, 1);
+      int i = 0;
+      for (StatDetail sd : someHeaps) {
+        allStatDetails[i++] = sd;
+        aggr.accum(sd);
+      }      
+    }
+    
+    public AllStatDetails (String aggrName, SlotKind ... kinds) {
+      this(aggrName, toStatDetails(kinds));
+    }
+        
+    public void accum(AllStatDetails o) {
+      for (int i = 0; i < allStatDetails.length; i++) {
+        allStatDetails[i].accum(o.allStatDetails[i]);
+      }
+    }
+    
+    public void aggregate() {
+      aggr = new StatDetail(name, CAN_BE_NEGATIVE, ! IN_MAIN_HEAP, 1);
+      for (StatDetail sd : allStatDetails) {
+        aggr.accum(sd);
+      }            
+    }
+
+    public String toString() {
+      StringBuilder sb = new StringBuilder();
+      for (StatDetail h : allStatDetails) {
+        sb.append(h.toString());
+      }
+      return sb.toString();
+    }
+  }
+       
+  private static float percent(long a, long b) {
+    if (a == 0) {
+      return 0F;
+    }
+    if (b == 0) {
+      return 100F;
+    }
+      
+    return  ((100F * a)/ b);
+  }
+        
+  // all measures in counts or bytes
+  public int header = 0;
+  public long origAuxByteArrayRefs = 0;  // in bytes (incl boolean), 1 entry usually = 4 bytes
+  public long origAuxShortArrayRefs = 0;
+  public long origAuxLongArrayRefs = 0;
+  public long origAuxBytes = 0;  // includes booleans, in bytes
+  public long origAuxShorts = 0;  //in bytes
+  public long origAuxLongs = 0;  // includes doubles, in bytes
+  
+  public long mainHeapFSs = 0;      // count of all feature structures
+  
+  public int stringsNbrCommon = 0;
+  public long stringsCommonChars = 0;
+  public long stringsSavedExact = 0;
+  public long stringsSavedSubstr = 0;
+  
+  public long totalTime = 0;
+
+    
+  public final StatDetail[] statDetails = new StatDetail[SlotKind.values().length];
+  {
+    for (SlotKind kind : SlotKind.values()) {
+      statDetails[kind.i] = new StatDetail(kind.toString(), 
+                                           kind.canBeNegative,
+                                           kind.inMainHeap,
+                                           kind.elementSize);
+    }
+  }
+  
+  public final AllStatDetails allSlots =  
+      new AllStatDetails("AllSlotKinds", 
+          Slot_ArrayLength,
+          Slot_HeapRef,
+          Slot_Int,
+          Slot_Byte,        // used only for arrays
+          Slot_Short,       // used only for arrays
+          Slot_TypeCode,
+          Slot_StrOffset,
+          Slot_StrLength,
+          Slot_StrChars,
+          Slot_Long_High,
+          Slot_Long_Low,
+          Slot_Float_Mantissa_Sign,
+          Slot_Float_Exponent,
+          Slot_Double_Mantissa_Sign,
+          Slot_Double_Exponent,
+          Slot_FsIndexes); 
+  public final AllStatDetails strSlots = 
+    new AllStatDetails("Strings",
+        Slot_StrOffset,
+        Slot_StrLength,
+        Slot_StrChars);
+  
+//  public final ModHeaps modHeaps = new ModHeaps(modMainHeap, modByteHeap, modShortHeap, modLongHeap);
+//  public final Str  strings = new Str(strOffsets, strLengths);
+//  public final IndexedFSs indexedFSs = new IndexedFSs();
+//  
+
+  public SerializationMeasures() {
+  }
+  
+  StatDetail[] toStatDetails(SlotKind[] kinds) {
+    StatDetail[] sds= new StatDetail[kinds.length];
+    int i = 0;
+    for(SlotKind k : kinds) {
+      sds[i++] = statDetails[k.i];
+    }
+    return sds;
+  }
+
+  /**
+   * accumulate results for multiple files
+   * @param o
+   */
+  public void accum(SerializationMeasures o) {
+    int i = 0;
+    for (StatDetail sd : o.statDetails) {
+      statDetails[i++].accum(sd);
+    }
+    origAuxByteArrayRefs += o.origAuxByteArrayRefs;
+    origAuxShortArrayRefs += o.origAuxShortArrayRefs;
+    origAuxLongArrayRefs += o.origAuxLongArrayRefs;
+    header += o.header;
+    mainHeapFSs += o.mainHeapFSs; 
+    
+    stringsNbrCommon += o.stringsNbrCommon;
+    stringsCommonChars += o.stringsCommonChars;
+    stringsSavedExact += o.stringsSavedExact;
+    stringsSavedSubstr += o.stringsSavedSubstr;
+  }
+  
+  public String toString() {
+    // Strings
+    
+    long origStringChars = statDetails[Slot_StrChars.i].getOriginal();
+    long origStringObjs = statDetails[Slot_StrLength.i].getOriginal() * 2;
+    long origStringsTot = origStringChars +   // space for the chars 
+            origStringObjs +                  // space for the offset and length
+            (origStringObjs / 2);             // space for the refs to the string heap 
+    
+    
+    allSlots.aggregate();
+    strSlots.aggregate();
+    
+    long allOrig = statDetails[Slot_MainHeap.i].original +
+                   origStringChars + origStringObjs +
+                   origAuxBytes + 
+                   origAuxShorts +
+                   origAuxLongs;
+                   
+    long allB4Z = allSlots.aggr.lengthTotal;
+    long strB4Z = strSlots.aggr.lengthTotal;
+    
+    long allTotZ = allSlots.aggr.afterZip;
+    long strTotZ = strSlots.aggr.afterZip;       
+    
+    return String.format(
+        "Summary: withZip: %,d(%.1f%%), without: %,d(%.1f%%)  zipTime: %,d ms  totalSerTime: %,d ms%n" +
+        "  nonStrgs: withZip: %,d(%.1f%%), without: %,d(%.1f%%)%n" +
+        "  Strings:  withZip: %,d(%.1f%%), without: %,d(%.1f%%)%n" +
+    		"  MainHeap TotFS: %,d, StrCmnChars: %,d(%.1f%%), StrSavedExact: %,d  StrSavedSubstr: %,d%n" +
+        "%s%n",
+       allTotZ, percent(allTotZ, allOrig), allB4Z, percent(allB4Z, allOrig), allSlots.aggr.zipTime, totalTime,
+       
+       allTotZ - strTotZ, percent(allTotZ - strTotZ, allOrig - origStringsTot),
+       allB4Z - strB4Z,   percent(allB4Z - strB4Z,   allOrig - origStringsTot),
+       
+       strTotZ, percent(strTotZ, origStringsTot),
+       strB4Z,  percent(strB4Z,  origStringsTot),
+            
+       mainHeapFSs, stringsCommonChars, percent(stringsCommonChars, statDetails[Slot_StrChars.i].original),
+       stringsSavedExact, stringsSavedSubstr,
+       allSlots.toString()
+    );
+  }
+ 
+}

Propchange: uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/SerializationMeasures.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/impl/DataIO.java
URL: http://svn.apache.org/viewvc/uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/impl/DataIO.java?rev=1406261&view=auto
==============================================================================
--- uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/impl/DataIO.java (added)
+++ uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/impl/DataIO.java Tue Nov  6 18:22:03 2012
@@ -0,0 +1,479 @@
+package org.apache.uima.util.impl;
+
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
+
+/**
+ * Methods for working with Data during I/O
+ */
+public class DataIO {
+  
+  
+  public static final Charset UTF8 = Charset.forName("UTF-8");  // use with String is a java 6, not 5, feature
+  public static final String UTF8_FAST = "UTF-8"; // for faster impls
+  private static final int SIGNED_INT_VALUE_0x80 = 0x80;
+  private static final int MASK_LOW_7 = 0x7f;
+  private static final long MASK_LOW_7_LONG = 0x7fL;
+  private static final long TOP_LONG_BIT = 0x8000000000000000L;
+ 
+  private static ThreadLocal<CharsetDecoder> DECODER = new ThreadLocal<CharsetDecoder>();
+  
+  
+  public static String decodeUTF8(ByteBuffer in, final int length) {
+    // First try fast path - assume chars in 0-127
+    fastPath: do {
+      if (in.hasArray()) {
+        byte[] backingArray = in.array();
+        int offset = in.arrayOffset() + in.position();
+        if (offset + length > backingArray.length) {
+          break fastPath;
+        }
+//        char[] ca = new char[length];
+        // string builder approach avoids copying the char array object
+        StringBuilder sb = new StringBuilder(length);
+        sb.setLength(length);
+        for (int i = 0; i < length; i++) {
+          byte b = backingArray[offset + i];
+          if (b < 0) { // give up and do it the other way
+            break fastPath;
+          }
+          sb.setCharAt(i, (char)b);
+        }
+        in.position(in.position() + length);
+        return sb.toString();  // doesn't copy the string char array
+      } 
+    } while (false); // not a real do loop - do only once
+  
+    CharsetDecoder decoder = DECODER.get();
+    if (null == decoder) {
+      decoder = UTF8.newDecoder()
+        .onMalformedInput(CodingErrorAction.REPLACE)
+        .onUnmappableCharacter(CodingErrorAction.REPLACE);
+      DECODER.set(decoder);       
+    }
+    ByteBuffer partToDecode = in.slice();
+    partToDecode.limit(length);
+    CharBuffer cb;
+    try {
+      cb = decoder.decode(partToDecode);
+      in.position(in.position() + length);
+    } catch (CharacterCodingException e) {
+      // should never happen
+      throw new RuntimeException(e);
+    }
+    return cb.toString();
+  }
+  
+  /***************************************************************************************
+   * For DataOutput, DataInput
+   ***************************************************************************************/
+  /**
+   * Similar to writeUTF, but ok for strings > 32K bytes long and better for strings < 127
+   * string utf-8 length must be <= Integer.MAX_VALUE - 1 
+   * @param string
+   * @param out
+   * @throws IOException
+   */
+  public static void writeUTFv(String string, DataOutput out) throws IOException {
+    if (null == string) {
+      out.write(0);
+      return;
+    }
+    byte[] bb = string.getBytes(UTF8_FAST);
+    if (bb.length > (Integer.MAX_VALUE - 1)) {
+      throw new RuntimeException(String.format("String UTF-8 representation too long, was %,d", bb.length));
+    }
+    writeVnumber(out, bb.length + 1);  // 0 reserved for null
+    out.write(bb);
+  }
+  
+  public static String readUTFv(DataInput in) throws IOException {
+    int length = readVnumber(in) - 1;
+    if (-1 == length) {
+      return null;
+    }
+    byte[] bb = new byte[length];
+    in.readFully(bb);
+//    return new String(bb, UTF8_FAST);
+    return decodeUTF8(ByteBuffer.wrap(bb), length);
+  }
+
+  public static long lengthUTFv(String string) throws UnsupportedEncodingException {
+    if (null == string) {
+      return 1;
+    }
+    byte[] bb = string.getBytes(UTF8_FAST);
+    if (bb.length > (Integer.MAX_VALUE - 1)) {
+      throw new RuntimeException(String.format("String UTF-8 representation too long, was %,d", bb.length));
+    }
+    int r = lengthVnumber(bb.length + 1);
+    return r + bb.length;
+  }
+  
+  /**
+   * DataOutputStream writeShort with checking of argument
+   * @param out
+   * @param v
+   * @throws IOException
+   */
+  public static void writeShort(DataOutput out, int v) throws IOException {
+    if (v > Short.MAX_VALUE ||
+        v < Short.MIN_VALUE) {  
+      throw new RuntimeException(String.format(
+          "Trying to write int %,d as a short but it doesn't fit", v));
+    }
+    out.writeShort(v);
+  }
+ 
+  /**
+   * DataOutputStream writeByte with checking of argument
+   * @param out
+   * @param v
+   * @throws IOException
+   */
+  public static void writeByte(DataOutput out, int v) throws IOException {
+    if (v > Byte.MAX_VALUE || 
+        v < Byte.MIN_VALUE) {
+      throw new RuntimeException(String.format(
+          "Trying to write int %,d as a byte but it doesn't fit", v));
+    }
+    out.write(v);
+  }
+
+  /**
+   * Write lower 8 bits 
+   * @param out
+   * @param v
+   * @throws IOException
+   */
+  public static void writeUnsignedByte(DataOutput out, int v) throws IOException {
+    out.write(v);
+  }
+
+  /**
+   * write a positive or negative number, optimized for fewer bytes near 0
+   *   sign put in low order bit, rest of number converted to positive and shifted left 1
+   *   max negative written as - 0.
+   * @param out
+   * @param v
+   * @throws IOException
+   */
+  // special handling for MIN_VALUE because
+  // Math.abs of it "fails".  We instead code it as
+  // "-0", a code point not otherwise in use
+  public static void writeVPNnumber(DataOutput out, int v) throws IOException {
+    if (v == Integer.MIN_VALUE) {
+      writeVnumber(out, 1);
+    } else {
+      if (v < 0) {
+        writeVnumber(out, (((long)Math.abs(v)) << 1) | 1);
+      } else {
+        writeVnumber(out, v << 1);
+      }
+    }
+  }
+ 
+  // special handling for MIN_VALUE because
+  // Math.abs of it "fails".  We instead code it as
+  // "-0", a code point not otherwise in use
+  public static void writeVPNnumber(DataOutput out, long v) throws IOException {
+    if (v == Long.MIN_VALUE) {
+      writeVnumber(out, 1);
+    } else {
+      if (v < 0) {
+        long pv = Math.abs(v);
+        writeVnumber(out, (Math.abs(v) << 1) | 1);
+      } else {
+        writeVnumber(out, v << 1);
+      }
+    }
+  }
+  // special handling for MIN_VALUE because
+  // Math.abs of it "fails".  We instead code it as
+  // "-0", a code point not otherwise in use
+  public static int lengthVPNnumber(int v) {
+    if (v == Integer.MIN_VALUE) {
+      return 1;
+    } else {
+      if (v < 0) {
+        return lengthVnumber(((long)(Math.abs(v)) << 1));
+      } else {
+        return lengthVnumber(v << 1);
+      }
+    }
+  }
+  // special handling for MIN_VALUE because
+  // Math.abs of it "fails".  We instead code it as
+  // "-0", a code point not otherwise in use
+  public static int lengthVPNnumber(long v) {
+    if (v == Long.MIN_VALUE) {
+      return 1;
+    } else {
+      if (v < 0) {
+        return lengthVnumber((Math.abs(v) << 1));
+      } else {
+        return lengthVnumber(v << 1);
+      }
+    }
+  }
+
+  /**
+   * Write a positive number with the fewest bytes possible
+   * up to 127 written as a byte
+   * high order bit on means get another byte
+   * 
+   * Note: value treated as unsigned 32 bit int
+   * 
+   * @param out
+   * @param v
+   * @throws IOException
+   */
+  public static void writeVnumber(final DataOutput out, final int v) throws IOException {
+    if ((v >= 0) && v < 128) {
+      out.write(v);  // fast path
+    } else {
+      writeVnumber1(out, v);
+    }
+  }
+  
+  private static void writeVnumber1(final DataOutput out, int v) throws IOException {
+    if (v < 0) {
+      throw new RuntimeException("never happen");
+    }
+    for (int i = 0; i < 5; i++) {
+      int outByte = v & MASK_LOW_7;
+      if (v < SIGNED_INT_VALUE_0x80) {
+        out.write(v);
+        return;
+      }
+      out.write(outByte | SIGNED_INT_VALUE_0x80);
+      v = v >>> 7;
+    }
+  }
+  
+  public static int lengthVnumber(int v) {
+    int r = 1;
+    for (int i = 0; i < 5; i++) {
+      if (v < SIGNED_INT_VALUE_0x80) {
+        return r;
+      }
+      v = v >>> 7;
+      r++;
+    }
+    throw new RuntimeException("Never get here");
+  }
+
+  public static int readVnumber(final DataInput in) throws IOException {
+    int raw = in.readUnsignedByte();
+    if (raw < 0x80) {   // fast path
+      return raw;
+    }
+    int result = raw & MASK_LOW_7;
+    int shift = 7;
+    
+    for (int i = 1; i < 5; i++) {
+      raw = in.readUnsignedByte();
+      result |= (raw & MASK_LOW_7) << shift;
+      if (raw < SIGNED_INT_VALUE_0x80) {
+        return result;
+      }
+      shift += 7;
+    }
+    throw new IllegalStateException("Invalid input deserializing Vnumber");   
+  }
+
+  /**
+   * Write a positive long with the fewest bytes possible; up to 127 written as a byte, high order
+   * bit on means get another byte.
+   * 
+   * @param out
+   * @param v is never negative
+   * @throws IOException
+   */
+  public static void writeVnumber(final DataOutput out, final long v) throws IOException {
+    if ((v >= 0) && v < 128) {
+      out.write((int)v);  // fast path
+    } else {
+      writeVnumber1(out, v);
+    }
+  }
+  
+  private static void writeVnumber1(final DataOutput out, long v) throws IOException {
+    if (v < 0) {
+      throw new RuntimeException("never happen");
+    }
+    for (int i = 0; i < 9; i++) {
+      if (v < SIGNED_INT_VALUE_0x80) {
+        out.write((int) v);
+        return;
+      }
+      int outByte = (int)(v & MASK_LOW_7_LONG);
+      out.write(outByte | SIGNED_INT_VALUE_0x80);
+      v = v >>> 7;
+    }
+  }
+  
+  public static int lengthVnumber(long v) {
+    int r = 1;
+    for (int i = 0; i < 9; i++) {
+      if (v < SIGNED_INT_VALUE_0x80) {
+        return r;
+      }
+      v = v >>> 7;
+      r++;
+    }
+    throw new RuntimeException("Never get here");
+  }
+
+
+  public static long readVlong(final DataInput in) throws IOException {
+    long raw = in.readUnsignedByte();
+    if (raw < 0x80) {   // fast path
+      return raw;
+    }
+
+    long result = raw & MASK_LOW_7_LONG;
+    int shift = 7;
+    for (int i = 1; i < 9; i++) {
+      raw = in.readUnsignedByte();
+      result |= (raw & MASK_LOW_7_LONG) << shift;
+      if (raw < 128) {
+        return result;
+      }
+      shift += 7;
+    }
+    throw new IllegalStateException("Invalid input deserializing Vlong");
+  }
+  
+  public static long readRestOfVlong(DataInput in, int firstByte) throws IOException {
+    if (firstByte < 0x80) {
+      return firstByte;
+    }
+    long result = firstByte ^ 0x80;  // turn off high bit
+    int shift = 7;
+    for (int i = 1; i < 9; i++) {
+      long raw = in.readUnsignedByte();
+      result |= (raw & MASK_LOW_7_LONG) << shift;
+      if (raw < 128) {
+        return result;
+      }
+      shift += 7;
+    }
+    throw new IllegalStateException("Invalid input deserializing Vlong");
+    
+  }
+
+  public static void writeByteArray(DataOutput out, byte[] v) throws IOException {
+    writeVnumber(out, v.length);
+    out.write(v);
+  }
+
+  public static byte[] readByteArray(DataInput in) throws IOException {
+    int size = readVnumber(in);
+    byte[] result = new byte[size];
+    in.readFully(result);
+    return result;
+  }
+
+  /**
+   * write array preceded by its length
+   * @param out
+   * @param v
+   * @throws IOException
+   */
+  public static void writeIntArray(DataOutput out, int[] v) throws IOException {
+    writeVnumber(out, v.length);
+    for (int vi : v) {
+      out.writeInt(vi);
+    }
+  }
+  
+  public static int[] readIntArray(DataInput in) throws IOException {
+    int size = readVnumber(in);
+    int[] result = new int[size];
+    for (int i = 0; i < size; i++) {
+      result[i] = in.readInt();
+    }
+    return result;
+  }
+  
+  /**
+   * Write delta encoded value, for increasing values
+   * @param out
+   * @param v
+   * @throws IOException
+   */
+  public static void writeIntArrayDelta(DataOutput out, int[] v) throws IOException {
+    writeVnumber(out, v.length);
+    int prev = 0;
+    for (int vi : v) {
+      writeVnumber(out, vi - prev);
+      prev = vi;
+    }
+  }
+  
+  public static int[] readIntArrayDelta(DataInput in) throws IOException {
+    int size = readVnumber(in);
+    int prev = 0;
+    int[] result = new int[size];
+    for (int i = 0; i < size; i++) {
+      result[i] = prev + readVnumber(in);
+      prev = result[i];
+    }
+    return result;
+  }
+
+  public static void writeLongArray(DataOutput out, long[] v) throws IOException {
+    // java doesn't support arrays longer than Integer.MAX_VALUE, even on 64-bit platforms
+    writeVnumber(out, v.length);
+    for (long vi : v)
+      out.writeLong(vi);
+  }
+  
+  public static long[] readLongArray(DataInput in) throws IOException {
+    int size = readVnumber(in);
+    long[] v = new long[size];
+    for (int i = 0; i < size; ++i)
+      v[i] = in.readLong();
+    return v;
+  }
+  
+  public static void writeLongArrayDelta(DataOutput out, long[] v) throws IOException {
+    // java doesn't support arrays longer than Integer.MAX_VALUE, even on 64-bit platforms
+    writeVnumber(out, v.length);
+    long prev = 0;
+    for (long vi : v) {
+      writeVnumber(out, vi - prev);
+      prev = vi;
+    }
+  }
+  
+  public static long[] readLongArrayDelta(DataInput in) throws IOException {
+    int size = readVnumber(in);
+    long[] v = new long[size];
+    long prev = 0;
+    for (int i=0; i<size; ++i) {
+      v[i] = prev + readVlong(in);
+      prev = v[i];
+    }
+    return v;
+  }  
+  
+  public static int readUnsignedByte(DataInput in) throws IOException {
+    int r =  in.readUnsignedByte();
+    if (r < 0) {
+      throw new IOException("Premature EOF");
+    }
+    return r;
+  }
+  
+}

Propchange: uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/impl/DataIO.java
------------------------------------------------------------------------------
    svn:eol-style = native