You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by sc...@apache.org on 2012/11/06 19:22:04 UTC
svn commit: r1406261 [3/4] - in /uima/uimaj/trunk/uimaj-core/src:
main/java/org/apache/uima/cas/ main/java/org/apache/uima/cas/impl/
main/java/org/apache/uima/util/ main/java/org/apache/uima/util/impl/
main/resources/org/apache/uima/ test/java/org/apac...
Propchange: uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/BinaryCasSerDes4.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/CASImpl.java
URL: http://svn.apache.org/viewvc/uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/CASImpl.java?rev=1406261&r1=1406260&r2=1406261&view=diff
==============================================================================
--- uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/CASImpl.java (original)
+++ uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/CASImpl.java Tue Nov 6 18:22:03 2012
@@ -42,6 +42,7 @@ import java.util.ListIterator;
import java.util.Map;
import java.util.Set;
+import org.apache.uima.cas.AbstractCas;
import org.apache.uima.cas.AbstractCas_ImplBase;
import org.apache.uima.cas.AnnotationBaseFS;
import org.apache.uima.cas.ArrayFS;
@@ -84,6 +85,7 @@ import org.apache.uima.cas.text.Language
import org.apache.uima.internal.util.IntVector;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.impl.JCasImpl;
+import org.apache.uima.util.SerializationMeasures;
/**
* Implements the CAS interfaces. This class must be public because we need to
@@ -108,6 +110,8 @@ public class CASImpl extends AbstractCas
public static final int FALSE = 0;
+ private static final int[] INT0 = new int[0];
+
public static final int DEFAULT_INITIAL_HEAP_SIZE = 500000;
public static final int DEFAULT_RESET_HEAP_SIZE = 5000000;
@@ -241,6 +245,10 @@ public class CASImpl extends AbstractCas
* element per component being journaled.
*/
private List<Marker> trackingMarkList;
+
+ // must be in svd part because has a field that is updated
+ // while serializing
+ private BinaryCasSerDes4 binaryCompressor;
private SharedViewData(boolean useFSCache) {
this.useFSCache = useFSCache;
@@ -1163,48 +1171,40 @@ public class CASImpl extends AbstractCas
return;
}
- DataInputStream dis = new DataInputStream(istream);
+ final DataInputStream dis = (istream instanceof DataInputStream) ?
+ (DataInputStream) istream : new DataInputStream(istream);
try {
// key
- // deteremine if byte swap if needed based on key
+ // determine if byte swap if needed based on key
byte[] bytebuf = new byte[4];
bytebuf[0] = dis.readByte(); // U
bytebuf[1] = dis.readByte(); // I
bytebuf[2] = dis.readByte(); // M
bytebuf[3] = dis.readByte(); // A
- boolean swap = false;
- // check if first byte is ascii char U
- if (bytebuf[0] != 85) {
- swap = true;
- }
+ final boolean swap = (bytebuf[0] != 85);
// version
- // version 2 indicates this is in delta format.
- int version;
- if (swap) {
- version = swap4(dis, bytebuf);
- } else {
- version = dis.readInt();
- }
+ // version bit in 2's place indicates this is in delta format.
+ final int version = readInt(dis, swap);
+ final boolean delta = ((version & 2) == 2);
- boolean delta = false;
- if (version == 2) {
- delta = true;
- }
if (!delta) {
this.resetNoQuestions();
}
- // main fsheap
- int fsheapsz = 0;
- if (swap) {
- fsheapsz = swap4(dis, bytebuf);
- } else {
- fsheapsz = dis.readInt();
+ if (0 != (version & 4)) {
+ if (svd.binaryCompressor == null) {
+ svd.binaryCompressor = new BinaryCasSerDes4(this.getTypeSystemImpl(), false);
+ }
+ svd.binaryCompressor.deserialize(this, dis, delta);
+ return;
}
+ // main fsheap
+ final int fsheapsz = readInt(dis, swap);
+
int startPos = 0;
if (!delta) {
this.getHeap().reinitSizeOnly(fsheapsz);
@@ -1212,32 +1212,19 @@ public class CASImpl extends AbstractCas
startPos = this.getHeap().getNextId();
this.getHeap().grow(fsheapsz);
}
-
+
for (int i = startPos; i < fsheapsz+startPos; i++) {
- if (swap) {
- this.getHeap().heap[i] = swap4(dis, bytebuf);
- } else {
- this.getHeap().heap[i] = dis.readInt();
- }
+ this.getHeap().heap[i] = readInt(dis, swap);
}
// string heap
- int stringheapsz = 0;
- if (swap) {
- stringheapsz = swap4(dis, bytebuf);
- } else {
- stringheapsz = dis.readInt();
- }
+ int stringheapsz = readInt(dis, swap);
final StringHeapDeserializationHelper shdh = new StringHeapDeserializationHelper();
shdh.charHeap = new char[stringheapsz];
for (int i = 0; i < stringheapsz; i++) {
- if (swap) {
- shdh.charHeap[i] = swap2(dis, bytebuf);
- } else {
- shdh.charHeap[i] = dis.readChar();
- }
+ shdh.charHeap[i] = (char) readShort(dis, swap);
}
shdh.charHeapPos = stringheapsz;
@@ -1247,12 +1234,7 @@ public class CASImpl extends AbstractCas
}
// string ref heap
- int refheapsz = 0;
- if (swap) {
- refheapsz = swap4(dis, bytebuf);
- } else {
- refheapsz = dis.readInt();
- }
+ int refheapsz = readInt(dis, swap);
refheapsz--;
refheapsz = refheapsz / 2;
@@ -1264,14 +1246,8 @@ public class CASImpl extends AbstractCas
dis.readInt(); // 0
for (int i = shdh.refHeapPos; i < shdh.refHeap.length; i += StringHeapDeserializationHelper.REF_HEAP_CELL_SIZE) {
- if (swap) {
- shdh.refHeap[i + StringHeapDeserializationHelper.CHAR_HEAP_POINTER_OFFSET] = swap4(dis,
- bytebuf);
- shdh.refHeap[i + StringHeapDeserializationHelper.CHAR_HEAP_STRLEN_OFFSET] = swap4(dis, bytebuf);
- } else {
- shdh.refHeap[i + StringHeapDeserializationHelper.CHAR_HEAP_POINTER_OFFSET] = dis.readInt();
- shdh.refHeap[i + StringHeapDeserializationHelper.CHAR_HEAP_STRLEN_OFFSET] = dis.readInt();
- }
+ shdh.refHeap[i + StringHeapDeserializationHelper.CHAR_HEAP_POINTER_OFFSET] = readInt(dis, swap);
+ shdh.refHeap[i + StringHeapDeserializationHelper.CHAR_HEAP_STRLEN_OFFSET] = readInt(dis, swap);
shdh.refHeap[i + StringHeapDeserializationHelper.STRING_LIST_ADDR_OFFSET] = 0;
}
shdh.refHeapPos = refheapsz + StringHeapDeserializationHelper.FIRST_CELL_REF;
@@ -1280,35 +1256,17 @@ public class CASImpl extends AbstractCas
//if delta, handle modified fs heap cells
if (delta) {
- int fsmodssz = 0;
- if (swap) {
- fsmodssz = swap4(dis, bytebuf);
- } else {
- fsmodssz = dis.readInt();
- }
+ int fsmodssz = readInt(dis, swap);
for (int i = 0; i < fsmodssz; i++) {
- if (swap) {
- this.getHeap().heap[swap4(dis,bytebuf)] = swap4(dis, bytebuf);
- } else {
- this.getHeap().heap[dis.readInt()] = dis.readInt();
- }
+ this.getHeap().heap[readInt(dis, swap)] = readInt(dis, swap);
}
}
// indexed FSs
- int fsindexsz = 0;
- if (swap) {
- fsindexsz = swap4(dis, bytebuf);
- } else {
- fsindexsz = dis.readInt();
- }
+ int fsindexsz = readInt(dis, swap);
int[] fsindexes = new int[fsindexsz];
for (int i = 0; i < fsindexsz; i++) {
- if (swap) {
- fsindexes[i] = swap4(dis, bytebuf);
- } else {
- fsindexes[i] = dis.readInt();
- }
+ fsindexes[i] = readInt(dis, swap);
}
// build the index
@@ -1319,218 +1277,156 @@ public class CASImpl extends AbstractCas
}
// byte heap
- int byteheapsz = 0;
- if (swap) {
- byteheapsz = swap4(dis, bytebuf);
- } else {
- byteheapsz = dis.readInt();
- }
+ int heapsz = readInt(dis, swap);
if (!delta) {
- this.getByteHeap().heap = new byte[Math.max(16, byteheapsz)]; // must
- // be >
- // 0
- for (int i = 0; i < byteheapsz; i++) {
- this.getByteHeap().heap[i] = dis.readByte();
- }
- this.getByteHeap().heapPos = byteheapsz;
+ this.getByteHeap().heap = new byte[Math.max(16, heapsz)]; // must be > 0
+ dis.read(this.getByteHeap().heap, 0, heapsz);
+ this.getByteHeap().heapPos = heapsz;
} else {
- for (int i=0; i < byteheapsz; i++) {
- this.getByteHeap().addByte(dis.readByte());
+ for (int i=0; i < heapsz; i++) {
+ this.getByteHeap().addByte(dis.readByte());
}
}
// word alignment
- int align = (4 - (byteheapsz % 4)) % 4;
- for (int i = 0; i < align; i++) {
- dis.readByte();
- }
+ int align = (4 - (heapsz % 4)) % 4;
+ dis.skipBytes(align);
// short heap
- int shortheapsz = 0;
- if (swap) {
- shortheapsz = swap4(dis, bytebuf);
- } else {
- shortheapsz = dis.readInt();
- }
+ heapsz = readInt(dis, swap);
if (!delta) {
- this.getShortHeap().heap = new short[Math.max(16, shortheapsz)]; // must
- // be >
- // 0
- for (int i = 0; i < shortheapsz; i++) {
- if (swap) {
- this.getShortHeap().heap[i] = (short) swap2(dis, bytebuf);
- } else {
- this.getShortHeap().heap[i] = dis.readShort();
- }
+ this.getShortHeap().heap = new short[Math.max(16, heapsz)]; // must be > 0
+ for (int i = 0; i < heapsz; i++) {
+ this.getShortHeap().heap[i] = readShort(dis, swap);
}
- this.getShortHeap().heapPos = shortheapsz;
+ this.getShortHeap().heapPos = heapsz;
} else {
- for (int i = 0; i < shortheapsz; i++) {
- if (swap) {
- this.getShortHeap().addShort((short) swap2(dis, bytebuf));
- } else {
- this.getShortHeap().addShort(dis.readShort());
- }
- }
+ for (int i = 0; i < heapsz; i++) {
+ this.getShortHeap().addShort(readShort(dis, swap));
+ }
}
// word alignment
- if (shortheapsz % 2 != 0) {
+ if (heapsz % 2 != 0) {
dis.readShort();
}
// long heap
- int longheapsz = 0;
- if (swap) {
- longheapsz = swap4(dis, bytebuf);
- bytebuf = new byte[8];
- } else {
- longheapsz = dis.readInt();
- }
+ heapsz = readInt(dis, swap);
if (!delta) {
- this.getLongHeap().heap = new long[Math.max(16, longheapsz)]; // must
- // be >
- // 0
- for (int i = 0; i < longheapsz; i++) {
- if (swap) {
- this.getLongHeap().heap[i] = swap8(dis, bytebuf);
- } else {
- this.getLongHeap().heap[i] = dis.readLong();
- }
+ this.getLongHeap().heap = new long[Math.max(16, heapsz)]; // must be > 0
+ for (int i = 0; i < heapsz; i++) {
+ this.getLongHeap().heap[i] = readLong(dis, swap);
}
- this.getLongHeap().heapPos = longheapsz;
+ this.getLongHeap().heapPos = heapsz;
} else {
- for (int i = 0; i < longheapsz; i++) {
- if (swap) {
- this.getLongHeap().addLong( swap8(dis, bytebuf));
- } else {
- this.getLongHeap().addLong(dis.readLong());
- }
+ for (int i = 0; i < heapsz; i++) {
+ this.getLongHeap().addLong(readLong(dis, swap));
}
}
if (delta) {
- //modified Byte Heap
- if (swap) {
- byteheapsz = swap4(dis, bytebuf);
- } else {
- byteheapsz = dis.readInt();
- }
- if (byteheapsz > 0) {
- int[] byteHeapAddrs = new int[byteheapsz];
- for (int i=0; i < byteheapsz; i++) {
- if (swap) {
- byteHeapAddrs[i] = swap4(dis, bytebuf);
- } else {
- byteHeapAddrs[i] = dis.readInt();
- }
- }
- for (int i=0; i < byteheapsz; i++) {
- this.getByteHeap().heap[byteHeapAddrs[i]] = dis.readByte();
- }
- }
- // word alignment
- align = (4 - (byteheapsz % 4)) % 4;
- for (int i = 0; i < align; i++) {
- dis.readByte();
- }
-
- //modified Short Heap
- if (swap) {
- shortheapsz = swap4(dis, bytebuf);
- } else {
- shortheapsz = dis.readInt();
- }
- if (shortheapsz > 0) {
- int[] shortHeapAddrs = new int[shortheapsz];
- for (int i=0; i < shortheapsz; i++) {
- if (swap) {
- shortHeapAddrs[i] = swap4(dis, bytebuf);
- } else {
- shortHeapAddrs[i] = dis.readInt();
- }
+ //modified Byte Heap
+ heapsz = readInt(dis, swap);
+ if (heapsz > 0) {
+ int[] heapAddrs = new int[heapsz];
+ for (int i = 0; i < heapsz; i++) {
+ heapAddrs[i] = readInt(dis, swap);
}
- for (int i=0; i < shortheapsz; i++) {
- if (swap) {
- this.getShortHeap().heap[i] = (short) swap2(dis, bytebuf);
- } else {
- this.getShortHeap().heap[i] = dis.readShort();
- }
+ for (int i = 0; i < heapsz; i++) {
+ this.getByteHeap().heap[heapAddrs[i]] = dis.readByte();
}
}
+ // word alignment
+ align = (4 - (heapsz % 4)) % 4;
+ dis.skipBytes(align);
+
+ //modified Short Heap
+ heapsz = readInt(dis, swap);
+ if (heapsz > 0) {
+ int[] heapAddrs = new int[heapsz];
+ for (int i = 0; i < heapsz; i++) {
+ heapAddrs[i] = readInt(dis, swap);
+ }
+ for (int i = 0; i < heapsz; i++) {
+ this.getShortHeap().heap[heapAddrs[i]] = readShort(dis, swap);
+ }
+ }
// word alignment
- if (shortheapsz % 2 != 0) {
+ if (heapsz % 2 != 0) {
dis.readShort();
}
//modified Long Heap
- if (swap) {
- longheapsz = swap4(dis, bytebuf);
- } else {
- longheapsz = dis.readInt();
- }
- if (longheapsz > 0) {
- int[] longHeapAddrs = new int[shortheapsz];
- for (int i=0; i < shortheapsz; i++) {
- if (swap) {
- longHeapAddrs[i] = swap4(dis, bytebuf);
- } else {
- longHeapAddrs[i] = dis.readInt();
- }
+ heapsz = readInt(dis, swap);
+ if (heapsz > 0) {
+ int[] heapAddrs = new int[heapsz];
+ for (int i = 0; i < heapsz; i++) {
+ heapAddrs[i] = readInt(dis, swap);
}
- for (int i=0; i < longheapsz; i++) {
- if (swap) {
- this.getLongHeap().heap[i] = (short) swap8(dis, bytebuf);
- } else {
- this.getLongHeap().heap[i] = dis.readLong();
- }
+ for (int i = 0; i < heapsz; i++) {
+ this.getLongHeap().heap[heapAddrs[i]] = readLong(dis, swap);
}
}
-
- }
+ } // of delta - modified processing
} catch (IOException e) {
CASRuntimeException exception = new CASRuntimeException(
CASRuntimeException.BLOB_DESERIALIZATION, new String[] { e.getMessage() });
throw exception;
}
}
-
- private long swap8(DataInputStream dis, byte[] buf) throws IOException {
-
- buf[7] = dis.readByte();
- buf[6] = dis.readByte();
- buf[5] = dis.readByte();
- buf[4] = dis.readByte();
- buf[3] = dis.readByte();
- buf[2] = dis.readByte();
- buf[1] = dis.readByte();
- buf[0] = dis.readByte();
- ByteBuffer bb = ByteBuffer.wrap(buf);
- return bb.getLong();
- }
-
- private int swap4(DataInputStream dis, byte[] buf) throws IOException {
- buf[3] = dis.readByte();
- buf[2] = dis.readByte();
- buf[1] = dis.readByte();
- buf[0] = dis.readByte();
- ByteBuffer bb = ByteBuffer.wrap(buf);
- return bb.getInt();
- }
-
- private char swap2(DataInputStream dis, byte[] buf) throws IOException {
- buf[1] = dis.readByte();
- buf[0] = dis.readByte();
- ByteBuffer bb = ByteBuffer.wrap(buf, 0, 2);
- return bb.getChar();
+
+ private long readLong(DataInputStream dis, boolean swap) throws IOException {
+ long v = dis.readLong();
+ return swap ? Long.reverseBytes(v) : v;
}
+
+ private int readInt(DataInputStream dis, boolean swap) throws IOException {
+ int v = dis.readInt();
+ return swap ? Integer.reverseBytes(v) : v;
+ }
+
+ private short readShort(DataInputStream dis, boolean swap) throws IOException {
+ short v = dis.readShort();
+ return swap ? Short.reverseBytes(v) : v;
+ }
+
+// private long swap8(DataInputStream dis, byte[] buf) throws IOException {
+//
+// buf[7] = dis.readByte();
+// buf[6] = dis.readByte();
+// buf[5] = dis.readByte();
+// buf[4] = dis.readByte();
+// buf[3] = dis.readByte();
+// buf[2] = dis.readByte();
+// buf[1] = dis.readByte();
+// buf[0] = dis.readByte();
+// ByteBuffer bb = ByteBuffer.wrap(buf);
+// return bb.getLong();
+// }
+//
+// private int swap4(DataInputStream dis, byte[] buf) throws IOException {
+// buf[3] = dis.readByte();
+// buf[2] = dis.readByte();
+// buf[1] = dis.readByte();
+// buf[0] = dis.readByte();
+// ByteBuffer bb = ByteBuffer.wrap(buf);
+// return bb.getInt();
+// }
+//
+// private char swap2(DataInputStream dis, byte[] buf) throws IOException {
+// buf[1] = dis.readByte();
+// buf[0] = dis.readByte();
+// ByteBuffer bb = ByteBuffer.wrap(buf, 0, 2);
+// return bb.getChar();
+// }
// assumes:
// indexes are empty on entry
//
- private void reinitIndexedFSs(int[] fsIndex) {
+ void reinitIndexedFSs(int[] fsIndex) {
// Add FSs to index repository for base CAS
int numViews = fsIndex[0];
int loopLen = fsIndex[1]; // number of sofas, not necessarily the same as
@@ -1578,7 +1474,7 @@ public class CASImpl extends AbstractCas
}
// fsIndex contains added, removed and reindexed FS per view
- private void reinitDeltaIndexedFSs(int[] fsIndex) {
+ void reinitDeltaIndexedFSs(int[] fsIndex) {
// Add FSs to index repository for base CAS
int numViews = fsIndex[0]; //total number of views
int loopLen = fsIndex[1]; // number of sofas, not necessarily the same as
@@ -1664,7 +1560,7 @@ public class CASImpl extends AbstractCas
if (loopIndexRep != null) {
fsLoopIndex = loopIndexRep.getIndexedFSs();
} else {
- fsLoopIndex = (new IntVector()).toArray();
+ fsLoopIndex = INT0;
}
v.add(fsLoopIndex.length);
for (int k = 0; k < fsLoopIndex.length; k++) {
@@ -1674,6 +1570,7 @@ public class CASImpl extends AbstractCas
return v.toArray();
}
+
//Delta IndexedFSs format:
// number of views
@@ -1719,9 +1616,9 @@ public class CASImpl extends AbstractCas
fsDeletedFromIndex = loopIndexRep.getDeletedFSs();
fsReindexed = loopIndexRep.getReindexedFSs();
} else {
- fsLoopIndex = (new IntVector()).toArray();
- fsDeletedFromIndex = (new IntVector()).toArray();
- fsReindexed = (new IntVector()).toArray();
+ fsLoopIndex = INT0;
+ fsDeletedFromIndex = INT0;
+ fsReindexed = INT0;
}
v.add(fsLoopIndex.length);
for (int k = 0; k < fsLoopIndex.length; k++) {
@@ -3926,14 +3823,14 @@ public class CASImpl extends AbstractCas
@SuppressWarnings("unchecked")
public AnnotationIndex<AnnotationFS> getAnnotationIndex() {
- return new AnnotationIndexImpl(
+ return new AnnotationIndexImpl<AnnotationFS>(
(FSIndex<AnnotationFS>) (FSIndex<?>) getIndexRepository().getIndex(
CAS.STD_ANNOTATION_INDEX));
}
@SuppressWarnings("unchecked")
public AnnotationIndex<AnnotationFS> getAnnotationIndex(Type type) {
- return new AnnotationIndexImpl(
+ return new AnnotationIndexImpl<AnnotationFS>(
(FSIndex<AnnotationFS>) (FSIndex<?>) getIndexRepository().getIndex(
CAS.STD_ANNOTATION_INDEX, type));
}
@@ -4390,4 +4287,16 @@ public class CASImpl extends AbstractCas
return this.svd.modifiedLongHeapCells;
}
+ /**
+ * Serialize in compressed binary form
+ * @param out - an OutputStream, a DataOutputStream, or a File
+ * @throws IOException
+ */
+ public void serializeWithCompression(Object out) throws IOException {
+ if (svd.binaryCompressor == null) {
+ svd.binaryCompressor = new BinaryCasSerDes4(this.getTypeSystemImpl(), false);
+ }
+ svd.binaryCompressor.serialize(this, out);
+ }
+
}
Modified: uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/CASSerializer.java
URL: http://svn.apache.org/viewvc/uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/CASSerializer.java?rev=1406261&r1=1406260&r2=1406261&view=diff
==============================================================================
--- uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/CASSerializer.java (original)
+++ uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/CASSerializer.java Tue Nov 6 18:22:03 2012
@@ -30,10 +30,38 @@ import org.apache.uima.cas.CASRuntimeExc
import org.apache.uima.cas.Marker;
/**
- * Serialization for CAS. This serializes the state of the CAS, assuming that the type and index
+ * Binary Serialization for CAS. This serializes the state of the CAS, assuming that the type and index
* information remains constant. <code>CASSerializer</code> objects can be serialized with
- * standard Java serialization.
+ * standard Java serialization; many uses of this class follow this form:
*
+ * 1) create an instance of this class
+ * 2) add a Cas to it (via addCAS methods)
+ * 3) use the instance of this class as the argument to anObjectOutputStream.writeObject(anInstanceOfThisClass)
+ * In UIMA this is done in the SerializationUtils class.
+ *
+ * There are also custom serialization methods that serialize to outputStreams.
+ *
+ * The format of the serialized data is in one of several formats:
+ * normal Java object serialization / custom binary serialization
+ *
+ * The custom binary serialization is in several formats:
+ * full / delta:
+ * full - the entire cas
+ * delta - only differences from a previous "mark" are serialized
+ * uncompressed / compressed / compressed (fast)
+ * uncompressed
+ * compressed - trades off time for space to give the most compression
+ * compressed (fast) - less compression, but faster
+ *
+ * This class is for internal use. Some of the serialized formats are readable by the C++
+ * implementation, and used for efficiently transferring CASes between Java frameworks and other ones.
+ * Others are used with Vinci or SOAP to communicate to remote annotators.
+ *
+ * External interfaces to compressed forms of this serialization are provided by the
+ * user class org.apache.uima.util.Compression
+ *
+ * To serialize the shared common information among a group of CASes sharing the same
+ * type definition and index specifications,
* @see org.apache.uima.cas.impl.CASMgrSerializer
*
*
@@ -73,8 +101,7 @@ public class CASSerializer implements Se
/**
* Serialize CAS data without heap-internal meta data. Currently used for serialization to C++.
*
- * @param casImpl
- * The CAS to be serialized.
+ * @param casImpl The CAS to be serialized.
*/
public void addNoMetaData(CASImpl casImpl) {
addCAS(casImpl, false);
@@ -84,8 +111,7 @@ public class CASSerializer implements Se
* Add the CAS to be serialized. Note that we need the implementation here, the interface is not
* enough.
*
- * @param cas
- * The CAS to be serialized.
+ * @param cas The CAS to be serialized.
*/
public void addCAS(CASImpl cas) {
addCAS(cas, true);
@@ -95,8 +121,7 @@ public class CASSerializer implements Se
* Add the CAS to be serialized. Note that we need the implementation here, the interface is not
* enough.
*
- * @param cas
- * The CAS to be serialized.
+ * @param cas The CAS to be serialized.
*/
public void addCAS(CASImpl cas, boolean addMetaData) {
this.fsIndex = cas.getIndexedFSs();
@@ -104,7 +129,12 @@ public class CASSerializer implements Se
this.heapArray = new int[heapSize];
System.arraycopy(cas.getHeap().heap, 0, this.heapArray, 0, heapSize);
if (addMetaData) {
- this.heapMetaData = cas.getHeap().getMetaData();
+ // some details about current main-heap specifications
+ // not required to deserialize
+ // not sent for C++
+ // is 7 words long
+ // not serialized by custom serializers, only by Java object serialization
+ this.heapMetaData = cas.getHeap().getMetaData();
}
this.stringTable = stringArrayListToArray(cas.getStringTable());
@@ -120,29 +150,113 @@ public class CASSerializer implements Se
this.longHeapArray = new long[longHeapSize];
System.arraycopy(cas.getLongHeap().heap, 0, this.longHeapArray, 0, longHeapSize);
}
+
+ // version
+ // encode: bits 7 6 5 4 3 2 1 0
+ // 0 0 1 = no delta, no compression
+ // 0 1 - = delta, no compression
+ // 1 d - = compression, w/wo delta
+
+ static void outputVersion(int version, DataOutputStream dos) throws IOException {
+ // output the key and version number
+
+ byte[] uima = new byte[4];
+ uima[0] = 85; // U
+ uima[1] = 73; // I
+ uima[2] = 77; // M
+ uima[3] = 65; // A
+
+ ByteBuffer buf = ByteBuffer.wrap(uima);
+ int key = buf.asIntBuffer().get();
+
+ dos.writeInt(key);
+ dos.writeInt(version);
+ }
+
+ private void outputStringHeap(DataOutputStream dos, CASImpl cas, StringHeapDeserializationHelper shdh) throws IOException {
+ // output the strings
+
+ // compute the number of total size of data in stringHeap
+ // total size = char buffer length + length of strings in the string list;
+ int stringHeapLength = shdh.charHeapPos;
+ int stringListLength = 0;
+ for (int i = 0; i < shdh.refHeap.length; i += 3) {
+ int ref = shdh.refHeap[i + StringHeapDeserializationHelper.STRING_LIST_ADDR_OFFSET];
+ // this is a string in the string list
+ // get length and add to total string heap length
+ if (ref != 0) {
+ // terminate each string with a null
+ stringListLength += 1 + cas.getStringHeap().getStringForCode(ref).length();
+ }
+ }
+
+ int stringTotalLength = stringHeapLength + stringListLength;
+ if (stringHeapLength == 0 && stringListLength > 0) {
+ // nothing from stringHeap
+ // add 1 for the null at the beginning
+ stringTotalLength += 1;
+ }
+ dos.writeInt(stringTotalLength);
+
+ // write the data in the stringheap, if there is any
+ if (stringTotalLength > 0) {
+ if (shdh.charHeapPos > 0) {
+ dos.writeChars(String.valueOf(shdh.charHeap, 0, shdh.charHeapPos));
+ } else {
+ // no stringheap data
+ // if there is data in the string lists, write a leading 0
+ if (stringListLength > 0) {
+ dos.writeChar(0);
+ }
+ }
+
+ // word alignment
+ if (stringTotalLength % 2 != 0) {
+ dos.writeChar(0);
+ }
+ }
+
+ // write out the string ref heap
+ // each reference consist of a offset into stringheap and a length
+ int refheapsz = ((shdh.refHeap.length - StringHeapDeserializationHelper.FIRST_CELL_REF) / StringHeapDeserializationHelper.REF_HEAP_CELL_SIZE) * 2;
+ refheapsz++;
+ dos.writeInt(refheapsz);
+ dos.writeInt(0);
+ for (int i = StringHeapDeserializationHelper.FIRST_CELL_REF; i < shdh.refHeap.length; i += 3) {
+ dos.writeInt(shdh.refHeap[i + StringHeapDeserializationHelper.CHAR_HEAP_POINTER_OFFSET]);
+ dos.writeInt(shdh.refHeap[i + StringHeapDeserializationHelper.CHAR_HEAP_STRLEN_OFFSET]);
+ }
+ }
/**
* Serializes the CAS data and writes it to the output stream.
- * --------------------------------------------------------------------- Blob Format
- *
- * Element Size Number of Description (bytes) Elements ------------ ---------
- * -------------------------------- 4 1 Blob key = "UIMA" in utf-8 4 1 Version (currently = 1) 4 1
- * size of 32-bit FS Heap array = s32H 4 s32H 32-bit FS heap array 4 1 size of 16-bit string Heap
- * array = sSH 2 sSH 16-bit string heap array 4 1 size of string Ref Heap array = sSRH 4 2*sSRH
- * string ref offsets and lengths 4 1 size of FS index array = sFSI 4 sFSI FS index array
- *
- * 4 1 size of 8-bit Heap array = s8H 1 s8H 8-bit Heap array 4 1 size of 16-bit Heap array = s16H
- * 2 s16H 16-bit Heap array 4 1 size of 64-bit Heap array = s64H 8 s64H 64-bit Heap array
+ * ---------------------------------------------------------------------
+ * Blob Format Element
+ * Size Number of Description
+ * (bytes) Elements
+ * ------------ --------- --------------------------------
+ * 4 1 Blob key = "UIMA" in utf-8
+ * 4 1 Version (currently = 1)
+ * 4 1 size of 32-bit FS Heap array = s32H
+ * 4 s32H 32-bit FS heap array
+ * 4 1 size of 16-bit string Heap array = sSH
+ * 2 sSH 16-bit string heap array
+ * 4 1 size of string Ref Heap zrray = sSRH
+ * 4 2*sSRH string ref offsets and lengths
+ * 4 1 size of FS index array = sFSI
+ * 4 sFSI FS index array
+ * 4 1 size of 8-bit Heap array = s8H
+ * 1 s8H 8-bit Heap array
+ * 4 1 size of 16-bit Heap array = s16H
+ * 2 s16H 16-bit Heap array
+ * 4 1 size of 64-bit Heap array = s64H
+ * 8 s64H 64-bit Heap array
* ---------------------------------------------------------------------
*
- * This reads in and deserializes CAS data from a stream. Byte swapping may be needed is the blob
- * is from C++ -- C++ blob serialization writes data in native byte order.
- *
- * @param cas
- * The CAS to be serialized. ostream The output stream.
+ * @param cas The CAS to be serialized. ostream The output stream.
*/
public void addCAS(CASImpl cas, OutputStream ostream) {
-
+
try {
DataOutputStream dos = new DataOutputStream(ostream);
@@ -151,20 +265,8 @@ public class CASSerializer implements Se
this.fsIndex = cas.getIndexedFSs();
// output the key and version number
-
- byte[] uima = new byte[4];
- uima[0] = 85; // U
- uima[1] = 73; // I
- uima[2] = 77; // M
- uima[3] = 65; // A
-
- ByteBuffer buf = ByteBuffer.wrap(uima);
- int key = buf.asIntBuffer().get();
-
- int version = 1;
- dos.writeInt(key);
- dos.writeInt(version);
-
+ outputVersion(1, dos);
+
// output the FS heap
final int heapSize = cas.getHeap().getCellsUsed();
dos.writeInt(heapSize);
@@ -175,56 +277,57 @@ public class CASSerializer implements Se
// output the strings
StringHeapDeserializationHelper shdh = cas.getStringHeap().serialize();
- // compute the number of total size of data in stringHeap
- // total size = char buffer length + length of strings in the string list;
- int stringHeapLength = shdh.charHeapPos;
- int stringListLength = 0;
- for (int i = 0; i < shdh.refHeap.length; i += 3) {
- int ref = shdh.refHeap[i + StringHeapDeserializationHelper.STRING_LIST_ADDR_OFFSET];
- // this is a string in the string list
- // get length and add to total string heap length
- if (ref != 0) {
- // terminate each string with a null
- stringListLength += 1 + cas.getStringHeap().getStringForCode(ref).length();
- }
- }
-
- int stringTotalLength = stringHeapLength + stringListLength;
- if (stringHeapLength == 0 && stringListLength > 0) {
- // nothing from stringHeap
- // add 1 for the null at the beginning
- stringTotalLength += 1;
- }
- dos.writeInt(stringTotalLength);
-
- // write the data in the stringheap, if there is any
- if (stringTotalLength > 0) {
- if (shdh.charHeapPos > 0) {
- dos.writeChars(String.valueOf(shdh.charHeap, 0, shdh.charHeapPos));
- } else {
- // no stringheap data
- // if there is data in the string lists, write a leading 0
- if (stringListLength > 0) {
- dos.writeChar(0);
- }
- }
-
- // word alignment
- if (stringTotalLength % 2 != 0) {
- dos.writeChar(0);
- }
- }
-
- // write out the string ref heap
- // each reference consist of a offset into stringheap and a length
- int refheapsz = ((shdh.refHeap.length - StringHeapDeserializationHelper.FIRST_CELL_REF) / StringHeapDeserializationHelper.REF_HEAP_CELL_SIZE) * 2;
- refheapsz++;
- dos.writeInt(refheapsz);
- dos.writeInt(0);
- for (int i = StringHeapDeserializationHelper.FIRST_CELL_REF; i < shdh.refHeap.length; i += 3) {
- dos.writeInt(shdh.refHeap[i + StringHeapDeserializationHelper.CHAR_HEAP_POINTER_OFFSET]);
- dos.writeInt(shdh.refHeap[i + StringHeapDeserializationHelper.CHAR_HEAP_STRLEN_OFFSET]);
- }
+ outputStringHeap(dos, cas, shdh);
+// // compute the number of total size of data in stringHeap
+// // total size = char buffer length + length of strings in the string list;
+// int stringHeapLength = shdh.charHeapPos;
+// int stringListLength = 0;
+// for (int i = 0; i < shdh.refHeap.length; i += 3) {
+// int ref = shdh.refHeap[i + StringHeapDeserializationHelper.STRING_LIST_ADDR_OFFSET];
+// // this is a string in the string list
+// // get length and add to total string heap length
+// if (ref != 0) {
+// // terminate each string with a null
+// stringListLength += 1 + cas.getStringHeap().getStringForCode(ref).length();
+// }
+// }
+//
+// int stringTotalLength = stringHeapLength + stringListLength;
+// if (stringHeapLength == 0 && stringListLength > 0) {
+// // nothing from stringHeap
+// // add 1 for the null at the beginning
+// stringTotalLength += 1;
+// }
+// dos.writeInt(stringTotalLength);
+//
+// // write the data in the stringheap, if there is any
+// if (stringTotalLength > 0) {
+// if (shdh.charHeapPos > 0) {
+// dos.writeChars(String.valueOf(shdh.charHeap, 0, shdh.charHeapPos));
+// } else {
+// // no stringheap data
+// // if there is data in the string lists, write a leading 0
+// if (stringListLength > 0) {
+// dos.writeChar(0);
+// }
+// }
+//
+// // word alignment
+// if (stringTotalLength % 2 != 0) {
+// dos.writeChar(0);
+// }
+// }
+//
+// // write out the string ref heap
+// // each reference consist of a offset into stringheap and a length
+// int refheapsz = ((shdh.refHeap.length - StringHeapDeserializationHelper.FIRST_CELL_REF) / StringHeapDeserializationHelper.REF_HEAP_CELL_SIZE) * 2;
+// refheapsz++;
+// dos.writeInt(refheapsz);
+// dos.writeInt(0);
+// for (int i = StringHeapDeserializationHelper.FIRST_CELL_REF; i < shdh.refHeap.length; i += 3) {
+// dos.writeInt(shdh.refHeap[i + StringHeapDeserializationHelper.CHAR_HEAP_POINTER_OFFSET]);
+// dos.writeInt(shdh.refHeap[i + StringHeapDeserializationHelper.CHAR_HEAP_STRLEN_OFFSET]);
+// }
// output the index FSs
dos.writeInt(this.fsIndex.length);
@@ -279,33 +382,33 @@ public class CASSerializer implements Se
*
* ElementSize NumberOfElements Description
* ----------- ---------------- ---------------------------------------------------------
- * 4 1 Blob key = "UIMA" in utf-8 (byte order flag)
- * 4 1 Version (1 = complete cas, 2 = delta cas)
- * 4 1 size of 32-bit heap array = s32H
- * 4 s32H 32-bit FS heap array (new elements)
- * 4 1 size of 16-bit string Heap array = sSH
- * 2 sSH 16-bit string heap array (new strings)
- * 4 1 size of string Ref Heap array = sSRH
- * 4 2*sSRH string ref offsets and lengths (for new strings)
- * 4 1 number of modified, preexisting 32-bit modified FS heap elements = sM32H
- * 4 2*sM32H 32-bit heap offset and value (preexisting cells modified)
- * 4 1 size of FS index array = sFSI
- * 4 sFSI FS index array in Delta format
- * 4 1 size of 8-bit Heap array = s8H
- * 1 s8H 8-bit Heap array (new elements)
- * 4 1 size of 16-bit Heap array = s16H
- * 2 s16H 16-bit Heap array (new elements)
- * 4 1 size of 64-bit Heap array = s64H
- * 8 s64H 64-bit Heap array (new elements)
- * 4 1 number of modified, preexisting 8-bit heap elements = sM8H
- * 4 sM8H 8-bit heap offsets (preexisting cells modified)
- * 1 sM8H 8-bit heap values (preexisting cells modified)
- * 4 1 number of modified, preexisting 16-bit heap elements = sM16H
- * 4 sM16H 16-bit heap offsets (preexisting cells modified)
- * 2 sM16H 16-bit heap values (preexisting cells modified)
- * 4 1 number of modified, preexisting 64-bit heap elements = sM64H
- * 4 sM64H 64-bit heap offsets (preexisting cells modified)
- * 2 sM64H 64-bit heap values (preexisting cells modified)
+ * 4 1 Blob key = "UIMA" in utf-8 (byte order flag)
+ * 4 1 Version (1 = complete cas, 2 = delta cas)
+ * 4 1 size of 32-bit heap array = s32H
+ * 4 s32H 32-bit FS heap array (new elements)
+ * 4 1 size of 16-bit string Heap array = sSH
+ * 2 sSH 16-bit string heap array (new strings)
+ * 4 1 size of string Ref Heap array = sSRH
+ * 4 2*sSRH string ref offsets and lengths (for new strings)
+ * 4 1 number of modified, preexisting 32-bit modified FS heap elements = sM32H
+ * 4 2*sM32H 32-bit heap offset and value (preexisting cells modified)
+ * 4 1 size of FS index array = sFSI
+ * 4 sFSI FS index array in Delta format
+ * 4 1 size of 8-bit Heap array = s8H
+ * 1 s8H 8-bit Heap array (new elements)
+ * 4 1 size of 16-bit Heap array = s16H
+ * 2 s16H 16-bit Heap array (new elements)
+ * 4 1 size of 64-bit Heap array = s64H
+ * 8 s64H 64-bit Heap array (new elements)
+ * 4 1 number of modified, preexisting 8-bit heap elements = sM8H
+ * 4 sM8H 8-bit heap offsets (preexisting cells modified)
+ * 1 sM8H 8-bit heap values (preexisting cells modified)
+ * 4 1 number of modified, preexisting 16-bit heap elements = sM16H
+ * 4 sM16H 16-bit heap offsets (preexisting cells modified)
+ * 2 sM16H 16-bit heap values (preexisting cells modified)
+ * 4 1 number of modified, preexisting 64-bit heap elements = sM64H
+ * 4 sM64H 64-bit heap offsets (preexisting cells modified)
+ * 2 sM64H 64-bit heap values (preexisting cells modified)
*
*
* @param cas
@@ -327,20 +430,9 @@ public class CASSerializer implements Se
this.fsIndex = cas.getDeltaIndexedFSs(mark);
// output the key and version number
-
- byte[] uima = new byte[4];
- uima[0] = 85; // U
- uima[1] = 73; // I
- uima[2] = 77; // M
- uima[3] = 65; // A
-
- ByteBuffer buf = ByteBuffer.wrap(uima);
- int key = buf.asIntBuffer().get();
-
- int version = 2; //1 = current full serialization; 2 = delta format
- //perhaps this should be split into 2 bytes for version and 2 bytes for format.
- dos.writeInt(key);
- dos.writeInt(version);
+ //1 = current full serialization; 2 = delta format
+ //perhaps this should be split into 2 bytes for version and 2 bytes for format.
+ outputVersion(2, dos);
// output the new FS heap cells
final int heapSize = cas.getHeap().getCellsUsed() - mark.nextFSId;
@@ -353,56 +445,57 @@ public class CASSerializer implements Se
// output the new strings
StringHeapDeserializationHelper shdh = cas.getStringHeap().serialize(mark.nextStringHeapAddr);
- // compute the number of total size of data in stringHeap
- // total size = char buffer length + length of strings in the string list;
- int stringHeapLength = shdh.charHeapPos;
- int stringListLength = 0;
- for (int i = 0; i < shdh.refHeap.length; i += 3) {
- int ref = shdh.refHeap[i + StringHeapDeserializationHelper.STRING_LIST_ADDR_OFFSET];
- // this is a string in the string list
- // get length and add to total string heap length
- if (ref != 0) {
- // terminate each string with a null
- stringListLength += 1 + cas.getStringHeap().getStringForCode(ref).length();
- }
- }
-
- int stringTotalLength = stringHeapLength + stringListLength;
- if (stringHeapLength == 0 && stringListLength > 0) {
- // nothing from stringHeap
- // add 1 for the null at the beginning
- stringTotalLength += 1;
- }
- dos.writeInt(stringTotalLength);
-
- // write the data in the stringheap, if there is any
- if (stringTotalLength > 0) {
- if (shdh.charHeapPos > 0) {
- dos.writeChars(String.valueOf(shdh.charHeap, 0, shdh.charHeapPos));
- } else {
- // no stringheap data
- // if there is data in the string lists, write a leading 0
- if (stringListLength > 0) {
- dos.writeChar(0);
- }
- }
-
- // word alignment
- if (stringTotalLength % 2 != 0) {
- dos.writeChar(0);
- }
- }
-
- // write out the string ref heap
- // each reference consist of a offset into stringheap and a length
- int refheapsz = ((shdh.refHeap.length - StringHeapDeserializationHelper.FIRST_CELL_REF) / StringHeapDeserializationHelper.REF_HEAP_CELL_SIZE) * 2;
- refheapsz++;
- dos.writeInt(refheapsz);
- dos.writeInt(0);
- for (int i = StringHeapDeserializationHelper.FIRST_CELL_REF; i < shdh.refHeap.length; i += 3) {
- dos.writeInt(shdh.refHeap[i + StringHeapDeserializationHelper.CHAR_HEAP_POINTER_OFFSET]);
- dos.writeInt(shdh.refHeap[i + StringHeapDeserializationHelper.CHAR_HEAP_STRLEN_OFFSET]);
- }
+ outputStringHeap(dos, cas, shdh);
+// // compute the number of total size of data in stringHeap
+// // total size = char buffer length + length of strings in the string list;
+// int stringHeapLength = shdh.charHeapPos;
+// int stringListLength = 0;
+// for (int i = 0; i < shdh.refHeap.length; i += 3) {
+// int ref = shdh.refHeap[i + StringHeapDeserializationHelper.STRING_LIST_ADDR_OFFSET];
+// // this is a string in the string list
+// // get length and add to total string heap length
+// if (ref != 0) {
+// // terminate each string with a null
+// stringListLength += 1 + cas.getStringHeap().getStringForCode(ref).length();
+// }
+// }
+//
+// int stringTotalLength = stringHeapLength + stringListLength;
+// if (stringHeapLength == 0 && stringListLength > 0) {
+// // nothing from stringHeap
+// // add 1 for the null at the beginning
+// stringTotalLength += 1;
+// }
+// dos.writeInt(stringTotalLength);
+//
+// // write the data in the stringheap, if there is any
+// if (stringTotalLength > 0) {
+// if (shdh.charHeapPos > 0) {
+// dos.writeChars(String.valueOf(shdh.charHeap, 0, shdh.charHeapPos));
+// } else {
+// // no stringheap data
+// // if there is data in the string lists, write a leading 0
+// if (stringListLength > 0) {
+// dos.writeChar(0);
+// }
+// }
+//
+// // word alignment
+// if (stringTotalLength % 2 != 0) {
+// dos.writeChar(0);
+// }
+// }
+//
+// // write out the string ref heap
+// // each reference consist of a offset into stringheap and a length
+// int refheapsz = ((shdh.refHeap.length - StringHeapDeserializationHelper.FIRST_CELL_REF) / StringHeapDeserializationHelper.REF_HEAP_CELL_SIZE) * 2;
+// refheapsz++;
+// dos.writeInt(refheapsz);
+// dos.writeInt(0);
+// for (int i = StringHeapDeserializationHelper.FIRST_CELL_REF; i < shdh.refHeap.length; i += 3) {
+// dos.writeInt(shdh.refHeap[i + StringHeapDeserializationHelper.CHAR_HEAP_POINTER_OFFSET]);
+// dos.writeInt(shdh.refHeap[i + StringHeapDeserializationHelper.CHAR_HEAP_STRLEN_OFFSET]);
+// }
//output modified FS Heap cells
int[] fsHeapModifiedAddrs = cas.getModifiedFSHeapAddrs().toArray();
@@ -452,16 +545,16 @@ public class CASSerializer implements Se
// 8 bit heap modified cells
int[] byteHeapModifiedAddrs = cas.getModifiedByteHeapAddrs().toArray();
- byte[] byteValues = new byte[byteHeapModifiedAddrs.length];
+// byte[] byteValues = new byte[byteHeapModifiedAddrs.length];
dos.writeInt(byteHeapModifiedAddrs.length);
for (int i=0; i < byteHeapModifiedAddrs.length; i++) {
- dos.writeInt(byteHeapModifiedAddrs[i]);
- byteValues[i] = cas.getByteHeap().getHeapValue(byteHeapModifiedAddrs[i]);
+ dos.writeInt(byteHeapModifiedAddrs[i]);
+// byteValues[i] = cas.getByteHeap().getHeapValue(byteHeapModifiedAddrs[i]);
}
- for (int i=0; i < byteValues.length; i++) {
+ for (int i=0; i < byteHeapModifiedAddrs.length; i++) {
dos.writeByte(cas.getByteHeap().getHeapValue(byteHeapModifiedAddrs[i]));
- }
-
+ }
+
// word alignment
align = (4 - (byteheapsz % 4)) % 4;
for (int i = 0; i < align; i++) {
@@ -504,7 +597,35 @@ public class CASSerializer implements Se
}
}
-
+
+// /**
+// * Serialize with compression
+// * Target is not constrained to the C++ format
+// * For non delta serialization, pass marker with 0 as values
+// * @throws IOException
+// */
+//
+// public void serialize(CASImpl cas, OutputStream ostream, Marker marker) throws IOException {
+// if (marker != null && !marker.isValid() ) {
+// CASRuntimeException exception = new CASRuntimeException(
+// CASRuntimeException.INVALID_MARKER, new String[] { "Invalid Marker." });
+// throw exception;
+// }
+// MarkerImpl mark = (MarkerImpl) marker;
+// DataOutputStream dos = new DataOutputStream(ostream);
+//
+// this.fsIndex = cas.getDeltaIndexedFSs(mark);
+// outputVersion(3 , dos);
+//
+// // output the new FS heap cells
+// final int heapSize = cas.getHeap().getCellsUsed() - mark.nextFSId);
+// compressHeapOut(dos, cas, heapSize, mark)
+//
+// // output the new strings
+//
+// }
+
+
/**
* Method stringArrayListToArray.
*
Modified: uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/MarkerImpl.java
URL: http://svn.apache.org/viewvc/uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/MarkerImpl.java?rev=1406261&r1=1406260&r2=1406261&view=diff
==============================================================================
--- uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/MarkerImpl.java (original)
+++ uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/MarkerImpl.java Tue Nov 6 18:22:03 2012
@@ -95,5 +95,25 @@ public class MarkerImpl implements Marke
public boolean isValid() {
return isValid;
}
+
+ public int getNextFSId() {
+ return nextFSId;
+ }
+
+ public int getNextStringHeapAddr() {
+ return nextStringHeapAddr;
+ }
+
+ public int getNextByteHeapAddr() {
+ return nextByteHeapAddr;
+ }
+
+ public int getNextShortHeapAddr() {
+ return nextShortHeapAddr;
+ }
+
+ public int getNextLongHeapAddr() {
+ return nextLongHeapAddr;
+ }
}
Modified: uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/StringHeap.java
URL: http://svn.apache.org/viewvc/uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/StringHeap.java?rev=1406261&r1=1406260&r2=1406261&view=diff
==============================================================================
--- uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/StringHeap.java (original)
+++ uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/StringHeap.java Tue Nov 6 18:22:03 2012
@@ -80,35 +80,7 @@ final class StringHeap {
* @return Serialization helper that can be interpreted easier by serialization code.
*/
StringHeapDeserializationHelper serialize() {
- StringHeapDeserializationHelper shdh = new StringHeapDeserializationHelper();
- // Ref heap is 3 times the size of the string list.
- shdh.refHeap = new int[this.stringList.size()
- * StringHeapDeserializationHelper.REF_HEAP_CELL_SIZE];
- shdh.refHeapPos = shdh.refHeap.length;
- // Compute required size of character heap.
- int charHeapSize = 0;
- for (int i = 0; i < this.stringList.size(); i++) {
- String s = this.stringList.get(i);
- if (s != null) {
- charHeapSize += s.length();
- }
- }
- shdh.charHeap = new char[charHeapSize];
- shdh.charHeapPos = shdh.charHeap.length;
-
- int charCount = 0;
- // Now write out the actual data
- for (int i = 1; i < this.stringList.size(); i++) {
- String s = this.stringList.get(i);
- int refHeapOffset = i * StringHeapDeserializationHelper.REF_HEAP_CELL_SIZE;
- shdh.refHeap[refHeapOffset + StringHeapDeserializationHelper.CHAR_HEAP_POINTER_OFFSET] = charCount;
- shdh.refHeap[refHeapOffset + StringHeapDeserializationHelper.CHAR_HEAP_STRLEN_OFFSET] = s
- .length();
- System.arraycopy(s.toCharArray(), 0, shdh.charHeap, charCount, s.length());
- charCount += s.length();
- }
- assert (charCount == shdh.charHeap.length);
- return shdh;
+ return serialize(1);
}
StringHeapDeserializationHelper serialize(int startPos) {
Modified: uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/CasCopier.java
URL: http://svn.apache.org/viewvc/uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/CasCopier.java?rev=1406261&r1=1406260&r2=1406261&view=diff
==============================================================================
--- uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/CasCopier.java (original)
+++ uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/CasCopier.java Tue Nov 6 18:22:03 2012
@@ -101,11 +101,18 @@ public class CasCopier {
public static void copyCas(CAS aSrcCas, CAS aDestCas, boolean aCopySofa) {
CasCopier copier = new CasCopier(aSrcCas, aDestCas);
- Iterator<SofaFS> sofaIter = aSrcCas.getSofaIterator();
- while (sofaIter.hasNext()) {
- SofaFS sofa = sofaIter.next();
- CAS view = aSrcCas.getView(sofa);
- copier.copyCasView(view, aCopySofa);
+ // oops, this misses the initial view if a sofa FS has not yet been created
+// Iterator<SofaFS> sofaIter = aSrcCas.getSofaIterator();
+// while (sofaIter.hasNext()) {
+// SofaFS sofa = sofaIter.next();
+// CAS view = aSrcCas.getView(sofa);
+// copier.copyCasView(view, aCopySofa);
+// }
+
+ Iterator<CAS> viewIterator = aSrcCas.getViewIterator();
+ while (viewIterator.hasNext()) {
+ CAS view = viewIterator.next();
+ copier.copyCasView(view, aCopySofa);
}
}
@@ -127,13 +134,16 @@ public class CasCopier {
if (aCopySofa) {
// can't copy the SofaFS - just copy the sofa data and mime type
- String sofaMime = aSrcCasView.getSofa().getSofaMime();
- if (aSrcCasView.getDocumentText() != null) {
- targetView.setSofaDataString(aSrcCasView.getDocumentText(), sofaMime);
- } else if (aSrcCasView.getSofaDataURI() != null) {
- targetView.setSofaDataURI(aSrcCasView.getSofaDataURI(), sofaMime);
- } else if (aSrcCasView.getSofaDataArray() != null) {
- targetView.setSofaDataArray(copyFs(aSrcCasView.getSofaDataArray()), sofaMime);
+ SofaFS sofa = aSrcCasView.getSofa();
+ if (null != sofa) {
+ String sofaMime = sofa.getSofaMime();
+ if (aSrcCasView.getDocumentText() != null) {
+ targetView.setSofaDataString(aSrcCasView.getDocumentText(), sofaMime);
+ } else if (aSrcCasView.getSofaDataURI() != null) {
+ targetView.setSofaDataURI(aSrcCasView.getSofaDataURI(), sofaMime);
+ } else if (aSrcCasView.getSofaDataArray() != null) {
+ targetView.setSofaDataArray(copyFs(aSrcCasView.getSofaDataArray()), sofaMime);
+ }
}
}
Added: uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/SerializationMeasures.java
URL: http://svn.apache.org/viewvc/uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/SerializationMeasures.java?rev=1406261&view=auto
==============================================================================
--- uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/SerializationMeasures.java (added)
+++ uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/SerializationMeasures.java Tue Nov 6 18:22:03 2012
@@ -0,0 +1,411 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.uima.util;
+
+import static org.apache.uima.cas.impl.BinaryCasSerDes4.CAN_BE_NEGATIVE;
+import static org.apache.uima.cas.impl.BinaryCasSerDes4.IN_MAIN_HEAP;
+import static org.apache.uima.cas.impl.BinaryCasSerDes4.SlotKind.Slot_ArrayLength;
+import static org.apache.uima.cas.impl.BinaryCasSerDes4.SlotKind.Slot_Byte;
+import static org.apache.uima.cas.impl.BinaryCasSerDes4.SlotKind.Slot_Double_Exponent;
+import static org.apache.uima.cas.impl.BinaryCasSerDes4.SlotKind.Slot_Double_Mantissa_Sign;
+import static org.apache.uima.cas.impl.BinaryCasSerDes4.SlotKind.Slot_Float_Exponent;
+import static org.apache.uima.cas.impl.BinaryCasSerDes4.SlotKind.Slot_Float_Mantissa_Sign;
+import static org.apache.uima.cas.impl.BinaryCasSerDes4.SlotKind.Slot_FsIndexes;
+import static org.apache.uima.cas.impl.BinaryCasSerDes4.SlotKind.Slot_HeapRef;
+import static org.apache.uima.cas.impl.BinaryCasSerDes4.SlotKind.Slot_Int;
+import static org.apache.uima.cas.impl.BinaryCasSerDes4.SlotKind.Slot_Long_High;
+import static org.apache.uima.cas.impl.BinaryCasSerDes4.SlotKind.Slot_Long_Low;
+import static org.apache.uima.cas.impl.BinaryCasSerDes4.SlotKind.Slot_MainHeap;
+import static org.apache.uima.cas.impl.BinaryCasSerDes4.SlotKind.Slot_Short;
+import static org.apache.uima.cas.impl.BinaryCasSerDes4.SlotKind.Slot_StrChars;
+import static org.apache.uima.cas.impl.BinaryCasSerDes4.SlotKind.Slot_StrLength;
+import static org.apache.uima.cas.impl.BinaryCasSerDes4.SlotKind.Slot_StrOffset;
+import static org.apache.uima.cas.impl.BinaryCasSerDes4.SlotKind.Slot_TypeCode;
+
+import org.apache.uima.cas.impl.BinaryCasSerDes4.SlotKind;
+
+
+/**
+ * Structure:
+
+ * StatDetail
+ *
+ * str has neh for offset, length, dictionary hits/misses
+ *
+ * indexedFs has neh for diffs
+ *
+ * modHeap named, has neh for diffs, heap for values
+ */
+public class SerializationMeasures {
+
+ public static final int MAX_NBR_ENCODE_LENGTH = 10; // for long values taking 64 bits at 7 bits per byte
+
+ /**
+ * helper method to truncate printing of lots of trailing 0s
+ * @param c
+ * @return
+ */
+ private static int maxIndexToZeros(int[] c) {
+ for (int i = c.length - 1; i >= 0; i--) {
+ if (c[i] != 0) {
+ return Math.min(i + 1, c.length - 1);
+ }
+ }
+ return 1;
+ }
+
+ /**
+ * Statistical details
+ * There's instances of this class for
+ * - the main heap
+ * - the aux heaps
+ * - the string offsets, the string lengths
+ *
+ * Heap: xxxx [name-of-delta: [Total: <TotalBytes>(negative%) Histo: a(neg%) b(neg%) c(neg%) d(neg%) e(neg%)]]
+ * 2 styles: one uses only one counter, no delta - used for byte, short, and long heaps
+ * other is for main heap, uses 4 deltas.
+ *
+ */
+ public static class StatDetail {
+ private final String name;
+ public long original = -1; // if set, use this, otherwise use countTotal * bytesPerCount
+ final boolean canBeNegative;
+ public final int[] c = new int[MAX_NBR_ENCODE_LENGTH];
+ private final int[] cn; // negative counts
+ private final int bytesPerCount; // # bytes in source, per entry
+ public int countTotal; // plain count not weighted by number of bytes
+ public int lengthTotal; // count weighted by encodedLength
+
+ // encoding variants
+// private long itemCount = 0;
+// public long hits = 0; // misses = itemCount - hits
+// public long lt64 = 0; // (non diff slot only) things coded outside of dictionary
+ public long diffEncoded = 0; // things not diff encoded = totalCount - diffEncoded
+ public long valueLeDiff = 0; // things not diff encoded which could have been
+// public long total = 0;
+
+ // zip info
+ public long beforeZip; // should be same as lengthTotal;
+ public long afterZip = -1; // -1 means not zipped
+ public long zipTime;
+ public long deserializationTime; // excluding unzipping
+
+ public StatDetail(String name,
+ boolean canBeNegative,
+ boolean inMainHeap,
+ int bytesPerCount) {
+ this.canBeNegative = canBeNegative;
+ this.bytesPerCount = bytesPerCount;
+ this.name = name;
+ if (canBeNegative) {
+ cn = new int[MAX_NBR_ENCODE_LENGTH];
+ } else {
+ cn = null;
+ }
+ if (inMainHeap) {
+ original = 0; // main heap original computed outside of this mechanism
+ }
+ }
+
+ public long getOriginal() {
+ if (original == -1) {
+ return countTotal * bytesPerCount;
+ }
+ else {
+ return original;
+ }
+ }
+
+ public void accum(StatDetail o) {
+ for (int i = 0; i < c.length; i++) {
+ c[i] += o.c[i];
+ if (canBeNegative && (null != o.cn)) {
+ cn[i] += o.cn[i];
+ }
+ }
+
+ countTotal += o.countTotal;
+ lengthTotal += o.lengthTotal;
+ original = getOriginal();
+ original += o.getOriginal();
+ diffEncoded += o.diffEncoded;
+ valueLeDiff += o.valueLeDiff;
+ beforeZip += o.beforeZip;
+ if (afterZip == -1) {
+ afterZip = 0;
+ }
+ afterZip += (o.afterZip == -1) ? o.beforeZip : o.afterZip;
+ zipTime += o.zipTime;
+ deserializationTime += o.deserializationTime;
+ }
+
+ public void incr(int encodedLength, boolean isNegative) {
+ if (isNegative) {
+ cn[encodedLength - 1] ++;
+ }
+ incr(encodedLength);
+ }
+
+ public void incr(int encodedLength) {
+ c[encodedLength - 1] ++;
+ countTotal ++;
+ lengthTotal += encodedLength;
+ }
+
+ /**
+ * v is the number of bytes to incr counter 0 by
+ * @param v
+ */
+ public void incrNoCompression(int v) {
+ c[bytesPerCount - 1] += v;
+ countTotal += v;
+ lengthTotal += v * bytesPerCount;
+ }
+
+ public String toString() {
+ long tot = lengthTotal;
+ if (tot == 0) {
+ return String.format("Item: %25s%n", name);
+ }
+ String diff = (0 < diffEncoded) ?
+ String.format(
+ "%n DiffEncoded(%%, %%v<diff): %,d(%.1f%% %.1f%%)",
+ diffEncoded, percent(diffEncoded, countTotal), percent(valueLeDiff, diffEncoded)) :
+ "";
+ String zp = (afterZip == -1) ? "" :
+ String.format(" afterZip: %,7d(%4.1f%%), %,3d ms", afterZip, percent(afterZip, beforeZip), zipTime);
+
+ String dt = (deserializationTime == 0) ? "" :
+ String.format(" Deserialization time: %f", deserializationTime/1000F);
+
+ StringBuilder sb = new StringBuilder();
+ // find max index to include = first non-zero from end, + 1
+ int maxToInclude = maxIndexToZeros(c);
+ for (int i = 0; i <= maxToInclude; i++) {
+ sb.append((canBeNegative) ?
+ String.format(" %,d(%,d)", c[i], cn[i]) :
+ String.format(" %,d", c[i]));
+ }
+ String totPct = (original == 0) ?
+ String.format("LengthTot: %,d", lengthTotal) :
+ String.format("LengthTot: %,d(%.1f%%)", lengthTotal, percentCompr(lengthTotal));
+ String histoDetails = String.format("[%s Histo:%s]", totPct, sb);
+ return String.format("Item: %25s %s %s %s %s%n",
+ name, zp, dt, histoDetails, diff);
+ }
+
+ private float percentCompr(long totCompr) {
+ return percent(totCompr, ((original == -1) || (original == 0)) ? countTotal * bytesPerCount : original);
+ }
+ }
+
+ /**
+ * each instance of this class remembers a set of statDetail instances to
+ * do bulk operations against that set of the statistics
+ */
+ public class AllStatDetails {
+ final StatDetail[] allStatDetails;
+ StatDetail aggr;
+ final String name;
+
+ public AllStatDetails (String aggrName, StatDetail ... someHeaps) {
+ name = aggrName;
+ allStatDetails = new StatDetail[someHeaps.length];
+ aggr = new StatDetail(aggrName, CAN_BE_NEGATIVE, IN_MAIN_HEAP, 1);
+ int i = 0;
+ for (StatDetail sd : someHeaps) {
+ allStatDetails[i++] = sd;
+ aggr.accum(sd);
+ }
+ }
+
+ public AllStatDetails (String aggrName, SlotKind ... kinds) {
+ this(aggrName, toStatDetails(kinds));
+ }
+
+ public void accum(AllStatDetails o) {
+ for (int i = 0; i < allStatDetails.length; i++) {
+ allStatDetails[i].accum(o.allStatDetails[i]);
+ }
+ }
+
+ public void aggregate() {
+ aggr = new StatDetail(name, CAN_BE_NEGATIVE, ! IN_MAIN_HEAP, 1);
+ for (StatDetail sd : allStatDetails) {
+ aggr.accum(sd);
+ }
+ }
+
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ for (StatDetail h : allStatDetails) {
+ sb.append(h.toString());
+ }
+ return sb.toString();
+ }
+ }
+
+ private static float percent(long a, long b) {
+ if (a == 0) {
+ return 0F;
+ }
+ if (b == 0) {
+ return 100F;
+ }
+
+ return ((100F * a)/ b);
+ }
+
+ // all measures in counts or bytes
+ public int header = 0;
+ public long origAuxByteArrayRefs = 0; // in bytes (incl boolean), 1 entry usually = 4 bytes
+ public long origAuxShortArrayRefs = 0;
+ public long origAuxLongArrayRefs = 0;
+ public long origAuxBytes = 0; // includes booleans, in bytes
+ public long origAuxShorts = 0; //in bytes
+ public long origAuxLongs = 0; // includes doubles, in bytes
+
+ public long mainHeapFSs = 0; // count of all feature structures
+
+ public int stringsNbrCommon = 0;
+ public long stringsCommonChars = 0;
+ public long stringsSavedExact = 0;
+ public long stringsSavedSubstr = 0;
+
+ public long totalTime = 0;
+
+
+ public final StatDetail[] statDetails = new StatDetail[SlotKind.values().length];
+ {
+ for (SlotKind kind : SlotKind.values()) {
+ statDetails[kind.i] = new StatDetail(kind.toString(),
+ kind.canBeNegative,
+ kind.inMainHeap,
+ kind.elementSize);
+ }
+ }
+
+ public final AllStatDetails allSlots =
+ new AllStatDetails("AllSlotKinds",
+ Slot_ArrayLength,
+ Slot_HeapRef,
+ Slot_Int,
+ Slot_Byte, // used only for arrays
+ Slot_Short, // used only for arrays
+ Slot_TypeCode,
+ Slot_StrOffset,
+ Slot_StrLength,
+ Slot_StrChars,
+ Slot_Long_High,
+ Slot_Long_Low,
+ Slot_Float_Mantissa_Sign,
+ Slot_Float_Exponent,
+ Slot_Double_Mantissa_Sign,
+ Slot_Double_Exponent,
+ Slot_FsIndexes);
+ public final AllStatDetails strSlots =
+ new AllStatDetails("Strings",
+ Slot_StrOffset,
+ Slot_StrLength,
+ Slot_StrChars);
+
+// public final ModHeaps modHeaps = new ModHeaps(modMainHeap, modByteHeap, modShortHeap, modLongHeap);
+// public final Str strings = new Str(strOffsets, strLengths);
+// public final IndexedFSs indexedFSs = new IndexedFSs();
+//
+
+ public SerializationMeasures() {
+ }
+
+ StatDetail[] toStatDetails(SlotKind[] kinds) {
+ StatDetail[] sds= new StatDetail[kinds.length];
+ int i = 0;
+ for(SlotKind k : kinds) {
+ sds[i++] = statDetails[k.i];
+ }
+ return sds;
+ }
+
+ /**
+ * accumulate results for multiple files
+ * @param o
+ */
+ public void accum(SerializationMeasures o) {
+ int i = 0;
+ for (StatDetail sd : o.statDetails) {
+ statDetails[i++].accum(sd);
+ }
+ origAuxByteArrayRefs += o.origAuxByteArrayRefs;
+ origAuxShortArrayRefs += o.origAuxShortArrayRefs;
+ origAuxLongArrayRefs += o.origAuxLongArrayRefs;
+ header += o.header;
+ mainHeapFSs += o.mainHeapFSs;
+
+ stringsNbrCommon += o.stringsNbrCommon;
+ stringsCommonChars += o.stringsCommonChars;
+ stringsSavedExact += o.stringsSavedExact;
+ stringsSavedSubstr += o.stringsSavedSubstr;
+ }
+
+ public String toString() {
+ // Strings
+
+ long origStringChars = statDetails[Slot_StrChars.i].getOriginal();
+ long origStringObjs = statDetails[Slot_StrLength.i].getOriginal() * 2;
+ long origStringsTot = origStringChars + // space for the chars
+ origStringObjs + // space for the offset and length
+ (origStringObjs / 2); // space for the refs to the string heap
+
+
+ allSlots.aggregate();
+ strSlots.aggregate();
+
+ long allOrig = statDetails[Slot_MainHeap.i].original +
+ origStringChars + origStringObjs +
+ origAuxBytes +
+ origAuxShorts +
+ origAuxLongs;
+
+ long allB4Z = allSlots.aggr.lengthTotal;
+ long strB4Z = strSlots.aggr.lengthTotal;
+
+ long allTotZ = allSlots.aggr.afterZip;
+ long strTotZ = strSlots.aggr.afterZip;
+
+ return String.format(
+ "Summary: withZip: %,d(%.1f%%), without: %,d(%.1f%%) zipTime: %,d ms totalSerTime: %,d ms%n" +
+ " nonStrgs: withZip: %,d(%.1f%%), without: %,d(%.1f%%)%n" +
+ " Strings: withZip: %,d(%.1f%%), without: %,d(%.1f%%)%n" +
+ " MainHeap TotFS: %,d, StrCmnChars: %,d(%.1f%%), StrSavedExact: %,d StrSavedSubstr: %,d%n" +
+ "%s%n",
+ allTotZ, percent(allTotZ, allOrig), allB4Z, percent(allB4Z, allOrig), allSlots.aggr.zipTime, totalTime,
+
+ allTotZ - strTotZ, percent(allTotZ - strTotZ, allOrig - origStringsTot),
+ allB4Z - strB4Z, percent(allB4Z - strB4Z, allOrig - origStringsTot),
+
+ strTotZ, percent(strTotZ, origStringsTot),
+ strB4Z, percent(strB4Z, origStringsTot),
+
+ mainHeapFSs, stringsCommonChars, percent(stringsCommonChars, statDetails[Slot_StrChars.i].original),
+ stringsSavedExact, stringsSavedSubstr,
+ allSlots.toString()
+ );
+ }
+
+}
Propchange: uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/SerializationMeasures.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/impl/DataIO.java
URL: http://svn.apache.org/viewvc/uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/impl/DataIO.java?rev=1406261&view=auto
==============================================================================
--- uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/impl/DataIO.java (added)
+++ uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/impl/DataIO.java Tue Nov 6 18:22:03 2012
@@ -0,0 +1,479 @@
+package org.apache.uima.util.impl;
+
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
+
+/**
+ * Methods for working with Data during I/O
+ */
+public class DataIO {
+
+
+ public static final Charset UTF8 = Charset.forName("UTF-8"); // use with String is a java 6, not 5, feature
+ public static final String UTF8_FAST = "UTF-8"; // for faster impls
+ private static final int SIGNED_INT_VALUE_0x80 = 0x80;
+ private static final int MASK_LOW_7 = 0x7f;
+ private static final long MASK_LOW_7_LONG = 0x7fL;
+ private static final long TOP_LONG_BIT = 0x8000000000000000L;
+
+ private static ThreadLocal<CharsetDecoder> DECODER = new ThreadLocal<CharsetDecoder>();
+
+
+ public static String decodeUTF8(ByteBuffer in, final int length) {
+ // First try fast path - assume chars in 0-127
+ fastPath: do {
+ if (in.hasArray()) {
+ byte[] backingArray = in.array();
+ int offset = in.arrayOffset() + in.position();
+ if (offset + length > backingArray.length) {
+ break fastPath;
+ }
+// char[] ca = new char[length];
+ // string builder approach avoids copying the char array object
+ StringBuilder sb = new StringBuilder(length);
+ sb.setLength(length);
+ for (int i = 0; i < length; i++) {
+ byte b = backingArray[offset + i];
+ if (b < 0) { // give up and do it the other way
+ break fastPath;
+ }
+ sb.setCharAt(i, (char)b);
+ }
+ in.position(in.position() + length);
+ return sb.toString(); // doesn't copy the string char array
+ }
+ } while (false); // not a real do loop - do only once
+
+ CharsetDecoder decoder = DECODER.get();
+ if (null == decoder) {
+ decoder = UTF8.newDecoder()
+ .onMalformedInput(CodingErrorAction.REPLACE)
+ .onUnmappableCharacter(CodingErrorAction.REPLACE);
+ DECODER.set(decoder);
+ }
+ ByteBuffer partToDecode = in.slice();
+ partToDecode.limit(length);
+ CharBuffer cb;
+ try {
+ cb = decoder.decode(partToDecode);
+ in.position(in.position() + length);
+ } catch (CharacterCodingException e) {
+ // should never happen
+ throw new RuntimeException(e);
+ }
+ return cb.toString();
+ }
+
+ /***************************************************************************************
+ * For DataOutput, DataInput
+ ***************************************************************************************/
+ /**
+ * Similar to writeUTF, but ok for strings > 32K bytes long and better for strings < 127
+ * string utf-8 length must be <= Integer.MAX_VALUE - 1
+ * @param string
+ * @param out
+ * @throws IOException
+ */
+ public static void writeUTFv(String string, DataOutput out) throws IOException {
+ if (null == string) {
+ out.write(0);
+ return;
+ }
+ byte[] bb = string.getBytes(UTF8_FAST);
+ if (bb.length > (Integer.MAX_VALUE - 1)) {
+ throw new RuntimeException(String.format("String UTF-8 representation too long, was %,d", bb.length));
+ }
+ writeVnumber(out, bb.length + 1); // 0 reserved for null
+ out.write(bb);
+ }
+
+ public static String readUTFv(DataInput in) throws IOException {
+ int length = readVnumber(in) - 1;
+ if (-1 == length) {
+ return null;
+ }
+ byte[] bb = new byte[length];
+ in.readFully(bb);
+// return new String(bb, UTF8_FAST);
+ return decodeUTF8(ByteBuffer.wrap(bb), length);
+ }
+
+ public static long lengthUTFv(String string) throws UnsupportedEncodingException {
+ if (null == string) {
+ return 1;
+ }
+ byte[] bb = string.getBytes(UTF8_FAST);
+ if (bb.length > (Integer.MAX_VALUE - 1)) {
+ throw new RuntimeException(String.format("String UTF-8 representation too long, was %,d", bb.length));
+ }
+ int r = lengthVnumber(bb.length + 1);
+ return r + bb.length;
+ }
+
+ /**
+ * DataOutputStream writeShort with checking of argument
+ * @param out
+ * @param v
+ * @throws IOException
+ */
+ public static void writeShort(DataOutput out, int v) throws IOException {
+ if (v > Short.MAX_VALUE ||
+ v < Short.MIN_VALUE) {
+ throw new RuntimeException(String.format(
+ "Trying to write int %,d as a short but it doesn't fit", v));
+ }
+ out.writeShort(v);
+ }
+
+ /**
+ * DataOutputStream writeByte with checking of argument
+ * @param out
+ * @param v
+ * @throws IOException
+ */
+ public static void writeByte(DataOutput out, int v) throws IOException {
+ if (v > Byte.MAX_VALUE ||
+ v < Byte.MIN_VALUE) {
+ throw new RuntimeException(String.format(
+ "Trying to write int %,d as a byte but it doesn't fit", v));
+ }
+ out.write(v);
+ }
+
+ /**
+ * Write lower 8 bits
+ * @param out
+ * @param v
+ * @throws IOException
+ */
+ public static void writeUnsignedByte(DataOutput out, int v) throws IOException {
+ out.write(v);
+ }
+
+ /**
+ * write a positive or negative number, optimized for fewer bytes near 0
+ * sign put in low order bit, rest of number converted to positive and shifted left 1
+ * max negative written as - 0.
+ * @param out
+ * @param v
+ * @throws IOException
+ */
+ // special handling for MIN_VALUE because
+ // Math.abs of it "fails". We instead code it as
+ // "-0", a code point not otherwise in use
+ public static void writeVPNnumber(DataOutput out, int v) throws IOException {
+ if (v == Integer.MIN_VALUE) {
+ writeVnumber(out, 1);
+ } else {
+ if (v < 0) {
+ writeVnumber(out, (((long)Math.abs(v)) << 1) | 1);
+ } else {
+ writeVnumber(out, v << 1);
+ }
+ }
+ }
+
+ // special handling for MIN_VALUE because
+ // Math.abs of it "fails". We instead code it as
+ // "-0", a code point not otherwise in use
+ public static void writeVPNnumber(DataOutput out, long v) throws IOException {
+ if (v == Long.MIN_VALUE) {
+ writeVnumber(out, 1);
+ } else {
+ if (v < 0) {
+ long pv = Math.abs(v);
+ writeVnumber(out, (Math.abs(v) << 1) | 1);
+ } else {
+ writeVnumber(out, v << 1);
+ }
+ }
+ }
+ // special handling for MIN_VALUE because
+ // Math.abs of it "fails". We instead code it as
+ // "-0", a code point not otherwise in use
+ public static int lengthVPNnumber(int v) {
+ if (v == Integer.MIN_VALUE) {
+ return 1;
+ } else {
+ if (v < 0) {
+ return lengthVnumber(((long)(Math.abs(v)) << 1));
+ } else {
+ return lengthVnumber(v << 1);
+ }
+ }
+ }
+ // special handling for MIN_VALUE because
+ // Math.abs of it "fails". We instead code it as
+ // "-0", a code point not otherwise in use
+ public static int lengthVPNnumber(long v) {
+ if (v == Long.MIN_VALUE) {
+ return 1;
+ } else {
+ if (v < 0) {
+ return lengthVnumber((Math.abs(v) << 1));
+ } else {
+ return lengthVnumber(v << 1);
+ }
+ }
+ }
+
+ /**
+ * Write a positive number with the fewest bytes possible
+ * up to 127 written as a byte
+ * high order bit on means get another byte
+ *
+ * Note: value treated as unsigned 32 bit int
+ *
+ * @param out
+ * @param v
+ * @throws IOException
+ */
+ public static void writeVnumber(final DataOutput out, final int v) throws IOException {
+ if ((v >= 0) && v < 128) {
+ out.write(v); // fast path
+ } else {
+ writeVnumber1(out, v);
+ }
+ }
+
+ private static void writeVnumber1(final DataOutput out, int v) throws IOException {
+ if (v < 0) {
+ throw new RuntimeException("never happen");
+ }
+ for (int i = 0; i < 5; i++) {
+ int outByte = v & MASK_LOW_7;
+ if (v < SIGNED_INT_VALUE_0x80) {
+ out.write(v);
+ return;
+ }
+ out.write(outByte | SIGNED_INT_VALUE_0x80);
+ v = v >>> 7;
+ }
+ }
+
+ public static int lengthVnumber(int v) {
+ int r = 1;
+ for (int i = 0; i < 5; i++) {
+ if (v < SIGNED_INT_VALUE_0x80) {
+ return r;
+ }
+ v = v >>> 7;
+ r++;
+ }
+ throw new RuntimeException("Never get here");
+ }
+
+ public static int readVnumber(final DataInput in) throws IOException {
+ int raw = in.readUnsignedByte();
+ if (raw < 0x80) { // fast path
+ return raw;
+ }
+ int result = raw & MASK_LOW_7;
+ int shift = 7;
+
+ for (int i = 1; i < 5; i++) {
+ raw = in.readUnsignedByte();
+ result |= (raw & MASK_LOW_7) << shift;
+ if (raw < SIGNED_INT_VALUE_0x80) {
+ return result;
+ }
+ shift += 7;
+ }
+ throw new IllegalStateException("Invalid input deserializing Vnumber");
+ }
+
+ /**
+ * Write a positive long with the fewest bytes possible; up to 127 written as a byte, high order
+ * bit on means get another byte.
+ *
+ * @param out
+ * @param v is never negative
+ * @throws IOException
+ */
+ public static void writeVnumber(final DataOutput out, final long v) throws IOException {
+ if ((v >= 0) && v < 128) {
+ out.write((int)v); // fast path
+ } else {
+ writeVnumber1(out, v);
+ }
+ }
+
+ private static void writeVnumber1(final DataOutput out, long v) throws IOException {
+ if (v < 0) {
+ throw new RuntimeException("never happen");
+ }
+ for (int i = 0; i < 9; i++) {
+ if (v < SIGNED_INT_VALUE_0x80) {
+ out.write((int) v);
+ return;
+ }
+ int outByte = (int)(v & MASK_LOW_7_LONG);
+ out.write(outByte | SIGNED_INT_VALUE_0x80);
+ v = v >>> 7;
+ }
+ }
+
+ public static int lengthVnumber(long v) {
+ int r = 1;
+ for (int i = 0; i < 9; i++) {
+ if (v < SIGNED_INT_VALUE_0x80) {
+ return r;
+ }
+ v = v >>> 7;
+ r++;
+ }
+ throw new RuntimeException("Never get here");
+ }
+
+
+ public static long readVlong(final DataInput in) throws IOException {
+ long raw = in.readUnsignedByte();
+ if (raw < 0x80) { // fast path
+ return raw;
+ }
+
+ long result = raw & MASK_LOW_7_LONG;
+ int shift = 7;
+ for (int i = 1; i < 9; i++) {
+ raw = in.readUnsignedByte();
+ result |= (raw & MASK_LOW_7_LONG) << shift;
+ if (raw < 128) {
+ return result;
+ }
+ shift += 7;
+ }
+ throw new IllegalStateException("Invalid input deserializing Vlong");
+ }
+
+ public static long readRestOfVlong(DataInput in, int firstByte) throws IOException {
+ if (firstByte < 0x80) {
+ return firstByte;
+ }
+ long result = firstByte ^ 0x80; // turn off high bit
+ int shift = 7;
+ for (int i = 1; i < 9; i++) {
+ long raw = in.readUnsignedByte();
+ result |= (raw & MASK_LOW_7_LONG) << shift;
+ if (raw < 128) {
+ return result;
+ }
+ shift += 7;
+ }
+ throw new IllegalStateException("Invalid input deserializing Vlong");
+
+ }
+
+ public static void writeByteArray(DataOutput out, byte[] v) throws IOException {
+ writeVnumber(out, v.length);
+ out.write(v);
+ }
+
+ public static byte[] readByteArray(DataInput in) throws IOException {
+ int size = readVnumber(in);
+ byte[] result = new byte[size];
+ in.readFully(result);
+ return result;
+ }
+
+ /**
+ * write array preceded by its length
+ * @param out
+ * @param v
+ * @throws IOException
+ */
+ public static void writeIntArray(DataOutput out, int[] v) throws IOException {
+ writeVnumber(out, v.length);
+ for (int vi : v) {
+ out.writeInt(vi);
+ }
+ }
+
+ public static int[] readIntArray(DataInput in) throws IOException {
+ int size = readVnumber(in);
+ int[] result = new int[size];
+ for (int i = 0; i < size; i++) {
+ result[i] = in.readInt();
+ }
+ return result;
+ }
+
+ /**
+ * Write delta encoded value, for increasing values
+ * @param out
+ * @param v
+ * @throws IOException
+ */
+ public static void writeIntArrayDelta(DataOutput out, int[] v) throws IOException {
+ writeVnumber(out, v.length);
+ int prev = 0;
+ for (int vi : v) {
+ writeVnumber(out, vi - prev);
+ prev = vi;
+ }
+ }
+
+ public static int[] readIntArrayDelta(DataInput in) throws IOException {
+ int size = readVnumber(in);
+ int prev = 0;
+ int[] result = new int[size];
+ for (int i = 0; i < size; i++) {
+ result[i] = prev + readVnumber(in);
+ prev = result[i];
+ }
+ return result;
+ }
+
+ public static void writeLongArray(DataOutput out, long[] v) throws IOException {
+ // java doesn't support arrays longer than Integer.MAX_VALUE, even on 64-bit platforms
+ writeVnumber(out, v.length);
+ for (long vi : v)
+ out.writeLong(vi);
+ }
+
+ public static long[] readLongArray(DataInput in) throws IOException {
+ int size = readVnumber(in);
+ long[] v = new long[size];
+ for (int i = 0; i < size; ++i)
+ v[i] = in.readLong();
+ return v;
+ }
+
+ public static void writeLongArrayDelta(DataOutput out, long[] v) throws IOException {
+ // java doesn't support arrays longer than Integer.MAX_VALUE, even on 64-bit platforms
+ writeVnumber(out, v.length);
+ long prev = 0;
+ for (long vi : v) {
+ writeVnumber(out, vi - prev);
+ prev = vi;
+ }
+ }
+
+ public static long[] readLongArrayDelta(DataInput in) throws IOException {
+ int size = readVnumber(in);
+ long[] v = new long[size];
+ long prev = 0;
+ for (int i=0; i<size; ++i) {
+ v[i] = prev + readVlong(in);
+ prev = v[i];
+ }
+ return v;
+ }
+
+ public static int readUnsignedByte(DataInput in) throws IOException {
+ int r = in.readUnsignedByte();
+ if (r < 0) {
+ throw new IOException("Premature EOF");
+ }
+ return r;
+ }
+
+}
Propchange: uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/util/impl/DataIO.java
------------------------------------------------------------------------------
svn:eol-style = native