You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/08/17 10:32:03 UTC

[22/56] [partial] incubator-joshua git commit: maven multi-module layout 1st commit: moving files into joshua-core

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/util/encoding/Analyzer.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/util/encoding/Analyzer.java b/joshua-core/src/main/java/org/apache/joshua/util/encoding/Analyzer.java
new file mode 100644
index 0000000..ad2910c
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/encoding/Analyzer.java
@@ -0,0 +1,235 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.encoding;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.TreeMap;
+
+import org.apache.joshua.util.io.LineReader;
+
+public class Analyzer {
+
+  private TreeMap<Float, Integer> histogram;
+  private int total;
+
+  public Analyzer() {
+    histogram = new TreeMap<Float, Integer>();
+    initialize();
+  }
+
+  public void initialize() {
+    histogram.clear();
+    // TODO: drop zero bucket; we won't encode zero-valued features anyway.
+    histogram.put(0.0f, 0);
+    total = 0;
+  }
+
+  public void add(float key) {
+    if (histogram.containsKey(key))
+      histogram.put(key, histogram.get(key) + 1);
+    else
+      histogram.put(key, 1);
+    total++;
+  }
+
+  public float[] quantize(int num_bits) {
+    float[] buckets = new float[1 << num_bits];
+
+    // We make sure that 0.0f always has its own bucket, so the bucket
+    // size is determined excluding the zero values.
+    int size = (total - histogram.get(0.0f)) / (buckets.length - 1);
+    buckets[0] = 0.0f;
+
+    int old_size = -1;
+    while (old_size != size) {
+      int sum = 0;
+      int count = buckets.length - 1;
+      for (float key : histogram.keySet()) {
+        int entry_count = histogram.get(key);
+        if (entry_count < size && key != 0)
+          sum += entry_count;
+        else
+          count--;
+      }
+      old_size = size;
+      size = sum / count;
+    }
+
+    float last_key = Float.MAX_VALUE;
+
+    int index = 1;
+    int count = 0;
+    float sum = 0.0f;
+
+    int value;
+    for (float key : histogram.keySet()) {
+      value = histogram.get(key);
+      // Special bucket termination cases: zero boundary and histogram spikes.
+      if (key == 0 || (last_key < 0 && key > 0) || (value >= size)) {
+        // If the count is not 0, i.e. there were negative values, we should
+        // not bucket them with the positive ones. Close out the bucket now.
+        if (count != 0 && index < buckets.length - 2) {
+          buckets[index++] = (float) sum / count;
+          count = 0;
+          sum = 0;
+        }
+        if (key == 0)
+          continue;
+      }
+      count += value;
+      sum += key * value;
+      // Check if the bucket is full.
+      if (count >= size && index < buckets.length - 2) {
+        buckets[index++] = (float) sum / count;
+        count = 0;
+        sum = 0;
+      }
+      last_key = key;
+    }
+    if (count > 0 && index < buckets.length - 1)
+      buckets[index++] = (float) sum / count;
+    
+    float[] shortened = new float[index];
+    for (int i = 0; i < shortened.length; ++i)
+      shortened[i] = buckets[i];
+    return shortened;
+  }
+
+  public boolean isBoolean() {
+    for (float value : histogram.keySet())
+      if (value != 0 && value != 1)
+        return false;
+    return true;
+  }
+
+  public boolean isByte() {
+    for (float value : histogram.keySet())
+      if (Math.ceil(value) != value || value < Byte.MIN_VALUE || value > Byte.MAX_VALUE)
+        return false;
+    return true;
+  }
+
+  public boolean isShort() {
+    for (float value : histogram.keySet())
+      if (Math.ceil(value) != value || value < Short.MIN_VALUE || value > Short.MAX_VALUE)
+        return false;
+    return true;
+  }
+
+  public boolean isChar() {
+    for (float value : histogram.keySet())
+      if (Math.ceil(value) != value || value < Character.MIN_VALUE || value > Character.MAX_VALUE)
+        return false;
+    return true;
+  }
+
+  public boolean isInt() {
+    for (float value : histogram.keySet())
+      if (Math.ceil(value) != value)
+        return false;
+    return true;
+  }
+
+  public boolean is8Bit() {
+    return (histogram.keySet().size() <= 256);
+  }
+
+  public FloatEncoder inferUncompressedType() {
+    if (isBoolean())
+      return PrimitiveFloatEncoder.BOOLEAN;
+    if (isByte())
+      return PrimitiveFloatEncoder.BYTE;
+    if (is8Bit())
+      return new EightBitQuantizer(this.quantize(8));
+    if (isChar())
+      return PrimitiveFloatEncoder.CHAR;
+    if (isShort())
+      return PrimitiveFloatEncoder.SHORT;
+    if (isInt())
+      return PrimitiveFloatEncoder.INT;
+    return PrimitiveFloatEncoder.FLOAT;
+  }
+  
+  public FloatEncoder inferType(int bits) {
+    if (isBoolean())
+      return PrimitiveFloatEncoder.BOOLEAN;
+    if (isByte())
+      return PrimitiveFloatEncoder.BYTE;
+    if (bits == 8 || is8Bit())
+      return new EightBitQuantizer(this.quantize(8));
+    // TODO: Could add sub-8-bit encoding here (or larger).
+    if (isChar())
+      return PrimitiveFloatEncoder.CHAR;
+    if (isShort())
+      return PrimitiveFloatEncoder.SHORT;
+    if (isInt())
+      return PrimitiveFloatEncoder.INT;
+    return PrimitiveFloatEncoder.FLOAT;
+  }
+
+  public String toString(String label) {
+    StringBuilder sb = new StringBuilder();
+    for (float val : histogram.keySet())
+      sb.append(label + "\t" + String.format("%.5f", val) + "\t" + histogram.get(val) + "\n");
+    return sb.toString();
+  }
+  
+  public static void main(String[] args) throws IOException {
+    LineReader reader = new LineReader(args[0]);
+    ArrayList<Float> s = new ArrayList<Float>();
+
+    System.out.println("Initialized.");
+    while (reader.hasNext())
+      s.add(Float.parseFloat(reader.next().trim()));
+    System.out.println("Data read.");
+    int n = s.size();
+    byte[] c = new byte[n];
+    ByteBuffer b = ByteBuffer.wrap(c);
+    Analyzer q = new Analyzer();
+
+    q.initialize();
+    for (int i = 0; i < n; i++)
+      q.add(s.get(i));
+    EightBitQuantizer eb = new EightBitQuantizer(q.quantize(8));
+    System.out.println("Quantizer learned.");
+
+    for (int i = 0; i < n; i++)
+      eb.write(b, s.get(i));
+    b.rewind();
+    System.out.println("Quantization complete.");
+
+    float avg_error = 0;
+    float error = 0;
+    int count = 0;
+    for (int i = -4; i < n - 4; i++) {
+      float coded = eb.read(b, i);
+      if (s.get(i + 4) != 0) {
+        error = Math.abs(s.get(i + 4) - coded);
+        avg_error += error;
+        count++;
+      }
+    }
+    avg_error /= count;
+    System.out.println("Evaluation complete.");
+
+    System.out.println("Average quanitization error over " + n + " samples is: " + avg_error);
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/util/encoding/EightBitQuantizer.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/util/encoding/EightBitQuantizer.java b/joshua-core/src/main/java/org/apache/joshua/util/encoding/EightBitQuantizer.java
new file mode 100644
index 0000000..5876d4f
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/encoding/EightBitQuantizer.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.encoding;
+
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+public class EightBitQuantizer implements FloatEncoder {
+
+  private float[] buckets;
+
+  public EightBitQuantizer() {
+    this.buckets = new float[256];
+  }
+
+  public EightBitQuantizer(float[] buckets) {
+    if (buckets.length > 256)
+      throw new RuntimeException("Incompatible number of buckets: " + buckets.length);
+    this.buckets = buckets;
+  }
+
+  @Override
+  public final float read(ByteBuffer stream, int position) {
+    byte index = stream.get(position + EncoderConfiguration.ID_SIZE);
+    return buckets[index + 128];
+  }
+
+  @Override
+  public final void write(ByteBuffer stream, float val) {
+    byte index = -128;
+
+    // We search for the bucket best matching the value. Only zeroes will be
+    // mapped to the zero bucket.
+    if (val != 0 && buckets.length > 1) {
+      int t = 1;
+      int b = buckets.length - 1;
+      while ((b - t) > 1) {
+        int half = (t + b) / 2;
+        if (val >= buckets[half])
+          t = half;
+        if (val <= buckets[half])
+          b = half;
+      }
+      index = (byte) ((Math.abs(buckets[t] - val) > (Math.abs(buckets[b] - val)) ? b : t) - 128);
+    }
+    stream.put(index);
+  }
+
+  @Override
+  public String getKey() {
+    return "8bit";
+  }
+
+  @Override
+  public void writeState(DataOutputStream out) throws IOException {
+    out.writeUTF(getKey());
+    out.writeInt(buckets.length);
+    for (int i = 0; i < buckets.length; i++)
+      out.writeFloat(buckets[i]);
+  }
+
+  @Override
+  public void readState(DataInputStream in) throws IOException {
+    int length = in.readInt();
+    buckets = new float[length];
+    for (int i = 0; i < buckets.length; i++)
+      buckets[i] = in.readFloat();
+  }
+
+  @Override
+  public final int size() {
+    return 1;
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/util/encoding/EncoderConfiguration.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/util/encoding/EncoderConfiguration.java b/joshua-core/src/main/java/org/apache/joshua/util/encoding/EncoderConfiguration.java
new file mode 100644
index 0000000..28b013f
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/encoding/EncoderConfiguration.java
@@ -0,0 +1,160 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.encoding;
+
+import java.io.BufferedInputStream;
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.joshua.corpus.Vocabulary;
+
+public class EncoderConfiguration {
+
+  public static int ID_SIZE = 4;
+
+  private IntEncoder idEncoder;
+  private int[] innerToOuter;
+  private FloatEncoder[] encoderById;
+  private FloatEncoder[] encoders;
+
+  private Map<Integer, Integer> outerToInner;
+  
+  private boolean labeled;
+  
+  private int numDenseFeatures = 0;
+  
+  public EncoderConfiguration() {
+    this.outerToInner = new HashMap<Integer, Integer>();
+  }
+
+  public int getNumDenseFeatures() {
+    return numDenseFeatures;
+  }
+  
+  public int getNumFeatures() {
+    return encoders.length;
+  }
+  
+  public void load(String file_name) throws IOException {
+    File encoding_file = new File(file_name);
+    BufferedInputStream buf_stream = new BufferedInputStream(new FileInputStream(encoding_file));
+    DataInputStream in_stream = new DataInputStream(buf_stream);
+
+    String id_key = in_stream.readUTF();
+    idEncoder = EncoderFactory.getIntEncoder(id_key);
+    idEncoder.readState(in_stream);
+    ID_SIZE = idEncoder.size();
+    labeled = in_stream.readBoolean();
+
+    int num_encoders = in_stream.readInt();
+    encoders = new FloatEncoder[num_encoders];
+    for (int i = 0; i < num_encoders; i++) {
+      String key = in_stream.readUTF();
+      FloatEncoder e = EncoderFactory.getFloatEncoder(key);
+      e.readState(in_stream);
+      encoders[i] = e;
+    }
+    int num_features = in_stream.readInt();
+    encoderById = new FloatEncoder[num_features];
+    innerToOuter = new int[num_features];
+    for (int i = 0; i < num_features; i++) {
+      int outer_id;
+      if (labeled) {
+        String feature_name = in_stream.readUTF();
+        outer_id = Vocabulary.id(feature_name);
+        try {
+          Integer.parseInt(feature_name);
+          numDenseFeatures++;
+        } catch (NumberFormatException e) {}
+      } else {
+        outer_id = in_stream.readInt();
+      }
+      int inner_id = in_stream.readInt();
+      int encoder_index = in_stream.readInt();
+      if (encoder_index >= num_encoders) {
+        throw new RuntimeException("Error deserializing EncoderConfig. " + "Feature "
+            + (labeled ? Vocabulary.word(outer_id) : outer_id) + " referring to encoder "
+            + encoder_index + " when only " + num_encoders + " known.");
+      }
+      encoderById[inner_id] = encoders[encoder_index];
+      innerToOuter[inner_id] = outer_id;
+    }
+    in_stream.close();
+    
+    outerToInner.clear();
+    for (int i = 0; i < innerToOuter.length; ++i)
+      outerToInner.put(innerToOuter[i], i);
+  }
+
+  public FloatEncoder encoder(int inner_id) {
+    return encoderById[inner_id];
+  }
+  
+  public int readId(ByteBuffer buffer, int pos) {
+    return idEncoder.read(buffer, pos);
+  }
+  
+  public int outerId(int inner_id) {
+    return innerToOuter[inner_id];
+  }
+  
+  public int innerId(int outer_id) {
+    return outerToInner.get(outer_id);
+  }
+  
+  public boolean isLabeled() {
+    return labeled;
+  }
+
+  /**
+   * For now, this just loads a configuration and prints out the number of features.
+   * 
+   * @param args an input configuration file
+   */
+  public static void main(String[] args) {
+    String grammar_dir = null;
+    try {
+      grammar_dir = args[0];
+    
+      EncoderConfiguration encoding = new EncoderConfiguration();
+      encoding.load(grammar_dir + File.separator + "encoding");
+      int num_features = encoding.getNumFeatures();
+      System.out.println(String.format("num_features = %d", encoding.getNumFeatures()));
+
+      for (int feature_id = 0; feature_id < num_features; feature_id++) {
+        if (Vocabulary.size() == 1) {
+          System.out.println(String.format("feature: %d", feature_id));
+        } else {
+          String name = Vocabulary.word(encoding.outerId(feature_id));
+          System.out.println(String.format("feature: %s", name));
+        }
+      }
+
+    } catch (ArrayIndexOutOfBoundsException e) {
+      throw new RuntimeException("Usage: EncoderConfiguration <packed_directory>");
+    } catch (IOException e) {
+      throw new RuntimeException(String.format("* FATAL: can't find file %s/encoding", grammar_dir));
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/util/encoding/EncoderFactory.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/util/encoding/EncoderFactory.java b/joshua-core/src/main/java/org/apache/joshua/util/encoding/EncoderFactory.java
new file mode 100644
index 0000000..a1f93d0
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/encoding/EncoderFactory.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.encoding;
+
+public class EncoderFactory {
+
+  public static FloatEncoder getFloatEncoder(String key) {
+    FloatEncoder encoder = PrimitiveFloatEncoder.get(key.toUpperCase());
+    if (encoder != null) {
+      return encoder;
+    } else if ("8bit".equals(key)) {
+      return new EightBitQuantizer();
+    } else {
+      throw new RuntimeException("Unknown FloatEncoder type: " + key.toUpperCase());
+    }
+  }
+
+  public static IntEncoder getIntEncoder(String key) {
+    IntEncoder encoder = PrimitiveIntEncoder.get(key.toUpperCase());
+    if (encoder != null) {
+      return encoder;
+    } else {
+      throw new RuntimeException("Unknown IntEncoder type: " + key.toUpperCase());
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/util/encoding/FeatureTypeAnalyzer.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/util/encoding/FeatureTypeAnalyzer.java b/joshua-core/src/main/java/org/apache/joshua/util/encoding/FeatureTypeAnalyzer.java
new file mode 100644
index 0000000..504859f
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/encoding/FeatureTypeAnalyzer.java
@@ -0,0 +1,254 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.encoding;
+
+import java.io.BufferedOutputStream;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.util.io.LineReader;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class FeatureTypeAnalyzer {
+
+  private static final Logger LOG = LoggerFactory.getLogger(FeatureTypeAnalyzer.class);
+
+  private ArrayList<FeatureType> types;
+
+  private Map<Integer, Integer> featureToType;
+
+  private Map<Integer, Integer> featureIdMap;
+
+  // Is the feature setup labeled.
+  private boolean labeled;
+
+  // Is the encoder configuration open for new features (that are not assumed boolean)?
+  private boolean open;
+
+  public FeatureTypeAnalyzer() {
+    this(false);
+  }
+
+  public FeatureTypeAnalyzer(boolean open) {
+    this.open = open;
+    this.types = new ArrayList<FeatureType>();
+    this.featureToType = new HashMap<Integer, Integer>();
+    this.featureIdMap = new HashMap<Integer, Integer>();
+  }
+
+  public void readConfig(String config_filename) throws IOException {
+    LineReader reader = new LineReader(config_filename);
+    while (reader.hasNext()) {
+      // Clean up line, chop comments off and skip if the result is empty.
+      String line = reader.next().trim();
+      if (line.indexOf('#') != -1)
+        line = line.substring(0, line.indexOf('#'));
+      if (line.isEmpty())
+        continue;
+      String[] fields = line.split("[\\s]+");
+
+      if ("encoder".equals(fields[0])) {
+        // Adding an encoder to the mix.
+        if (fields.length < 3) {
+          throw new RuntimeException("Incomplete encoder line in config.");
+        }
+        String encoder_key = fields[1];
+        ArrayList<Integer> feature_ids = new ArrayList<Integer>();
+        for (int i = 2; i < fields.length; i++)
+          feature_ids.add(Vocabulary.id(fields[i]));
+        addFeatures(encoder_key, feature_ids);
+      }
+    }
+  }
+
+  public void addFeatures(String encoder_key, List<Integer> feature_ids) {
+    int index = addType(encoder_key);
+    for (int feature_id : feature_ids)
+      featureToType.put(feature_id, index);
+  }
+
+  private int addType(String encoder_key) {
+    FeatureType ft = new FeatureType(encoder_key);
+    int index = types.indexOf(ft);
+    if (index < 0) {
+      types.add(ft);
+      return types.size() - 1;
+    }
+    return index;
+  }
+
+  private int addType() {
+    types.add(new FeatureType());
+    return types.size() - 1;
+  }
+
+  public void observe(int feature_id, float value) {
+    Integer type_id = featureToType.get(feature_id);
+    if (type_id == null && open) {
+      type_id = addType();
+      featureToType.put(feature_id, type_id);
+    }
+    if (type_id != null)
+      types.get(type_id).observe(value);
+  }
+
+  // Inspects the collected histograms, inferring actual type of feature. Then replaces the
+  // analyzer, if present, with the most compact applicable type.
+  public void inferTypes(boolean labeled) {
+    for (FeatureType ft : types) {
+      ft.inferUncompressedType();
+    }
+    if (LOG.isInfoEnabled()) {
+      for (int id : featureToType.keySet()) {
+        LOG.info("Type inferred: {} is {}", (labeled ? Vocabulary.word(id) : "Feature " + id),
+            types.get(featureToType.get(id)).encoder.getKey());
+      }
+    }
+  }
+
+  public void buildFeatureMap() {
+    int[] known_features = new int[featureToType.keySet().size()];
+    int i = 0;
+    for (int f : featureToType.keySet())
+      known_features[i++] = f;
+    Arrays.sort(known_features);
+
+    featureIdMap.clear();
+    for (i = 0; i < known_features.length; ++i)
+      featureIdMap.put(known_features[i], i);
+  }
+
+  public int getRank(int feature_id) {
+    return featureIdMap.get(feature_id);
+  }
+
+  public IntEncoder getIdEncoder() {
+    int num_features = featureIdMap.size();
+    if (num_features <= Byte.MAX_VALUE)
+      return PrimitiveIntEncoder.BYTE;
+    else if (num_features <= Character.MAX_VALUE)
+      return PrimitiveIntEncoder.CHAR;
+    else
+      return PrimitiveIntEncoder.INT;
+  }
+
+  public void write(String file_name) throws IOException {
+    File out_file = new File(file_name);
+    BufferedOutputStream buf_stream = new BufferedOutputStream(new FileOutputStream(out_file));
+    DataOutputStream out_stream = new DataOutputStream(buf_stream);
+
+    buildFeatureMap();
+
+    getIdEncoder().writeState(out_stream);
+    out_stream.writeBoolean(labeled);
+    out_stream.writeInt(types.size());
+    for (int index = 0; index < types.size(); index++)
+      types.get(index).encoder.writeState(out_stream);
+
+    out_stream.writeInt(featureToType.size());
+    for (int feature_id : featureToType.keySet()) {
+      if (labeled)
+        out_stream.writeUTF(Vocabulary.word(feature_id));
+      else
+        out_stream.writeInt(feature_id);
+      out_stream.writeInt(featureIdMap.get(feature_id));
+      out_stream.writeInt(featureToType.get(feature_id));
+    }
+    out_stream.close();
+  }
+
+  public String toString() {
+    StringBuilder sb = new StringBuilder();
+    for (int feature_id : featureToType.keySet()) {
+      sb.append(types.get(featureToType.get(feature_id)).analyzer.toString(Vocabulary.word(feature_id)));
+    }
+    return sb.toString();
+  }
+
+  public boolean isLabeled() {
+    return labeled;
+  }
+
+  public void setLabeled(boolean labeled) {
+    this.labeled = labeled;
+  }
+
+  class FeatureType {
+    FloatEncoder encoder;
+    Analyzer analyzer;
+    int bits;
+
+    FeatureType() {
+      encoder = null;
+      analyzer = new Analyzer();
+      bits = -1;
+    }
+
+    FeatureType(String key) {
+      // either throws or returns non-null
+      FloatEncoder e = EncoderFactory.getFloatEncoder(key);
+      encoder = e;
+      analyzer = null;
+      bits = -1;
+    }
+
+    void inferUncompressedType() {
+      if (encoder != null)
+        return;
+      encoder = analyzer.inferUncompressedType();
+      analyzer = null;
+    }
+
+    void inferType() {
+      if (encoder != null)
+        return;
+      encoder = analyzer.inferType(bits);
+      analyzer = null;
+    }
+
+    void observe(float value) {
+      if (analyzer != null)
+        analyzer.add(value);
+    }
+
+    public boolean equals(Object t) {
+      if (t != null && t instanceof FeatureType) {
+        FeatureType that = (FeatureType) t;
+        if (this.encoder != null) {
+          return this.encoder.equals(that.encoder);
+        } else {
+          if (that.encoder != null)
+            return false;
+          if (this.analyzer != null)
+            return this.analyzer.equals(that.analyzer);
+        }
+      }
+      return false;
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/util/encoding/FloatEncoder.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/util/encoding/FloatEncoder.java b/joshua-core/src/main/java/org/apache/joshua/util/encoding/FloatEncoder.java
new file mode 100644
index 0000000..5121ea2
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/encoding/FloatEncoder.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.encoding;
+
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+public interface FloatEncoder {
+
+  public float read(ByteBuffer stream, int position);
+
+  public void write(ByteBuffer stream, float value);
+
+  public String getKey();
+
+  public void writeState(DataOutputStream out) throws IOException;
+
+  public void readState(DataInputStream in) throws IOException;
+
+  public int size();
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/util/encoding/IntEncoder.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/util/encoding/IntEncoder.java b/joshua-core/src/main/java/org/apache/joshua/util/encoding/IntEncoder.java
new file mode 100644
index 0000000..a8917f7
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/encoding/IntEncoder.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.encoding;
+
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+public interface IntEncoder {
+
+  public int read(ByteBuffer stream, int position);
+
+  public void write(ByteBuffer stream, int value);
+
+  public String getKey();
+
+  public void writeState(DataOutputStream out) throws IOException;
+
+  public void readState(DataInputStream in) throws IOException;
+
+  public int size();
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/util/encoding/PrimitiveFloatEncoder.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/util/encoding/PrimitiveFloatEncoder.java b/joshua-core/src/main/java/org/apache/joshua/util/encoding/PrimitiveFloatEncoder.java
new file mode 100644
index 0000000..d5015f2
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/encoding/PrimitiveFloatEncoder.java
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.encoding;
+
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+public enum PrimitiveFloatEncoder implements FloatEncoder {
+
+  BYTE("byte", 1) {
+    public final float read(ByteBuffer stream, int position) {
+      return (float) stream.get(position + EncoderConfiguration.ID_SIZE);
+    }
+
+    public final void write(ByteBuffer stream, float value) {
+      stream.put((byte) value);
+    }
+  },
+
+  BOOLEAN("boolean", 0) {
+    public final float read(ByteBuffer stream, int position) {
+      return 1.0f;
+    }
+
+    public final void write(ByteBuffer stream, float value) {
+    }
+  },
+
+  CHAR("char", 2) {
+    public final float read(ByteBuffer stream, int position) {
+      return (float) stream.getChar(position + EncoderConfiguration.ID_SIZE);
+    }
+
+    public final void write(ByteBuffer stream, float value) {
+      stream.putChar((char) value);
+    }
+  },
+
+  FLOAT("float", 4) {
+    public final float read(ByteBuffer stream, int position) {
+      return stream.getFloat(position + EncoderConfiguration.ID_SIZE);
+    }
+
+    public final void write(ByteBuffer stream, float value) {
+      stream.putFloat(value);
+    }
+  },
+
+  INT("int", 4) {
+    public final float read(ByteBuffer stream, int position) {
+      return (float) stream.getInt(position + EncoderConfiguration.ID_SIZE);
+    }
+
+    public final void write(ByteBuffer stream, float value) {
+      stream.putInt((int) value);
+    }
+  },
+
+  SHORT("short", 2) {
+    public final float read(ByteBuffer stream, int position) {
+      return (float) stream.getShort(position + EncoderConfiguration.ID_SIZE);
+    }
+
+    public final void write(ByteBuffer stream, float value) {
+      stream.putShort((short) value);
+    }
+  };
+
+  private final String key;
+  private final int size;
+
+  private PrimitiveFloatEncoder(String k, int s) {
+    key = k;
+    size = s;
+  }
+
+  @Override
+  public String getKey() {
+    return key;
+  }
+
+  @Override
+  public int size() {
+    return size;
+  }
+
+  public static PrimitiveFloatEncoder get(String k) {
+    PrimitiveFloatEncoder encoder;
+    try {
+      encoder = valueOf(k);
+    } catch (IllegalArgumentException e) {
+      return null;
+    }
+    return encoder;
+  }
+
+  @Override
+  public void readState(DataInputStream in) throws IOException {
+  }
+
+  @Override
+  public void writeState(DataOutputStream out) throws IOException {
+    out.writeUTF(getKey());
+  }
+
+  @Override
+  public abstract float read(ByteBuffer stream, int position);
+
+  @Override
+  public abstract void write(ByteBuffer stream, float value);
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/util/encoding/PrimitiveIntEncoder.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/util/encoding/PrimitiveIntEncoder.java b/joshua-core/src/main/java/org/apache/joshua/util/encoding/PrimitiveIntEncoder.java
new file mode 100644
index 0000000..42f6053
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/encoding/PrimitiveIntEncoder.java
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.encoding;
+
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+public enum PrimitiveIntEncoder implements IntEncoder {
+
+  // TODO: the inconsistency with FloatEncoders is dangerous.
+  BYTE("byte", 1) {
+    public final int read(ByteBuffer stream, int position) {
+      return (int) stream.get(position);
+    }
+
+    public final void write(ByteBuffer stream, int value) {
+      stream.put((byte) value);
+    }
+  },
+
+  CHAR("char", 2) {
+    public final int read(ByteBuffer stream, int position) {
+      return (int) stream.getChar(position);
+    }
+
+    public final void write(ByteBuffer stream, int value) {
+      stream.putChar((char) value);
+    }
+  },
+
+  INT("int", 4) {
+    public final int read(ByteBuffer stream, int position) {
+      return (int) stream.getInt(position);
+    }
+
+    public final void write(ByteBuffer stream, int value) {
+      stream.putInt((int) value);
+    }
+  },
+
+  SHORT("short", 2) {
+    public final int read(ByteBuffer stream, int position) {
+      return (int) stream.getShort(position);
+    }
+
+    public final void write(ByteBuffer stream, int value) {
+      stream.putShort((short) value);
+    }
+  };
+
+  private final String key;
+  private final int size;
+
+  private PrimitiveIntEncoder(String k, int s) {
+    key = k;
+    size = s;
+  }
+
+  @Override
+  public String getKey() {
+    return key;
+  }
+
+  @Override
+  public int size() {
+    return size;
+  }
+
+  public static PrimitiveIntEncoder get(String k) {
+    PrimitiveIntEncoder encoder;
+    try {
+      encoder = valueOf(k);
+    } catch (IllegalArgumentException e) {
+      return null;
+    }
+    return encoder;
+  }
+
+  @Override
+  public void readState(DataInputStream in) throws IOException {
+  }
+
+  @Override
+  public void writeState(DataOutputStream out) throws IOException {
+    out.writeUTF(getKey());
+  }
+
+  @Override
+  public abstract int read(ByteBuffer stream, int position);
+
+  @Override
+  public abstract void write(ByteBuffer stream, int value);
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/util/encoding/VariableQuantizer.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/util/encoding/VariableQuantizer.java b/joshua-core/src/main/java/org/apache/joshua/util/encoding/VariableQuantizer.java
new file mode 100644
index 0000000..afa3f69
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/encoding/VariableQuantizer.java
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.encoding;
+
+public class VariableQuantizer {
+
+  private final byte[] bytes;
+  private int byteOffset;
+  private int bitOffset;
+
+  /**
+   * @param bytes bytes from which this will read bits. Bits will be read from the first byte first.
+   *          Bits are read within a byte from most-significant to least-significant bit.
+   */
+  public VariableQuantizer(byte[] bytes) {
+    this.bytes = bytes;
+  }
+
+  /**
+   * @return index of next bit in current byte which would be read by the next call to
+   *         {@link #readBits(int)}.
+   */
+  public int getBitOffset() {
+    return bitOffset;
+  }
+
+  /**
+   * @return index of next byte in input byte array which would be read by the next call to
+   *         {@link #readBits(int)}.
+   */
+  public int getByteOffset() {
+    return byteOffset;
+  }
+
+  /**
+   * @param numBits number of bits to read
+   * @return int representing the bits read. The bits will appear as the least-significant bits of
+   *         the int
+   * @throws IllegalArgumentException if numBits isn't in [1,32] or more than is available
+   */
+  public int readBits(int numBits) {
+    if (numBits < 1 || numBits > 32 || numBits > available()) {
+      throw new IllegalArgumentException(String.valueOf(numBits));
+    }
+
+    int result = 0;
+
+    // First, read remainder from current byte
+    if (bitOffset > 0) {
+      int bitsLeft = 8 - bitOffset;
+      int toRead = numBits < bitsLeft ? numBits : bitsLeft;
+      int bitsToNotRead = bitsLeft - toRead;
+      int mask = (0xFF >> (8 - toRead)) << bitsToNotRead;
+      result = (bytes[byteOffset] & mask) >> bitsToNotRead;
+      numBits -= toRead;
+      bitOffset += toRead;
+      if (bitOffset == 8) {
+        bitOffset = 0;
+        byteOffset++;
+      }
+    }
+
+    // Next read whole bytes
+    if (numBits > 0) {
+      while (numBits >= 8) {
+        result = (result << 8) | (bytes[byteOffset] & 0xFF);
+        byteOffset++;
+        numBits -= 8;
+      }
+
+      // Finally read a partial byte
+      if (numBits > 0) {
+        int bitsToNotRead = 8 - numBits;
+        int mask = (0xFF >> bitsToNotRead) << bitsToNotRead;
+        result = (result << numBits) | ((bytes[byteOffset] & mask) >> bitsToNotRead);
+        bitOffset += numBits;
+      }
+    }
+
+    return result;
+  }
+
+  /**
+   * @return number of bits that can be read successfully
+   */
+  public int available() {
+    return 8 * (bytes.length - byteOffset) - bitOffset;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/util/io/BinaryIn.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/util/io/BinaryIn.java b/joshua-core/src/main/java/org/apache/joshua/util/io/BinaryIn.java
new file mode 100644
index 0000000..9483e3e
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/io/BinaryIn.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.io;
+
+import java.io.DataInput;
+import java.io.Externalizable;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.ObjectInput;
+import java.io.ObjectStreamConstants;
+import java.io.RandomAccessFile;
+
+public class BinaryIn<E extends Externalizable> extends RandomAccessFile implements DataInput, ObjectInput {
+
+  private final Class<E> type;
+
+  public BinaryIn(String filename, Class<E> type) throws FileNotFoundException {
+    super(filename, "r");
+    this.type = type;
+  }
+
+  public int available() throws IOException {
+    long pos = getFilePointer();
+    long length = length();
+    long bytesAvailable = length - pos;
+    if (bytesAvailable > Integer.MAX_VALUE) {
+      return Integer.MAX_VALUE;
+    } else {
+      return (int) bytesAvailable;
+    }
+  }
+
+  public E readObject() throws ClassNotFoundException, IOException {
+
+    int b = peek();
+    if (b == ObjectStreamConstants.TC_NULL) {
+      return null;
+    } else {
+      E obj;
+      try {
+        obj = type.newInstance();
+        obj.readExternal(this);
+        return obj;
+      } catch (InstantiationException e) {
+        throw new RuntimeException(e);
+      } catch (IllegalAccessException e) {
+        throw new RuntimeException(e);
+      }
+    }
+  }
+
+  public long skip(long n) throws IOException {
+
+    long bytesSkipped = 0;
+
+    while (n > 0) {
+      if (n > Integer.MAX_VALUE) {
+        bytesSkipped += skipBytes(Integer.MAX_VALUE);
+        n -= Integer.MAX_VALUE;
+      } else {
+        bytesSkipped = skipBytes((int) n);
+        n = 0;
+      }
+    }
+
+    return bytesSkipped;
+  }
+
+  private int peek() throws IOException {
+    long pos = getFilePointer();
+    int b = read();
+    seek(pos);
+    return b;
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/util/io/BinaryOut.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/util/io/BinaryOut.java b/joshua-core/src/main/java/org/apache/joshua/util/io/BinaryOut.java
new file mode 100644
index 0000000..8383053
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/io/BinaryOut.java
@@ -0,0 +1,505 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.io;
+
+import java.io.Closeable;
+import java.io.DataOutput;
+import java.io.Externalizable;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.Flushable;
+import java.io.IOException;
+import java.io.ObjectOutput;
+import java.io.ObjectOutputStream;
+import java.io.ObjectStreamConstants;
+import java.io.OutputStream;
+import java.io.UTFDataFormatException;
+
+/**
+ * A BinaryOut writes data to an output stream in raw binary form. Each data type is converted to
+ * byte representation.
+ * <p>
+ * Unlike ObjectOutputStream, no extra Java meta-data is written to the stream.
+ * 
+ * @author Lane Schwartz
+ * @see ObjectOutputStream
+ * @see Externalizable
+ */
+public class BinaryOut implements DataOutput, ObjectOutput, Flushable, Closeable {
+
+
+  public final int BITS_PER_BYTE = 8;
+
+  public final int BOOLEAN_SIZE = 1;
+  public final int BYTE_SIZE = 1;
+  public final int CHAR_SIZE = 2;
+  public final int SHORT_SIZE = 2;
+  public final int FLOAT_SIZE = 4;
+  public final int INT_SIZE = 4;
+  public final int DOUBLE_SIZE = 8;
+  public final int LONG_SIZE = 8;
+
+  private final OutputStream out;
+
+  private int bufferPosition;
+  private static final int BUFFER_SIZE = 1024;
+  private final byte[] buffer;
+  private final char[] charBuffer;
+  private final utf8CharRange[] charSizeBuffer;
+  private final boolean writeObjects;
+
+  public BinaryOut(File file) throws FileNotFoundException, IOException {
+    this(new FileOutputStream(file), true);
+  }
+
+  public BinaryOut(String filename) throws FileNotFoundException, IOException {
+    this(new File(filename));
+  }
+
+  public BinaryOut(OutputStream out, boolean writeObjects) throws IOException {
+    this.out = out;
+    this.buffer = new byte[BUFFER_SIZE];
+    this.charBuffer = new char[BUFFER_SIZE];
+    this.charSizeBuffer = new utf8CharRange[BUFFER_SIZE];
+    this.bufferPosition = 0;
+    this.writeObjects = writeObjects;
+  }
+
+  public void close() throws IOException {
+    flush();
+    out.close();
+  }
+
+  /**
+   * Ensures that the buffer has at least enough space available to hold <code>size</code>
+   * additional bytes.
+   * <p>
+   * If necessary, the current contents of the buffer will be written to the underlying output
+   * stream.
+   * 
+   * @param size the size of the buffer
+   * @throws IOException if there is an error determining the current size
+   */
+  protected void prepareBuffer(int size) throws IOException {
+    if (bufferPosition > 0 && bufferPosition >= BUFFER_SIZE - size) {
+
+      writeBuffer();
+
+    }
+  }
+
+  protected void writeBuffer() throws IOException {
+    if (bufferPosition > 0) {
+      out.write(buffer, 0, bufferPosition);
+      bufferPosition = 0;
+    }
+  }
+
+  public void flush() throws IOException {
+    writeBuffer();
+    out.flush();
+  }
+
+  public void write(int b) throws IOException {
+    writeBuffer();
+    out.write(b);
+  }
+
+  public void write(byte[] b) throws IOException {
+    writeBuffer();
+    out.write(b);
+  }
+
+  public void write(byte[] b, int off, int len) throws IOException {
+    writeBuffer();
+    out.write(b, off, len);
+  }
+
+
+  public void writeObject(Object obj) throws IOException {
+
+    if (writeObjects) {
+      if (obj == null) {
+
+        write(ObjectStreamConstants.TC_NULL);
+
+      } else if (obj instanceof String) {
+
+        String s = (String) obj;
+        long bytesRequired = utfBytesRequired(s);
+        boolean forceLongHeader = (bytesRequired > Short.MAX_VALUE);
+
+        writeUTF(s, bytesRequired, forceLongHeader);
+
+      } else if (obj instanceof Externalizable) {
+
+        Externalizable e = (Externalizable) obj;
+
+        e.writeExternal(this);
+
+      } else {
+
+        throw new RuntimeException("Object is not Externalizable: " + obj.toString());
+
+      }
+    }
+  }
+
+  public void writeBoolean(boolean v) throws IOException {
+    prepareBuffer(BOOLEAN_SIZE);
+    if (v) {
+      buffer[bufferPosition] = 0x01;
+    } else {
+      buffer[bufferPosition] = 0x00;
+    }
+    bufferPosition += BOOLEAN_SIZE;
+  }
+
+  public void writeByte(int v) throws IOException {
+    prepareBuffer(BYTE_SIZE);
+    buffer[bufferPosition] = (byte) v;
+    bufferPosition += BYTE_SIZE;
+  }
+
+  public void writeBytes(String s) throws IOException {
+    int charsRemaining = s.length();
+
+    while (charsRemaining > 0) {
+
+      int bytesAvailableInBuffer = (BUFFER_SIZE - 1) - bufferPosition;
+      int charsAvailableInBuffer = bytesAvailableInBuffer;
+
+      if (charsAvailableInBuffer > charsRemaining) {
+        charsAvailableInBuffer = charsRemaining;
+      }
+
+      int charStart = 0;
+
+      if (charsAvailableInBuffer > 0) {
+
+        // Copy characters into the character buffer
+        s.getChars(charStart, charStart + charsAvailableInBuffer, charBuffer, 0);
+
+        // Iterate over each character in the character buffer
+        for (int charIndex = 0; charIndex < charsAvailableInBuffer; charIndex++) {
+
+          // Put the low-order byte for the current character into the byte buffer
+          buffer[bufferPosition] = (byte) charBuffer[charIndex];
+
+          bufferPosition += BYTE_SIZE;
+
+        }
+
+        charsRemaining -= charsAvailableInBuffer;
+
+      } else {
+        writeBuffer();
+      }
+    }
+  }
+
+  public void writeChar(int v) throws IOException {
+    prepareBuffer(CHAR_SIZE);
+
+    for (int offset = 0, mask = ((CHAR_SIZE - 1) * BITS_PER_BYTE); offset < CHAR_SIZE && mask >= 0; offset++, mask -=
+        BITS_PER_BYTE) {
+
+      buffer[bufferPosition + offset] = (byte) (v >>> mask);
+
+    }
+
+    bufferPosition += CHAR_SIZE;
+  }
+
+  public void writeChars(String s) throws IOException {
+
+    int charsRemaining = s.length();
+
+    while (charsRemaining > 0) {
+
+      int bytesAvailableInBuffer = (BUFFER_SIZE - 1) - bufferPosition;
+      int charsAvailableInBuffer = bytesAvailableInBuffer / CHAR_SIZE;
+
+      if (charsAvailableInBuffer > charsRemaining) {
+        charsAvailableInBuffer = charsRemaining;
+      }
+
+      int charStart = 0;
+
+      if (charsAvailableInBuffer > 0) {
+
+        // Copy characters into the character buffer
+        s.getChars(charStart, charStart + charsAvailableInBuffer, charBuffer, 0);
+
+        // Iterate over each character in the character buffer
+        for (int charIndex = 0; charIndex < charsAvailableInBuffer; charIndex++) {
+
+          // Put the bytes for the current character into the byte buffer
+          for (int offset = 0, mask = (CHAR_SIZE * BITS_PER_BYTE); offset < CHAR_SIZE && mask >= 0; offset++, mask -=
+              BITS_PER_BYTE) {
+
+            buffer[bufferPosition + offset] = (byte) (charBuffer[charIndex] >>> mask);
+          }
+
+          bufferPosition += CHAR_SIZE;
+
+        }
+
+        charsRemaining -= charsAvailableInBuffer;
+
+      } else {
+        writeBuffer();
+      }
+    }
+
+  }
+
+  public void writeDouble(double v) throws IOException {
+    prepareBuffer(DOUBLE_SIZE);
+
+    long l = Double.doubleToLongBits(v);
+
+    for (int offset = 0, mask = ((DOUBLE_SIZE - 1) * BITS_PER_BYTE); offset < DOUBLE_SIZE
+        && mask >= 0; offset++, mask -= BITS_PER_BYTE) {
+
+      buffer[bufferPosition + offset] = (byte) (l >>> mask);
+
+    }
+
+    bufferPosition += DOUBLE_SIZE;
+  }
+
+  public void writeFloat(float v) throws IOException {
+    prepareBuffer(FLOAT_SIZE);
+
+    int i = Float.floatToIntBits(v);
+
+    for (int offset = 0, mask = ((FLOAT_SIZE - 1) * BITS_PER_BYTE); offset < FLOAT_SIZE
+        && mask >= 0; offset++, mask -= BITS_PER_BYTE) {
+
+      buffer[bufferPosition + offset] = (byte) (i >>> mask);
+
+    }
+
+    bufferPosition += FLOAT_SIZE;
+  }
+
+  public void writeInt(int v) throws IOException {
+    prepareBuffer(INT_SIZE);
+
+    for (int offset = 0, mask = ((INT_SIZE - 1) * BITS_PER_BYTE); offset < INT_SIZE && mask >= 0; offset++, mask -=
+        BITS_PER_BYTE) {
+
+      buffer[bufferPosition + offset] = (byte) (v >>> mask);
+
+    }
+
+    bufferPosition += INT_SIZE;
+  }
+
+  public void writeLong(long v) throws IOException {
+    prepareBuffer(LONG_SIZE);
+
+    for (int offset = 0, mask = ((LONG_SIZE - 1) * BITS_PER_BYTE); offset < LONG_SIZE && mask >= 0; offset++, mask -=
+        LONG_SIZE) {
+
+      buffer[bufferPosition + offset] = (byte) (v >>> mask);
+
+    }
+
+    bufferPosition += LONG_SIZE;
+  }
+
+  public void writeShort(int v) throws IOException {
+    prepareBuffer(SHORT_SIZE);
+
+    for (int offset = 0, mask = ((SHORT_SIZE - 1) * BITS_PER_BYTE); offset < SHORT_SIZE
+        && mask >= 0; offset++, mask -= BITS_PER_BYTE) {
+
+      buffer[bufferPosition + offset] = (byte) (v >>> mask);
+
+    }
+
+    bufferPosition += SHORT_SIZE;
+  }
+
+  private long utfBytesRequired(String str) {
+
+    long bytesRequired = 0;
+
+    // Calculate the number of bytes required
+    for (int charStart = 0, charsRemaining = str.length(); charsRemaining > 0;) {
+
+      int charsToCopy = ((charsRemaining < charBuffer.length) ? charsRemaining : charBuffer.length);
+
+      int charEnd = charStart + charsToCopy;
+
+
+      // Copy characters into the character buffer
+      str.getChars(charStart, charEnd, charBuffer, 0);
+
+      // Iterate over each character in the character buffer
+      for (int charIndex = 0; charIndex < charsToCopy; charIndex++) {
+
+        char c = charBuffer[charIndex];
+
+        if (c >= '\u0001' && c <= '\u007f') {
+          charSizeBuffer[charIndex] = utf8CharRange.ONE_BYTE;
+          bytesRequired += 1;
+          // } else if ((c>='\u0080' && c<='\u07ff') || c=='\u0000') {
+        } else if (c < '\u0800') {
+          charSizeBuffer[charIndex] = utf8CharRange.TWO_BYTES;
+          bytesRequired += 2;
+        } else {
+          charSizeBuffer[charIndex] = utf8CharRange.THREE_BYTES;
+          bytesRequired += 3;
+        }
+
+      }
+
+      charStart = charEnd;
+      charsRemaining -= charsToCopy;
+
+    }
+
+    return bytesRequired;
+  }
+
+  public void writeUTF(String str) throws IOException {
+
+    // Calculate the number of bytes required to encode the string
+    long bytesRequired = utfBytesRequired(str);
+
+    writeUTF(str, bytesRequired, false);
+  }
+
+
+
+  private void writeUTF(String str, long bytesRequired, boolean forceLongHeader) throws IOException {
+
+    if (forceLongHeader) {
+      writeLong(bytesRequired);
+    } else {
+      // Attempt to write the number of bytes required to encode this string.
+      //
+      // Because the size of the string is encoded as a short,
+      // only strings that require no more than Short.MAX_VALUE bytes can be encoded.
+      if (bytesRequired > Short.MAX_VALUE) {
+        throw new UTFDataFormatException(
+            "Unable to successfully encode strings that require more than " + Short.MAX_VALUE
+                + " bytes. Encoding the provided string would require " + bytesRequired + " bytes.");
+      } else {
+        writeShort((short) bytesRequired);
+      }
+    }
+
+    int numChars = str.length();
+    int charsRemaining = numChars;
+
+
+    int charStart = 0;
+    int charEnd = numChars;
+
+    while (charsRemaining > 0) {
+
+      // Get the number of empty bytes available in the buffer
+      int bytesAvailableInBuffer = (BUFFER_SIZE - 1) - bufferPosition;
+
+      // Calculate the number of characters that
+      // can be encoded in the remaining buffer space.
+      int bytesToUse = 0;
+      for (int charIndex = charStart; charIndex < numChars; charIndex++) {
+        int bytesNeeded;
+        switch (charSizeBuffer[charIndex]) {
+          case ONE_BYTE:
+            bytesNeeded = 1;
+            break;
+          case TWO_BYTES:
+            bytesNeeded = 2;
+            break;
+          case THREE_BYTES:
+          default:
+            bytesNeeded = 3;
+            break;
+        }
+
+        if (bytesToUse + bytesNeeded > bytesAvailableInBuffer) {
+          charEnd = charIndex;
+          break;
+        } else {
+          bytesToUse += bytesNeeded;
+        }
+      }
+
+
+      // Write character data to the byte buffer
+      int charsAvailableInBuffer = charEnd - charStart;
+      int charsToCopy = charEnd - charStart;
+
+      if (charsToCopy > 0) {
+
+        // Copy characters into the character buffer
+        str.getChars(charStart, charEnd, charBuffer, 0);
+
+        // Iterate over each character in the character buffer
+        for (int charIndex = 0; charIndex < charsAvailableInBuffer; charIndex++) {
+
+          char c = charBuffer[charIndex];
+
+          switch (charSizeBuffer[charIndex]) {
+
+            case ONE_BYTE: {
+              buffer[bufferPosition++] = (byte) c;
+              break;
+            }
+
+            case TWO_BYTES: {
+              buffer[bufferPosition++] = (byte) (0xc0 | (0x1f & (c >> 6)));
+              buffer[bufferPosition++] = (byte) (0x80 | (0x3f & c));
+              break;
+            }
+
+            case THREE_BYTES: {
+              buffer[bufferPosition++] = (byte) (0xe0 | (0x0f & (c >> 12)));
+              buffer[bufferPosition++] = (byte) (0x80 | (0x3f & (c >> 6)));
+              buffer[bufferPosition++] = (byte) (0x80 | (0x3f & c));
+              break;
+            }
+          }
+
+        }
+
+        charsRemaining -= charsToCopy;
+        charStart = charEnd;
+        charEnd = numChars;
+
+      } else {
+        writeBuffer();
+      }
+
+    }
+
+  }
+
+  private static enum utf8CharRange {
+    ONE_BYTE, TWO_BYTES, THREE_BYTES
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/util/io/IndexedReader.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/util/io/IndexedReader.java b/joshua-core/src/main/java/org/apache/joshua/util/io/IndexedReader.java
new file mode 100644
index 0000000..f357e55
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/io/IndexedReader.java
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.io;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+
+
+/**
+ * Wraps a reader with "line" index information.
+ * 
+ * @author wren ng thornton wren@users.sourceforge.net
+ * @version $LastChangedDate: 2009-03-26 15:06:57 -0400 (Thu, 26 Mar 2009) $
+ */
+public class IndexedReader<E> implements Reader<E> {
+
+  /** A name for the type of elements the reader produces. */
+  private final String elementName;
+
+  /** The number of elements the reader has delivered so far. */
+  private int lineNumber;
+
+  /** The underlying reader. */
+  private final Reader<E> reader;
+
+  public IndexedReader(String elementName, Reader<E> reader) {
+    this.elementName = elementName;
+    this.lineNumber = 0;
+    this.reader = reader;
+  }
+
+  /** 
+   * Return the number of elements delivered so far.
+   * @return integer representing the number of elements delivered so far
+   */
+  public int index() {
+    return this.lineNumber;
+  }
+
+
+  /**
+   * Wrap an IOException's message with the index when it occured.
+   * @param oldError the old {@link java.io.IOException} we wish to wrap
+   * @return the new wrapped {@link java.io.IOException}
+   */
+  public IOException wrapIOException(IOException oldError) {
+    IOException newError =
+        new IOException("At " + this.elementName + " " + this.lineNumber + ": "
+            + oldError.getMessage());
+    newError.initCause(oldError);
+    return newError;
+  }
+
+  // ===============================================================
+  // Reader
+  // ===============================================================
+
+  /** 
+   * Delegated to the underlying reader.
+   * @return true if the reader is ready
+   * @throws IOException if there is an error determining readiness
+   */
+  @Override
+  public boolean ready() throws IOException {
+    try {
+      return this.reader.ready();
+    } catch (IOException oldError) {
+      throw wrapIOException(oldError);
+    }
+  }
+
+
+  /**
+   * Delegated to the underlying reader. Note that we do not have a <code>finalize()</code> method;
+   * however, when we fall out of scope, the underlying reader will too, so its finalizer may be
+   * called. For correctness, be sure to manually close all readers.
+   */
+  public void close() throws IOException {
+    try {
+      this.reader.close();
+    } catch (IOException oldError) {
+      throw wrapIOException(oldError);
+    }
+  }
+
+
+  /** Delegated to the underlying reader. */
+  public E readLine() throws IOException {
+    E line;
+    try {
+      line = this.reader.readLine();
+    } catch (IOException oldError) {
+      throw wrapIOException(oldError);
+    }
+    ++this.lineNumber;
+    return line;
+  }
+
+
+  // ===============================================================
+  // Iterable -- because sometimes Java can be very stupid
+  // ===============================================================
+
+  /** Return self as an iterator. */
+  public Iterator<E> iterator() {
+    return this;
+  }
+
+
+  // ===============================================================
+  // Iterator
+  // ===============================================================
+
+  /** Delegated to the underlying reader. */
+  public boolean hasNext() {
+    return this.reader.hasNext();
+  }
+
+
+  /** Delegated to the underlying reader. */
+  public E next() throws NoSuchElementException {
+    E line = this.reader.next();
+    // Let exceptions out, we'll wrap any errors a closing time.
+
+    ++this.lineNumber;
+    return line;
+  }
+
+
+  /**
+   * If the underlying reader supports removal, then so do we. Note that the {@link #index()} method
+   * returns the number of elements delivered to the client, so removing an element from the
+   * underlying collection does not affect that number.
+   */
+  public void remove() throws UnsupportedOperationException {
+    this.reader.remove();
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/util/io/LineReader.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/util/io/LineReader.java b/joshua-core/src/main/java/org/apache/joshua/util/io/LineReader.java
new file mode 100644
index 0000000..5122994
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/io/LineReader.java
@@ -0,0 +1,368 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.io;
+
+import java.io.BufferedReader;
+import java.io.FileDescriptor;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.File;
+import java.nio.charset.Charset;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+import java.util.zip.GZIPInputStream;
+
+import org.apache.joshua.decoder.Decoder;
+
+/**
+ * This class provides an Iterator interface to a BufferedReader. This covers the most common
+ * use-cases for reading from files without ugly code to check whether we got a line or not.
+ * 
+ * @author wren ng thornton wren@users.sourceforge.net
+ * @author Matt Post post@cs.jhu.edu
+ */
+public class LineReader implements Reader<String> {
+
+  /*
+   * Note: charset name is case-agnostic "UTF-8" is the canonical name "UTF8", "unicode-1-1-utf-8"
+   * are aliases Java doesn't distinguish utf8 vs UTF-8 like Perl does
+   */
+  private static final Charset FILE_ENCODING = Charset.forName("UTF-8");
+
+  /*
+   * The reader and its underlying input stream. We need to keep a hold of the underlying
+   * input stream so that we can query how many raw bytes it's read (for a generic progress
+   * meter that works across GZIP'ed and plain text files).
+   */
+  private BufferedReader reader;
+  private ProgressInputStream rawStream;
+
+  private String buffer;
+  private IOException error;
+
+  private int lineno = 0;
+  
+  private boolean display_progress = false;
+  
+  private int progress = 0;
+
+  // ===============================================================
+  // Constructors and destructors
+  // ===============================================================
+
+  /**
+   * Opens a file for iterating line by line. The special "-" filename can be used to specify
+   * STDIN. GZIP'd files are tested for automatically.
+   * 
+   * @param filename the file to be opened ("-" for STDIN)
+   * @throws IOException if there is an error reading the input file
+   */
+  public LineReader(String filename) throws IOException {
+    
+    display_progress = (Decoder.VERBOSE >= 1);
+    
+    progress = 0;
+    
+    InputStream stream = null; 
+    long totalBytes = -1;
+    if (filename.equals("-")) {
+      rawStream = null;
+      stream = new FileInputStream(FileDescriptor.in);
+    } else {
+      totalBytes = new File(filename).length();
+      rawStream = new ProgressInputStream(new FileInputStream(filename), totalBytes);
+
+      try {
+        stream = new GZIPInputStream(rawStream);
+      } catch (Exception e) {
+        // GZIP ate a byte, so reset
+        rawStream.close();
+        stream = rawStream = new ProgressInputStream(new FileInputStream(filename), totalBytes);
+      }
+    } 
+    
+    this.reader = new BufferedReader(new InputStreamReader(stream, FILE_ENCODING));
+  }
+  
+  public LineReader(String filename, boolean show_progress) throws IOException {
+    this(filename);
+    display_progress = (Decoder.VERBOSE >= 1 && show_progress);
+  }
+
+
+  /**
+   * Wraps an InputStream for iterating line by line. Stream encoding is assumed to be UTF-8.
+   * @param in an {@link java.io.InputStream} to wrap and iterate over line by line
+   */
+  public LineReader(InputStream in) {
+    this.reader = new BufferedReader(new InputStreamReader(in, FILE_ENCODING));
+    display_progress = false;
+  }
+  
+  /**
+   * Chain to the underlying {@link ProgressInputStream}. 
+   * 
+   * @return an integer from 0..100, indicating how much of the file has been read.
+   */
+  public int progress() {
+    return rawStream == null ? 0 : rawStream.progress();
+  }
+  
+  /**
+   * This method will close the file handle, and will raise any exceptions that occured during
+   * iteration. The method is idempotent, and all calls after the first are no-ops (unless the
+   * thread was interrupted or killed). For correctness, you <b>must</b> call this method before the
+   * object falls out of scope.
+   * @throws IOException if there is an error closing the file handler
+   */
+  public void close() throws IOException {
+
+    this.buffer = null; // Just in case it's a large string
+
+    if (null != this.reader) {
+      try {
+        // We assume the wrappers will percolate this down.
+        this.reader.close();
+
+      } catch (IOException e) {
+        // We need to trash our cached error for idempotence.
+        // Presumably the closing error is the more important
+        // one to throw.
+        this.error = null;
+        throw e;
+
+      } finally {
+        this.reader = null;
+      }
+    }
+
+    if (null != this.error) {
+      IOException e = this.error;
+      this.error = null;
+      throw e;
+    }
+  }
+
+
+  /**
+   * We attempt to avoid leaking file descriptors if you fail to call close before the object falls
+   * out of scope. However, the language spec makes <b>no guarantees</b> about timeliness of garbage
+   * collection. It is a bug to rely on this method to release the resources. Also, the garbage
+   * collector will discard any exceptions that have queued up, without notifying the application in
+   * any way.
+   * 
+   * Having a finalizer means the JVM can't do "fast allocation" of LineReader objects (or
+   * subclasses). This isn't too important due to disk latency, but may be worth noting.
+   * 
+   * @see <a
+   *      href="http://java2go.blogspot.com/2007/09/javaone-2007-performance-tips-2-finish.html">Performance
+   *      Tips</a>
+   * @see <a
+   *      href="http://www.javaworld.com/javaworld/jw-06-1998/jw-06-techniques.html?page=1">Techniques</a>
+   */
+  protected void finalize() throws Throwable {
+    try {
+      this.close();
+    } catch (IOException e) {
+      // Do nothing. The GC will discard the exception
+      // anyways, but it may cause us to linger on the heap.
+    } finally {
+      super.finalize();
+    }
+  }
+
+
+
+  // ===============================================================
+  // Reader
+  // ===============================================================
+
+  // Copied from interface documentation.
+  /** Determine if the reader is ready to read a line. */
+  public boolean ready() throws IOException {
+    return this.reader.ready();
+  }
+
+
+  /**
+   * This method is like next() except that it throws the IOException directly. If there are no
+   * lines to be read then null is returned.
+   */
+  public String readLine() throws IOException {
+    if (this.hasNext()) {
+      String line = this.buffer;
+      this.buffer = null;
+      return line;
+
+    } else {
+      if (null != this.error) {
+        IOException e = this.error;
+        this.error = null;
+        throw e;
+      }
+      return null;
+    }
+  }
+
+
+  // ===============================================================
+  // Iterable -- because sometimes Java can be very stupid
+  // ===============================================================
+
+  /** Return self as an iterator. */
+  public Iterator<String> iterator() {
+    return this;
+  }
+
+
+  // ===============================================================
+  // Iterator
+  // ===============================================================
+
+  // Copied from interface documentation.
+  /**
+   * Returns <code>true</code> if the iteration has more elements. (In other words, returns
+   * <code>true</code> if <code>next</code> would return an element rather than throwing an
+   * exception.)
+   */
+  public boolean hasNext() {
+    if (null != this.buffer) {
+      return true;
+
+    } else if (null != this.error) {
+      return false;
+
+    } else {
+      // We're not allowed to throw IOException from within Iterator
+      try {
+        this.buffer = this.reader.readLine();
+      } catch (IOException e) {
+        this.buffer = null;
+        this.error = e;
+        return false;
+      }
+      return (null != this.buffer);
+    }
+  }
+
+
+  /**
+   * Return the next line of the file. If an error is encountered, NoSuchElementException is thrown.
+   * The actual IOException encountered will be thrown later, when the LineReader is closed. Also if
+   * there is no line to be read then NoSuchElementException is thrown.
+   */
+  public String next() throws NoSuchElementException {
+    if (this.hasNext()) {
+      if (display_progress) {
+        int newProgress = (reader != null) ? progress() : 100;
+//        System.err.println(String.format("OLD %d NEW %d", progress, newProgress));
+        
+        if (newProgress > progress) {
+          for (int i = progress + 1; i <= newProgress; i++)
+            if (i == 97) {
+              System.err.print("1");
+            } else if (i == 98) {
+              System.err.print("0");
+            } else if (i == 99) {
+              System.err.print("0");
+            } else if (i == 100) {
+              System.err.println("%");
+            } else if (i % 10 == 0) {
+              System.err.print(String.format("%d", i));
+              System.err.flush();
+            } else if ((i - 1) % 10 == 0)
+              ; // skip at 11 since 10, 20, etc take two digits
+            else {
+              System.err.print(".");
+              System.err.flush();
+            }
+          progress = newProgress;
+        }
+      }
+      
+      String line = this.buffer;
+      this.lineno++;
+      this.buffer = null;
+      return line;
+    } else {
+      throw new NoSuchElementException();
+    }
+  }
+  
+  /* Get the line number of the last line that was returned */
+  public int lineno() {
+    return this.lineno;
+  }
+
+  /** Unsupported. */
+  public void remove() throws UnsupportedOperationException {
+    throw new UnsupportedOperationException();
+  }
+
+
+  /**
+   * Iterates over all lines, ignoring their contents, and returns the count of lines. If some lines
+   * have already been read, this will return the count of remaining lines. Because no lines will
+   * remain after calling this method, we implicitly call close.
+   * 
+   * @return the number of lines read
+   * @throws IOException if there is an error reading lines
+   */
+  public int countLines() throws IOException {
+    int lines = 0;
+
+    while (this.hasNext()) {
+      this.next();
+      lines++;
+    }
+    this.close();
+
+    return lines;
+  }
+
+  /** 
+   * Example usage code.
+   * @param args an input file
+   */
+  public static void main(String[] args) {
+    if (1 != args.length) {
+      System.out.println("Usage: java LineReader filename");
+      System.exit(1);
+    }
+
+    try {
+
+      LineReader in = new LineReader(args[0]);
+      try {
+        for (String line : in) {
+
+          System.out.println(line);
+
+        }
+      } finally {
+        in.close();
+      }
+
+    } catch (IOException e) {
+      e.printStackTrace();
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/util/io/NullReader.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/util/io/NullReader.java b/joshua-core/src/main/java/org/apache/joshua/util/io/NullReader.java
new file mode 100644
index 0000000..f833f00
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/io/NullReader.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.io;
+
+import java.io.IOException;
+
+import org.apache.joshua.util.NullIterator;
+
+
+/**
+ * This class provides a null-object Reader. This is primarily useful for when you may or may not
+ * have a {@link Reader}, and you don't want to check for null all the time. All operations are
+ * no-ops.
+ * 
+ * @author wren ng thornton wren@users.sourceforge.net
+ * @version $LastChangedDate: 2009-03-26 15:06:57 -0400 (Thu, 26 Mar 2009) $
+ */
+public class NullReader<E> extends NullIterator<E> implements Reader<E> {
+
+  // ===============================================================
+  // Constructors and destructors
+  // ===============================================================
+
+  // TODO: use static factory method and singleton?
+  public NullReader() {}
+
+  /** A no-op. */
+  public void close() throws IOException {}
+
+
+  // ===============================================================
+  // Reader
+  // ===============================================================
+
+  /**
+   * Always returns true. Is this correct? What are the semantics of ready()? We're always capable
+   * of delivering nothing, but we're never capable of delivering anything...
+   */
+  public boolean ready() {
+    return true;
+  }
+
+  /** Always returns null. */
+  public E readLine() throws IOException {
+    return null;
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/util/io/ProgressInputStream.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/util/io/ProgressInputStream.java b/joshua-core/src/main/java/org/apache/joshua/util/io/ProgressInputStream.java
new file mode 100644
index 0000000..075c0b3
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/io/ProgressInputStream.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.io;
+
+import java.io.FilterInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * Generic progress meter for reading files (compressed or not). Pass it the raw input file stream
+ * and it will keep track for you.
+ * 
+ * @author Matt Post post@cs.jhu.edu
+ */
+public class ProgressInputStream extends FilterInputStream {
+
+  private long totalBytes = -1;
+  private long bytesRead = 0;
+  
+  protected ProgressInputStream(InputStream in, long totalBytes) {
+    super(in);
+
+    this.totalBytes = totalBytes;
+  }
+  
+  @Override
+  public int read() throws IOException {
+    int value = super.read();
+    bytesRead += 1;
+    return value;
+  }
+  
+  @Override
+  public int read(byte[] b) throws IOException {
+    int value = super.read(b);
+    bytesRead += value;
+    return value;
+  }
+  
+  @Override
+  public int read(byte[] b, int off, int len) throws IOException {
+    int value = super.read(b, off, len);
+    bytesRead += value;
+    return value;
+  }
+  
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    bytesRead = 0;
+  }
+  
+  @Override
+  public long skip(long bytesRead) throws IOException {
+    long skip = super.skip(bytesRead);
+    bytesRead += skip;
+    return skip;
+  }
+  
+  /** 
+   * @return progress through the file, as an integer (0..100).
+   */
+  public int progress() {
+    return (int)(100.0 * (float)bytesRead / (float)totalBytes);
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/util/io/Reader.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/util/io/Reader.java b/joshua-core/src/main/java/org/apache/joshua/util/io/Reader.java
new file mode 100644
index 0000000..cab6d74
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/io/Reader.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.io;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+/**
+ * Common interface for Reader type objects.
+ * 
+ * @author wren ng thornton wren@users.sourceforge.net
+ * @version $LastChangedDate: 2009-03-26 15:06:57 -0400 (Thu, 26 Mar 2009) $
+ */
+public interface Reader<E> extends Iterable<E>, Iterator<E> {
+
+  /** 
+   * Close the reader, freeing all resources.
+   * @throws IOException if there is an error closing the reader instance
+   */
+  void close() throws IOException;
+
+  /** 
+   * Determine if the reader is ready to read a line.
+   * @return true if it is ready
+   * @throws IOException if there is an error whilst determining if the reader if ready
+   */
+  boolean ready() throws IOException;
+
+  /** 
+   * Read a "line" and return an object representing it.
+   * @return an object representing a single line
+   * @throws IOException if there is an error reading lines
+   */
+  E readLine() throws IOException;
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/util/io/package-info.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/util/io/package-info.java b/joshua-core/src/main/java/org/apache/joshua/util/io/package-info.java
new file mode 100644
index 0000000..d7ea475
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/io/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/** 
+ * Provides common utility classes for IO.
+ */
+package org.apache.joshua.util.io;

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/util/package-info.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/util/package-info.java b/joshua-core/src/main/java/org/apache/joshua/util/package-info.java
new file mode 100644
index 0000000..2dedb37
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/**
+ * Provides common utility classes.
+ */
+package org.apache.joshua.util;