You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/08/17 10:32:03 UTC
[22/56] [partial] incubator-joshua git commit: maven multi-module
layout 1st commit: moving files into joshua-core
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/util/encoding/Analyzer.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/util/encoding/Analyzer.java b/joshua-core/src/main/java/org/apache/joshua/util/encoding/Analyzer.java
new file mode 100644
index 0000000..ad2910c
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/encoding/Analyzer.java
@@ -0,0 +1,235 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.encoding;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.TreeMap;
+
+import org.apache.joshua.util.io.LineReader;
+
+public class Analyzer {
+
+ private TreeMap<Float, Integer> histogram;
+ private int total;
+
+ public Analyzer() {
+ histogram = new TreeMap<Float, Integer>();
+ initialize();
+ }
+
+ public void initialize() {
+ histogram.clear();
+ // TODO: drop zero bucket; we won't encode zero-valued features anyway.
+ histogram.put(0.0f, 0);
+ total = 0;
+ }
+
+ public void add(float key) {
+ if (histogram.containsKey(key))
+ histogram.put(key, histogram.get(key) + 1);
+ else
+ histogram.put(key, 1);
+ total++;
+ }
+
+ public float[] quantize(int num_bits) {
+ float[] buckets = new float[1 << num_bits];
+
+ // We make sure that 0.0f always has its own bucket, so the bucket
+ // size is determined excluding the zero values.
+ int size = (total - histogram.get(0.0f)) / (buckets.length - 1);
+ buckets[0] = 0.0f;
+
+ int old_size = -1;
+ while (old_size != size) {
+ int sum = 0;
+ int count = buckets.length - 1;
+ for (float key : histogram.keySet()) {
+ int entry_count = histogram.get(key);
+ if (entry_count < size && key != 0)
+ sum += entry_count;
+ else
+ count--;
+ }
+ old_size = size;
+ size = sum / count;
+ }
+
+ float last_key = Float.MAX_VALUE;
+
+ int index = 1;
+ int count = 0;
+ float sum = 0.0f;
+
+ int value;
+ for (float key : histogram.keySet()) {
+ value = histogram.get(key);
+ // Special bucket termination cases: zero boundary and histogram spikes.
+ if (key == 0 || (last_key < 0 && key > 0) || (value >= size)) {
+ // If the count is not 0, i.e. there were negative values, we should
+ // not bucket them with the positive ones. Close out the bucket now.
+ if (count != 0 && index < buckets.length - 2) {
+ buckets[index++] = (float) sum / count;
+ count = 0;
+ sum = 0;
+ }
+ if (key == 0)
+ continue;
+ }
+ count += value;
+ sum += key * value;
+ // Check if the bucket is full.
+ if (count >= size && index < buckets.length - 2) {
+ buckets[index++] = (float) sum / count;
+ count = 0;
+ sum = 0;
+ }
+ last_key = key;
+ }
+ if (count > 0 && index < buckets.length - 1)
+ buckets[index++] = (float) sum / count;
+
+ float[] shortened = new float[index];
+ for (int i = 0; i < shortened.length; ++i)
+ shortened[i] = buckets[i];
+ return shortened;
+ }
+
+ public boolean isBoolean() {
+ for (float value : histogram.keySet())
+ if (value != 0 && value != 1)
+ return false;
+ return true;
+ }
+
+ public boolean isByte() {
+ for (float value : histogram.keySet())
+ if (Math.ceil(value) != value || value < Byte.MIN_VALUE || value > Byte.MAX_VALUE)
+ return false;
+ return true;
+ }
+
+ public boolean isShort() {
+ for (float value : histogram.keySet())
+ if (Math.ceil(value) != value || value < Short.MIN_VALUE || value > Short.MAX_VALUE)
+ return false;
+ return true;
+ }
+
+ public boolean isChar() {
+ for (float value : histogram.keySet())
+ if (Math.ceil(value) != value || value < Character.MIN_VALUE || value > Character.MAX_VALUE)
+ return false;
+ return true;
+ }
+
+ public boolean isInt() {
+ for (float value : histogram.keySet())
+ if (Math.ceil(value) != value)
+ return false;
+ return true;
+ }
+
+ public boolean is8Bit() {
+ return (histogram.keySet().size() <= 256);
+ }
+
+ public FloatEncoder inferUncompressedType() {
+ if (isBoolean())
+ return PrimitiveFloatEncoder.BOOLEAN;
+ if (isByte())
+ return PrimitiveFloatEncoder.BYTE;
+ if (is8Bit())
+ return new EightBitQuantizer(this.quantize(8));
+ if (isChar())
+ return PrimitiveFloatEncoder.CHAR;
+ if (isShort())
+ return PrimitiveFloatEncoder.SHORT;
+ if (isInt())
+ return PrimitiveFloatEncoder.INT;
+ return PrimitiveFloatEncoder.FLOAT;
+ }
+
+ public FloatEncoder inferType(int bits) {
+ if (isBoolean())
+ return PrimitiveFloatEncoder.BOOLEAN;
+ if (isByte())
+ return PrimitiveFloatEncoder.BYTE;
+ if (bits == 8 || is8Bit())
+ return new EightBitQuantizer(this.quantize(8));
+ // TODO: Could add sub-8-bit encoding here (or larger).
+ if (isChar())
+ return PrimitiveFloatEncoder.CHAR;
+ if (isShort())
+ return PrimitiveFloatEncoder.SHORT;
+ if (isInt())
+ return PrimitiveFloatEncoder.INT;
+ return PrimitiveFloatEncoder.FLOAT;
+ }
+
+ public String toString(String label) {
+ StringBuilder sb = new StringBuilder();
+ for (float val : histogram.keySet())
+ sb.append(label + "\t" + String.format("%.5f", val) + "\t" + histogram.get(val) + "\n");
+ return sb.toString();
+ }
+
+ public static void main(String[] args) throws IOException {
+ LineReader reader = new LineReader(args[0]);
+ ArrayList<Float> s = new ArrayList<Float>();
+
+ System.out.println("Initialized.");
+ while (reader.hasNext())
+ s.add(Float.parseFloat(reader.next().trim()));
+ System.out.println("Data read.");
+ int n = s.size();
+ byte[] c = new byte[n];
+ ByteBuffer b = ByteBuffer.wrap(c);
+ Analyzer q = new Analyzer();
+
+ q.initialize();
+ for (int i = 0; i < n; i++)
+ q.add(s.get(i));
+ EightBitQuantizer eb = new EightBitQuantizer(q.quantize(8));
+ System.out.println("Quantizer learned.");
+
+ for (int i = 0; i < n; i++)
+ eb.write(b, s.get(i));
+ b.rewind();
+ System.out.println("Quantization complete.");
+
+ float avg_error = 0;
+ float error = 0;
+ int count = 0;
+ for (int i = -4; i < n - 4; i++) {
+ float coded = eb.read(b, i);
+ if (s.get(i + 4) != 0) {
+ error = Math.abs(s.get(i + 4) - coded);
+ avg_error += error;
+ count++;
+ }
+ }
+ avg_error /= count;
+ System.out.println("Evaluation complete.");
+
+ System.out.println("Average quanitization error over " + n + " samples is: " + avg_error);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/util/encoding/EightBitQuantizer.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/util/encoding/EightBitQuantizer.java b/joshua-core/src/main/java/org/apache/joshua/util/encoding/EightBitQuantizer.java
new file mode 100644
index 0000000..5876d4f
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/encoding/EightBitQuantizer.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.encoding;
+
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+public class EightBitQuantizer implements FloatEncoder {
+
+ private float[] buckets;
+
+ public EightBitQuantizer() {
+ this.buckets = new float[256];
+ }
+
+ public EightBitQuantizer(float[] buckets) {
+ if (buckets.length > 256)
+ throw new RuntimeException("Incompatible number of buckets: " + buckets.length);
+ this.buckets = buckets;
+ }
+
+ @Override
+ public final float read(ByteBuffer stream, int position) {
+ byte index = stream.get(position + EncoderConfiguration.ID_SIZE);
+ return buckets[index + 128];
+ }
+
+ @Override
+ public final void write(ByteBuffer stream, float val) {
+ byte index = -128;
+
+ // We search for the bucket best matching the value. Only zeroes will be
+ // mapped to the zero bucket.
+ if (val != 0 && buckets.length > 1) {
+ int t = 1;
+ int b = buckets.length - 1;
+ while ((b - t) > 1) {
+ int half = (t + b) / 2;
+ if (val >= buckets[half])
+ t = half;
+ if (val <= buckets[half])
+ b = half;
+ }
+ index = (byte) ((Math.abs(buckets[t] - val) > (Math.abs(buckets[b] - val)) ? b : t) - 128);
+ }
+ stream.put(index);
+ }
+
+ @Override
+ public String getKey() {
+ return "8bit";
+ }
+
+ @Override
+ public void writeState(DataOutputStream out) throws IOException {
+ out.writeUTF(getKey());
+ out.writeInt(buckets.length);
+ for (int i = 0; i < buckets.length; i++)
+ out.writeFloat(buckets[i]);
+ }
+
+ @Override
+ public void readState(DataInputStream in) throws IOException {
+ int length = in.readInt();
+ buckets = new float[length];
+ for (int i = 0; i < buckets.length; i++)
+ buckets[i] = in.readFloat();
+ }
+
+ @Override
+ public final int size() {
+ return 1;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/util/encoding/EncoderConfiguration.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/util/encoding/EncoderConfiguration.java b/joshua-core/src/main/java/org/apache/joshua/util/encoding/EncoderConfiguration.java
new file mode 100644
index 0000000..28b013f
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/encoding/EncoderConfiguration.java
@@ -0,0 +1,160 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.encoding;
+
+import java.io.BufferedInputStream;
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.joshua.corpus.Vocabulary;
+
+public class EncoderConfiguration {
+
+ public static int ID_SIZE = 4;
+
+ private IntEncoder idEncoder;
+ private int[] innerToOuter;
+ private FloatEncoder[] encoderById;
+ private FloatEncoder[] encoders;
+
+ private Map<Integer, Integer> outerToInner;
+
+ private boolean labeled;
+
+ private int numDenseFeatures = 0;
+
+ public EncoderConfiguration() {
+ this.outerToInner = new HashMap<Integer, Integer>();
+ }
+
+ public int getNumDenseFeatures() {
+ return numDenseFeatures;
+ }
+
+ public int getNumFeatures() {
+ return encoders.length;
+ }
+
+ public void load(String file_name) throws IOException {
+ File encoding_file = new File(file_name);
+ BufferedInputStream buf_stream = new BufferedInputStream(new FileInputStream(encoding_file));
+ DataInputStream in_stream = new DataInputStream(buf_stream);
+
+ String id_key = in_stream.readUTF();
+ idEncoder = EncoderFactory.getIntEncoder(id_key);
+ idEncoder.readState(in_stream);
+ ID_SIZE = idEncoder.size();
+ labeled = in_stream.readBoolean();
+
+ int num_encoders = in_stream.readInt();
+ encoders = new FloatEncoder[num_encoders];
+ for (int i = 0; i < num_encoders; i++) {
+ String key = in_stream.readUTF();
+ FloatEncoder e = EncoderFactory.getFloatEncoder(key);
+ e.readState(in_stream);
+ encoders[i] = e;
+ }
+ int num_features = in_stream.readInt();
+ encoderById = new FloatEncoder[num_features];
+ innerToOuter = new int[num_features];
+ for (int i = 0; i < num_features; i++) {
+ int outer_id;
+ if (labeled) {
+ String feature_name = in_stream.readUTF();
+ outer_id = Vocabulary.id(feature_name);
+ try {
+ Integer.parseInt(feature_name);
+ numDenseFeatures++;
+ } catch (NumberFormatException e) {}
+ } else {
+ outer_id = in_stream.readInt();
+ }
+ int inner_id = in_stream.readInt();
+ int encoder_index = in_stream.readInt();
+ if (encoder_index >= num_encoders) {
+ throw new RuntimeException("Error deserializing EncoderConfig. " + "Feature "
+ + (labeled ? Vocabulary.word(outer_id) : outer_id) + " referring to encoder "
+ + encoder_index + " when only " + num_encoders + " known.");
+ }
+ encoderById[inner_id] = encoders[encoder_index];
+ innerToOuter[inner_id] = outer_id;
+ }
+ in_stream.close();
+
+ outerToInner.clear();
+ for (int i = 0; i < innerToOuter.length; ++i)
+ outerToInner.put(innerToOuter[i], i);
+ }
+
+ public FloatEncoder encoder(int inner_id) {
+ return encoderById[inner_id];
+ }
+
+ public int readId(ByteBuffer buffer, int pos) {
+ return idEncoder.read(buffer, pos);
+ }
+
+ public int outerId(int inner_id) {
+ return innerToOuter[inner_id];
+ }
+
+ public int innerId(int outer_id) {
+ return outerToInner.get(outer_id);
+ }
+
+ public boolean isLabeled() {
+ return labeled;
+ }
+
+ /**
+ * For now, this just loads a configuration and prints out the number of features.
+ *
+ * @param args an input configuration file
+ */
+ public static void main(String[] args) {
+ String grammar_dir = null;
+ try {
+ grammar_dir = args[0];
+
+ EncoderConfiguration encoding = new EncoderConfiguration();
+ encoding.load(grammar_dir + File.separator + "encoding");
+ int num_features = encoding.getNumFeatures();
+ System.out.println(String.format("num_features = %d", encoding.getNumFeatures()));
+
+ for (int feature_id = 0; feature_id < num_features; feature_id++) {
+ if (Vocabulary.size() == 1) {
+ System.out.println(String.format("feature: %d", feature_id));
+ } else {
+ String name = Vocabulary.word(encoding.outerId(feature_id));
+ System.out.println(String.format("feature: %s", name));
+ }
+ }
+
+ } catch (ArrayIndexOutOfBoundsException e) {
+ throw new RuntimeException("Usage: EncoderConfiguration <packed_directory>");
+ } catch (IOException e) {
+ throw new RuntimeException(String.format("* FATAL: can't find file %s/encoding", grammar_dir));
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/util/encoding/EncoderFactory.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/util/encoding/EncoderFactory.java b/joshua-core/src/main/java/org/apache/joshua/util/encoding/EncoderFactory.java
new file mode 100644
index 0000000..a1f93d0
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/encoding/EncoderFactory.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.encoding;
+
+public class EncoderFactory {
+
+ public static FloatEncoder getFloatEncoder(String key) {
+ FloatEncoder encoder = PrimitiveFloatEncoder.get(key.toUpperCase());
+ if (encoder != null) {
+ return encoder;
+ } else if ("8bit".equals(key)) {
+ return new EightBitQuantizer();
+ } else {
+ throw new RuntimeException("Unknown FloatEncoder type: " + key.toUpperCase());
+ }
+ }
+
+ public static IntEncoder getIntEncoder(String key) {
+ IntEncoder encoder = PrimitiveIntEncoder.get(key.toUpperCase());
+ if (encoder != null) {
+ return encoder;
+ } else {
+ throw new RuntimeException("Unknown IntEncoder type: " + key.toUpperCase());
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/util/encoding/FeatureTypeAnalyzer.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/util/encoding/FeatureTypeAnalyzer.java b/joshua-core/src/main/java/org/apache/joshua/util/encoding/FeatureTypeAnalyzer.java
new file mode 100644
index 0000000..504859f
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/encoding/FeatureTypeAnalyzer.java
@@ -0,0 +1,254 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.encoding;
+
+import java.io.BufferedOutputStream;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.util.io.LineReader;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class FeatureTypeAnalyzer {
+
+ private static final Logger LOG = LoggerFactory.getLogger(FeatureTypeAnalyzer.class);
+
+ private ArrayList<FeatureType> types;
+
+ private Map<Integer, Integer> featureToType;
+
+ private Map<Integer, Integer> featureIdMap;
+
+ // Is the feature setup labeled.
+ private boolean labeled;
+
+ // Is the encoder configuration open for new features (that are not assumed boolean)?
+ private boolean open;
+
+ public FeatureTypeAnalyzer() {
+ this(false);
+ }
+
+ public FeatureTypeAnalyzer(boolean open) {
+ this.open = open;
+ this.types = new ArrayList<FeatureType>();
+ this.featureToType = new HashMap<Integer, Integer>();
+ this.featureIdMap = new HashMap<Integer, Integer>();
+ }
+
+ public void readConfig(String config_filename) throws IOException {
+ LineReader reader = new LineReader(config_filename);
+ while (reader.hasNext()) {
+ // Clean up line, chop comments off and skip if the result is empty.
+ String line = reader.next().trim();
+ if (line.indexOf('#') != -1)
+ line = line.substring(0, line.indexOf('#'));
+ if (line.isEmpty())
+ continue;
+ String[] fields = line.split("[\\s]+");
+
+ if ("encoder".equals(fields[0])) {
+ // Adding an encoder to the mix.
+ if (fields.length < 3) {
+ throw new RuntimeException("Incomplete encoder line in config.");
+ }
+ String encoder_key = fields[1];
+ ArrayList<Integer> feature_ids = new ArrayList<Integer>();
+ for (int i = 2; i < fields.length; i++)
+ feature_ids.add(Vocabulary.id(fields[i]));
+ addFeatures(encoder_key, feature_ids);
+ }
+ }
+ }
+
+ public void addFeatures(String encoder_key, List<Integer> feature_ids) {
+ int index = addType(encoder_key);
+ for (int feature_id : feature_ids)
+ featureToType.put(feature_id, index);
+ }
+
+ private int addType(String encoder_key) {
+ FeatureType ft = new FeatureType(encoder_key);
+ int index = types.indexOf(ft);
+ if (index < 0) {
+ types.add(ft);
+ return types.size() - 1;
+ }
+ return index;
+ }
+
+ private int addType() {
+ types.add(new FeatureType());
+ return types.size() - 1;
+ }
+
+ public void observe(int feature_id, float value) {
+ Integer type_id = featureToType.get(feature_id);
+ if (type_id == null && open) {
+ type_id = addType();
+ featureToType.put(feature_id, type_id);
+ }
+ if (type_id != null)
+ types.get(type_id).observe(value);
+ }
+
+ // Inspects the collected histograms, inferring actual type of feature. Then replaces the
+ // analyzer, if present, with the most compact applicable type.
+ public void inferTypes(boolean labeled) {
+ for (FeatureType ft : types) {
+ ft.inferUncompressedType();
+ }
+ if (LOG.isInfoEnabled()) {
+ for (int id : featureToType.keySet()) {
+ LOG.info("Type inferred: {} is {}", (labeled ? Vocabulary.word(id) : "Feature " + id),
+ types.get(featureToType.get(id)).encoder.getKey());
+ }
+ }
+ }
+
+ public void buildFeatureMap() {
+ int[] known_features = new int[featureToType.keySet().size()];
+ int i = 0;
+ for (int f : featureToType.keySet())
+ known_features[i++] = f;
+ Arrays.sort(known_features);
+
+ featureIdMap.clear();
+ for (i = 0; i < known_features.length; ++i)
+ featureIdMap.put(known_features[i], i);
+ }
+
+ public int getRank(int feature_id) {
+ return featureIdMap.get(feature_id);
+ }
+
+ public IntEncoder getIdEncoder() {
+ int num_features = featureIdMap.size();
+ if (num_features <= Byte.MAX_VALUE)
+ return PrimitiveIntEncoder.BYTE;
+ else if (num_features <= Character.MAX_VALUE)
+ return PrimitiveIntEncoder.CHAR;
+ else
+ return PrimitiveIntEncoder.INT;
+ }
+
+ public void write(String file_name) throws IOException {
+ File out_file = new File(file_name);
+ BufferedOutputStream buf_stream = new BufferedOutputStream(new FileOutputStream(out_file));
+ DataOutputStream out_stream = new DataOutputStream(buf_stream);
+
+ buildFeatureMap();
+
+ getIdEncoder().writeState(out_stream);
+ out_stream.writeBoolean(labeled);
+ out_stream.writeInt(types.size());
+ for (int index = 0; index < types.size(); index++)
+ types.get(index).encoder.writeState(out_stream);
+
+ out_stream.writeInt(featureToType.size());
+ for (int feature_id : featureToType.keySet()) {
+ if (labeled)
+ out_stream.writeUTF(Vocabulary.word(feature_id));
+ else
+ out_stream.writeInt(feature_id);
+ out_stream.writeInt(featureIdMap.get(feature_id));
+ out_stream.writeInt(featureToType.get(feature_id));
+ }
+ out_stream.close();
+ }
+
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ for (int feature_id : featureToType.keySet()) {
+ sb.append(types.get(featureToType.get(feature_id)).analyzer.toString(Vocabulary.word(feature_id)));
+ }
+ return sb.toString();
+ }
+
+ public boolean isLabeled() {
+ return labeled;
+ }
+
+ public void setLabeled(boolean labeled) {
+ this.labeled = labeled;
+ }
+
+ class FeatureType {
+ FloatEncoder encoder;
+ Analyzer analyzer;
+ int bits;
+
+ FeatureType() {
+ encoder = null;
+ analyzer = new Analyzer();
+ bits = -1;
+ }
+
+ FeatureType(String key) {
+ // either throws or returns non-null
+ FloatEncoder e = EncoderFactory.getFloatEncoder(key);
+ encoder = e;
+ analyzer = null;
+ bits = -1;
+ }
+
+ void inferUncompressedType() {
+ if (encoder != null)
+ return;
+ encoder = analyzer.inferUncompressedType();
+ analyzer = null;
+ }
+
+ void inferType() {
+ if (encoder != null)
+ return;
+ encoder = analyzer.inferType(bits);
+ analyzer = null;
+ }
+
+ void observe(float value) {
+ if (analyzer != null)
+ analyzer.add(value);
+ }
+
+ public boolean equals(Object t) {
+ if (t != null && t instanceof FeatureType) {
+ FeatureType that = (FeatureType) t;
+ if (this.encoder != null) {
+ return this.encoder.equals(that.encoder);
+ } else {
+ if (that.encoder != null)
+ return false;
+ if (this.analyzer != null)
+ return this.analyzer.equals(that.analyzer);
+ }
+ }
+ return false;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/util/encoding/FloatEncoder.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/util/encoding/FloatEncoder.java b/joshua-core/src/main/java/org/apache/joshua/util/encoding/FloatEncoder.java
new file mode 100644
index 0000000..5121ea2
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/encoding/FloatEncoder.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.encoding;
+
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+public interface FloatEncoder {
+
+ public float read(ByteBuffer stream, int position);
+
+ public void write(ByteBuffer stream, float value);
+
+ public String getKey();
+
+ public void writeState(DataOutputStream out) throws IOException;
+
+ public void readState(DataInputStream in) throws IOException;
+
+ public int size();
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/util/encoding/IntEncoder.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/util/encoding/IntEncoder.java b/joshua-core/src/main/java/org/apache/joshua/util/encoding/IntEncoder.java
new file mode 100644
index 0000000..a8917f7
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/encoding/IntEncoder.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.encoding;
+
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+public interface IntEncoder {
+
+ public int read(ByteBuffer stream, int position);
+
+ public void write(ByteBuffer stream, int value);
+
+ public String getKey();
+
+ public void writeState(DataOutputStream out) throws IOException;
+
+ public void readState(DataInputStream in) throws IOException;
+
+ public int size();
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/util/encoding/PrimitiveFloatEncoder.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/util/encoding/PrimitiveFloatEncoder.java b/joshua-core/src/main/java/org/apache/joshua/util/encoding/PrimitiveFloatEncoder.java
new file mode 100644
index 0000000..d5015f2
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/encoding/PrimitiveFloatEncoder.java
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.encoding;
+
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+public enum PrimitiveFloatEncoder implements FloatEncoder {
+
+ BYTE("byte", 1) {
+ public final float read(ByteBuffer stream, int position) {
+ return (float) stream.get(position + EncoderConfiguration.ID_SIZE);
+ }
+
+ public final void write(ByteBuffer stream, float value) {
+ stream.put((byte) value);
+ }
+ },
+
+ BOOLEAN("boolean", 0) {
+ public final float read(ByteBuffer stream, int position) {
+ return 1.0f;
+ }
+
+ public final void write(ByteBuffer stream, float value) {
+ }
+ },
+
+ CHAR("char", 2) {
+ public final float read(ByteBuffer stream, int position) {
+ return (float) stream.getChar(position + EncoderConfiguration.ID_SIZE);
+ }
+
+ public final void write(ByteBuffer stream, float value) {
+ stream.putChar((char) value);
+ }
+ },
+
+ FLOAT("float", 4) {
+ public final float read(ByteBuffer stream, int position) {
+ return stream.getFloat(position + EncoderConfiguration.ID_SIZE);
+ }
+
+ public final void write(ByteBuffer stream, float value) {
+ stream.putFloat(value);
+ }
+ },
+
+ INT("int", 4) {
+ public final float read(ByteBuffer stream, int position) {
+ return (float) stream.getInt(position + EncoderConfiguration.ID_SIZE);
+ }
+
+ public final void write(ByteBuffer stream, float value) {
+ stream.putInt((int) value);
+ }
+ },
+
+ SHORT("short", 2) {
+ public final float read(ByteBuffer stream, int position) {
+ return (float) stream.getShort(position + EncoderConfiguration.ID_SIZE);
+ }
+
+ public final void write(ByteBuffer stream, float value) {
+ stream.putShort((short) value);
+ }
+ };
+
+ private final String key;
+ private final int size;
+
+ private PrimitiveFloatEncoder(String k, int s) {
+ key = k;
+ size = s;
+ }
+
+ @Override
+ public String getKey() {
+ return key;
+ }
+
+ @Override
+ public int size() {
+ return size;
+ }
+
+ public static PrimitiveFloatEncoder get(String k) {
+ PrimitiveFloatEncoder encoder;
+ try {
+ encoder = valueOf(k);
+ } catch (IllegalArgumentException e) {
+ return null;
+ }
+ return encoder;
+ }
+
+ @Override
+ public void readState(DataInputStream in) throws IOException {
+ }
+
+ @Override
+ public void writeState(DataOutputStream out) throws IOException {
+ out.writeUTF(getKey());
+ }
+
+ @Override
+ public abstract float read(ByteBuffer stream, int position);
+
+ @Override
+ public abstract void write(ByteBuffer stream, float value);
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/util/encoding/PrimitiveIntEncoder.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/util/encoding/PrimitiveIntEncoder.java b/joshua-core/src/main/java/org/apache/joshua/util/encoding/PrimitiveIntEncoder.java
new file mode 100644
index 0000000..42f6053
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/encoding/PrimitiveIntEncoder.java
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.encoding;
+
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+public enum PrimitiveIntEncoder implements IntEncoder {
+
+ // TODO: the inconsistency with FloatEncoders is dangerous.
+ BYTE("byte", 1) {
+ public final int read(ByteBuffer stream, int position) {
+ return (int) stream.get(position);
+ }
+
+ public final void write(ByteBuffer stream, int value) {
+ stream.put((byte) value);
+ }
+ },
+
+ CHAR("char", 2) {
+ public final int read(ByteBuffer stream, int position) {
+ return (int) stream.getChar(position);
+ }
+
+ public final void write(ByteBuffer stream, int value) {
+ stream.putChar((char) value);
+ }
+ },
+
+ INT("int", 4) {
+ public final int read(ByteBuffer stream, int position) {
+ return (int) stream.getInt(position);
+ }
+
+ public final void write(ByteBuffer stream, int value) {
+ stream.putInt((int) value);
+ }
+ },
+
+ SHORT("short", 2) {
+ public final int read(ByteBuffer stream, int position) {
+ return (int) stream.getShort(position);
+ }
+
+ public final void write(ByteBuffer stream, int value) {
+ stream.putShort((short) value);
+ }
+ };
+
+ private final String key;
+ private final int size;
+
+ private PrimitiveIntEncoder(String k, int s) {
+ key = k;
+ size = s;
+ }
+
+ @Override
+ public String getKey() {
+ return key;
+ }
+
+ @Override
+ public int size() {
+ return size;
+ }
+
+ public static PrimitiveIntEncoder get(String k) {
+ PrimitiveIntEncoder encoder;
+ try {
+ encoder = valueOf(k);
+ } catch (IllegalArgumentException e) {
+ return null;
+ }
+ return encoder;
+ }
+
+ @Override
+ public void readState(DataInputStream in) throws IOException {
+ }
+
+ @Override
+ public void writeState(DataOutputStream out) throws IOException {
+ out.writeUTF(getKey());
+ }
+
+ @Override
+ public abstract int read(ByteBuffer stream, int position);
+
+ @Override
+ public abstract void write(ByteBuffer stream, int value);
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/util/encoding/VariableQuantizer.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/util/encoding/VariableQuantizer.java b/joshua-core/src/main/java/org/apache/joshua/util/encoding/VariableQuantizer.java
new file mode 100644
index 0000000..afa3f69
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/encoding/VariableQuantizer.java
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.encoding;
+
+public class VariableQuantizer {
+
+ private final byte[] bytes;
+ private int byteOffset;
+ private int bitOffset;
+
+ /**
+ * @param bytes bytes from which this will read bits. Bits will be read from the first byte first.
+ * Bits are read within a byte from most-significant to least-significant bit.
+ */
+ public VariableQuantizer(byte[] bytes) {
+ this.bytes = bytes;
+ }
+
+ /**
+ * @return index of next bit in current byte which would be read by the next call to
+ * {@link #readBits(int)}.
+ */
+ public int getBitOffset() {
+ return bitOffset;
+ }
+
+ /**
+ * @return index of next byte in input byte array which would be read by the next call to
+ * {@link #readBits(int)}.
+ */
+ public int getByteOffset() {
+ return byteOffset;
+ }
+
+ /**
+ * @param numBits number of bits to read
+ * @return int representing the bits read. The bits will appear as the least-significant bits of
+ * the int
+ * @throws IllegalArgumentException if numBits isn't in [1,32] or more than is available
+ */
+ public int readBits(int numBits) {
+ if (numBits < 1 || numBits > 32 || numBits > available()) {
+ throw new IllegalArgumentException(String.valueOf(numBits));
+ }
+
+ int result = 0;
+
+ // First, read remainder from current byte
+ if (bitOffset > 0) {
+ int bitsLeft = 8 - bitOffset;
+ int toRead = numBits < bitsLeft ? numBits : bitsLeft;
+ int bitsToNotRead = bitsLeft - toRead;
+ int mask = (0xFF >> (8 - toRead)) << bitsToNotRead;
+ result = (bytes[byteOffset] & mask) >> bitsToNotRead;
+ numBits -= toRead;
+ bitOffset += toRead;
+ if (bitOffset == 8) {
+ bitOffset = 0;
+ byteOffset++;
+ }
+ }
+
+ // Next read whole bytes
+ if (numBits > 0) {
+ while (numBits >= 8) {
+ result = (result << 8) | (bytes[byteOffset] & 0xFF);
+ byteOffset++;
+ numBits -= 8;
+ }
+
+ // Finally read a partial byte
+ if (numBits > 0) {
+ int bitsToNotRead = 8 - numBits;
+ int mask = (0xFF >> bitsToNotRead) << bitsToNotRead;
+ result = (result << numBits) | ((bytes[byteOffset] & mask) >> bitsToNotRead);
+ bitOffset += numBits;
+ }
+ }
+
+ return result;
+ }
+
+ /**
+ * @return number of bits that can be read successfully
+ */
+ public int available() {
+ return 8 * (bytes.length - byteOffset) - bitOffset;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/util/io/BinaryIn.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/util/io/BinaryIn.java b/joshua-core/src/main/java/org/apache/joshua/util/io/BinaryIn.java
new file mode 100644
index 0000000..9483e3e
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/io/BinaryIn.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.io;
+
+import java.io.DataInput;
+import java.io.Externalizable;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.ObjectInput;
+import java.io.ObjectStreamConstants;
+import java.io.RandomAccessFile;
+
+public class BinaryIn<E extends Externalizable> extends RandomAccessFile implements DataInput, ObjectInput {
+
+ private final Class<E> type;
+
+ public BinaryIn(String filename, Class<E> type) throws FileNotFoundException {
+ super(filename, "r");
+ this.type = type;
+ }
+
+ public int available() throws IOException {
+ long pos = getFilePointer();
+ long length = length();
+ long bytesAvailable = length - pos;
+ if (bytesAvailable > Integer.MAX_VALUE) {
+ return Integer.MAX_VALUE;
+ } else {
+ return (int) bytesAvailable;
+ }
+ }
+
+ public E readObject() throws ClassNotFoundException, IOException {
+
+ int b = peek();
+ if (b == ObjectStreamConstants.TC_NULL) {
+ return null;
+ } else {
+ E obj;
+ try {
+ obj = type.newInstance();
+ obj.readExternal(this);
+ return obj;
+ } catch (InstantiationException e) {
+ throw new RuntimeException(e);
+ } catch (IllegalAccessException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ }
+
+ public long skip(long n) throws IOException {
+
+ long bytesSkipped = 0;
+
+ while (n > 0) {
+ if (n > Integer.MAX_VALUE) {
+ bytesSkipped += skipBytes(Integer.MAX_VALUE);
+ n -= Integer.MAX_VALUE;
+ } else {
+ bytesSkipped = skipBytes((int) n);
+ n = 0;
+ }
+ }
+
+ return bytesSkipped;
+ }
+
+ private int peek() throws IOException {
+ long pos = getFilePointer();
+ int b = read();
+ seek(pos);
+ return b;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/util/io/BinaryOut.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/util/io/BinaryOut.java b/joshua-core/src/main/java/org/apache/joshua/util/io/BinaryOut.java
new file mode 100644
index 0000000..8383053
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/io/BinaryOut.java
@@ -0,0 +1,505 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.io;
+
+import java.io.Closeable;
+import java.io.DataOutput;
+import java.io.Externalizable;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.Flushable;
+import java.io.IOException;
+import java.io.ObjectOutput;
+import java.io.ObjectOutputStream;
+import java.io.ObjectStreamConstants;
+import java.io.OutputStream;
+import java.io.UTFDataFormatException;
+
+/**
+ * A BinaryOut writes data to an output stream in raw binary form. Each data type is converted to
+ * byte representation.
+ * <p>
+ * Unlike ObjectOutputStream, no extra Java meta-data is written to the stream.
+ *
+ * @author Lane Schwartz
+ * @see ObjectOutputStream
+ * @see Externalizable
+ */
+public class BinaryOut implements DataOutput, ObjectOutput, Flushable, Closeable {
+
+
+ public final int BITS_PER_BYTE = 8;
+
+ public final int BOOLEAN_SIZE = 1;
+ public final int BYTE_SIZE = 1;
+ public final int CHAR_SIZE = 2;
+ public final int SHORT_SIZE = 2;
+ public final int FLOAT_SIZE = 4;
+ public final int INT_SIZE = 4;
+ public final int DOUBLE_SIZE = 8;
+ public final int LONG_SIZE = 8;
+
+ private final OutputStream out;
+
+ private int bufferPosition;
+ private static final int BUFFER_SIZE = 1024;
+ private final byte[] buffer;
+ private final char[] charBuffer;
+ private final utf8CharRange[] charSizeBuffer;
+ private final boolean writeObjects;
+
+ public BinaryOut(File file) throws FileNotFoundException, IOException {
+ this(new FileOutputStream(file), true);
+ }
+
+ public BinaryOut(String filename) throws FileNotFoundException, IOException {
+ this(new File(filename));
+ }
+
+ public BinaryOut(OutputStream out, boolean writeObjects) throws IOException {
+ this.out = out;
+ this.buffer = new byte[BUFFER_SIZE];
+ this.charBuffer = new char[BUFFER_SIZE];
+ this.charSizeBuffer = new utf8CharRange[BUFFER_SIZE];
+ this.bufferPosition = 0;
+ this.writeObjects = writeObjects;
+ }
+
+ public void close() throws IOException {
+ flush();
+ out.close();
+ }
+
+ /**
+ * Ensures that the buffer has at least enough space available to hold <code>size</code>
+ * additional bytes.
+ * <p>
+ * If necessary, the current contents of the buffer will be written to the underlying output
+ * stream.
+ *
+ * @param size the size of the buffer
+ * @throws IOException if there is an error determining the current size
+ */
+ protected void prepareBuffer(int size) throws IOException {
+ if (bufferPosition > 0 && bufferPosition >= BUFFER_SIZE - size) {
+
+ writeBuffer();
+
+ }
+ }
+
+ protected void writeBuffer() throws IOException {
+ if (bufferPosition > 0) {
+ out.write(buffer, 0, bufferPosition);
+ bufferPosition = 0;
+ }
+ }
+
+ public void flush() throws IOException {
+ writeBuffer();
+ out.flush();
+ }
+
+ public void write(int b) throws IOException {
+ writeBuffer();
+ out.write(b);
+ }
+
+ public void write(byte[] b) throws IOException {
+ writeBuffer();
+ out.write(b);
+ }
+
+ public void write(byte[] b, int off, int len) throws IOException {
+ writeBuffer();
+ out.write(b, off, len);
+ }
+
+
+ public void writeObject(Object obj) throws IOException {
+
+ if (writeObjects) {
+ if (obj == null) {
+
+ write(ObjectStreamConstants.TC_NULL);
+
+ } else if (obj instanceof String) {
+
+ String s = (String) obj;
+ long bytesRequired = utfBytesRequired(s);
+ boolean forceLongHeader = (bytesRequired > Short.MAX_VALUE);
+
+ writeUTF(s, bytesRequired, forceLongHeader);
+
+ } else if (obj instanceof Externalizable) {
+
+ Externalizable e = (Externalizable) obj;
+
+ e.writeExternal(this);
+
+ } else {
+
+ throw new RuntimeException("Object is not Externalizable: " + obj.toString());
+
+ }
+ }
+ }
+
+ public void writeBoolean(boolean v) throws IOException {
+ prepareBuffer(BOOLEAN_SIZE);
+ if (v) {
+ buffer[bufferPosition] = 0x01;
+ } else {
+ buffer[bufferPosition] = 0x00;
+ }
+ bufferPosition += BOOLEAN_SIZE;
+ }
+
+ public void writeByte(int v) throws IOException {
+ prepareBuffer(BYTE_SIZE);
+ buffer[bufferPosition] = (byte) v;
+ bufferPosition += BYTE_SIZE;
+ }
+
+ public void writeBytes(String s) throws IOException {
+ int charsRemaining = s.length();
+
+ while (charsRemaining > 0) {
+
+ int bytesAvailableInBuffer = (BUFFER_SIZE - 1) - bufferPosition;
+ int charsAvailableInBuffer = bytesAvailableInBuffer;
+
+ if (charsAvailableInBuffer > charsRemaining) {
+ charsAvailableInBuffer = charsRemaining;
+ }
+
+ int charStart = 0;
+
+ if (charsAvailableInBuffer > 0) {
+
+ // Copy characters into the character buffer
+ s.getChars(charStart, charStart + charsAvailableInBuffer, charBuffer, 0);
+
+ // Iterate over each character in the character buffer
+ for (int charIndex = 0; charIndex < charsAvailableInBuffer; charIndex++) {
+
+ // Put the low-order byte for the current character into the byte buffer
+ buffer[bufferPosition] = (byte) charBuffer[charIndex];
+
+ bufferPosition += BYTE_SIZE;
+
+ }
+
+ charsRemaining -= charsAvailableInBuffer;
+
+ } else {
+ writeBuffer();
+ }
+ }
+ }
+
+ public void writeChar(int v) throws IOException {
+ prepareBuffer(CHAR_SIZE);
+
+ for (int offset = 0, mask = ((CHAR_SIZE - 1) * BITS_PER_BYTE); offset < CHAR_SIZE && mask >= 0; offset++, mask -=
+ BITS_PER_BYTE) {
+
+ buffer[bufferPosition + offset] = (byte) (v >>> mask);
+
+ }
+
+ bufferPosition += CHAR_SIZE;
+ }
+
+ public void writeChars(String s) throws IOException {
+
+ int charsRemaining = s.length();
+
+ while (charsRemaining > 0) {
+
+ int bytesAvailableInBuffer = (BUFFER_SIZE - 1) - bufferPosition;
+ int charsAvailableInBuffer = bytesAvailableInBuffer / CHAR_SIZE;
+
+ if (charsAvailableInBuffer > charsRemaining) {
+ charsAvailableInBuffer = charsRemaining;
+ }
+
+ int charStart = 0;
+
+ if (charsAvailableInBuffer > 0) {
+
+ // Copy characters into the character buffer
+ s.getChars(charStart, charStart + charsAvailableInBuffer, charBuffer, 0);
+
+ // Iterate over each character in the character buffer
+ for (int charIndex = 0; charIndex < charsAvailableInBuffer; charIndex++) {
+
+ // Put the bytes for the current character into the byte buffer
+ for (int offset = 0, mask = (CHAR_SIZE * BITS_PER_BYTE); offset < CHAR_SIZE && mask >= 0; offset++, mask -=
+ BITS_PER_BYTE) {
+
+ buffer[bufferPosition + offset] = (byte) (charBuffer[charIndex] >>> mask);
+ }
+
+ bufferPosition += CHAR_SIZE;
+
+ }
+
+ charsRemaining -= charsAvailableInBuffer;
+
+ } else {
+ writeBuffer();
+ }
+ }
+
+ }
+
+ public void writeDouble(double v) throws IOException {
+ prepareBuffer(DOUBLE_SIZE);
+
+ long l = Double.doubleToLongBits(v);
+
+ for (int offset = 0, mask = ((DOUBLE_SIZE - 1) * BITS_PER_BYTE); offset < DOUBLE_SIZE
+ && mask >= 0; offset++, mask -= BITS_PER_BYTE) {
+
+ buffer[bufferPosition + offset] = (byte) (l >>> mask);
+
+ }
+
+ bufferPosition += DOUBLE_SIZE;
+ }
+
+ public void writeFloat(float v) throws IOException {
+ prepareBuffer(FLOAT_SIZE);
+
+ int i = Float.floatToIntBits(v);
+
+ for (int offset = 0, mask = ((FLOAT_SIZE - 1) * BITS_PER_BYTE); offset < FLOAT_SIZE
+ && mask >= 0; offset++, mask -= BITS_PER_BYTE) {
+
+ buffer[bufferPosition + offset] = (byte) (i >>> mask);
+
+ }
+
+ bufferPosition += FLOAT_SIZE;
+ }
+
+ public void writeInt(int v) throws IOException {
+ prepareBuffer(INT_SIZE);
+
+ for (int offset = 0, mask = ((INT_SIZE - 1) * BITS_PER_BYTE); offset < INT_SIZE && mask >= 0; offset++, mask -=
+ BITS_PER_BYTE) {
+
+ buffer[bufferPosition + offset] = (byte) (v >>> mask);
+
+ }
+
+ bufferPosition += INT_SIZE;
+ }
+
+ public void writeLong(long v) throws IOException {
+ prepareBuffer(LONG_SIZE);
+
+ for (int offset = 0, mask = ((LONG_SIZE - 1) * BITS_PER_BYTE); offset < LONG_SIZE && mask >= 0; offset++, mask -=
+ LONG_SIZE) {
+
+ buffer[bufferPosition + offset] = (byte) (v >>> mask);
+
+ }
+
+ bufferPosition += LONG_SIZE;
+ }
+
+ public void writeShort(int v) throws IOException {
+ prepareBuffer(SHORT_SIZE);
+
+ for (int offset = 0, mask = ((SHORT_SIZE - 1) * BITS_PER_BYTE); offset < SHORT_SIZE
+ && mask >= 0; offset++, mask -= BITS_PER_BYTE) {
+
+ buffer[bufferPosition + offset] = (byte) (v >>> mask);
+
+ }
+
+ bufferPosition += SHORT_SIZE;
+ }
+
+ private long utfBytesRequired(String str) {
+
+ long bytesRequired = 0;
+
+ // Calculate the number of bytes required
+ for (int charStart = 0, charsRemaining = str.length(); charsRemaining > 0;) {
+
+ int charsToCopy = ((charsRemaining < charBuffer.length) ? charsRemaining : charBuffer.length);
+
+ int charEnd = charStart + charsToCopy;
+
+
+ // Copy characters into the character buffer
+ str.getChars(charStart, charEnd, charBuffer, 0);
+
+ // Iterate over each character in the character buffer
+ for (int charIndex = 0; charIndex < charsToCopy; charIndex++) {
+
+ char c = charBuffer[charIndex];
+
+ if (c >= '\u0001' && c <= '\u007f') {
+ charSizeBuffer[charIndex] = utf8CharRange.ONE_BYTE;
+ bytesRequired += 1;
+ // } else if ((c>='\u0080' && c<='\u07ff') || c=='\u0000') {
+ } else if (c < '\u0800') {
+ charSizeBuffer[charIndex] = utf8CharRange.TWO_BYTES;
+ bytesRequired += 2;
+ } else {
+ charSizeBuffer[charIndex] = utf8CharRange.THREE_BYTES;
+ bytesRequired += 3;
+ }
+
+ }
+
+ charStart = charEnd;
+ charsRemaining -= charsToCopy;
+
+ }
+
+ return bytesRequired;
+ }
+
+ public void writeUTF(String str) throws IOException {
+
+ // Calculate the number of bytes required to encode the string
+ long bytesRequired = utfBytesRequired(str);
+
+ writeUTF(str, bytesRequired, false);
+ }
+
+
+
+ private void writeUTF(String str, long bytesRequired, boolean forceLongHeader) throws IOException {
+
+ if (forceLongHeader) {
+ writeLong(bytesRequired);
+ } else {
+ // Attempt to write the number of bytes required to encode this string.
+ //
+ // Because the size of the string is encoded as a short,
+ // only strings that require no more than Short.MAX_VALUE bytes can be encoded.
+ if (bytesRequired > Short.MAX_VALUE) {
+ throw new UTFDataFormatException(
+ "Unable to successfully encode strings that require more than " + Short.MAX_VALUE
+ + " bytes. Encoding the provided string would require " + bytesRequired + " bytes.");
+ } else {
+ writeShort((short) bytesRequired);
+ }
+ }
+
+ int numChars = str.length();
+ int charsRemaining = numChars;
+
+
+ int charStart = 0;
+ int charEnd = numChars;
+
+ while (charsRemaining > 0) {
+
+ // Get the number of empty bytes available in the buffer
+ int bytesAvailableInBuffer = (BUFFER_SIZE - 1) - bufferPosition;
+
+ // Calculate the number of characters that
+ // can be encoded in the remaining buffer space.
+ int bytesToUse = 0;
+ for (int charIndex = charStart; charIndex < numChars; charIndex++) {
+ int bytesNeeded;
+ switch (charSizeBuffer[charIndex]) {
+ case ONE_BYTE:
+ bytesNeeded = 1;
+ break;
+ case TWO_BYTES:
+ bytesNeeded = 2;
+ break;
+ case THREE_BYTES:
+ default:
+ bytesNeeded = 3;
+ break;
+ }
+
+ if (bytesToUse + bytesNeeded > bytesAvailableInBuffer) {
+ charEnd = charIndex;
+ break;
+ } else {
+ bytesToUse += bytesNeeded;
+ }
+ }
+
+
+ // Write character data to the byte buffer
+ int charsAvailableInBuffer = charEnd - charStart;
+ int charsToCopy = charEnd - charStart;
+
+ if (charsToCopy > 0) {
+
+ // Copy characters into the character buffer
+ str.getChars(charStart, charEnd, charBuffer, 0);
+
+ // Iterate over each character in the character buffer
+ for (int charIndex = 0; charIndex < charsAvailableInBuffer; charIndex++) {
+
+ char c = charBuffer[charIndex];
+
+ switch (charSizeBuffer[charIndex]) {
+
+ case ONE_BYTE: {
+ buffer[bufferPosition++] = (byte) c;
+ break;
+ }
+
+ case TWO_BYTES: {
+ buffer[bufferPosition++] = (byte) (0xc0 | (0x1f & (c >> 6)));
+ buffer[bufferPosition++] = (byte) (0x80 | (0x3f & c));
+ break;
+ }
+
+ case THREE_BYTES: {
+ buffer[bufferPosition++] = (byte) (0xe0 | (0x0f & (c >> 12)));
+ buffer[bufferPosition++] = (byte) (0x80 | (0x3f & (c >> 6)));
+ buffer[bufferPosition++] = (byte) (0x80 | (0x3f & c));
+ break;
+ }
+ }
+
+ }
+
+ charsRemaining -= charsToCopy;
+ charStart = charEnd;
+ charEnd = numChars;
+
+ } else {
+ writeBuffer();
+ }
+
+ }
+
+ }
+
+ private static enum utf8CharRange {
+ ONE_BYTE, TWO_BYTES, THREE_BYTES
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/util/io/IndexedReader.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/util/io/IndexedReader.java b/joshua-core/src/main/java/org/apache/joshua/util/io/IndexedReader.java
new file mode 100644
index 0000000..f357e55
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/io/IndexedReader.java
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.io;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+
+
+/**
+ * Wraps a reader with "line" index information.
+ *
+ * @author wren ng thornton wren@users.sourceforge.net
+ * @version $LastChangedDate: 2009-03-26 15:06:57 -0400 (Thu, 26 Mar 2009) $
+ */
+public class IndexedReader<E> implements Reader<E> {
+
+ /** A name for the type of elements the reader produces. */
+ private final String elementName;
+
+ /** The number of elements the reader has delivered so far. */
+ private int lineNumber;
+
+ /** The underlying reader. */
+ private final Reader<E> reader;
+
+ public IndexedReader(String elementName, Reader<E> reader) {
+ this.elementName = elementName;
+ this.lineNumber = 0;
+ this.reader = reader;
+ }
+
+ /**
+ * Return the number of elements delivered so far.
+ * @return integer representing the number of elements delivered so far
+ */
+ public int index() {
+ return this.lineNumber;
+ }
+
+
+ /**
+ * Wrap an IOException's message with the index when it occured.
+ * @param oldError the old {@link java.io.IOException} we wish to wrap
+ * @return the new wrapped {@link java.io.IOException}
+ */
+ public IOException wrapIOException(IOException oldError) {
+ IOException newError =
+ new IOException("At " + this.elementName + " " + this.lineNumber + ": "
+ + oldError.getMessage());
+ newError.initCause(oldError);
+ return newError;
+ }
+
+ // ===============================================================
+ // Reader
+ // ===============================================================
+
+ /**
+ * Delegated to the underlying reader.
+ * @return true if the reader is ready
+ * @throws IOException if there is an error determining readiness
+ */
+ @Override
+ public boolean ready() throws IOException {
+ try {
+ return this.reader.ready();
+ } catch (IOException oldError) {
+ throw wrapIOException(oldError);
+ }
+ }
+
+
+ /**
+ * Delegated to the underlying reader. Note that we do not have a <code>finalize()</code> method;
+ * however, when we fall out of scope, the underlying reader will too, so its finalizer may be
+ * called. For correctness, be sure to manually close all readers.
+ */
+ public void close() throws IOException {
+ try {
+ this.reader.close();
+ } catch (IOException oldError) {
+ throw wrapIOException(oldError);
+ }
+ }
+
+
+ /** Delegated to the underlying reader. */
+ public E readLine() throws IOException {
+ E line;
+ try {
+ line = this.reader.readLine();
+ } catch (IOException oldError) {
+ throw wrapIOException(oldError);
+ }
+ ++this.lineNumber;
+ return line;
+ }
+
+
+ // ===============================================================
+ // Iterable -- because sometimes Java can be very stupid
+ // ===============================================================
+
+ /** Return self as an iterator. */
+ public Iterator<E> iterator() {
+ return this;
+ }
+
+
+ // ===============================================================
+ // Iterator
+ // ===============================================================
+
+ /** Delegated to the underlying reader. */
+ public boolean hasNext() {
+ return this.reader.hasNext();
+ }
+
+
+ /** Delegated to the underlying reader. */
+ public E next() throws NoSuchElementException {
+ E line = this.reader.next();
+ // Let exceptions out, we'll wrap any errors a closing time.
+
+ ++this.lineNumber;
+ return line;
+ }
+
+
+ /**
+ * If the underlying reader supports removal, then so do we. Note that the {@link #index()} method
+ * returns the number of elements delivered to the client, so removing an element from the
+ * underlying collection does not affect that number.
+ */
+ public void remove() throws UnsupportedOperationException {
+ this.reader.remove();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/util/io/LineReader.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/util/io/LineReader.java b/joshua-core/src/main/java/org/apache/joshua/util/io/LineReader.java
new file mode 100644
index 0000000..5122994
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/io/LineReader.java
@@ -0,0 +1,368 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.io;
+
+import java.io.BufferedReader;
+import java.io.FileDescriptor;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.File;
+import java.nio.charset.Charset;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+import java.util.zip.GZIPInputStream;
+
+import org.apache.joshua.decoder.Decoder;
+
+/**
+ * This class provides an Iterator interface to a BufferedReader. This covers the most common
+ * use-cases for reading from files without ugly code to check whether we got a line or not.
+ *
+ * @author wren ng thornton wren@users.sourceforge.net
+ * @author Matt Post post@cs.jhu.edu
+ */
+public class LineReader implements Reader<String> {
+
+ /*
+ * Note: charset name is case-agnostic "UTF-8" is the canonical name "UTF8", "unicode-1-1-utf-8"
+ * are aliases Java doesn't distinguish utf8 vs UTF-8 like Perl does
+ */
+ private static final Charset FILE_ENCODING = Charset.forName("UTF-8");
+
+ /*
+ * The reader and its underlying input stream. We need to keep a hold of the underlying
+ * input stream so that we can query how many raw bytes it's read (for a generic progress
+ * meter that works across GZIP'ed and plain text files).
+ */
+ private BufferedReader reader;
+ private ProgressInputStream rawStream;
+
+ private String buffer;
+ private IOException error;
+
+ private int lineno = 0;
+
+ private boolean display_progress = false;
+
+ private int progress = 0;
+
+ // ===============================================================
+ // Constructors and destructors
+ // ===============================================================
+
+ /**
+ * Opens a file for iterating line by line. The special "-" filename can be used to specify
+ * STDIN. GZIP'd files are tested for automatically.
+ *
+ * @param filename the file to be opened ("-" for STDIN)
+ * @throws IOException if there is an error reading the input file
+ */
+ public LineReader(String filename) throws IOException {
+
+ display_progress = (Decoder.VERBOSE >= 1);
+
+ progress = 0;
+
+ InputStream stream = null;
+ long totalBytes = -1;
+ if (filename.equals("-")) {
+ rawStream = null;
+ stream = new FileInputStream(FileDescriptor.in);
+ } else {
+ totalBytes = new File(filename).length();
+ rawStream = new ProgressInputStream(new FileInputStream(filename), totalBytes);
+
+ try {
+ stream = new GZIPInputStream(rawStream);
+ } catch (Exception e) {
+ // GZIP ate a byte, so reset
+ rawStream.close();
+ stream = rawStream = new ProgressInputStream(new FileInputStream(filename), totalBytes);
+ }
+ }
+
+ this.reader = new BufferedReader(new InputStreamReader(stream, FILE_ENCODING));
+ }
+
+ public LineReader(String filename, boolean show_progress) throws IOException {
+ this(filename);
+ display_progress = (Decoder.VERBOSE >= 1 && show_progress);
+ }
+
+
+ /**
+ * Wraps an InputStream for iterating line by line. Stream encoding is assumed to be UTF-8.
+ * @param in an {@link java.io.InputStream} to wrap and iterate over line by line
+ */
+ public LineReader(InputStream in) {
+ this.reader = new BufferedReader(new InputStreamReader(in, FILE_ENCODING));
+ display_progress = false;
+ }
+
+ /**
+ * Chain to the underlying {@link ProgressInputStream}.
+ *
+ * @return an integer from 0..100, indicating how much of the file has been read.
+ */
+ public int progress() {
+ return rawStream == null ? 0 : rawStream.progress();
+ }
+
+ /**
+ * This method will close the file handle, and will raise any exceptions that occured during
+ * iteration. The method is idempotent, and all calls after the first are no-ops (unless the
+ * thread was interrupted or killed). For correctness, you <b>must</b> call this method before the
+ * object falls out of scope.
+ * @throws IOException if there is an error closing the file handler
+ */
+ public void close() throws IOException {
+
+ this.buffer = null; // Just in case it's a large string
+
+ if (null != this.reader) {
+ try {
+ // We assume the wrappers will percolate this down.
+ this.reader.close();
+
+ } catch (IOException e) {
+ // We need to trash our cached error for idempotence.
+ // Presumably the closing error is the more important
+ // one to throw.
+ this.error = null;
+ throw e;
+
+ } finally {
+ this.reader = null;
+ }
+ }
+
+ if (null != this.error) {
+ IOException e = this.error;
+ this.error = null;
+ throw e;
+ }
+ }
+
+
+ /**
+ * We attempt to avoid leaking file descriptors if you fail to call close before the object falls
+ * out of scope. However, the language spec makes <b>no guarantees</b> about timeliness of garbage
+ * collection. It is a bug to rely on this method to release the resources. Also, the garbage
+ * collector will discard any exceptions that have queued up, without notifying the application in
+ * any way.
+ *
+ * Having a finalizer means the JVM can't do "fast allocation" of LineReader objects (or
+ * subclasses). This isn't too important due to disk latency, but may be worth noting.
+ *
+ * @see <a
+ * href="http://java2go.blogspot.com/2007/09/javaone-2007-performance-tips-2-finish.html">Performance
+ * Tips</a>
+ * @see <a
+ * href="http://www.javaworld.com/javaworld/jw-06-1998/jw-06-techniques.html?page=1">Techniques</a>
+ */
+ protected void finalize() throws Throwable {
+ try {
+ this.close();
+ } catch (IOException e) {
+ // Do nothing. The GC will discard the exception
+ // anyways, but it may cause us to linger on the heap.
+ } finally {
+ super.finalize();
+ }
+ }
+
+
+
+ // ===============================================================
+ // Reader
+ // ===============================================================
+
+ // Copied from interface documentation.
+ /** Determine if the reader is ready to read a line. */
+ public boolean ready() throws IOException {
+ return this.reader.ready();
+ }
+
+
+ /**
+ * This method is like next() except that it throws the IOException directly. If there are no
+ * lines to be read then null is returned.
+ */
+ public String readLine() throws IOException {
+ if (this.hasNext()) {
+ String line = this.buffer;
+ this.buffer = null;
+ return line;
+
+ } else {
+ if (null != this.error) {
+ IOException e = this.error;
+ this.error = null;
+ throw e;
+ }
+ return null;
+ }
+ }
+
+
+ // ===============================================================
+ // Iterable -- because sometimes Java can be very stupid
+ // ===============================================================
+
+ /** Return self as an iterator. */
+ public Iterator<String> iterator() {
+ return this;
+ }
+
+
+ // ===============================================================
+ // Iterator
+ // ===============================================================
+
+ // Copied from interface documentation.
+ /**
+ * Returns <code>true</code> if the iteration has more elements. (In other words, returns
+ * <code>true</code> if <code>next</code> would return an element rather than throwing an
+ * exception.)
+ */
+ public boolean hasNext() {
+ if (null != this.buffer) {
+ return true;
+
+ } else if (null != this.error) {
+ return false;
+
+ } else {
+ // We're not allowed to throw IOException from within Iterator
+ try {
+ this.buffer = this.reader.readLine();
+ } catch (IOException e) {
+ this.buffer = null;
+ this.error = e;
+ return false;
+ }
+ return (null != this.buffer);
+ }
+ }
+
+
+ /**
+ * Return the next line of the file. If an error is encountered, NoSuchElementException is thrown.
+ * The actual IOException encountered will be thrown later, when the LineReader is closed. Also if
+ * there is no line to be read then NoSuchElementException is thrown.
+ */
+ public String next() throws NoSuchElementException {
+ if (this.hasNext()) {
+ if (display_progress) {
+ int newProgress = (reader != null) ? progress() : 100;
+// System.err.println(String.format("OLD %d NEW %d", progress, newProgress));
+
+ if (newProgress > progress) {
+ for (int i = progress + 1; i <= newProgress; i++)
+ if (i == 97) {
+ System.err.print("1");
+ } else if (i == 98) {
+ System.err.print("0");
+ } else if (i == 99) {
+ System.err.print("0");
+ } else if (i == 100) {
+ System.err.println("%");
+ } else if (i % 10 == 0) {
+ System.err.print(String.format("%d", i));
+ System.err.flush();
+ } else if ((i - 1) % 10 == 0)
+ ; // skip at 11 since 10, 20, etc take two digits
+ else {
+ System.err.print(".");
+ System.err.flush();
+ }
+ progress = newProgress;
+ }
+ }
+
+ String line = this.buffer;
+ this.lineno++;
+ this.buffer = null;
+ return line;
+ } else {
+ throw new NoSuchElementException();
+ }
+ }
+
+ /* Get the line number of the last line that was returned */
+ public int lineno() {
+ return this.lineno;
+ }
+
+ /** Unsupported. */
+ public void remove() throws UnsupportedOperationException {
+ throw new UnsupportedOperationException();
+ }
+
+
+ /**
+ * Iterates over all lines, ignoring their contents, and returns the count of lines. If some lines
+ * have already been read, this will return the count of remaining lines. Because no lines will
+ * remain after calling this method, we implicitly call close.
+ *
+ * @return the number of lines read
+ * @throws IOException if there is an error reading lines
+ */
+ public int countLines() throws IOException {
+ int lines = 0;
+
+ while (this.hasNext()) {
+ this.next();
+ lines++;
+ }
+ this.close();
+
+ return lines;
+ }
+
+ /**
+ * Example usage code.
+ * @param args an input file
+ */
+ public static void main(String[] args) {
+ if (1 != args.length) {
+ System.out.println("Usage: java LineReader filename");
+ System.exit(1);
+ }
+
+ try {
+
+ LineReader in = new LineReader(args[0]);
+ try {
+ for (String line : in) {
+
+ System.out.println(line);
+
+ }
+ } finally {
+ in.close();
+ }
+
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/util/io/NullReader.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/util/io/NullReader.java b/joshua-core/src/main/java/org/apache/joshua/util/io/NullReader.java
new file mode 100644
index 0000000..f833f00
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/io/NullReader.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.io;
+
+import java.io.IOException;
+
+import org.apache.joshua.util.NullIterator;
+
+
+/**
+ * This class provides a null-object Reader. This is primarily useful for when you may or may not
+ * have a {@link Reader}, and you don't want to check for null all the time. All operations are
+ * no-ops.
+ *
+ * @author wren ng thornton wren@users.sourceforge.net
+ * @version $LastChangedDate: 2009-03-26 15:06:57 -0400 (Thu, 26 Mar 2009) $
+ */
+public class NullReader<E> extends NullIterator<E> implements Reader<E> {
+
+ // ===============================================================
+ // Constructors and destructors
+ // ===============================================================
+
+ // TODO: use static factory method and singleton?
+ public NullReader() {}
+
+ /** A no-op. */
+ public void close() throws IOException {}
+
+
+ // ===============================================================
+ // Reader
+ // ===============================================================
+
+ /**
+ * Always returns true. Is this correct? What are the semantics of ready()? We're always capable
+ * of delivering nothing, but we're never capable of delivering anything...
+ */
+ public boolean ready() {
+ return true;
+ }
+
+ /** Always returns null. */
+ public E readLine() throws IOException {
+ return null;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/util/io/ProgressInputStream.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/util/io/ProgressInputStream.java b/joshua-core/src/main/java/org/apache/joshua/util/io/ProgressInputStream.java
new file mode 100644
index 0000000..075c0b3
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/io/ProgressInputStream.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.io;
+
+import java.io.FilterInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * Generic progress meter for reading files (compressed or not). Pass it the raw input file stream
+ * and it will keep track for you.
+ *
+ * @author Matt Post post@cs.jhu.edu
+ */
+public class ProgressInputStream extends FilterInputStream {
+
+ private long totalBytes = -1;
+ private long bytesRead = 0;
+
+ protected ProgressInputStream(InputStream in, long totalBytes) {
+ super(in);
+
+ this.totalBytes = totalBytes;
+ }
+
+ @Override
+ public int read() throws IOException {
+ int value = super.read();
+ bytesRead += 1;
+ return value;
+ }
+
+ @Override
+ public int read(byte[] b) throws IOException {
+ int value = super.read(b);
+ bytesRead += value;
+ return value;
+ }
+
+ @Override
+ public int read(byte[] b, int off, int len) throws IOException {
+ int value = super.read(b, off, len);
+ bytesRead += value;
+ return value;
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ bytesRead = 0;
+ }
+
+ @Override
+ public long skip(long bytesRead) throws IOException {
+ long skip = super.skip(bytesRead);
+ bytesRead += skip;
+ return skip;
+ }
+
+ /**
+ * @return progress through the file, as an integer (0..100).
+ */
+ public int progress() {
+ return (int)(100.0 * (float)bytesRead / (float)totalBytes);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/util/io/Reader.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/util/io/Reader.java b/joshua-core/src/main/java/org/apache/joshua/util/io/Reader.java
new file mode 100644
index 0000000..cab6d74
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/io/Reader.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.io;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+/**
+ * Common interface for Reader type objects.
+ *
+ * @author wren ng thornton wren@users.sourceforge.net
+ * @version $LastChangedDate: 2009-03-26 15:06:57 -0400 (Thu, 26 Mar 2009) $
+ */
+public interface Reader<E> extends Iterable<E>, Iterator<E> {
+
+ /**
+ * Close the reader, freeing all resources.
+ * @throws IOException if there is an error closing the reader instance
+ */
+ void close() throws IOException;
+
+ /**
+ * Determine if the reader is ready to read a line.
+ * @return true if it is ready
+ * @throws IOException if there is an error whilst determining if the reader if ready
+ */
+ boolean ready() throws IOException;
+
+ /**
+ * Read a "line" and return an object representing it.
+ * @return an object representing a single line
+ * @throws IOException if there is an error reading lines
+ */
+ E readLine() throws IOException;
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/util/io/package-info.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/util/io/package-info.java b/joshua-core/src/main/java/org/apache/joshua/util/io/package-info.java
new file mode 100644
index 0000000..d7ea475
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/io/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/**
+ * Provides common utility classes for IO.
+ */
+package org.apache.joshua.util.io;
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/util/package-info.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/util/package-info.java b/joshua-core/src/main/java/org/apache/joshua/util/package-info.java
new file mode 100644
index 0000000..2dedb37
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/**
+ * Provides common utility classes.
+ */
+package org.apache.joshua.util;