You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/08/30 21:04:52 UTC
[07/17] incubator-joshua git commit: Merge branch 'master' into 7-with-master

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b0b70627/joshua-core/src/main/java/org/apache/joshua/util/encoding/FeatureTypeAnalyzer.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/util/encoding/FeatureTypeAnalyzer.java
index 0aa41af,0000000..5226b0a
mode 100644,000000..100644
--- a/joshua-core/src/main/java/org/apache/joshua/util/encoding/FeatureTypeAnalyzer.java
+++ b/joshua-core/src/main/java/org/apache/joshua/util/encoding/FeatureTypeAnalyzer.java
@@@ -1,256 -1,0 +1,264 @@@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 +package org.apache.joshua.util.encoding;
 +
 +import java.io.BufferedOutputStream;
 +import java.io.DataOutputStream;
 +import java.io.File;
 +import java.io.FileOutputStream;
 +import java.io.IOException;
 +import java.util.ArrayList;
 +import java.util.Arrays;
 +import java.util.HashMap;
 +import java.util.List;
 +import java.util.Map;
 +
 +import org.apache.joshua.corpus.Vocabulary;
 +import org.apache.joshua.decoder.ff.FeatureMap;
 +import org.apache.joshua.util.io.LineReader;
 +import org.slf4j.Logger;
 +import org.slf4j.LoggerFactory;
 +
 +public class FeatureTypeAnalyzer {
 +
 +  private static final Logger LOG = LoggerFactory.getLogger(FeatureTypeAnalyzer.class);
 +
 +  private ArrayList<FeatureType> types;
 +
 +  private Map<Integer, Integer> featureToType;
 +
 +  private Map<Integer, Integer> featureIdMap;
 +
 +  // Is the feature setup labeled.
 +  private boolean labeled;
 +
 +  // Is the encoder configuration open for new features (that are not assumed boolean)?
 +  private boolean open;
 +
 +  public FeatureTypeAnalyzer() {
 +    this(false);
 +  }
 +
 +  public FeatureTypeAnalyzer(boolean open) {
 +    this.open = open;
 +    this.types = new ArrayList<FeatureType>();
 +    this.featureToType = new HashMap<Integer, Integer>();
 +    this.featureIdMap = new HashMap<Integer, Integer>();
 +  }
 +
 +  public void readConfig(String config_filename) throws IOException {
-     LineReader reader = new LineReader(config_filename);
-     while (reader.hasNext()) {
-       // Clean up line, chop comments off and skip if the result is empty.
-       String line = reader.next().trim();
-       if (line.indexOf('#') != -1)
-         line = line.substring(0, line.indexOf('#'));
-       if (line.isEmpty())
-         continue;
-       String[] fields = line.split("[\\s]+");
- 
-       if ("encoder".equals(fields[0])) {
-         // Adding an encoder to the mix.
-         if (fields.length < 3) {
-           throw new RuntimeException("Incomplete encoder line in config.");
++    try(LineReader reader = new LineReader(config_filename);) {
++      while (reader.hasNext()) {
++        // Clean up line, chop comments off and skip if the result is empty.
++        String line = reader.next().trim();
++        if (line.indexOf('#') != -1)
++          line = line.substring(0, line.indexOf('#'));
++        if (line.isEmpty())
++          continue;
++        String[] fields = line.split("[\\s]+");
++
++        if ("encoder".equals(fields[0])) {
++          // Adding an encoder to the mix.
++          if (fields.length < 3) {
++            throw new RuntimeException("Incomplete encoder line in config.");
++          }
++          String encoder_key = fields[1];
++          List<Integer> feature_ids = new ArrayList<Integer>();
++          for (int i = 2; i < fields.length; i++)
++            feature_ids.add(Vocabulary.id(fields[i]));
++          addFeatures(encoder_key, feature_ids);
 +        }
 +        String encoder_key = fields[1];
-         ArrayList<Integer> feature_ids = new ArrayList<Integer>();
++        List<Integer> feature_ids = new ArrayList<Integer>();
 +        for (int i = 2; i < fields.length; i++)
 +          feature_ids.add(FeatureMap.hashFeature(fields[i]));
 +        addFeatures(encoder_key, feature_ids);
 +      }
 +    }
 +  }
 +
 +  public void addFeatures(String encoder_key, List<Integer> feature_ids) {
 +    int index = addType(encoder_key);
 +    for (int feature_id : feature_ids)
 +      featureToType.put(feature_id, index);
 +  }
 +
 +  private int addType(String encoder_key) {
 +    FeatureType ft = new FeatureType(encoder_key);
 +    int index = types.indexOf(ft);
 +    if (index < 0) {
 +      types.add(ft);
 +      return types.size() - 1;
 +    }
 +    return index;
 +  }
 +
 +  private int addType() {
 +    types.add(new FeatureType());
 +    return types.size() - 1;
 +  }
 +
 +  public void observe(int feature_id, float value) {
 +    Integer type_id = featureToType.get(feature_id);
 +    if (type_id == null && open) {
 +      type_id = addType();
 +      featureToType.put(feature_id, type_id);
 +    }
 +    if (type_id != null)
 +      types.get(type_id).observe(value);
 +  }
 +
 +  // Inspects the collected histograms, inferring actual type of feature. Then replaces the
 +  // analyzer, if present, with the most compact applicable type.
 +  public void inferTypes(boolean labeled) {
 +    for (FeatureType ft : types) {
 +      ft.inferUncompressedType();
 +    }
 +    if (LOG.isInfoEnabled()) {
 +      for (int id : featureToType.keySet()) {
 +        LOG.info("Type inferred: {} is {}", (labeled ? FeatureMap.getFeature(id) : "Feature " + id),
 +            types.get(featureToType.get(id)).encoder.getKey());
 +      }
 +    }
 +  }
 +
 +  public void buildFeatureMap() {
 +    int[] known_features = new int[featureToType.keySet().size()];
 +    int i = 0;
 +    for (int f : featureToType.keySet())
 +      known_features[i++] = f;
 +    Arrays.sort(known_features);
 +
 +    featureIdMap.clear();
 +    for (i = 0; i < known_features.length; ++i)
 +      featureIdMap.put(known_features[i], i);
 +  }
 +
 +  public int getRank(int feature_id) {
 +    return featureIdMap.get(feature_id);
 +  }
 +
 +  public IntEncoder getIdEncoder() {
 +    int num_features = featureIdMap.size();
 +    if (num_features <= Byte.MAX_VALUE)
 +      return PrimitiveIntEncoder.BYTE;
 +    else if (num_features <= Character.MAX_VALUE)
 +      return PrimitiveIntEncoder.CHAR;
 +    else
 +      return PrimitiveIntEncoder.INT;
 +  }
 +
 +  public void write(String file_name) throws IOException {
 +    File out_file = new File(file_name);
 +    BufferedOutputStream buf_stream = new BufferedOutputStream(new FileOutputStream(out_file));
 +    DataOutputStream out_stream = new DataOutputStream(buf_stream);
 +
 +    buildFeatureMap();
 +
 +    getIdEncoder().writeState(out_stream);
 +    out_stream.writeBoolean(labeled);
 +    out_stream.writeInt(types.size());
 +    for (int index = 0; index < types.size(); index++)
 +      types.get(index).encoder.writeState(out_stream);
 +
 +    out_stream.writeInt(featureToType.size());
 +    for (int feature_id : featureToType.keySet()) {
 +      if (labeled)
 +        out_stream.writeUTF(FeatureMap.getFeature(feature_id));
 +      else
 +        out_stream.writeInt(feature_id);
 +      out_stream.writeInt(featureIdMap.get(feature_id));
 +      out_stream.writeInt(featureToType.get(feature_id));
 +    }
 +    out_stream.close();
 +  }
 +
++  @Override
 +  public String toString() {
 +    StringBuilder sb = new StringBuilder();
 +    for (int feature_id : featureToType.keySet()) {
 +      sb.append(types.get(featureToType.get(feature_id)).analyzer.toString(FeatureMap.getFeature(feature_id)));
 +    }
 +    System.out.println(sb.toString());
 +    return sb.toString();
 +  }
 +
 +  public boolean isLabeled() {
 +    return labeled;
 +  }
 +
 +  public void setLabeled(boolean labeled) {
 +    this.labeled = labeled;
 +  }
 +
-   class FeatureType {
++  static class FeatureType {
 +    FloatEncoder encoder;
 +    Analyzer analyzer;
 +    int bits;
 +
 +    FeatureType() {
 +      encoder = null;
 +      analyzer = new Analyzer();
 +      bits = -1;
 +    }
 +
 +    FeatureType(String key) {
 +      // either throws or returns non-null
 +      FloatEncoder e = EncoderFactory.getFloatEncoder(key);
 +      encoder = e;
 +      analyzer = null;
 +      bits = -1;
 +    }
 +
 +    void inferUncompressedType() {
 +      if (encoder != null)
 +        return;
 +      encoder = analyzer.inferUncompressedType();
 +      analyzer = null;
 +    }
 +
 +    void inferType() {
 +      if (encoder != null)
 +        return;
 +      encoder = analyzer.inferType(bits);
 +      analyzer = null;
 +    }
 +
 +    void observe(float value) {
 +      if (analyzer != null)
 +        analyzer.add(value);
 +    }
 +
++    @Override
 +    public boolean equals(Object t) {
 +      if (t != null && t instanceof FeatureType) {
 +        FeatureType that = (FeatureType) t;
 +        if (this.encoder != null) {
 +          return this.encoder.equals(that.encoder);
 +        } else {
 +          if (that.encoder != null)
 +            return false;
 +          if (this.analyzer != null)
 +            return this.analyzer.equals(that.analyzer);
 +        }
 +      }
 +      return false;
 +    }
 +  }
 +}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b0b70627/joshua-core/src/main/java/org/apache/joshua/util/io/ExistingUTF8EncodedTextFile.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/util/io/ExistingUTF8EncodedTextFile.java
index 0000000,0000000..42dd236
new file mode 100644
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/io/ExistingUTF8EncodedTextFile.java
@@@ -1,0 -1,0 +1,77 @@@
++/*
++ * Licensed to the Apache Software Foundation (ASF) under one
++ * or more contributor license agreements.  See the NOTICE file
++ * distributed with this work for additional information
++ * regarding copyright ownership.  The ASF licenses this file
++ * to you under the Apache License, Version 2.0 (the
++ * "License"); you may not use this file except in compliance
++ * with the License.  You may obtain a copy of the License at
++ *
++ *  http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing,
++ * software distributed under the License is distributed on an
++ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
++ * KIND, either express or implied.  See the License for the
++ * specific language governing permissions and limitations
++ * under the License.
++ */
++package org.apache.joshua.util.io;
++
++import java.io.FileNotFoundException;
++import java.io.IOException;
++import java.nio.charset.StandardCharsets;
++import java.nio.file.Files;
++import java.nio.file.Path;
++import java.nio.file.Paths;
++import java.util.function.Predicate;
++import java.util.stream.Stream;
++
++/**
++ * A class that represents a {@link StandardCharsets#UTF_8} text file. Will
++ * throw a {@link FileNotFoundException} upon instantiation if the underlying
++ * {@link Path}, or {@link String} representing a Path, is not found.
++ */
++public class ExistingUTF8EncodedTextFile {
++  private static final Predicate<String> emptyStringPredicate = s -> s.isEmpty();
++
++  private final Path p;
++
++  public ExistingUTF8EncodedTextFile(String pathStr) throws FileNotFoundException {
++    this(Paths.get(pathStr));
++  }
++
++  public ExistingUTF8EncodedTextFile(Path p) throws FileNotFoundException {
++    this.p = p;
++    if (!Files.exists(p))
++      throw new FileNotFoundException("Did not find the file at path: " + p.toString());
++  }
++
++  /**
++   * @return the {@link Path} representing this object
++   */
++  public Path getPath() {
++    return this.p;
++  }
++
++  /**
++   * @return the number of lines in the file represented by this object
++   * @throws IOException on inability to read file (maybe it's not a text file)
++   */
++  public int getNumberOfLines() throws IOException {
++    try(Stream<String> ls = Files.lines(this.p, StandardCharsets.UTF_8);) {
++      return (int) ls.count();
++    }
++  }
++
++  /**
++   * @return the number of non-empty lines in the file represented by this object
++   * @throws IOException on inability to read file (maybe it's not a text file)
++   */
++  public int getNumberOfNonEmptyLines() throws IOException {
++    try(Stream<String> ls = Files.lines(this.p, StandardCharsets.UTF_8);) {
++      return (int) ls.filter(emptyStringPredicate.negate())
++          .count();
++    }
++  }
++}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b0b70627/joshua-core/src/main/java/org/apache/joshua/util/io/IndexedReader.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/util/io/IndexedReader.java
index f357e55,0000000..d206544
mode 100644,000000..100644
--- a/joshua-core/src/main/java/org/apache/joshua/util/io/IndexedReader.java
+++ b/joshua-core/src/main/java/org/apache/joshua/util/io/IndexedReader.java
@@@ -1,155 -1,0 +1,160 @@@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 +package org.apache.joshua.util.io;
 +
 +import java.io.IOException;
 +import java.util.Iterator;
 +import java.util.NoSuchElementException;
 +
 +
 +/**
 + * Wraps a reader with "line" index information.
-  * 
++ *
 + * @author wren ng thornton wren@users.sourceforge.net
 + * @version $LastChangedDate: 2009-03-26 15:06:57 -0400 (Thu, 26 Mar 2009) $
 + */
 +public class IndexedReader<E> implements Reader<E> {
- 
 +  /** A name for the type of elements the reader produces. */
 +  private final String elementName;
 +
 +  /** The number of elements the reader has delivered so far. */
 +  private int lineNumber;
 +
 +  /** The underlying reader. */
 +  private final Reader<E> reader;
 +
 +  public IndexedReader(String elementName, Reader<E> reader) {
 +    this.elementName = elementName;
 +    this.lineNumber = 0;
 +    this.reader = reader;
 +  }
 +
-   /** 
++  /**
 +   * Return the number of elements delivered so far.
 +   * @return integer representing the number of elements delivered so far
 +   */
 +  public int index() {
 +    return this.lineNumber;
 +  }
 +
 +
 +  /**
 +   * Wrap an IOException's message with the index when it occured.
 +   * @param oldError the old {@link java.io.IOException} we wish to wrap
 +   * @return the new wrapped {@link java.io.IOException}
 +   */
 +  public IOException wrapIOException(IOException oldError) {
 +    IOException newError =
 +        new IOException("At " + this.elementName + " " + this.lineNumber + ": "
 +            + oldError.getMessage());
 +    newError.initCause(oldError);
 +    return newError;
 +  }
 +
 +  // ===============================================================
 +  // Reader
 +  // ===============================================================
 +
-   /** 
++  /**
 +   * Delegated to the underlying reader.
 +   * @return true if the reader is ready
 +   * @throws IOException if there is an error determining readiness
 +   */
 +  @Override
 +  public boolean ready() throws IOException {
 +    try {
 +      return this.reader.ready();
 +    } catch (IOException oldError) {
 +      throw wrapIOException(oldError);
 +    }
 +  }
 +
 +
 +  /**
 +   * Delegated to the underlying reader. Note that we do not have a <code>finalize()</code> method;
 +   * however, when we fall out of scope, the underlying reader will too, so its finalizer may be
 +   * called. For correctness, be sure to manually close all readers.
 +   */
++  @Override
 +  public void close() throws IOException {
 +    try {
 +      this.reader.close();
 +    } catch (IOException oldError) {
 +      throw wrapIOException(oldError);
 +    }
 +  }
 +
 +
 +  /** Delegated to the underlying reader. */
++  @Override
 +  public E readLine() throws IOException {
 +    E line;
 +    try {
 +      line = this.reader.readLine();
 +    } catch (IOException oldError) {
 +      throw wrapIOException(oldError);
 +    }
 +    ++this.lineNumber;
 +    return line;
 +  }
 +
 +
 +  // ===============================================================
 +  // Iterable -- because sometimes Java can be very stupid
 +  // ===============================================================
 +
 +  /** Return self as an iterator. */
++  @Override
 +  public Iterator<E> iterator() {
 +    return this;
 +  }
 +
 +
 +  // ===============================================================
 +  // Iterator
 +  // ===============================================================
 +
 +  /** Delegated to the underlying reader. */
++  @Override
 +  public boolean hasNext() {
 +    return this.reader.hasNext();
 +  }
 +
 +
 +  /** Delegated to the underlying reader. */
++  @Override
 +  public E next() throws NoSuchElementException {
 +    E line = this.reader.next();
 +    // Let exceptions out, we'll wrap any errors a closing time.
 +
 +    ++this.lineNumber;
 +    return line;
 +  }
 +
 +
 +  /**
 +   * If the underlying reader supports removal, then so do we. Note that the {@link #index()} method
 +   * returns the number of elements delivered to the client, so removing an element from the
 +   * underlying collection does not affect that number.
 +   */
++  @Override
 +  public void remove() throws UnsupportedOperationException {
 +    this.reader.remove();
 +  }
 +}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b0b70627/joshua-core/src/main/java/org/apache/joshua/util/io/LineReader.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/util/io/LineReader.java
index d63763d,0000000..ea5d8f1
mode 100644,000000..100644
--- a/joshua-core/src/main/java/org/apache/joshua/util/io/LineReader.java
+++ b/joshua-core/src/main/java/org/apache/joshua/util/io/LineReader.java
@@@ -1,368 -1,0 +1,309 @@@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 +package org.apache.joshua.util.io;
 +
 +import java.io.BufferedReader;
++import java.io.File;
 +import java.io.FileDescriptor;
 +import java.io.FileInputStream;
 +import java.io.IOException;
 +import java.io.InputStream;
 +import java.io.InputStreamReader;
- import java.io.File;
- import java.nio.charset.Charset;
++import java.nio.charset.StandardCharsets;
 +import java.util.Iterator;
 +import java.util.NoSuchElementException;
 +import java.util.zip.GZIPInputStream;
 +
 +import org.apache.joshua.decoder.Decoder;
 +
 +/**
 + * This class provides an Iterator interface to a BufferedReader. This covers the most common
 + * use-cases for reading from files without ugly code to check whether we got a line or not.
-  * 
++ *
 + * @author wren ng thornton wren@users.sourceforge.net
 + * @author Matt Post post@cs.jhu.edu
 + */
 +public class LineReader implements Reader<String>, AutoCloseable {
 +
 +  /*
-    * Note: charset name is case-agnostic "UTF-8" is the canonical name "UTF8", "unicode-1-1-utf-8"
-    * are aliases Java doesn't distinguish utf8 vs UTF-8 like Perl does
-    */
-   private static final Charset FILE_ENCODING = Charset.forName("UTF-8");
- 
-   /*
 +   * The reader and its underlying input stream. We need to keep a hold of the underlying
 +   * input stream so that we can query how many raw bytes it's read (for a generic progress
 +   * meter that works across GZIP'ed and plain text files).
 +   */
 +  private BufferedReader reader;
 +  private ProgressInputStream rawStream;
 +
 +  private String buffer;
 +  private IOException error;
 +
 +  private int lineno = 0;
-   
++
 +  private boolean display_progress = false;
-   
++
 +  private int progress = 0;
 +
 +  // ===============================================================
 +  // Constructors and destructors
 +  // ===============================================================
 +
 +  /**
 +   * Opens a file for iterating line by line. The special "-" filename can be used to specify
 +   * STDIN. GZIP'd files are tested for automatically.
-    * 
++   *
 +   * @param filename the file to be opened ("-" for STDIN)
 +   * @throws IOException if there is an error reading the input file
 +   */
 +  public LineReader(String filename) throws IOException {
-     
++
 +    display_progress = (Decoder.VERBOSE >= 1);
-     
++
 +    progress = 0;
-     
-     InputStream stream = null; 
++
++    InputStream stream = null;
 +    long totalBytes = -1;
 +    if (filename.equals("-")) {
 +      rawStream = null;
 +      stream = new FileInputStream(FileDescriptor.in);
 +    } else {
 +      totalBytes = new File(filename).length();
 +      rawStream = new ProgressInputStream(new FileInputStream(filename), totalBytes);
 +
 +      try {
 +        stream = new GZIPInputStream(rawStream);
 +      } catch (Exception e) {
 +        // GZIP ate a byte, so reset
 +        rawStream.close();
 +        stream = rawStream = new ProgressInputStream(new FileInputStream(filename), totalBytes);
 +      }
-     } 
-     
-     this.reader = new BufferedReader(new InputStreamReader(stream, FILE_ENCODING));
++    }
++
++    this.reader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8));
 +  }
-   
++
 +  public LineReader(String filename, boolean show_progress) throws IOException {
 +    this(filename);
 +    display_progress = (Decoder.VERBOSE >= 1 && show_progress);
 +  }
 +
 +
 +  /**
 +   * Wraps an InputStream for iterating line by line. Stream encoding is assumed to be UTF-8.
 +   * @param in an {@link java.io.InputStream} to wrap and iterate over line by line
 +   */
 +  public LineReader(InputStream in) {
-     this.reader = new BufferedReader(new InputStreamReader(in, FILE_ENCODING));
++    this.reader = new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8));
 +    display_progress = false;
 +  }
-   
++
 +  /**
-    * Chain to the underlying {@link ProgressInputStream}. 
-    * 
++   * Chain to the underlying {@link ProgressInputStream}.
++   *
 +   * @return an integer from 0..100, indicating how much of the file has been read.
 +   */
 +  public int progress() {
 +    return rawStream == null ? 0 : rawStream.progress();
 +  }
-   
++
 +  /**
 +   * This method will close the file handle, and will raise any exceptions that occured during
 +   * iteration. The method is idempotent, and all calls after the first are no-ops (unless the
 +   * thread was interrupted or killed). For correctness, you <b>must</b> call this method before the
 +   * object falls out of scope.
 +   * @throws IOException if there is an error closing the file handler
 +   */
++  @Override
 +  public void close() throws IOException {
 +
 +    this.buffer = null; // Just in case it's a large string
 +
 +    if (null != this.reader) {
 +      try {
 +        // We assume the wrappers will percolate this down.
 +        this.reader.close();
 +
 +      } catch (IOException e) {
 +        // We need to trash our cached error for idempotence.
 +        // Presumably the closing error is the more important
 +        // one to throw.
 +        this.error = null;
 +        throw e;
 +
 +      } finally {
 +        this.reader = null;
 +      }
 +    }
 +
 +    if (null != this.error) {
 +      IOException e = this.error;
 +      this.error = null;
 +      throw e;
 +    }
 +  }
 +
- 
-   /**
-    * We attempt to avoid leaking file descriptors if you fail to call close before the object falls
-    * out of scope. However, the language spec makes <b>no guarantees</b> about timeliness of garbage
-    * collection. It is a bug to rely on this method to release the resources. Also, the garbage
-    * collector will discard any exceptions that have queued up, without notifying the application in
-    * any way.
-    * 
-    * Having a finalizer means the JVM can't do "fast allocation" of LineReader objects (or
-    * subclasses). This isn't too important due to disk latency, but may be worth noting.
-    * 
-    * @see <a
-    *      href="http://java2go.blogspot.com/2007/09/javaone-2007-performance-tips-2-finish.html">Performance
-    *      Tips</a>
-    * @see <a
-    *      href="http://www.javaworld.com/javaworld/jw-06-1998/jw-06-techniques.html?page=1">Techniques</a>
-    */
-   protected void finalize() throws Throwable {
-     try {
-       this.close();
-     } catch (IOException e) {
-       // Do nothing. The GC will discard the exception
-       // anyways, but it may cause us to linger on the heap.
-     } finally {
-       super.finalize();
-     }
-   }
- 
- 
- 
 +  // ===============================================================
 +  // Reader
 +  // ===============================================================
 +
 +  // Copied from interface documentation.
 +  /** Determine if the reader is ready to read a line. */
++  @Override
 +  public boolean ready() throws IOException {
 +    return this.reader.ready();
 +  }
 +
 +
 +  /**
 +   * This method is like next() except that it throws the IOException directly. If there are no
 +   * lines to be read then null is returned.
 +   */
++  @Override
 +  public String readLine() throws IOException {
 +    if (this.hasNext()) {
 +      String line = this.buffer;
 +      this.buffer = null;
 +      return line;
 +
 +    } else {
 +      if (null != this.error) {
 +        IOException e = this.error;
 +        this.error = null;
 +        throw e;
 +      }
 +      return null;
 +    }
 +  }
 +
 +
 +  // ===============================================================
 +  // Iterable -- because sometimes Java can be very stupid
 +  // ===============================================================
 +
 +  /** Return self as an iterator. */
++  @Override
 +  public Iterator<String> iterator() {
 +    return this;
 +  }
 +
 +
 +  // ===============================================================
 +  // Iterator
 +  // ===============================================================
 +
 +  // Copied from interface documentation.
 +  /**
 +   * Returns <code>true</code> if the iteration has more elements. (In other words, returns
 +   * <code>true</code> if <code>next</code> would return an element rather than throwing an
 +   * exception.)
 +   */
++  @Override
 +  public boolean hasNext() {
 +    if (null != this.buffer) {
 +      return true;
 +
 +    } else if (null != this.error) {
 +      return false;
 +
 +    } else {
 +      // We're not allowed to throw IOException from within Iterator
 +      try {
 +        this.buffer = this.reader.readLine();
 +      } catch (IOException e) {
 +        this.buffer = null;
 +        this.error = e;
 +        return false;
 +      }
 +      return (null != this.buffer);
 +    }
 +  }
 +
 +
 +  /**
 +   * Return the next line of the file. If an error is encountered, NoSuchElementException is thrown.
 +   * The actual IOException encountered will be thrown later, when the LineReader is closed. Also if
 +   * there is no line to be read then NoSuchElementException is thrown.
 +   */
++  @Override
 +  public String next() throws NoSuchElementException {
 +    if (this.hasNext()) {
 +      if (display_progress) {
 +        int newProgress = (reader != null) ? progress() : 100;
 +//        System.err.println(String.format("OLD %d NEW %d", progress, newProgress));
-         
++
 +        if (newProgress > progress) {
 +          for (int i = progress + 1; i <= newProgress; i++)
 +            if (i == 97) {
 +              System.err.print("1");
 +            } else if (i == 98) {
 +              System.err.print("0");
 +            } else if (i == 99) {
 +              System.err.print("0");
 +            } else if (i == 100) {
 +              System.err.println("%");
 +            } else if (i % 10 == 0) {
 +              System.err.print(String.format("%d", i));
 +              System.err.flush();
 +            } else if ((i - 1) % 10 == 0)
 +              ; // skip at 11 since 10, 20, etc take two digits
 +            else {
 +              System.err.print(".");
 +              System.err.flush();
 +            }
 +          progress = newProgress;
 +        }
 +      }
-       
++
 +      String line = this.buffer;
 +      this.lineno++;
 +      this.buffer = null;
 +      return line;
 +    } else {
 +      throw new NoSuchElementException();
 +    }
 +  }
-   
++
 +  /* Get the line number of the last line that was returned */
 +  public int lineno() {
 +    return this.lineno;
 +  }
 +
 +  /** Unsupported. */
++  @Override
 +  public void remove() throws UnsupportedOperationException {
 +    throw new UnsupportedOperationException();
 +  }
 +
- 
 +  /**
-    * Iterates over all lines, ignoring their contents, and returns the count of lines. If some lines
-    * have already been read, this will return the count of remaining lines. Because no lines will
-    * remain after calling this method, we implicitly call close.
-    * 
-    * @return the number of lines read
-    * @throws IOException if there is an error reading lines
-    */
-   public int countLines() throws IOException {
-     int lines = 0;
- 
-     while (this.hasNext()) {
-       this.next();
-       lines++;
-     }
-     this.close();
- 
-     return lines;
-   }
- 
-   /** 
 +   * Example usage code.
 +   * @param args an input file
 +   */
 +  public static void main(String[] args) {
 +    if (1 != args.length) {
 +      System.out.println("Usage: java LineReader filename");
 +      System.exit(1);
 +    }
 +
-     try {
- 
-       LineReader in = new LineReader(args[0]);
-       try {
-         for (String line : in) {
- 
-           System.out.println(line);
- 
-         }
-       } finally {
-         in.close();
++    try (LineReader in = new LineReader(args[0]);) {
++      for (String line : in) {
++        System.out.println(line);
 +      }
- 
 +    } catch (IOException e) {
 +      e.printStackTrace();
 +    }
 +  }
 +}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b0b70627/joshua-core/src/main/java/org/apache/joshua/util/io/Reader.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/util/io/Reader.java
index cab6d74,0000000..e3a150e
mode 100644,000000..100644
--- a/joshua-core/src/main/java/org/apache/joshua/util/io/Reader.java
+++ b/joshua-core/src/main/java/org/apache/joshua/util/io/Reader.java
@@@ -1,51 -1,0 +1,52 @@@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 +package org.apache.joshua.util.io;
 +
 +import java.io.IOException;
 +import java.util.Iterator;
 +
 +/**
 + * Common interface for Reader type objects.
-  * 
++ *
 + * @author wren ng thornton wren@users.sourceforge.net
 + * @version $LastChangedDate: 2009-03-26 15:06:57 -0400 (Thu, 26 Mar 2009) $
 + */
- public interface Reader<E> extends Iterable<E>, Iterator<E> {
++public interface Reader<E> extends Iterable<E>, Iterator<E>, AutoCloseable {
 +
-   /** 
++  /**
 +   * Close the reader, freeing all resources.
 +   * @throws IOException if there is an error closing the reader instance
 +   */
++  @Override
 +  void close() throws IOException;
 +
-   /** 
++  /**
 +   * Determine if the reader is ready to read a line.
 +   * @return true if it is ready
 +   * @throws IOException if there is an error whilst determining if the reader if ready
 +   */
 +  boolean ready() throws IOException;
 +
-   /** 
++  /**
 +   * Read a "line" and return an object representing it.
 +   * @return an object representing a single line
 +   * @throws IOException if there is an error reading lines
 +   */
 +  E readLine() throws IOException;
 +}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b0b70627/joshua-core/src/main/java/org/apache/joshua/util/quantization/Quantizer.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/util/quantization/Quantizer.java
index 33a4e9a,0000000..ab291be
mode 100644,000000..100644
--- a/joshua-core/src/main/java/org/apache/joshua/util/quantization/Quantizer.java
+++ b/joshua-core/src/main/java/org/apache/joshua/util/quantization/Quantizer.java
@@@ -1,45 -1,0 +1,43 @@@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 +package org.apache.joshua.util.quantization;
-  
- import java.io.DataInputStream; 
- import java.io.DataOutputStream; 
- import java.io.IOException; 
- import java.nio.ByteBuffer; 
-  
- public interface Quantizer { 
-  
-   public float read(ByteBuffer stream, int position); 
-  
-   public void write(ByteBuffer stream, float value); 
-  
-   public void initialize(); 
-  
-   public void add(float key); 
-  
-   public void finalize(); 
-  
-   public String getKey(); 
-  
-   public void writeState(DataOutputStream out) throws IOException; 
-  
-   public void readState(DataInputStream in) throws IOException; 
-  
-   public int size(); 
++
++import java.io.DataInputStream;
++import java.io.DataOutputStream;
++import java.io.IOException;
++import java.nio.ByteBuffer;
++
++public interface Quantizer {
++
++  public float read(ByteBuffer stream, int position);
++
++  public void write(ByteBuffer stream, float value);
++
++  public void initialize();
++
++  public void add(float key);
++
++  public String getKey();
++
++  public void writeState(DataOutputStream out) throws IOException;
++
++  public void readState(DataInputStream in) throws IOException;
++
++  public int size();
 +}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b0b70627/joshua-core/src/main/java/org/apache/joshua/util/quantization/QuantizerConfiguration.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/util/quantization/QuantizerConfiguration.java
index f4765f9,0000000..39aef36
mode 100644,000000..100644
--- a/joshua-core/src/main/java/org/apache/joshua/util/quantization/QuantizerConfiguration.java
+++ b/joshua-core/src/main/java/org/apache/joshua/util/quantization/QuantizerConfiguration.java
@@@ -1,119 -1,0 +1,114 @@@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 +package org.apache.joshua.util.quantization;
 +
- import java.io.BufferedInputStream; 
- import java.io.BufferedOutputStream; 
- import java.io.DataInputStream; 
- import java.io.DataOutputStream; 
- import java.io.File; 
- import java.io.FileInputStream; 
- import java.io.FileOutputStream; 
- import java.io.IOException; 
- import java.util.ArrayList; 
- import java.util.HashMap; 
- import java.util.List; 
- import java.util.Map; 
++import java.io.BufferedInputStream;
++import java.io.BufferedOutputStream;
++import java.io.DataInputStream;
++import java.io.DataOutputStream;
++import java.io.File;
++import java.io.FileInputStream;
++import java.io.FileOutputStream;
++import java.io.IOException;
++import java.util.ArrayList;
++import java.util.HashMap;
++import java.util.List;
++import java.util.Map;
 +
- import org.apache.joshua.corpus.Vocabulary; 
++import org.apache.joshua.corpus.Vocabulary;
 +
- public class QuantizerConfiguration { 
++public class QuantizerConfiguration {
 +
-   private static final Quantizer DEFAULT; 
++  private static final Quantizer DEFAULT;
 +
-   private ArrayList<Quantizer> quantizers; 
-   private Map<Integer, Integer> quantizerByFeatureId; 
++  private ArrayList<Quantizer> quantizers;
++  private Map<Integer, Integer> quantizerByFeatureId;
 +
-   static { 
-     DEFAULT = new BooleanQuantizer(); 
-   } 
++  static {
++    DEFAULT = new BooleanQuantizer();
++  }
 +
-   public QuantizerConfiguration() { 
-     quantizers = new ArrayList<Quantizer>(); 
-     quantizerByFeatureId = new HashMap<Integer, Integer>(); 
-   } 
++  public QuantizerConfiguration() {
++    quantizers = new ArrayList<Quantizer>();
++    quantizerByFeatureId = new HashMap<Integer, Integer>();
++  }
 +
-   public void add(String quantizer_key, List<Integer> feature_ids) { 
-     Quantizer q = QuantizerFactory.get(quantizer_key); 
-     quantizers.add(q); 
-     int index = quantizers.size() - 1; 
-     for (int feature_id : feature_ids) 
-       quantizerByFeatureId.put(feature_id, index); 
-   } 
++  public void add(String quantizer_key, List<Integer> feature_ids) {
++    Quantizer q = QuantizerFactory.get(quantizer_key);
++    quantizers.add(q);
++    int index = quantizers.size() - 1;
++    for (int feature_id : feature_ids)
++      quantizerByFeatureId.put(feature_id, index);
++  }
 +
-   public void initialize() { 
-     for (Quantizer q : quantizers) 
-       q.initialize(); 
-   } 
++  public void initialize() {
++    for (Quantizer q : quantizers)
++      q.initialize();
++  }
 +
-   public void finalize() { 
-     for (Quantizer q : quantizers) 
-       q.finalize(); 
-   } 
++  public final Quantizer get(int feature_id) {
++    Integer index = quantizerByFeatureId.get(feature_id);
++    return (index != null ? quantizers.get(index) : DEFAULT);
++  }
 +
-   public final Quantizer get(int feature_id) { 
-     Integer index = quantizerByFeatureId.get(feature_id); 
-     return (index != null ? quantizers.get(index) : DEFAULT); 
-   } 
++  public void read(String file_name) throws IOException {
++    quantizers.clear();
++    quantizerByFeatureId.clear();
 +
-   public void read(String file_name) throws IOException { 
-     quantizers.clear(); 
-     quantizerByFeatureId.clear(); 
++    File quantizer_file = new File(file_name);
++    DataInputStream in_stream =
++        new DataInputStream(new BufferedInputStream(new FileInputStream(quantizer_file)));
++    int num_quantizers = in_stream.readInt();
++    quantizers.ensureCapacity(num_quantizers);
++    for (int i = 0; i < num_quantizers; i++) {
++      String key = in_stream.readUTF();
++      Quantizer q = QuantizerFactory.get(key);
++      q.readState(in_stream);
++      quantizers.add(q);
++    }
++    int num_mappings = in_stream.readInt();
++    for (int i = 0; i < num_mappings; i++) {
++      String feature_name = in_stream.readUTF();
++      int feature_id = Vocabulary.id(feature_name);
++      int quantizer_index = in_stream.readInt();
++      if (quantizer_index >= num_quantizers) {
++        throw new RuntimeException("Error deserializing QuanitzerConfig. " + "Feature "
++            + feature_name + " referring to quantizer " + quantizer_index + " when only "
++            + num_quantizers + " known.");
++      }
++      this.quantizerByFeatureId.put(feature_id, quantizer_index);
++    }
++    in_stream.close();
++  }
 +
-     File quantizer_file = new File(file_name); 
-     DataInputStream in_stream = 
-         new DataInputStream(new BufferedInputStream(new FileInputStream(quantizer_file))); 
-     int num_quantizers = in_stream.readInt(); 
-     quantizers.ensureCapacity(num_quantizers); 
-     for (int i = 0; i < num_quantizers; i++) { 
-       String key = in_stream.readUTF(); 
-       Quantizer q = QuantizerFactory.get(key); 
-       q.readState(in_stream); 
-       quantizers.add(q); 
-     } 
-     int num_mappings = in_stream.readInt(); 
-     for (int i = 0; i < num_mappings; i++) { 
-       String feature_name = in_stream.readUTF(); 
-       int feature_id = Vocabulary.id(feature_name); 
-       int quantizer_index = in_stream.readInt(); 
-       if (quantizer_index >= num_quantizers) { 
-         throw new RuntimeException("Error deserializing QuanitzerConfig. " + "Feature " 
-             + feature_name + " referring to quantizer " + quantizer_index + " when only " 
-             + num_quantizers + " known."); 
-       } 
-       this.quantizerByFeatureId.put(feature_id, quantizer_index); 
-     } 
-     in_stream.close(); 
-   } 
- 
-   public void write(String file_name) throws IOException { 
-     File vocab_file = new File(file_name); 
-     DataOutputStream out_stream = 
-         new DataOutputStream(new BufferedOutputStream(new FileOutputStream(vocab_file))); 
-     out_stream.writeInt(quantizers.size()); 
-     for (int index = 0; index < quantizers.size(); index++) 
-       quantizers.get(index).writeState(out_stream); 
-     out_stream.writeInt(quantizerByFeatureId.size()); 
-     for (int feature_id : quantizerByFeatureId.keySet()) { 
-       out_stream.writeUTF(Vocabulary.word(feature_id)); 
-       out_stream.writeInt(quantizerByFeatureId.get(feature_id)); 
-     } 
-     out_stream.close(); 
-   } 
++  public void write(String file_name) throws IOException {
++    File vocab_file = new File(file_name);
++    DataOutputStream out_stream =
++        new DataOutputStream(new BufferedOutputStream(new FileOutputStream(vocab_file)));
++    out_stream.writeInt(quantizers.size());
++    for (int index = 0; index < quantizers.size(); index++)
++      quantizers.get(index).writeState(out_stream);
++    out_stream.writeInt(quantizerByFeatureId.size());
++    for (int feature_id : quantizerByFeatureId.keySet()) {
++      out_stream.writeUTF(Vocabulary.word(feature_id));
++      out_stream.writeInt(quantizerByFeatureId.get(feature_id));
++    }
++    out_stream.close();
++  }
 +}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b0b70627/joshua-core/src/main/java/org/apache/joshua/util/quantization/StatelessQuantizer.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/util/quantization/StatelessQuantizer.java
index e81e945,0000000..a241cdf
mode 100644,000000..100644
--- a/joshua-core/src/main/java/org/apache/joshua/util/quantization/StatelessQuantizer.java
+++ b/joshua-core/src/main/java/org/apache/joshua/util/quantization/StatelessQuantizer.java
@@@ -1,38 -1,0 +1,40 @@@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 +package org.apache.joshua.util.quantization;
 +
- import java.io.DataInputStream; 
- import java.io.DataOutputStream; 
- import java.io.IOException; 
++import java.io.DataInputStream;
++import java.io.DataOutputStream;
++import java.io.IOException;
 +
- abstract class StatelessQuantizer implements Quantizer { 
++abstract class StatelessQuantizer implements Quantizer {
 +
-   public void initialize() {} 
++  @Override
++  public void initialize() {}
 +
-   public void add(float key) {} 
++  @Override
++  public void add(float key) {}
 +
-   public void finalize() {} 
++  @Override
++  public void writeState(DataOutputStream out) throws IOException {
++    out.writeUTF(getKey());
++  }
 +
-   public void writeState(DataOutputStream out) throws IOException { 
-     out.writeUTF(getKey()); 
-   } 
- 
-   public void readState(DataInputStream in) throws IOException {} 
++  @Override
++  public void readState(DataInputStream in) throws IOException {}
 +}