You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/08/30 21:04:52 UTC
[07/17] incubator-joshua git commit: Merge branch 'master' into
7-with-master
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b0b70627/joshua-core/src/main/java/org/apache/joshua/util/encoding/FeatureTypeAnalyzer.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/util/encoding/FeatureTypeAnalyzer.java
index 0aa41af,0000000..5226b0a
mode 100644,000000..100644
--- a/joshua-core/src/main/java/org/apache/joshua/util/encoding/FeatureTypeAnalyzer.java
+++ b/joshua-core/src/main/java/org/apache/joshua/util/encoding/FeatureTypeAnalyzer.java
@@@ -1,256 -1,0 +1,264 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.encoding;
+
+import java.io.BufferedOutputStream;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.ff.FeatureMap;
+import org.apache.joshua.util.io.LineReader;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class FeatureTypeAnalyzer {
+
+ private static final Logger LOG = LoggerFactory.getLogger(FeatureTypeAnalyzer.class);
+
+ private ArrayList<FeatureType> types;
+
+ private Map<Integer, Integer> featureToType;
+
+ private Map<Integer, Integer> featureIdMap;
+
+ // Is the feature setup labeled.
+ private boolean labeled;
+
+ // Is the encoder configuration open for new features (that are not assumed boolean)?
+ private boolean open;
+
+ public FeatureTypeAnalyzer() {
+ this(false);
+ }
+
+ public FeatureTypeAnalyzer(boolean open) {
+ this.open = open;
+ this.types = new ArrayList<FeatureType>();
+ this.featureToType = new HashMap<Integer, Integer>();
+ this.featureIdMap = new HashMap<Integer, Integer>();
+ }
+
+ public void readConfig(String config_filename) throws IOException {
- LineReader reader = new LineReader(config_filename);
- while (reader.hasNext()) {
- // Clean up line, chop comments off and skip if the result is empty.
- String line = reader.next().trim();
- if (line.indexOf('#') != -1)
- line = line.substring(0, line.indexOf('#'));
- if (line.isEmpty())
- continue;
- String[] fields = line.split("[\\s]+");
-
- if ("encoder".equals(fields[0])) {
- // Adding an encoder to the mix.
- if (fields.length < 3) {
- throw new RuntimeException("Incomplete encoder line in config.");
++ try(LineReader reader = new LineReader(config_filename);) {
++ while (reader.hasNext()) {
++ // Clean up line, chop comments off and skip if the result is empty.
++ String line = reader.next().trim();
++ if (line.indexOf('#') != -1)
++ line = line.substring(0, line.indexOf('#'));
++ if (line.isEmpty())
++ continue;
++ String[] fields = line.split("[\\s]+");
++
++ if ("encoder".equals(fields[0])) {
++ // Adding an encoder to the mix.
++ if (fields.length < 3) {
++ throw new RuntimeException("Incomplete encoder line in config.");
++ }
++ String encoder_key = fields[1];
++ List<Integer> feature_ids = new ArrayList<Integer>();
++ for (int i = 2; i < fields.length; i++)
++ feature_ids.add(Vocabulary.id(fields[i]));
++ addFeatures(encoder_key, feature_ids);
+ }
+ String encoder_key = fields[1];
- ArrayList<Integer> feature_ids = new ArrayList<Integer>();
++ List<Integer> feature_ids = new ArrayList<Integer>();
+ for (int i = 2; i < fields.length; i++)
+ feature_ids.add(FeatureMap.hashFeature(fields[i]));
+ addFeatures(encoder_key, feature_ids);
+ }
+ }
+ }
+
+ public void addFeatures(String encoder_key, List<Integer> feature_ids) {
+ int index = addType(encoder_key);
+ for (int feature_id : feature_ids)
+ featureToType.put(feature_id, index);
+ }
+
+ private int addType(String encoder_key) {
+ FeatureType ft = new FeatureType(encoder_key);
+ int index = types.indexOf(ft);
+ if (index < 0) {
+ types.add(ft);
+ return types.size() - 1;
+ }
+ return index;
+ }
+
+ private int addType() {
+ types.add(new FeatureType());
+ return types.size() - 1;
+ }
+
+ public void observe(int feature_id, float value) {
+ Integer type_id = featureToType.get(feature_id);
+ if (type_id == null && open) {
+ type_id = addType();
+ featureToType.put(feature_id, type_id);
+ }
+ if (type_id != null)
+ types.get(type_id).observe(value);
+ }
+
+ // Inspects the collected histograms, inferring actual type of feature. Then replaces the
+ // analyzer, if present, with the most compact applicable type.
+ public void inferTypes(boolean labeled) {
+ for (FeatureType ft : types) {
+ ft.inferUncompressedType();
+ }
+ if (LOG.isInfoEnabled()) {
+ for (int id : featureToType.keySet()) {
+ LOG.info("Type inferred: {} is {}", (labeled ? FeatureMap.getFeature(id) : "Feature " + id),
+ types.get(featureToType.get(id)).encoder.getKey());
+ }
+ }
+ }
+
+ public void buildFeatureMap() {
+ int[] known_features = new int[featureToType.keySet().size()];
+ int i = 0;
+ for (int f : featureToType.keySet())
+ known_features[i++] = f;
+ Arrays.sort(known_features);
+
+ featureIdMap.clear();
+ for (i = 0; i < known_features.length; ++i)
+ featureIdMap.put(known_features[i], i);
+ }
+
+ public int getRank(int feature_id) {
+ return featureIdMap.get(feature_id);
+ }
+
+ public IntEncoder getIdEncoder() {
+ int num_features = featureIdMap.size();
+ if (num_features <= Byte.MAX_VALUE)
+ return PrimitiveIntEncoder.BYTE;
+ else if (num_features <= Character.MAX_VALUE)
+ return PrimitiveIntEncoder.CHAR;
+ else
+ return PrimitiveIntEncoder.INT;
+ }
+
+ public void write(String file_name) throws IOException {
+ File out_file = new File(file_name);
+ BufferedOutputStream buf_stream = new BufferedOutputStream(new FileOutputStream(out_file));
+ DataOutputStream out_stream = new DataOutputStream(buf_stream);
+
+ buildFeatureMap();
+
+ getIdEncoder().writeState(out_stream);
+ out_stream.writeBoolean(labeled);
+ out_stream.writeInt(types.size());
+ for (int index = 0; index < types.size(); index++)
+ types.get(index).encoder.writeState(out_stream);
+
+ out_stream.writeInt(featureToType.size());
+ for (int feature_id : featureToType.keySet()) {
+ if (labeled)
+ out_stream.writeUTF(FeatureMap.getFeature(feature_id));
+ else
+ out_stream.writeInt(feature_id);
+ out_stream.writeInt(featureIdMap.get(feature_id));
+ out_stream.writeInt(featureToType.get(feature_id));
+ }
+ out_stream.close();
+ }
+
++ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ for (int feature_id : featureToType.keySet()) {
+ sb.append(types.get(featureToType.get(feature_id)).analyzer.toString(FeatureMap.getFeature(feature_id)));
+ }
+ System.out.println(sb.toString());
+ return sb.toString();
+ }
+
+ public boolean isLabeled() {
+ return labeled;
+ }
+
+ public void setLabeled(boolean labeled) {
+ this.labeled = labeled;
+ }
+
- class FeatureType {
++ static class FeatureType {
+ FloatEncoder encoder;
+ Analyzer analyzer;
+ int bits;
+
+ FeatureType() {
+ encoder = null;
+ analyzer = new Analyzer();
+ bits = -1;
+ }
+
+ FeatureType(String key) {
+ // either throws or returns non-null
+ FloatEncoder e = EncoderFactory.getFloatEncoder(key);
+ encoder = e;
+ analyzer = null;
+ bits = -1;
+ }
+
+ void inferUncompressedType() {
+ if (encoder != null)
+ return;
+ encoder = analyzer.inferUncompressedType();
+ analyzer = null;
+ }
+
+ void inferType() {
+ if (encoder != null)
+ return;
+ encoder = analyzer.inferType(bits);
+ analyzer = null;
+ }
+
+ void observe(float value) {
+ if (analyzer != null)
+ analyzer.add(value);
+ }
+
++ @Override
+ public boolean equals(Object t) {
+ if (t != null && t instanceof FeatureType) {
+ FeatureType that = (FeatureType) t;
+ if (this.encoder != null) {
+ return this.encoder.equals(that.encoder);
+ } else {
+ if (that.encoder != null)
+ return false;
+ if (this.analyzer != null)
+ return this.analyzer.equals(that.analyzer);
+ }
+ }
+ return false;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b0b70627/joshua-core/src/main/java/org/apache/joshua/util/io/ExistingUTF8EncodedTextFile.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/util/io/ExistingUTF8EncodedTextFile.java
index 0000000,0000000..42dd236
new file mode 100644
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/util/io/ExistingUTF8EncodedTextFile.java
@@@ -1,0 -1,0 +1,77 @@@
++/*
++ * Licensed to the Apache Software Foundation (ASF) under one
++ * or more contributor license agreements. See the NOTICE file
++ * distributed with this work for additional information
++ * regarding copyright ownership. The ASF licenses this file
++ * to you under the Apache License, Version 2.0 (the
++ * "License"); you may not use this file except in compliance
++ * with the License. You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing,
++ * software distributed under the License is distributed on an
++ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
++ * KIND, either express or implied. See the License for the
++ * specific language governing permissions and limitations
++ * under the License.
++ */
++package org.apache.joshua.util.io;
++
++import java.io.FileNotFoundException;
++import java.io.IOException;
++import java.nio.charset.StandardCharsets;
++import java.nio.file.Files;
++import java.nio.file.Path;
++import java.nio.file.Paths;
++import java.util.function.Predicate;
++import java.util.stream.Stream;
++
++/**
++ * A class that represents a {@link StandardCharsets#UTF_8} text file. Will
++ * throw a {@link FileNotFoundException} upon instantiation if the underlying
++ * {@link Path}, or {@link String} representing a Path, is not found.
++ */
++public class ExistingUTF8EncodedTextFile {
++ private static final Predicate<String> emptyStringPredicate = s -> s.isEmpty();
++
++ private final Path p;
++
++ public ExistingUTF8EncodedTextFile(String pathStr) throws FileNotFoundException {
++ this(Paths.get(pathStr));
++ }
++
++ public ExistingUTF8EncodedTextFile(Path p) throws FileNotFoundException {
++ this.p = p;
++ if (!Files.exists(p))
++ throw new FileNotFoundException("Did not find the file at path: " + p.toString());
++ }
++
++ /**
++ * @return the {@link Path} representing this object
++ */
++ public Path getPath() {
++ return this.p;
++ }
++
++ /**
++ * @return the number of lines in the file represented by this object
++ * @throws IOException on inability to read file (maybe it's not a text file)
++ */
++ public int getNumberOfLines() throws IOException {
++ try(Stream<String> ls = Files.lines(this.p, StandardCharsets.UTF_8);) {
++ return (int) ls.count();
++ }
++ }
++
++ /**
++ * @return the number of non-empty lines in the file represented by this object
++ * @throws IOException on inability to read file (maybe it's not a text file)
++ */
++ public int getNumberOfNonEmptyLines() throws IOException {
++ try(Stream<String> ls = Files.lines(this.p, StandardCharsets.UTF_8);) {
++ return (int) ls.filter(emptyStringPredicate.negate())
++ .count();
++ }
++ }
++}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b0b70627/joshua-core/src/main/java/org/apache/joshua/util/io/IndexedReader.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/util/io/IndexedReader.java
index f357e55,0000000..d206544
mode 100644,000000..100644
--- a/joshua-core/src/main/java/org/apache/joshua/util/io/IndexedReader.java
+++ b/joshua-core/src/main/java/org/apache/joshua/util/io/IndexedReader.java
@@@ -1,155 -1,0 +1,160 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.io;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+
+
+/**
+ * Wraps a reader with "line" index information.
- *
++ *
+ * @author wren ng thornton wren@users.sourceforge.net
+ * @version $LastChangedDate: 2009-03-26 15:06:57 -0400 (Thu, 26 Mar 2009) $
+ */
+public class IndexedReader<E> implements Reader<E> {
-
+ /** A name for the type of elements the reader produces. */
+ private final String elementName;
+
+ /** The number of elements the reader has delivered so far. */
+ private int lineNumber;
+
+ /** The underlying reader. */
+ private final Reader<E> reader;
+
+ public IndexedReader(String elementName, Reader<E> reader) {
+ this.elementName = elementName;
+ this.lineNumber = 0;
+ this.reader = reader;
+ }
+
- /**
++ /**
+ * Return the number of elements delivered so far.
+ * @return integer representing the number of elements delivered so far
+ */
+ public int index() {
+ return this.lineNumber;
+ }
+
+
+ /**
+ * Wrap an IOException's message with the index when it occured.
+ * @param oldError the old {@link java.io.IOException} we wish to wrap
+ * @return the new wrapped {@link java.io.IOException}
+ */
+ public IOException wrapIOException(IOException oldError) {
+ IOException newError =
+ new IOException("At " + this.elementName + " " + this.lineNumber + ": "
+ + oldError.getMessage());
+ newError.initCause(oldError);
+ return newError;
+ }
+
+ // ===============================================================
+ // Reader
+ // ===============================================================
+
- /**
++ /**
+ * Delegated to the underlying reader.
+ * @return true if the reader is ready
+ * @throws IOException if there is an error determining readiness
+ */
+ @Override
+ public boolean ready() throws IOException {
+ try {
+ return this.reader.ready();
+ } catch (IOException oldError) {
+ throw wrapIOException(oldError);
+ }
+ }
+
+
+ /**
+ * Delegated to the underlying reader. Note that we do not have a <code>finalize()</code> method;
+ * however, when we fall out of scope, the underlying reader will too, so its finalizer may be
+ * called. For correctness, be sure to manually close all readers.
+ */
++ @Override
+ public void close() throws IOException {
+ try {
+ this.reader.close();
+ } catch (IOException oldError) {
+ throw wrapIOException(oldError);
+ }
+ }
+
+
+ /** Delegated to the underlying reader. */
++ @Override
+ public E readLine() throws IOException {
+ E line;
+ try {
+ line = this.reader.readLine();
+ } catch (IOException oldError) {
+ throw wrapIOException(oldError);
+ }
+ ++this.lineNumber;
+ return line;
+ }
+
+
+ // ===============================================================
+ // Iterable -- because sometimes Java can be very stupid
+ // ===============================================================
+
+ /** Return self as an iterator. */
++ @Override
+ public Iterator<E> iterator() {
+ return this;
+ }
+
+
+ // ===============================================================
+ // Iterator
+ // ===============================================================
+
+ /** Delegated to the underlying reader. */
++ @Override
+ public boolean hasNext() {
+ return this.reader.hasNext();
+ }
+
+
+ /** Delegated to the underlying reader. */
++ @Override
+ public E next() throws NoSuchElementException {
+ E line = this.reader.next();
+ // Let exceptions out, we'll wrap any errors a closing time.
+
+ ++this.lineNumber;
+ return line;
+ }
+
+
+ /**
+ * If the underlying reader supports removal, then so do we. Note that the {@link #index()} method
+ * returns the number of elements delivered to the client, so removing an element from the
+ * underlying collection does not affect that number.
+ */
++ @Override
+ public void remove() throws UnsupportedOperationException {
+ this.reader.remove();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b0b70627/joshua-core/src/main/java/org/apache/joshua/util/io/LineReader.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/util/io/LineReader.java
index d63763d,0000000..ea5d8f1
mode 100644,000000..100644
--- a/joshua-core/src/main/java/org/apache/joshua/util/io/LineReader.java
+++ b/joshua-core/src/main/java/org/apache/joshua/util/io/LineReader.java
@@@ -1,368 -1,0 +1,309 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.io;
+
+import java.io.BufferedReader;
++import java.io.File;
+import java.io.FileDescriptor;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
- import java.io.File;
- import java.nio.charset.Charset;
++import java.nio.charset.StandardCharsets;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+import java.util.zip.GZIPInputStream;
+
+import org.apache.joshua.decoder.Decoder;
+
+/**
+ * This class provides an Iterator interface to a BufferedReader. This covers the most common
+ * use-cases for reading from files without ugly code to check whether we got a line or not.
- *
++ *
+ * @author wren ng thornton wren@users.sourceforge.net
+ * @author Matt Post post@cs.jhu.edu
+ */
+public class LineReader implements Reader<String>, AutoCloseable {
+
+ /*
- * Note: charset name is case-agnostic "UTF-8" is the canonical name "UTF8", "unicode-1-1-utf-8"
- * are aliases Java doesn't distinguish utf8 vs UTF-8 like Perl does
- */
- private static final Charset FILE_ENCODING = Charset.forName("UTF-8");
-
- /*
+ * The reader and its underlying input stream. We need to keep a hold of the underlying
+ * input stream so that we can query how many raw bytes it's read (for a generic progress
+ * meter that works across GZIP'ed and plain text files).
+ */
+ private BufferedReader reader;
+ private ProgressInputStream rawStream;
+
+ private String buffer;
+ private IOException error;
+
+ private int lineno = 0;
-
++
+ private boolean display_progress = false;
-
++
+ private int progress = 0;
+
+ // ===============================================================
+ // Constructors and destructors
+ // ===============================================================
+
+ /**
+ * Opens a file for iterating line by line. The special "-" filename can be used to specify
+ * STDIN. GZIP'd files are tested for automatically.
- *
++ *
+ * @param filename the file to be opened ("-" for STDIN)
+ * @throws IOException if there is an error reading the input file
+ */
+ public LineReader(String filename) throws IOException {
-
++
+ display_progress = (Decoder.VERBOSE >= 1);
-
++
+ progress = 0;
-
- InputStream stream = null;
++
++ InputStream stream = null;
+ long totalBytes = -1;
+ if (filename.equals("-")) {
+ rawStream = null;
+ stream = new FileInputStream(FileDescriptor.in);
+ } else {
+ totalBytes = new File(filename).length();
+ rawStream = new ProgressInputStream(new FileInputStream(filename), totalBytes);
+
+ try {
+ stream = new GZIPInputStream(rawStream);
+ } catch (Exception e) {
+ // GZIP ate a byte, so reset
+ rawStream.close();
+ stream = rawStream = new ProgressInputStream(new FileInputStream(filename), totalBytes);
+ }
- }
-
- this.reader = new BufferedReader(new InputStreamReader(stream, FILE_ENCODING));
++ }
++
++ this.reader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8));
+ }
-
++
+ public LineReader(String filename, boolean show_progress) throws IOException {
+ this(filename);
+ display_progress = (Decoder.VERBOSE >= 1 && show_progress);
+ }
+
+
+ /**
+ * Wraps an InputStream for iterating line by line. Stream encoding is assumed to be UTF-8.
+ * @param in an {@link java.io.InputStream} to wrap and iterate over line by line
+ */
+ public LineReader(InputStream in) {
- this.reader = new BufferedReader(new InputStreamReader(in, FILE_ENCODING));
++ this.reader = new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8));
+ display_progress = false;
+ }
-
++
+ /**
- * Chain to the underlying {@link ProgressInputStream}.
- *
++ * Chain to the underlying {@link ProgressInputStream}.
++ *
+ * @return an integer from 0..100, indicating how much of the file has been read.
+ */
+ public int progress() {
+ return rawStream == null ? 0 : rawStream.progress();
+ }
-
++
+ /**
+ * This method will close the file handle, and will raise any exceptions that occured during
+ * iteration. The method is idempotent, and all calls after the first are no-ops (unless the
+ * thread was interrupted or killed). For correctness, you <b>must</b> call this method before the
+ * object falls out of scope.
+ * @throws IOException if there is an error closing the file handler
+ */
++ @Override
+ public void close() throws IOException {
+
+ this.buffer = null; // Just in case it's a large string
+
+ if (null != this.reader) {
+ try {
+ // We assume the wrappers will percolate this down.
+ this.reader.close();
+
+ } catch (IOException e) {
+ // We need to trash our cached error for idempotence.
+ // Presumably the closing error is the more important
+ // one to throw.
+ this.error = null;
+ throw e;
+
+ } finally {
+ this.reader = null;
+ }
+ }
+
+ if (null != this.error) {
+ IOException e = this.error;
+ this.error = null;
+ throw e;
+ }
+ }
+
-
- /**
- * We attempt to avoid leaking file descriptors if you fail to call close before the object falls
- * out of scope. However, the language spec makes <b>no guarantees</b> about timeliness of garbage
- * collection. It is a bug to rely on this method to release the resources. Also, the garbage
- * collector will discard any exceptions that have queued up, without notifying the application in
- * any way.
- *
- * Having a finalizer means the JVM can't do "fast allocation" of LineReader objects (or
- * subclasses). This isn't too important due to disk latency, but may be worth noting.
- *
- * @see <a
- * href="http://java2go.blogspot.com/2007/09/javaone-2007-performance-tips-2-finish.html">Performance
- * Tips</a>
- * @see <a
- * href="http://www.javaworld.com/javaworld/jw-06-1998/jw-06-techniques.html?page=1">Techniques</a>
- */
- protected void finalize() throws Throwable {
- try {
- this.close();
- } catch (IOException e) {
- // Do nothing. The GC will discard the exception
- // anyways, but it may cause us to linger on the heap.
- } finally {
- super.finalize();
- }
- }
-
-
-
+ // ===============================================================
+ // Reader
+ // ===============================================================
+
+ // Copied from interface documentation.
+ /** Determine if the reader is ready to read a line. */
++ @Override
+ public boolean ready() throws IOException {
+ return this.reader.ready();
+ }
+
+
+ /**
+ * This method is like next() except that it throws the IOException directly. If there are no
+ * lines to be read then null is returned.
+ */
++ @Override
+ public String readLine() throws IOException {
+ if (this.hasNext()) {
+ String line = this.buffer;
+ this.buffer = null;
+ return line;
+
+ } else {
+ if (null != this.error) {
+ IOException e = this.error;
+ this.error = null;
+ throw e;
+ }
+ return null;
+ }
+ }
+
+
+ // ===============================================================
+ // Iterable -- because sometimes Java can be very stupid
+ // ===============================================================
+
+ /** Return self as an iterator. */
++ @Override
+ public Iterator<String> iterator() {
+ return this;
+ }
+
+
+ // ===============================================================
+ // Iterator
+ // ===============================================================
+
+ // Copied from interface documentation.
+ /**
+ * Returns <code>true</code> if the iteration has more elements. (In other words, returns
+ * <code>true</code> if <code>next</code> would return an element rather than throwing an
+ * exception.)
+ */
++ @Override
+ public boolean hasNext() {
+ if (null != this.buffer) {
+ return true;
+
+ } else if (null != this.error) {
+ return false;
+
+ } else {
+ // We're not allowed to throw IOException from within Iterator
+ try {
+ this.buffer = this.reader.readLine();
+ } catch (IOException e) {
+ this.buffer = null;
+ this.error = e;
+ return false;
+ }
+ return (null != this.buffer);
+ }
+ }
+
+
+ /**
+ * Return the next line of the file. If an error is encountered, NoSuchElementException is thrown.
+ * The actual IOException encountered will be thrown later, when the LineReader is closed. Also if
+ * there is no line to be read then NoSuchElementException is thrown.
+ */
++ @Override
+ public String next() throws NoSuchElementException {
+ if (this.hasNext()) {
+ if (display_progress) {
+ int newProgress = (reader != null) ? progress() : 100;
+// System.err.println(String.format("OLD %d NEW %d", progress, newProgress));
-
++
+ if (newProgress > progress) {
+ for (int i = progress + 1; i <= newProgress; i++)
+ if (i == 97) {
+ System.err.print("1");
+ } else if (i == 98) {
+ System.err.print("0");
+ } else if (i == 99) {
+ System.err.print("0");
+ } else if (i == 100) {
+ System.err.println("%");
+ } else if (i % 10 == 0) {
+ System.err.print(String.format("%d", i));
+ System.err.flush();
+ } else if ((i - 1) % 10 == 0)
+ ; // skip at 11 since 10, 20, etc take two digits
+ else {
+ System.err.print(".");
+ System.err.flush();
+ }
+ progress = newProgress;
+ }
+ }
-
++
+ String line = this.buffer;
+ this.lineno++;
+ this.buffer = null;
+ return line;
+ } else {
+ throw new NoSuchElementException();
+ }
+ }
-
++
+ /* Get the line number of the last line that was returned */
+ public int lineno() {
+ return this.lineno;
+ }
+
+ /** Unsupported. */
++ @Override
+ public void remove() throws UnsupportedOperationException {
+ throw new UnsupportedOperationException();
+ }
+
-
+ /**
- * Iterates over all lines, ignoring their contents, and returns the count of lines. If some lines
- * have already been read, this will return the count of remaining lines. Because no lines will
- * remain after calling this method, we implicitly call close.
- *
- * @return the number of lines read
- * @throws IOException if there is an error reading lines
- */
- public int countLines() throws IOException {
- int lines = 0;
-
- while (this.hasNext()) {
- this.next();
- lines++;
- }
- this.close();
-
- return lines;
- }
-
- /**
+ * Example usage code.
+ * @param args an input file
+ */
+ public static void main(String[] args) {
+ if (1 != args.length) {
+ System.out.println("Usage: java LineReader filename");
+ System.exit(1);
+ }
+
- try {
-
- LineReader in = new LineReader(args[0]);
- try {
- for (String line : in) {
-
- System.out.println(line);
-
- }
- } finally {
- in.close();
++ try (LineReader in = new LineReader(args[0]);) {
++ for (String line : in) {
++ System.out.println(line);
+ }
-
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b0b70627/joshua-core/src/main/java/org/apache/joshua/util/io/Reader.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/util/io/Reader.java
index cab6d74,0000000..e3a150e
mode 100644,000000..100644
--- a/joshua-core/src/main/java/org/apache/joshua/util/io/Reader.java
+++ b/joshua-core/src/main/java/org/apache/joshua/util/io/Reader.java
@@@ -1,51 -1,0 +1,52 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.io;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+/**
+ * Common interface for Reader type objects.
- *
++ *
+ * @author wren ng thornton wren@users.sourceforge.net
+ * @version $LastChangedDate: 2009-03-26 15:06:57 -0400 (Thu, 26 Mar 2009) $
+ */
- public interface Reader<E> extends Iterable<E>, Iterator<E> {
++public interface Reader<E> extends Iterable<E>, Iterator<E>, AutoCloseable {
+
- /**
++ /**
+ * Close the reader, freeing all resources.
+ * @throws IOException if there is an error closing the reader instance
+ */
++ @Override
+ void close() throws IOException;
+
- /**
++ /**
+ * Determine if the reader is ready to read a line.
+ * @return true if it is ready
+ * @throws IOException if there is an error whilst determining if the reader if ready
+ */
+ boolean ready() throws IOException;
+
- /**
++ /**
+ * Read a "line" and return an object representing it.
+ * @return an object representing a single line
+ * @throws IOException if there is an error reading lines
+ */
+ E readLine() throws IOException;
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b0b70627/joshua-core/src/main/java/org/apache/joshua/util/quantization/Quantizer.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/util/quantization/Quantizer.java
index 33a4e9a,0000000..ab291be
mode 100644,000000..100644
--- a/joshua-core/src/main/java/org/apache/joshua/util/quantization/Quantizer.java
+++ b/joshua-core/src/main/java/org/apache/joshua/util/quantization/Quantizer.java
@@@ -1,45 -1,0 +1,43 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.quantization;
-
- import java.io.DataInputStream;
- import java.io.DataOutputStream;
- import java.io.IOException;
- import java.nio.ByteBuffer;
-
- public interface Quantizer {
-
- public float read(ByteBuffer stream, int position);
-
- public void write(ByteBuffer stream, float value);
-
- public void initialize();
-
- public void add(float key);
-
- public void finalize();
-
- public String getKey();
-
- public void writeState(DataOutputStream out) throws IOException;
-
- public void readState(DataInputStream in) throws IOException;
-
- public int size();
++
++import java.io.DataInputStream;
++import java.io.DataOutputStream;
++import java.io.IOException;
++import java.nio.ByteBuffer;
++
++public interface Quantizer {
++
++ public float read(ByteBuffer stream, int position);
++
++ public void write(ByteBuffer stream, float value);
++
++ public void initialize();
++
++ public void add(float key);
++
++ public String getKey();
++
++ public void writeState(DataOutputStream out) throws IOException;
++
++ public void readState(DataInputStream in) throws IOException;
++
++ public int size();
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b0b70627/joshua-core/src/main/java/org/apache/joshua/util/quantization/QuantizerConfiguration.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/util/quantization/QuantizerConfiguration.java
index f4765f9,0000000..39aef36
mode 100644,000000..100644
--- a/joshua-core/src/main/java/org/apache/joshua/util/quantization/QuantizerConfiguration.java
+++ b/joshua-core/src/main/java/org/apache/joshua/util/quantization/QuantizerConfiguration.java
@@@ -1,119 -1,0 +1,114 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.quantization;
+
- import java.io.BufferedInputStream;
- import java.io.BufferedOutputStream;
- import java.io.DataInputStream;
- import java.io.DataOutputStream;
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.FileOutputStream;
- import java.io.IOException;
- import java.util.ArrayList;
- import java.util.HashMap;
- import java.util.List;
- import java.util.Map;
++import java.io.BufferedInputStream;
++import java.io.BufferedOutputStream;
++import java.io.DataInputStream;
++import java.io.DataOutputStream;
++import java.io.File;
++import java.io.FileInputStream;
++import java.io.FileOutputStream;
++import java.io.IOException;
++import java.util.ArrayList;
++import java.util.HashMap;
++import java.util.List;
++import java.util.Map;
+
- import org.apache.joshua.corpus.Vocabulary;
++import org.apache.joshua.corpus.Vocabulary;
+
- public class QuantizerConfiguration {
++public class QuantizerConfiguration {
+
- private static final Quantizer DEFAULT;
++ private static final Quantizer DEFAULT;
+
- private ArrayList<Quantizer> quantizers;
- private Map<Integer, Integer> quantizerByFeatureId;
++ private ArrayList<Quantizer> quantizers;
++ private Map<Integer, Integer> quantizerByFeatureId;
+
- static {
- DEFAULT = new BooleanQuantizer();
- }
++ static {
++ DEFAULT = new BooleanQuantizer();
++ }
+
- public QuantizerConfiguration() {
- quantizers = new ArrayList<Quantizer>();
- quantizerByFeatureId = new HashMap<Integer, Integer>();
- }
++ public QuantizerConfiguration() {
++ quantizers = new ArrayList<Quantizer>();
++ quantizerByFeatureId = new HashMap<Integer, Integer>();
++ }
+
- public void add(String quantizer_key, List<Integer> feature_ids) {
- Quantizer q = QuantizerFactory.get(quantizer_key);
- quantizers.add(q);
- int index = quantizers.size() - 1;
- for (int feature_id : feature_ids)
- quantizerByFeatureId.put(feature_id, index);
- }
++ public void add(String quantizer_key, List<Integer> feature_ids) {
++ Quantizer q = QuantizerFactory.get(quantizer_key);
++ quantizers.add(q);
++ int index = quantizers.size() - 1;
++ for (int feature_id : feature_ids)
++ quantizerByFeatureId.put(feature_id, index);
++ }
+
- public void initialize() {
- for (Quantizer q : quantizers)
- q.initialize();
- }
++ public void initialize() {
++ for (Quantizer q : quantizers)
++ q.initialize();
++ }
+
- public void finalize() {
- for (Quantizer q : quantizers)
- q.finalize();
- }
++ public final Quantizer get(int feature_id) {
++ Integer index = quantizerByFeatureId.get(feature_id);
++ return (index != null ? quantizers.get(index) : DEFAULT);
++ }
+
- public final Quantizer get(int feature_id) {
- Integer index = quantizerByFeatureId.get(feature_id);
- return (index != null ? quantizers.get(index) : DEFAULT);
- }
++ public void read(String file_name) throws IOException {
++ quantizers.clear();
++ quantizerByFeatureId.clear();
+
- public void read(String file_name) throws IOException {
- quantizers.clear();
- quantizerByFeatureId.clear();
++ File quantizer_file = new File(file_name);
++ DataInputStream in_stream =
++ new DataInputStream(new BufferedInputStream(new FileInputStream(quantizer_file)));
++ int num_quantizers = in_stream.readInt();
++ quantizers.ensureCapacity(num_quantizers);
++ for (int i = 0; i < num_quantizers; i++) {
++ String key = in_stream.readUTF();
++ Quantizer q = QuantizerFactory.get(key);
++ q.readState(in_stream);
++ quantizers.add(q);
++ }
++ int num_mappings = in_stream.readInt();
++ for (int i = 0; i < num_mappings; i++) {
++ String feature_name = in_stream.readUTF();
++ int feature_id = Vocabulary.id(feature_name);
++ int quantizer_index = in_stream.readInt();
++ if (quantizer_index >= num_quantizers) {
++ throw new RuntimeException("Error deserializing QuanitzerConfig. " + "Feature "
++ + feature_name + " referring to quantizer " + quantizer_index + " when only "
++ + num_quantizers + " known.");
++ }
++ this.quantizerByFeatureId.put(feature_id, quantizer_index);
++ }
++ in_stream.close();
++ }
+
- File quantizer_file = new File(file_name);
- DataInputStream in_stream =
- new DataInputStream(new BufferedInputStream(new FileInputStream(quantizer_file)));
- int num_quantizers = in_stream.readInt();
- quantizers.ensureCapacity(num_quantizers);
- for (int i = 0; i < num_quantizers; i++) {
- String key = in_stream.readUTF();
- Quantizer q = QuantizerFactory.get(key);
- q.readState(in_stream);
- quantizers.add(q);
- }
- int num_mappings = in_stream.readInt();
- for (int i = 0; i < num_mappings; i++) {
- String feature_name = in_stream.readUTF();
- int feature_id = Vocabulary.id(feature_name);
- int quantizer_index = in_stream.readInt();
- if (quantizer_index >= num_quantizers) {
- throw new RuntimeException("Error deserializing QuanitzerConfig. " + "Feature "
- + feature_name + " referring to quantizer " + quantizer_index + " when only "
- + num_quantizers + " known.");
- }
- this.quantizerByFeatureId.put(feature_id, quantizer_index);
- }
- in_stream.close();
- }
-
- public void write(String file_name) throws IOException {
- File vocab_file = new File(file_name);
- DataOutputStream out_stream =
- new DataOutputStream(new BufferedOutputStream(new FileOutputStream(vocab_file)));
- out_stream.writeInt(quantizers.size());
- for (int index = 0; index < quantizers.size(); index++)
- quantizers.get(index).writeState(out_stream);
- out_stream.writeInt(quantizerByFeatureId.size());
- for (int feature_id : quantizerByFeatureId.keySet()) {
- out_stream.writeUTF(Vocabulary.word(feature_id));
- out_stream.writeInt(quantizerByFeatureId.get(feature_id));
- }
- out_stream.close();
- }
++ public void write(String file_name) throws IOException {
++ File vocab_file = new File(file_name);
++ DataOutputStream out_stream =
++ new DataOutputStream(new BufferedOutputStream(new FileOutputStream(vocab_file)));
++ out_stream.writeInt(quantizers.size());
++ for (int index = 0; index < quantizers.size(); index++)
++ quantizers.get(index).writeState(out_stream);
++ out_stream.writeInt(quantizerByFeatureId.size());
++ for (int feature_id : quantizerByFeatureId.keySet()) {
++ out_stream.writeUTF(Vocabulary.word(feature_id));
++ out_stream.writeInt(quantizerByFeatureId.get(feature_id));
++ }
++ out_stream.close();
++ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b0b70627/joshua-core/src/main/java/org/apache/joshua/util/quantization/StatelessQuantizer.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/util/quantization/StatelessQuantizer.java
index e81e945,0000000..a241cdf
mode 100644,000000..100644
--- a/joshua-core/src/main/java/org/apache/joshua/util/quantization/StatelessQuantizer.java
+++ b/joshua-core/src/main/java/org/apache/joshua/util/quantization/StatelessQuantizer.java
@@@ -1,38 -1,0 +1,40 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.util.quantization;
+
- import java.io.DataInputStream;
- import java.io.DataOutputStream;
- import java.io.IOException;
++import java.io.DataInputStream;
++import java.io.DataOutputStream;
++import java.io.IOException;
+
- abstract class StatelessQuantizer implements Quantizer {
++abstract class StatelessQuantizer implements Quantizer {
+
- public void initialize() {}
++ @Override
++ public void initialize() {}
+
- public void add(float key) {}
++ @Override
++ public void add(float key) {}
+
- public void finalize() {}
++ @Override
++ public void writeState(DataOutputStream out) throws IOException {
++ out.writeUTF(getKey());
++ }
+
- public void writeState(DataOutputStream out) throws IOException {
- out.writeUTF(getKey());
- }
-
- public void readState(DataInputStream in) throws IOException {}
++ @Override
++ public void readState(DataInputStream in) throws IOException {}
+}