You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/06/01 02:52:13 UTC

[78/94] [abbrv] incubator-joshua git commit: Merge remote-tracking branch 'origin/master' into JOSHUA-252

Merge remote-tracking branch 'origin/master' into JOSHUA-252

Includes many updates for tests, and updating hard-coded invocations of the old Java path.

# Conflicts:
#	src/main/java/org/apache/joshua/decoder/StructuredTranslation.java
#	src/main/java/org/apache/joshua/decoder/Translation.java
#	src/main/java/org/apache/joshua/decoder/ff/tm/GrammarReader.java
#	src/main/java/org/apache/joshua/decoder/ff/tm/format/HieroFormatReader.java
#	src/main/java/org/apache/joshua/decoder/ff/tm/format/MosesFormatReader.java
#	src/main/java/org/apache/joshua/decoder/hypergraph/KBestExtractor.java
#	src/test/java/org/apache/joshua/corpus/VocabularyTest.java
#	src/test/java/org/apache/joshua/decoder/kbest_extraction/KBestExtractionTest.java
#	src/test/java/org/apache/joshua/decoder/phrase/constrained/ConstrainedPhraseDecodingTest.java
#	src/test/java/org/apache/joshua/decoder/phrase/decode/PhraseDecodingTest.java
#	src/test/java/org/apache/joshua/system/KenLmTest.java
#	src/test/java/org/apache/joshua/system/MultithreadedTranslationTests.java
#	src/test/java/org/apache/joshua/system/StructuredOutputTest.java
#	src/test/java/org/apache/joshua/system/StructuredTranslationTest.java
#	src/test/java/org/apache/joshua/util/FormatUtilsTest.java


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/91400fe2
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/91400fe2
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/91400fe2

Branch: refs/heads/master
Commit: 91400fe2a457d04c45a8daa12d74740d582cbca3
Parents: ef00b03 15c7619
Author: Matt Post <po...@cs.jhu.edu>
Authored: Tue May 31 12:36:43 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Tue May 31 12:36:43 2016 -0400

----------------------------------------------------------------------
 scripts/support/create_glue_grammar.sh          |   2 +-
 scripts/support/grammar-packer.pl               |   2 +-
 scripts/training/pipeline.pl                    |   4 +-
 .../joshua/decoder/StructuredTranslation.java   |  66 ++++------
 .../decoder/StructuredTranslationFactory.java   | 112 +++++++++++++++++
 .../org/apache/joshua/decoder/Translation.java  | 125 +++++++++++++------
 .../BloomFilterLanguageModel.java               |   1 -
 .../joshua/decoder/ff/tm/GrammarReader.java     |   2 -
 .../decoder/ff/tm/format/HieroFormatReader.java |  13 +-
 .../decoder/ff/tm/format/MosesFormatReader.java |  12 +-
 .../tm/hash_based/MemoryBasedBatchGrammar.java  |   1 -
 .../decoder/hypergraph/AlignedSourceTokens.java |  11 +-
 .../decoder/hypergraph/KBestExtractor.java      |  84 +++++++++----
 .../decoder/hypergraph/WordAlignmentState.java  | 103 ++++++++-------
 .../apache/joshua/decoder/io/JSONMessage.java   |   2 +-
 .../org/apache/joshua/tools/GrammarPacker.java  |   3 +-
 .../java/org/apache/joshua/util/Constants.java  |  36 ++++++
 .../org/apache/joshua/util/FileUtility.java     |   1 -
 .../org/apache/joshua/util/io/LineReader.java   |   5 +-
 src/main/resources/log4j.properties             |   4 +-
 .../apache/joshua/corpus/CorpusArrayTest.java   |  14 ---
 .../apache/joshua/corpus/VocabularyTest.java    |  27 ++--
 .../joshua/decoder/DecoderThreadTest.java       |   4 -
 .../kbest_extraction/KBestExtractionTest.java   |  10 +-
 .../ConstrainedPhraseDecodingTest.java          |  10 +-
 .../phrase/decode/PhraseDecodingTest.java       |  10 +-
 .../org/apache/joshua/system/KenLmTest.java     |   2 -
 .../system/MultithreadedTranslationTests.java   |   4 +-
 .../joshua/system/StructuredOutputTest.java     |  20 ++-
 .../system/StructuredTranslationTest.java       | 107 ++++++++++++----
 .../org/apache/joshua/util/FormatUtilsTest.java |   6 -
 src/test/resources/packed-grammar/test.sh       |   2 +-
 .../resources/thrax/filtering/test-exact.sh     |   0
 src/test/resources/thrax/filtering/test-fast.sh |   0
 .../resources/thrax/filtering/test-loose.sh     |   0
 35 files changed, 515 insertions(+), 290 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/91400fe2/scripts/support/create_glue_grammar.sh
----------------------------------------------------------------------
diff --cc scripts/support/create_glue_grammar.sh
index 166e6ee,166e6ee..df60ade
--- a/scripts/support/create_glue_grammar.sh
+++ b/scripts/support/create_glue_grammar.sh
@@@ -23,4 -23,4 +23,4 @@@ if [[ -z "$1" ]]; the
    exit
  fi
  
--java -Xmx2g -cp $JOSHUA/lib/args4j-2.0.29.jar:$JOSHUA/class joshua.decoder.ff.tm.CreateGlueGrammar -g $1
++java -Xmx2g -cp $JOSHUA/target/joshua-*-jar-with-dependencies.jar org.apache.joshua.decoder.ff.tm.CreateGlueGrammar -g $1

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/91400fe2/scripts/support/grammar-packer.pl
----------------------------------------------------------------------
diff --cc scripts/support/grammar-packer.pl
index 7cd3153,7cd3153..a0caf3c
--- a/scripts/support/grammar-packer.pl
+++ b/scripts/support/grammar-packer.pl
@@@ -90,7 -90,7 +90,7 @@@ foreach my $grammar (@grammars) 
  my $grammars = join(" ", @sorted_grammars);
  my $outputs  = join(" ", @outputs);
  my $alignments = $opts{a} ? "--ga" : "";
--my $cmd = "java -Xmx$opts{m} -cp $JOSHUA/lib/args4j-2.0.29.jar:$JOSHUA/lib/guava-19.0.jar:$JOSHUA/class joshua.tools.GrammarPackerCli -g $grammars --outputs $outputs $alignments";
++my $cmd = "java -Xmx$opts{m} -cp $JOSHUA/target/joshua-*-jar-with-dependencies.jar org.apache.joshua.tools.GrammarPackerCli -g $grammars --outputs $outputs $alignments";
  print STDERR "Packing with $cmd...\n" if $opts{v};
  
  my $retval = system($cmd);

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/91400fe2/scripts/training/pipeline.pl
----------------------------------------------------------------------
diff --cc scripts/training/pipeline.pl
index e6db89d,88d641c..d9aec24
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl
@@@ -1377,7 -1362,7 +1377,7 @@@ if ($DO_FILTER_TM and defined $GRAMMAR_
  if ($OPTIMIZER_RUN == 1 and defined $TUNE_GRAMMAR and $GRAMMAR_TYPE ne "phrase" and $GRAMMAR_TYPE ne "moses") {
    if (! defined $GLUE_GRAMMAR_FILE) {
      $cachepipe->cmd("glue-tune",
--                    "java -Xmx2g -cp $JOSHUA/lib/args4j-2.0.29.jar:$JOSHUA/class joshua.decoder.ff.tm.CreateGlueGrammar -g $TUNE_GRAMMAR > $DATA_DIRS{tune}/grammar.glue",
++                    "$JOSHUA/scripts/support/create_glue_grammar.sh $TUNE_GRAMMAR > $DATA_DIRS{tune}/grammar.glue",
                      get_file_from_grammar($TUNE_GRAMMAR),
                      "$DATA_DIRS{tune}/grammar.glue");
      $GLUE_GRAMMAR_FILE = "$DATA_DIRS{tune}/grammar.glue";
@@@ -1594,7 -1579,7 +1594,7 @@@ if ($DO_FILTER_TM and defined $GRAMMAR_
  if ($OPTIMIZER_RUN == 1 and defined $TEST_GRAMMAR and $GRAMMAR_TYPE ne "phrase" and $GRAMMAR_TYPE ne "moses") {
    if (! defined $GLUE_GRAMMAR_FILE) {
      $cachepipe->cmd("glue-test",
--                    "java -Xmx2g -cp $JOSHUA/lib/args4j-2.0.29.jar:$JOSHUA/class joshua.decoder.ff.tm.CreateGlueGrammar -g $TEST_GRAMMAR > $DATA_DIRS{test}/grammar.glue",
++                    "$JOSHUA/scripts/support/create_glue_grammar.sh $TEST_GRAMMAR > $DATA_DIRS{test}/grammar.glue",
                      get_file_from_grammar($TEST_GRAMMAR),
                      "$DATA_DIRS{test}/grammar.glue");
      $GLUE_GRAMMAR_FILE = "$DATA_DIRS{test}/grammar.glue";

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/91400fe2/src/main/java/org/apache/joshua/decoder/StructuredTranslation.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/decoder/StructuredTranslation.java
index 8aa518e,0000000..b548a74
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/decoder/StructuredTranslation.java
+++ b/src/main/java/org/apache/joshua/decoder/StructuredTranslation.java
@@@ -1,126 -1,0 +1,102 @@@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 +package org.apache.joshua.decoder;
 +
- import static java.util.Arrays.asList;
- import static java.util.Collections.emptyList;
- import static org.apache.joshua.decoder.hypergraph.ViterbiExtractor.getViterbiFeatures;
- import static org.apache.joshua.decoder.hypergraph.ViterbiExtractor.getViterbiString;
- import static org.apache.joshua.decoder.hypergraph.ViterbiExtractor.getViterbiWordAlignmentList;
- import static org.apache.joshua.util.FormatUtils.removeSentenceMarkers;
- 
 +import java.util.List;
 +import java.util.Map;
 +
- import org.apache.joshua.decoder.ff.FeatureFunction;
- import org.apache.joshua.decoder.hypergraph.HyperGraph;
 +import org.apache.joshua.decoder.segment_file.Sentence;
 +
 +/**
-  * <p>structuredTranslation provides a more structured access to translation
-  * results than the Translation class.
-  * Members of instances of this class can be used upstream.</p>
-  * TODO:
-  * Enable K-Best extraction.
++ * A StructuredTranslation instance provides a more structured access to
++ * translation results than the string-based Translation class.
++ * This is useful if the decoder is encapsulated in a larger project, instead
++ * of simply writing to a file or stdout.
++ * StructuredTranslation encodes all relevant information about a derivation,
++ * namely output string, tokens, score, features, and word alignment.
 + * 
 + * @author fhieber
 + */
 +public class StructuredTranslation {
 +  
 +  private final Sentence sourceSentence;
 +  private final String translationString;
 +  private final List<String> translationTokens;
 +  private final float translationScore;
 +  private final List<List<Integer>> translationWordAlignments;
 +  private final Map<String,Float> translationFeatures;
 +  private final float extractionTime;
 +  
-   public StructuredTranslation(final Sentence sourceSentence,
-       final HyperGraph hypergraph,
-       final List<FeatureFunction> featureFunctions) {
-     
-       final long startTime = System.currentTimeMillis();
-       
-       this.sourceSentence = sourceSentence;
-       this.translationString = removeSentenceMarkers(getViterbiString(hypergraph));
-       this.translationTokens = extractTranslationTokens();
-       this.translationScore = extractTranslationScore(hypergraph);
-       this.translationFeatures = getViterbiFeatures(hypergraph, featureFunctions, sourceSentence).getMap();
-       this.translationWordAlignments = getViterbiWordAlignmentList(hypergraph);
-       this.extractionTime = (System.currentTimeMillis() - startTime) / 1000.0f;
-   }
-   
-   private float extractTranslationScore(final HyperGraph hypergraph) {
-     if (hypergraph == null) {
-       return 0;
-     } else {
-       return hypergraph.goalNode.getScore();
-     }
++  public StructuredTranslation(
++      final Sentence sourceSentence,
++      final String translationString,
++      final List<String> translationTokens,
++      final float translationScore,
++      final List<List<Integer>> translationWordAlignments,
++      final Map<String,Float> translationFeatures,
++      final float extractionTime) {
++    this.sourceSentence = sourceSentence;
++    this.translationString = translationString;
++    this.translationTokens = translationTokens;
++    this.translationScore = translationScore;
++    this.translationWordAlignments = translationWordAlignments;
++    this.translationFeatures = translationFeatures;
++    this.extractionTime = extractionTime;
 +  }
 +  
-   private List<String> extractTranslationTokens() {
-     if (translationString.isEmpty()) {
-       return emptyList();
-     } else {
-       return asList(translationString.split("\\s+"));
-     }
-   }
-   
-   // Getters to use upstream
-   
 +  public Sentence getSourceSentence() {
 +    return sourceSentence;
 +  }
 +
 +  public int getSentenceId() {
 +    return sourceSentence.id();
 +  }
 +
 +  public String getTranslationString() {
 +    return translationString;
 +  }
 +
 +  public List<String> getTranslationTokens() {
 +    return translationTokens;
 +  }
 +
 +  public float getTranslationScore() {
 +    return translationScore;
 +  }
 +
 +  /**
 +   * Returns a list of target to source alignments.
 +   * @return a list of target to source alignments
 +   */
 +  public List<List<Integer>> getTranslationWordAlignments() {
 +    return translationWordAlignments;
 +  }
 +  
 +  public Map<String,Float> getTranslationFeatures() {
 +    return translationFeatures;
 +  }
 +  
 +  /**
 +   * Time taken to build output information from the hypergraph.
 +   * @return the time taken to build output information from the hypergraph
 +   */
 +  public Float getExtractionTime() {
 +    return extractionTime;
 +  }
 +}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/91400fe2/src/main/java/org/apache/joshua/decoder/StructuredTranslationFactory.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/decoder/StructuredTranslationFactory.java
index 0000000,0000000..de5a90b
new file mode 100644
--- /dev/null
+++ b/src/main/java/org/apache/joshua/decoder/StructuredTranslationFactory.java
@@@ -1,0 -1,0 +1,112 @@@
++/*
++ * Licensed to the Apache Software Foundation (ASF) under one
++ * or more contributor license agreements.  See the NOTICE file
++ * distributed with this work for additional information
++ * regarding copyright ownership.  The ASF licenses this file
++ * to you under the Apache License, Version 2.0 (the
++ * "License"); you may not use this file except in compliance
++ * with the License.  You may obtain a copy of the License at
++ *
++ *  http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing,
++ * software distributed under the License is distributed on an
++ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
++ * KIND, either express or implied.  See the License for the
++ * specific language governing permissions and limitations
++ * under the License.
++ */
++package org.apache.joshua.decoder;
++
++import static java.util.Arrays.asList;
++import static java.util.Collections.emptyList;
++import static java.util.Collections.emptyMap;
++import static org.apache.joshua.decoder.hypergraph.ViterbiExtractor.getViterbiFeatures;
++import static org.apache.joshua.decoder.hypergraph.ViterbiExtractor.getViterbiString;
++import static org.apache.joshua.decoder.hypergraph.ViterbiExtractor.getViterbiWordAlignmentList;
++import static org.apache.joshua.util.FormatUtils.removeSentenceMarkers;
++
++import java.util.List;
++
++import org.apache.joshua.decoder.ff.FeatureFunction;
++import org.apache.joshua.decoder.hypergraph.HyperGraph;
++import org.apache.joshua.decoder.hypergraph.KBestExtractor.DerivationState;
++import org.apache.joshua.decoder.segment_file.Sentence;
++
++/**
++ * This factory provides methods to create StructuredTranslation objects
++ * from either Viterbi derivations or KBest derivations.
++ * 
++ * @author fhieber
++ */
++public class StructuredTranslationFactory {
++  
++  /**
++   * Returns a StructuredTranslation instance from the Viterbi derivation.
++   * @return A StructuredTranslation object representing the Viterbi derivation.
++   */
++  public static StructuredTranslation fromViterbiDerivation(
++      final Sentence sourceSentence,
++      final HyperGraph hypergraph,
++      final List<FeatureFunction> featureFunctions) {
++    final long startTime = System.currentTimeMillis();
++    final String translationString = removeSentenceMarkers(getViterbiString(hypergraph));
++    return new StructuredTranslation(
++        sourceSentence,
++        translationString,
++        extractTranslationTokens(translationString),
++        extractTranslationScore(hypergraph),
++        getViterbiWordAlignmentList(hypergraph),
++        getViterbiFeatures(hypergraph, featureFunctions, sourceSentence).getMap(),
++        (System.currentTimeMillis() - startTime) / 1000.0f);
++  }
++  
++  /**
++   * Returns a StructuredTranslation from an empty decoder output
++   * @param sourceSentence
++   * @return
++   */
++  public static StructuredTranslation fromEmptyOutput(final Sentence sourceSentence) {
++        return new StructuredTranslation(
++                sourceSentence, "", emptyList(), 0, emptyList(), emptyMap(), 0f);
++      }
++  
++  /**
++   * Returns a StructuredTranslation instance from a KBest DerivationState. 
++   * @param sourceSentence Sentence object representing the source.
++   * @param derivationState the KBest DerivationState.
++   * @return A StructuredTranslation object representing the derivation encoded by derivationState.
++   */
++  public static StructuredTranslation fromKBestDerivation(
++      final Sentence sourceSentence,
++      final DerivationState derivationState) {
++    final long startTime = System.currentTimeMillis();
++    final String translationString = removeSentenceMarkers(derivationState.getHypothesis());
++    return new StructuredTranslation(
++        sourceSentence,
++        translationString,
++        extractTranslationTokens(translationString),
++        derivationState.getModelCost(),
++        derivationState.getWordAlignmentList(),
++        derivationState.getFeatures().getMap(),
++        (System.currentTimeMillis() - startTime) / 1000.0f);
++  }
++  
++  private static float extractTranslationScore(final HyperGraph hypergraph) {
++    if (hypergraph == null) {
++      return 0;
++    } else {
++      return hypergraph.goalNode.getScore();
++    }
++  }
++  
++  private static List<String> extractTranslationTokens(final String translationString) {
++    if (translationString.isEmpty()) {
++      return emptyList();
++    } else {
++      return asList(translationString.split("\\s+"));
++    }
++  }
++  
++
++}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/91400fe2/src/main/java/org/apache/joshua/decoder/Translation.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/decoder/Translation.java
index ab37814,0000000..6b8e5e4
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/decoder/Translation.java
+++ b/src/main/java/org/apache/joshua/decoder/Translation.java
@@@ -1,203 -1,0 +1,248 @@@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 +package org.apache.joshua.decoder;
 +
 +import static org.apache.joshua.decoder.hypergraph.ViterbiExtractor.getViterbiFeatures;
 +import static org.apache.joshua.decoder.hypergraph.ViterbiExtractor.getViterbiString;
 +import static org.apache.joshua.decoder.hypergraph.ViterbiExtractor.getViterbiWordAlignments;
++import static org.apache.joshua.decoder.StructuredTranslationFactory.fromViterbiDerivation;
 +import static org.apache.joshua.util.FormatUtils.removeSentenceMarkers;
++import static java.util.Arrays.asList;
 +
 +import java.io.BufferedWriter;
 +import java.io.IOException;
 +import java.io.StringWriter;
 +import java.util.List;
 +
 +import org.apache.joshua.decoder.ff.FeatureFunction;
 +import org.apache.joshua.decoder.ff.FeatureVector;
 +import org.apache.joshua.decoder.ff.lm.StateMinimizingLanguageModel;
 +import org.apache.joshua.decoder.hypergraph.HyperGraph;
 +import org.apache.joshua.decoder.hypergraph.KBestExtractor;
 +import org.apache.joshua.decoder.io.DeNormalize;
 +import org.apache.joshua.decoder.segment_file.Sentence;
 +import org.slf4j.Logger;
 +import org.slf4j.LoggerFactory;
 +
 +/**
 + * This class represents translated input objects (sentences or lattices). It is aware of the source
 + * sentence and id and contains the decoded hypergraph. Translation objects are returned by
 + * DecoderThread instances to the InputHandler, where they are assembled in order for output.
 + * 
++<<<<<<< HEAD:src/main/java/org/apache/joshua/decoder/Translation.java
 + * @author Matt Post post@cs.jhu.edu
++=======
++ * @author Matt Post <po...@cs.jhu.edu>
++ * @author Felix Hieber <fh...@amazon.com>
++>>>>>>> origin/master:src/joshua/decoder/Translation.java
 + */
 +
 +public class Translation {
 +  private static final Logger LOG = LoggerFactory.getLogger(Translation.class);
 +  private Sentence source;
 +
 +  /**
 +   * This stores the output of the translation so we don't have to hold onto the hypergraph while we
 +   * wait for the outputs to be assembled.
 +   */
 +  private String output = null;
 +
-   private StructuredTranslation structuredTranslation = null;
- 
++  /**
++   * Stores the list of StructuredTranslations.
++   * If joshuaConfig.topN == 0, will only contain the Viterbi translation.
++   * Else it will use KBestExtractor to populate this list.
++   */
++  private List<StructuredTranslation> structuredTranslations = null;
++  
 +  public Translation(Sentence source, HyperGraph hypergraph, 
 +      List<FeatureFunction> featureFunctions, JoshuaConfiguration joshuaConfiguration) {
 +    this.source = source;
- 
++    
++    /**
++     * Structured output from Joshua provides a way to programmatically access translation results
++     * from downstream applications, instead of writing results as strings to an output buffer.
++     */
 +    if (joshuaConfiguration.use_structured_output) {
- 
-       structuredTranslation = new StructuredTranslation(
-           source, hypergraph, featureFunctions);
-       this.output = structuredTranslation.getTranslationString();
++      
++      if (joshuaConfiguration.topN == 0) {
++        /*
++         * Obtain Viterbi StructuredTranslation
++         */
++        StructuredTranslation translation = fromViterbiDerivation(source, hypergraph, featureFunctions);
++        this.output = translation.getTranslationString();
++        structuredTranslations = asList(translation);
++        
++      } else {
++        /*
++         * Get K-Best list of StructuredTranslations
++         */
++        final KBestExtractor kBestExtractor = new KBestExtractor(source, featureFunctions, Decoder.weights, false, joshuaConfiguration);
++        structuredTranslations = kBestExtractor.KbestExtractOnHG(hypergraph, joshuaConfiguration.topN);
++        if (structuredTranslations.isEmpty()) {
++            structuredTranslations = asList(StructuredTranslationFactory.fromEmptyOutput(source));
++            this.output = "";
++        } else {
++            this.output = structuredTranslations.get(0).getTranslationString();
++        }
++        // TODO: We omit the BLEU rescoring for now since it is not clear whether it works at all and what the desired output is below.
++      }
 +
 +    } else {
 +
 +      StringWriter sw = new StringWriter();
 +      BufferedWriter out = new BufferedWriter(sw);
 +
 +      try {
++        
 +        if (hypergraph != null) {
++          
 +          if (!joshuaConfiguration.hypergraphFilePattern.equals("")) {
 +            hypergraph.dump(String.format(joshuaConfiguration.hypergraphFilePattern, source.id()), featureFunctions);
 +          }
 +
 +          long startTime = System.currentTimeMillis();
 +
 +          // We must put this weight as zero, otherwise we get an error when we try to retrieve it
 +          // without checking
 +          Decoder.weights.increment("BLEU", 0);
 +
 +          if (joshuaConfiguration.topN == 0) {
 +
 +            /* construct Viterbi output */
 +            final String best = getViterbiString(hypergraph);
 +
 +            LOG.info("Translation {}: {} {}", source.id(), hypergraph.goalNode.getScore(), best);
 +
 +            /*
 +             * Setting topN to 0 turns off k-best extraction, in which case we need to parse through
 +             * the output-string, with the understanding that we can only substitute variables for the
 +             * output string, sentence number, and model score.
 +             */
 +            String translation = joshuaConfiguration.outputFormat
 +                .replace("%s", removeSentenceMarkers(best))
 +                .replace("%S", DeNormalize.processSingleLine(best))
 +                .replace("%c", String.format("%.3f", hypergraph.goalNode.getScore()))
 +                .replace("%i", String.format("%d", source.id()));
 +
 +            if (joshuaConfiguration.outputFormat.contains("%a")) {
 +              translation = translation.replace("%a", getViterbiWordAlignments(hypergraph));
 +            }
 +
 +            if (joshuaConfiguration.outputFormat.contains("%f")) {
 +              final FeatureVector features = getViterbiFeatures(hypergraph, featureFunctions, source);
 +              translation = translation.replace("%f", joshuaConfiguration.moses ? features.mosesString() : features.toString());
 +            }
 +
 +            out.write(translation);
 +            out.newLine();
 +
 +          } else {
 +
 +            final KBestExtractor kBestExtractor = new KBestExtractor(
 +                source, featureFunctions, Decoder.weights, false, joshuaConfiguration);
 +            kBestExtractor.lazyKBestExtractOnHG(hypergraph, joshuaConfiguration.topN, out);
 +
 +            if (joshuaConfiguration.rescoreForest) {
 +              Decoder.weights.increment("BLEU", joshuaConfiguration.rescoreForestWeight);
 +              kBestExtractor.lazyKBestExtractOnHG(hypergraph, joshuaConfiguration.topN, out);
 +
 +              Decoder.weights.increment("BLEU", -joshuaConfiguration.rescoreForestWeight);
 +              kBestExtractor.lazyKBestExtractOnHG(hypergraph, joshuaConfiguration.topN, out);
 +            }
 +          }
 +
 +          float seconds = (float) (System.currentTimeMillis() - startTime) / 1000.0f;
 +          LOG.info("Input {}: {}-best extraction took {} seconds", id(),
 +              joshuaConfiguration.topN, seconds);
 +
 +        } else {
- 
++          
 +          // Failed translations and blank lines get empty formatted outputs
-           // @formatter:off
-           String outputString = joshuaConfiguration.outputFormat
-               .replace("%s", source.source())
-               .replace("%e", "")
-               .replace("%S", "")
-               .replace("%t", "()")
-               .replace("%i", Integer.toString(source.id()))
-               .replace("%f", "")
-               .replace("%c", "0.000");
-           // @formatter:on
- 
-           out.write(outputString);
++          out.write(getFailedTranslationOutput(source, joshuaConfiguration));
 +          out.newLine();
++          
 +        }
 +
 +        out.flush();
++        
 +      } catch (IOException e) {
 +        throw new RuntimeException(e);
 +      }
 +
 +      this.output = sw.toString();
 +
 +    }
- 
-     /*
-      * KenLM hack. If using KenLMFF, we need to tell KenLM to delete the pool used to create chart
-      * objects for this sentence.
-      */
-     for (FeatureFunction feature : featureFunctions) {
-       if (feature instanceof StateMinimizingLanguageModel) {
-         ((StateMinimizingLanguageModel) feature).destroyPool(getSourceSentence().id());
-         break;
-       }
-     }
++    
++    // remove state from StateMinimizingLanguageModel instances in features.
++    destroyKenLMStates(featureFunctions);
 +
 +  }
 +
 +  public Sentence getSourceSentence() {
 +    return this.source;
 +  }
 +
 +  public int id() {
 +    return source.id();
 +  }
 +
 +  @Override
 +  public String toString() {
 +    return output;
 +  }
- 
++  
++  private String getFailedTranslationOutput(final Sentence source, final JoshuaConfiguration joshuaConfiguration) {
++    return joshuaConfiguration.outputFormat
++        .replace("%s", source.source())
++        .replace("%e", "")
++        .replace("%S", "")
++        .replace("%t", "()")
++        .replace("%i", Integer.toString(source.id()))
++        .replace("%f", "")
++        .replace("%c", "0.000");
++  }
++  
++  /**
++   * Returns the StructuredTranslations
++   * if JoshuaConfiguration.use_structured_output == True.
++   * @throws RuntimeException if JoshuaConfiguration.use_structured_output == False.
++   * @return List of StructuredTranslations.
++   */
++  public List<StructuredTranslation> getStructuredTranslations() {
++    if (structuredTranslations == null) {
++      throw new RuntimeException(
++          "No StructuredTranslation objects created. You should set JoshuaConfigration.use_structured_output = true");
++    }
++    return structuredTranslations;
++  }
++  
 +  /**
-    * Returns the StructuredTranslation object
-    * if JoshuaConfiguration.construct_structured_output == True.
-    * @throws RuntimeException if StructuredTranslation object not set.
-    * @return {@link org.apache.joshua.decoder.StructuredTranslation} object
++   * KenLM hack. If using KenLMFF, we need to tell KenLM to delete the pool used to create chart
++   * objects for this sentence.
 +   */
-   public StructuredTranslation getStructuredTranslation() {
-     if (structuredTranslation == null) {
-       throw new RuntimeException("No StructuredTranslation object created. You should set JoshuaConfigration.construct_structured_output = true");
++  private void destroyKenLMStates(final List<FeatureFunction> featureFunctions) {
++    for (FeatureFunction feature : featureFunctions) {
++      if (feature instanceof StateMinimizingLanguageModel) {
++        ((StateMinimizingLanguageModel) feature).destroyPool(getSourceSentence().id());
++        break;
++      }
 +    }
-     return structuredTranslation;
 +  }
 +
 +}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/91400fe2/src/main/java/org/apache/joshua/decoder/ff/lm/bloomfilter_lm/BloomFilterLanguageModel.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/decoder/ff/lm/bloomfilter_lm/BloomFilterLanguageModel.java
index 7d0e599,0000000..90d90b8
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/decoder/ff/lm/bloomfilter_lm/BloomFilterLanguageModel.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/lm/bloomfilter_lm/BloomFilterLanguageModel.java
@@@ -1,561 -1,0 +1,560 @@@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 +package org.apache.joshua.decoder.ff.lm.bloomfilter_lm;
 +
 +import java.io.Externalizable;
 +import java.io.FileInputStream;
- import java.io.FileNotFoundException;
 +import java.io.FileOutputStream;
 +import java.io.IOException;
 +import java.io.InputStream;
 +import java.io.ObjectInput;
 +import java.io.ObjectInputStream;
 +import java.io.ObjectOutput;
 +import java.io.ObjectOutputStream;
 +import java.util.HashMap;
 +import java.util.zip.GZIPInputStream;
 +import java.util.zip.GZIPOutputStream;
 +
 +import org.apache.joshua.corpus.Vocabulary;
 +import org.apache.joshua.decoder.ff.lm.DefaultNGramLanguageModel;
 +import org.apache.joshua.util.Regex;
 +import org.apache.joshua.util.io.LineReader;
 +import org.slf4j.Logger;
 +import org.slf4j.LoggerFactory;
 +
 +/**
 + * An n-gram language model with linearly-interpolated Witten-Bell smoothing, using a Bloom filter
 + * as its main data structure. A Bloom filter is a lossy data structure that can be used to test for
 + * set membership.
 + */
 +public class BloomFilterLanguageModel extends DefaultNGramLanguageModel implements Externalizable {
 +  /**
 +   * An initial value used for hashing n-grams so that they can be stored in a bloom filter.
 +   */
 +  public static final int HASH_SEED = 17;
 +
 +  /**
 +   * Another value used in the process of hashing n-grams.
 +   */
 +  public static final int HASH_OFFSET = 37;
 +
 +  /**
 +   * The maximum score that a language model feature function can return to the Joshua decoder.
 +   */
 +  public static final double MAX_SCORE = 100.0;
 +
 +  /**
 +   * The logger for this class.
 +   */
 +  private static final Logger LOG = LoggerFactory.getLogger(BloomFilterLanguageModel.class);
 +
 +  /**
 +   * The Bloom filter data structure itself.
 +   */
 +  private BloomFilter bf;
 +
 +  /**
 +   * The base of the logarithm used to quantize n-gram counts. N-gram counts are quantized
 +   * logarithmically to reduce the number of times we need to query the Bloom filter.
 +   */
 +  private double quantizationBase;
 +
 +  /**
 +   * Natural log of the number of tokens seen in the training corpus.
 +   */
 +  private double numTokens;
 +
 +  /**
 +   * An array of pairs of long, used as hash functions for storing or retreiving the count of an
 +   * n-gram in the Bloom filter.
 +   */
 +  private long[][] countFuncs;
 +  /**
 +   * An array of pairs of long, used as hash functions for storing or retreiving the number of
 +   * distinct types observed after an n-gram.
 +   */
 +  private long[][] typesFuncs;
 +
 +  /**
 +   * The smoothed probability of an unseen n-gram. This is also the probability of any n-gram under
 +   * the zeroth-order model.
 +   */
 +  transient private double p0;
 +
 +  /**
 +   * The interpolation constant between Witten-Bell models of order zero and one. Stored in a field
 +   * because it can be calculated ahead of time; it doesn't depend on the particular n-gram.
 +   */
 +  transient private double lambda0;
 +
 +  /**
 +   * The maximum possible quantized count of any n-gram stored in the Bloom filter. Used as an upper
 +   * bound on the count that could be returned when querying the Bloom filter.
 +   */
 +  transient private int maxQ; // max quantized count
 +
 +  /**
 +   * Constructor called from the Joshua decoder. This constructor assumes that the LM has already
 +   * been built, and takes the name of the file where the LM is stored.
 +   * 
 +   * @param order the order of the language model
 +   * @param filename path to the file where the language model is stored
 +   * @throws IOException if the bloom filter language model cannot be rebuilt from the input file
 +   */
 +  public BloomFilterLanguageModel(int order, String filename) throws IOException {
 +    super(order);
 +    try {
 +      readExternal(new ObjectInputStream(new GZIPInputStream(new FileInputStream(filename))));
 +    } catch (ClassNotFoundException e) {
 +      IOException ioe = new IOException("Could not rebuild bloom filter LM from file " + filename);
 +      ioe.initCause(e);
 +      throw ioe;
 +    }
 +
 +    int vocabSize = Vocabulary.size();
 +    p0 = -Math.log(vocabSize + 1);
 +    double oneMinusLambda0 = numTokens - logAdd(Math.log(vocabSize), numTokens);
 +    p0 += oneMinusLambda0;
 +    lambda0 = Math.log(vocabSize) - logAdd(Math.log(vocabSize), numTokens);
 +    maxQ = quantize((long) Math.exp(numTokens));
 +  }
 +
 +  /**
 +   * Constructor to be used by the main function. This constructor is used to build a new language
 +   * model from scratch. An LM should be built with the main function before using it in the Joshua
 +   * decoder.
 +   * 
 +   * @param filename path to the file of training corpus statistics
 +   * @param order the order of the language model
 +   * @param size the size of the Bloom filter, in bits
 +   * @param base a double. The base of the logarithm for quantization.
 +   */
 +  private BloomFilterLanguageModel(String filename, int order, int size, double base) {
 +    super(order);
 +    quantizationBase = base;
 +    populateBloomFilter(size, filename);
 +  }
 +
 +  /**
 +   * calculates the linearly-interpolated Witten-Bell probability for a given ngram. this is
 +   * calculated as: p(w|h) = pML(w|h)L(h) - (1 - L(h))p(w|h') where: w is a word and h is a history
 +   * h' is the history h with the first word removed pML is the maximum-likelihood estimate of the
 +   * probability L(.) is lambda, the interpolation factor, which depends only on the history h: L(h)
 +   * = s(h) / s(h) + c(h) where s(.) is the observed number of distinct types after h, and c is the
 +   * observed number of counts of h in the training corpus.
 +   * <p>
 +   * in fact this model calculates the probability starting from the lowest order and working its
 +   * way up, to take advantage of the one- sided error rate inherent in using a bloom filter data
 +   * structure.
 +   * 
 +   * @param ngram the ngram whose probability is to be calculated
 +   * @param ngramOrder the order of the ngram.
 +   * 
 +   * @return the linearly-interpolated Witten-Bell smoothed probability of an ngram
 +   */
 +  private float wittenBell(int[] ngram, int ngramOrder) {
 +    int end = ngram.length;
 +    double p = p0; // current calculated probability
 +    // note that p0 and lambda0 are independent of the given
 +    // ngram so they are calculated ahead of time.
 +    int MAX_QCOUNT = getCount(ngram, ngram.length - 1, ngram.length, maxQ);
 +    if (MAX_QCOUNT == 0) // OOV!
 +      return (float) p;
 +    double pML = Math.log(unQuantize(MAX_QCOUNT)) - numTokens;
 +
 +    // p += lambda0 * pML;
 +    p = logAdd(p, (lambda0 + pML));
 +    if (ngram.length == 1) { // if it's a unigram, we're done
 +      return (float) p;
 +    }
 +    // otherwise we calculate the linear interpolation
 +    // with higher order models.
 +    for (int i = end - 2; i >= end - ngramOrder && i >= 0; i--) {
 +      int historyCnt = getCount(ngram, i, end, MAX_QCOUNT);
 +      // if the count for the history is zero, all higher
 +      // terms in the interpolation must be zero, so we
 +      // are done here.
 +      if (historyCnt == 0) {
 +        return (float) p;
 +      }
 +      int historyTypesAfter = getTypesAfter(ngram, i, end, historyCnt);
 +      // unQuantize the counts we got from the BF
 +      double HC = unQuantize(historyCnt);
 +      double HTA = 1 + unQuantize(historyTypesAfter);
 +      // interpolation constant
 +      double lambda = Math.log(HTA) - Math.log(HTA + HC);
 +      double oneMinusLambda = Math.log(HC) - Math.log(HTA + HC);
 +      // p *= 1 - lambda
 +      p += oneMinusLambda;
 +      int wordCount = getCount(ngram, i + 1, end, historyTypesAfter);
 +      double WC = unQuantize(wordCount);
 +      // p += lambda * p_ML(w|h)
 +      if (WC == 0) return (float) p;
 +      p = logAdd(p, lambda + Math.log(WC) - Math.log(HC));
 +      MAX_QCOUNT = wordCount;
 +    }
 +    return (float) p;
 +  }
 +
 +  /**
 +   * Retrieve the count of a ngram from the Bloom filter. That is, how many times did we see this
 +   * ngram in the training corpus? This corresponds roughly to algorithm 2 in Talbot and Osborne's
 +   * "Tera-Scale LMs on the Cheap."
 +   * 
 +   * @param ngram array containing the ngram as a sub-array
 +   * @param start the index of the first word of the ngram
 +   * @param end the index after the last word of the ngram
 +   * @param qcount the maximum possible count to be returned
 +   * 
 +   * @return the number of times the ngram was seen in the training corpus, quantized
 +   */
 +  private int getCount(int[] ngram, int start, int end, int qcount) {
 +    for (int i = 1; i <= qcount; i++) {
 +      int hash = hashNgram(ngram, start, end, i);
 +      if (!bf.query(hash, countFuncs)) {
 +        return i - 1;
 +      }
 +    }
 +    return qcount;
 +  }
 +
 +  /**
 +   * Retrieve the number of distinct types that follow an ngram in the training corpus.
 +   * 
 +   * This is another version of algorithm 2. As noted in the paper, we have different algorithms for
 +   * getting ngram counts versus suffix counts because c(x) = 1 is a proxy item for s(x) = 1
 +   * 
 +   * @param ngram an array the contains the ngram as a sub-array
 +   * @param start the index of the first word of the ngram
 +   * @param end the index after the last word of the ngram
 +   * @param qcount the maximum possible return value
 +   * 
 +   * @return the number of distinct types observed to follow an ngram in the training corpus,
 +   *         quantized
 +   */
 +  private int getTypesAfter(int[] ngram, int start, int end, int qcount) {
 +    // first we check c(x) >= 1
 +    int hash = hashNgram(ngram, start, end, 1);
 +    if (!bf.query(hash, countFuncs)) {
 +      return 0;
 +    }
 +    // if c(x) >= 1, we check for the stored suffix count
 +    for (int i = 1; i < qcount; i++) {
 +      hash = hashNgram(ngram, start, end, i);
 +      if (!bf.query(hash, typesFuncs)) {
 +        return i - 1;
 +      }
 +    }
 +    return qcount;
 +  }
 +
 +  /**
 +   * Logarithmically quantizes raw counts. The quantization scheme is described in Talbot and
 +   * Osborne's paper "Tera-Scale LMs on the Cheap."
 +   * 
 +   * @param x long giving the raw count to be quantized
 +   * 
 +   * @return the quantized count
 +   */
 +  private int quantize(long x) {
 +    return 1 + (int) Math.floor(Math.log(x) / Math.log(quantizationBase));
 +  }
 +
 +  /**
 +   * Unquantizes a quantized count.
 +   * 
 +   * @param x the quantized count
 +   * 
 +   * @return the expected raw value of the quantized count
 +   */
 +  private double unQuantize(int x) {
 +    if (x == 0) {
 +      return 0;
 +    } else {
 +      return ((quantizationBase + 1) * Math.pow(quantizationBase, x - 1) - 1) / 2;
 +    }
 +  }
 +
 +  /**
 +   * Converts an n-gram and a count into a value that can be stored into a Bloom filter. This is
 +   * adapted directly from <code>AbstractPhrase.hashCode()</code> elsewhere in the Joshua code base.
 +   * 
 +   * @param ngram an array containing the ngram as a sub-array
 +   * @param start the index of the first word of the ngram
 +   * @param end the index after the last word of the ngram
 +   * @param val the count of the ngram
 +   * 
 +   * @return a value suitable to be stored in a Bloom filter
 +   */
 +  private int hashNgram(int[] ngram, int start, int end, int val) {
 +    int result = HASH_OFFSET * HASH_SEED + val;
 +    for (int i = start; i < end; i++)
 +      result = HASH_OFFSET * result + ngram[i];
 +    return result;
 +  }
 +
 +  /**
 +   * Adds two numbers that are in the log domain, avoiding underflow.
 +   * 
 +   * @param x one summand
 +   * @param y the other summand
 +   * 
 +   * @return the log of the sum of the exponent of the two numbers.
 +   */
 +  private static double logAdd(double x, double y) {
 +    if (y <= x) {
 +      return x + Math.log1p(Math.exp(y - x));
 +    } else {
 +      return y + Math.log1p(Math.exp(x - y));
 +    }
 +  }
 +
 +  /**
 +   * Builds a language model and stores it in a file.
 +   * 
 +   * @param argv command-line arguments
 +   */
 +  public static void main(String[] argv) {
 +    if (argv.length < 5) {
 +      String msg = "usage: BloomFilterLanguageModel <statistics file> <order> <size>"
 +          + " <quantization base> <output file>";
 +      System.err.println(msg);
 +      LOG.error(msg);
 +      return;
 +    }
 +    int order = Integer.parseInt(argv[1]);
 +    int size = (int) (Integer.parseInt(argv[2]) * Math.pow(2, 23));
 +    double base = Double.parseDouble(argv[3]);
 +
 +    try {
 +      BloomFilterLanguageModel lm = new BloomFilterLanguageModel(argv[0], order, size, base);
 +
 +      ObjectOutputStream out =
 +          new ObjectOutputStream(new GZIPOutputStream(new FileOutputStream(argv[4])));
 +
 +      lm.writeExternal(out);
 +      out.close(); //TODO: try-with-resources
 +    } catch (IOException e) {
 +      LOG.error(e.getMessage(), e);
 +    }
 +  }
 +  
 +  /**
 +   * Adds ngram counts and counts of distinct types after ngrams, read from a file, to the Bloom
 +   * filter.
 +   * <p>
 +   * The file format should look like this: ngram1 count types-after ngram2 count types-after ...
 +   * 
 +   * @param bloomFilterSize the size of the Bloom filter, in bits
 +   * @param filename path to the statistics file
 +   */
 +  private void populateBloomFilter(int bloomFilterSize, String filename) {
 +    HashMap<String, Long> typesAfter = new HashMap<String, Long>();
 +    try {
 +      FileInputStream file_in = new FileInputStream(filename);
 +      FileInputStream file_in_copy = new FileInputStream(filename);
 +      InputStream in;
 +      InputStream estimateStream;
 +      if (filename.endsWith(".gz")) {
 +        in = new GZIPInputStream(file_in);
 +        estimateStream = new GZIPInputStream(file_in_copy);
 +      } else {
 +        in = file_in;
 +        estimateStream = file_in_copy;
 +      }
 +      int numObjects = estimateNumberOfObjects(estimateStream);
 +      LOG.debug("Estimated number of objects: {}", numObjects);
 +      bf = new BloomFilter(bloomFilterSize, numObjects);
 +      countFuncs = bf.initializeHashFunctions();
 +      populateFromInputStream(in, typesAfter);
 +      in.close();
 +    } catch (IOException e) {
 +      LOG.error(e.getMessage(), e);
 +      return;
 +    }
 +    typesFuncs = bf.initializeHashFunctions();
 +    for (String history : typesAfter.keySet()) {
 +      String[] toks = Regex.spaces.split(history);
 +      int[] hist = new int[toks.length];
 +      for (int i = 0; i < toks.length; i++)
 +        hist[i] = Vocabulary.id(toks[i]);
 +      add(hist, typesAfter.get(history), typesFuncs);
 +    }
 +    return;
 +  }
 +
 +  /**
 +   * Estimate the number of objects that will be stored in the Bloom filter. The optimum number of
 +   * hash functions depends on the number of items that will be stored, so we want a guess before we
 +   * begin to read the statistics file and store it.
 +   * 
 +   * @param source an InputStream pointing to the training corpus stats
 +   * 
 +   * @return an estimate of the number of objects to be stored in the Bloom filter
 +   */
 +  private int estimateNumberOfObjects(InputStream source) {
 +    int numLines = 0;
 +    long maxCount = 0;
 +    for (String line: new LineReader(source)) {
 +      if (line.trim().equals("")) continue;
 +      String[] toks = Regex.spaces.split(line);
 +      if (toks.length > ngramOrder + 1) continue;
 +      try {
 +        long cnt = Long.parseLong(toks[toks.length - 1]);
 +        if (cnt > maxCount) maxCount = cnt;
 +      } catch (NumberFormatException e) {
 +        LOG.error(e.getMessage(), e);
 +        break;
 +      }
 +      numLines++;
 +    }
 +    double estimate = Math.log(maxCount) / Math.log(quantizationBase);
 +    return (int) Math.round(numLines * estimate);
 +  }
 +
 +  /**
 +   * Reads the statistics from a source and stores them in the Bloom filter. The ngram counts are
 +   * stored immediately in the Bloom filter, but the counts of distinct types following each ngram
 +   * are accumulated from the file as we go.
 +   * 
 +   * @param source an InputStream pointing to the statistics
 +   * @param types a HashMap that will stores the accumulated counts of distinct types observed to
 +   *        follow each ngram
 +   */
 +  private void populateFromInputStream(InputStream source, HashMap<String, Long> types) {
 +    numTokens = Double.NEGATIVE_INFINITY; // = log(0)
 +    for (String line: new LineReader(source)) {
 +      String[] toks = Regex.spaces.split(line);
 +      if ((toks.length < 2) || (toks.length > ngramOrder + 1)) continue;
 +      int[] ngram = new int[toks.length - 1];
 +      StringBuilder history = new StringBuilder();
 +      for (int i = 0; i < toks.length - 1; i++) {
 +        ngram[i] = Vocabulary.id(toks[i]);
 +        if (i < toks.length - 2) history.append(toks[i]).append(" ");
 +      }
 +
 +      long cnt = Long.parseLong(toks[toks.length - 1]);
 +      add(ngram, cnt, countFuncs);
 +      if (toks.length == 2) { // unigram
 +        numTokens = logAdd(numTokens, Math.log(cnt));
 +        // no need to count types after ""
 +        // that's what vocabulary.size() is for.
 +        continue;
 +      }
 +      if (types.get(history) == null)
 +        types.put(history.toString(), 1L);
 +      else {
 +        long x = (Long) types.get(history);
 +        types.put(history.toString(), x + 1);
 +      }
 +    }
 +    return;
 +  }
 +
 +  /**
 +   * Adds an ngram, along with an associated value, to the Bloom filter. This corresponds to Talbot
 +   * and Osborne's "Tera-scale LMs on the cheap", algorithm 1.
 +   * 
 +   * @param ngram an array representing the ngram
 +   * @param value the value to be associated with the ngram
 +   * @param funcs an array of long to be used as hash functions
 +   */
 +  private void add(int[] ngram, long value, long[][] funcs) {
 +    if (ngram == null) return;
 +    int qValue = quantize(value);
 +    for (int i = 1; i <= qValue; i++) {
 +      int hash = hashNgram(ngram, 0, ngram.length, i);
 +      bf.add(hash, funcs);
 +    }
 +  }
 +
 +  /**
 +   * Read a Bloom filter LM from an external file.
 +   * 
 +   * @param in an ObjectInput stream to read from
 +   */
 +  public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException {
 +    int vocabSize = in.readInt();
 +    for (int i = 0; i < vocabSize; i++) {
 +      String line = in.readUTF();
 +      Vocabulary.id(line);
 +    }
 +    numTokens = in.readDouble();
 +    countFuncs = new long[in.readInt()][2];
 +    for (int i = 0; i < countFuncs.length; i++) {
 +      countFuncs[i][0] = in.readLong();
 +      countFuncs[i][1] = in.readLong();
 +    }
 +    typesFuncs = new long[in.readInt()][2];
 +    for (int i = 0; i < typesFuncs.length; i++) {
 +      typesFuncs[i][0] = in.readLong();
 +      typesFuncs[i][1] = in.readLong();
 +    }
 +    quantizationBase = in.readDouble();
 +    bf = new BloomFilter();
 +    bf.readExternal(in);
 +  }
 +
 +  /**
 +   * Write a Bloom filter LM to some external location.
 +   * 
 +   * @param out an ObjectOutput stream to write to
 +   * 
 +   * @throws IOException if an input or output exception occurred
 +   */
 +  public void writeExternal(ObjectOutput out) throws IOException {
 +    out.writeInt(Vocabulary.size());
 +    for (int i = 0; i < Vocabulary.size(); i++) {
 +      // out.writeBytes(vocabulary.getWord(i));
 +      // out.writeChar('\n'); // newline
 +      out.writeUTF(Vocabulary.word(i));
 +    }
 +    out.writeDouble(numTokens);
 +    out.writeInt(countFuncs.length);
 +    for (int i = 0; i < countFuncs.length; i++) {
 +      out.writeLong(countFuncs[i][0]);
 +      out.writeLong(countFuncs[i][1]);
 +    }
 +    out.writeInt(typesFuncs.length);
 +    for (int i = 0; i < typesFuncs.length; i++) {
 +      out.writeLong(typesFuncs[i][0]);
 +      out.writeLong(typesFuncs[i][1]);
 +    }
 +    out.writeDouble(quantizationBase);
 +    bf.writeExternal(out);
 +  }
 +
 +  /**
 +   * Returns the language model score for an n-gram. This is called from the rest of the Joshua
 +   * decoder.
 +   * 
 +   * @param ngram the ngram to score
 +   * @param order the order of the model
 +   * 
 +   * @return the language model score of the ngram
 +   */
 +  @Override
 +  protected float ngramLogProbability_helper(int[] ngram, int order) {
 +    int[] lm_ngram = new int[ngram.length];
 +    for (int i = 0; i < ngram.length; i++) {
 +      lm_ngram[i] = Vocabulary.id(Vocabulary.word(ngram[i]));
 +    }
 +    return wittenBell(lm_ngram, order);
 +  }
 +}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/91400fe2/src/main/java/org/apache/joshua/decoder/ff/tm/GrammarReader.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/decoder/ff/tm/GrammarReader.java
index 501be62,0000000..df00255
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/decoder/ff/tm/GrammarReader.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/tm/GrammarReader.java
@@@ -1,160 -1,0 +1,158 @@@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 +package org.apache.joshua.decoder.ff.tm;
 +
 +import java.io.IOException;
 +import java.util.Iterator;
 +
- import org.apache.joshua.corpus.Vocabulary;
 +import org.apache.joshua.decoder.Decoder;
 +import org.apache.joshua.util.io.LineReader;
 +import org.slf4j.Logger;
 +import org.slf4j.LoggerFactory;
 +
 +/**
 + * This is a base class for simple, ASCII line-based grammars that are stored on disk.
 + * 
 + * @author Juri Ganitkevitch
 + * 
 + */
 +public abstract class GrammarReader<R extends Rule> implements Iterable<R>, Iterator<R> {
 +
 +  private static final Logger LOG = LoggerFactory.getLogger(GrammarReader.class);
 +
 +  protected static String fieldDelimiter;
- 
 +  protected static String description;
 +
 +  protected String fileName;
 +  protected LineReader reader;
 +  protected String lookAhead;
 +  protected int numRulesRead;
 +
 +
 +  // dummy constructor for
 +  public GrammarReader() {
 +    this.fileName = null;
 +  }
 +
 +  public GrammarReader(String fileName) throws IOException {
 +    this.fileName = fileName;
 +    this.reader = new LineReader(fileName);
 +    LOG.info("Reading grammar from file {}...", fileName);
 +    numRulesRead = 0;
 +    advanceReader();
 +  }
 +
 +  // the reader is the iterator itself
 +  public Iterator<R> iterator() {
 +    return this;
 +  }
 +
 +  /** Unsupported Iterator method. */
 +  public void remove() throws UnsupportedOperationException {
 +    throw new UnsupportedOperationException();
 +  }
 +
 +  public void close() {
 +    if (null != this.reader) {
 +      try {
 +        this.reader.close();
 +      } catch (IOException e) {
 +        LOG.warn(e.getMessage(), e);
 +        LOG.error("Error closing grammar file stream: {}",  this.fileName);
 +      }
 +      this.reader = null;
 +    }
 +  }
 +
 +  /**
 +   * For correct behavior <code>close</code> must be called on every GrammarReader, however this
 +   * code attempts to avoid resource leaks.
 +   * 
 +   * @see org.apache.joshua.util.io.LineReader
 +   */
 +  @Override
 +  protected void finalize() throws Throwable {
 +    if (this.reader != null) {
 +      LOG.error("Grammar file stream was not closed, this indicates a coding error: {}",
 +          this.fileName);
 +    }
 +
 +    this.close();
 +    super.finalize();
 +  }
 +
 +  @Override
 +  public boolean hasNext() {
 +    return lookAhead != null;
 +  }
 +
 +  private void advanceReader() {
 +    try {
 +      lookAhead = reader.readLine();
 +      numRulesRead++;
 +    } catch (IOException e) {
 +      LOG.error("Error reading grammar from file: {}", fileName);
 +      LOG.error(e.getMessage(), e);
 +    }
 +    if (lookAhead == null && reader != null) {
 +      this.close();
 +    }
 +  }
 +
 +  /**
 +   * Read the next line, and print reader progress.
 +   */
 +  @Override
 +  public R next() {
 +    String line = lookAhead;
 +
 +    int oldProgress = reader.progress();
 +    advanceReader();
 +
 +
 +    if (Decoder.VERBOSE >= 1) {
 +      int newProgress = (reader != null) ? reader.progress() : 100;
 +
 +      //TODO: review this code. It is better to print progress based on time gap (like for every 1s or 2sec) than %!
 +      if (newProgress > oldProgress) {
 +        for (int i = oldProgress + 1; i <= newProgress; i++)
 +          if (i == 97) {
 +            System.err.print("1");
 +          } else if (i == 98) {
 +            System.err.print("0");
 +          } else if (i == 99) {
 +            System.err.print("0");
 +          } else if (i == 100) {
 +            System.err.println("%");
 +          } else if (i % 10 == 0) {
 +            System.err.print(String.format("%d", i));
 +            System.err.flush();
 +          } else if ((i - 1) % 10 == 0)
 +            ; // skip at 11 since 10, 20, etc take two digits
 +          else {
 +            System.err.print(".");
 +            System.err.flush();
 +          }
 +      }
 +    }
 +    return parseLine(line);
 +  }
 +
 +  protected abstract R parseLine(String line);
 +}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/91400fe2/src/main/java/org/apache/joshua/decoder/ff/tm/format/HieroFormatReader.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/decoder/ff/tm/format/HieroFormatReader.java
index d80638d,0000000..45c8c33
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/decoder/ff/tm/format/HieroFormatReader.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/tm/format/HieroFormatReader.java
@@@ -1,107 -1,0 +1,106 @@@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 +package org.apache.joshua.decoder.ff.tm.format;
 +
 +import java.io.IOException;
 +
 +import org.apache.joshua.corpus.Vocabulary;
 +import org.apache.joshua.decoder.ff.tm.GrammarReader;
 +import org.apache.joshua.decoder.ff.tm.Rule;
++import org.apache.joshua.util.Constants;
 +import org.apache.joshua.util.FormatUtils;
 +
 +/**
 + * This class implements reading files in the format defined by David Chiang for Hiero. 
 + * 
 + * @author Matt Post post@cs.jhu.edu
 + */
 +
 +public class HieroFormatReader extends GrammarReader<Rule> {
 +
 +  static {
-     fieldDelimiter = "\\s\\|{3}\\s";
 +    description = "Original Hiero format";
 +  }
 +
 +  public HieroFormatReader() {
 +    super();
 +  }
 +
 +  public HieroFormatReader(String grammarFile) throws IOException {
 +    super(grammarFile);
 +  }
 +
 +  @Override
 +  public Rule parseLine(String line) {
-     String[] fields = line.split(fieldDelimiter);
++    String[] fields = line.split(Constants.fieldDelimiter);
 +    if (fields.length < 3) {
 +      throw new RuntimeException(String.format("Rule '%s' does not have four fields", line));
 +    }
 +
 +    int lhs = Vocabulary.id(fields[0]);
 +
 +    /**
 +     * On the foreign side, we map nonterminals to negative IDs, and terminals to positive IDs.
 +     */
 +    int arity = 0;
 +    String[] sourceWords = fields[1].split("\\s+");
 +    int[] sourceIDs = new int[sourceWords.length];
 +    for (int i = 0; i < sourceWords.length; i++) {
++      /* NOTE: This redundantly creates vocab items for terms like [X,1]. This might actually
++       * be necessary, so don't try to turn this into an if/else.
++       */
 +      sourceIDs[i] = Vocabulary.id(sourceWords[i]);
 +      if (FormatUtils.isNonterminal(sourceWords[i])) {
 +        sourceIDs[i] = Vocabulary.id(FormatUtils.stripNonTerminalIndex(sourceWords[i]));
 +        arity++;
 +        
 +        // TODO: the arity here (after incrementing) should match the rule index. Should
 +        // check that arity == FormatUtils.getNonterminalIndex(foreignWords[i]), throw runtime
 +        // error if not
 +      }
 +    }
 +
 +    /**
 +     * The English side maps terminal symbols to positive IDs. Nonterminal symbols are linked
 +     * to the index of the source-side nonterminal they are linked to. So for a rule
 +     * 
 +     * [X] ||| [X,1] [X,2] [X,3] ||| [X,2] [X,1] [X,3] ||| ...
 +     * 
 +     * the English side nonterminals will be -2, -1, -3. This assumes that the source side of
 +     * the rule is always listed monotonically.
 +     */
 +    String[] targetWords = fields[2].split("\\s+");
 +    int[] targetIDs = new int[targetWords.length];
 +    for (int i = 0; i < targetWords.length; i++) {
 +      targetIDs[i] = Vocabulary.id(targetWords[i]);
 +      if (FormatUtils.isNonterminal(targetWords[i])) {
 +        targetIDs[i] = -FormatUtils.getNonterminalIndex(targetWords[i]);
 +      }
 +    }
 +
 +    String sparse_features = (fields.length > 3 ? fields[3] : "");
 +    String alignment = (fields.length > 4) ? fields[4] : null;
 +
 +    return new Rule(lhs, sourceIDs, targetIDs, sparse_features, arity, alignment);
 +  }
- 
-   public static String getFieldDelimiter() {
-     return fieldDelimiter;
-   }
- 
++  
 +  public static boolean isNonTerminal(final String word) {
 +    return FormatUtils.isNonterminal(word);
 +  }
 +}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/91400fe2/src/main/java/org/apache/joshua/decoder/ff/tm/format/MosesFormatReader.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/decoder/ff/tm/format/MosesFormatReader.java
index 0252636,0000000..7811b3b
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/decoder/ff/tm/format/MosesFormatReader.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/tm/format/MosesFormatReader.java
@@@ -1,106 -1,0 +1,108 @@@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 +package org.apache.joshua.decoder.ff.tm.format;
 +
 +import java.io.IOException;
 +
 +import org.apache.joshua.corpus.Vocabulary;
 +import org.apache.joshua.decoder.ff.tm.Rule;
 +import org.apache.joshua.util.io.LineReader;
++import org.apache.joshua.util.Constants;
++import org.apache.joshua.util.FormatUtils;
 +
 +/***
 + * This class reads in the Moses phrase table format, with support for the source and target side,
 + * list of features, and word alignments. It works by
 + * 
 + * - casting the phrase-based rules to left-branching hierarchical rules and passing them on \
 + *   to its parent class, {@link HieroFormatReader}.
 + * - converting the probabilities to -log probabilities
 + * 
 + * There is also a tool to convert the grammars directly, so that they can be suitably packed. Usage:
 + * 
 + * <pre>
 + *     cat PHRASE_TABLE | java -cp $JOSHUA/target/classes org.apache.joshua.decoder.ff.tm.format.MosesFormatReader
 + * </pre>
 + * 
 + * @author Matt Post post@cs.jhu.edu
 + *
 + */
 +
 +public class MosesFormatReader extends HieroFormatReader {
 +
 +  public MosesFormatReader(String grammarFile) throws IOException {
 +    super(grammarFile);
-     Vocabulary.id("[X]");
++    Vocabulary.id(Constants.defaultNT);
 +  }
 +  
 +  public MosesFormatReader() {
 +    super();
-     Vocabulary.id("[X]");
++    Vocabulary.id(Constants.defaultNT);
 +  }
 +  
 +  /**
 +   * When dealing with Moses format, this munges a Moses-style phrase table into a grammar.
 +   * 
 +   *    mots francaises ||| French words ||| 1 2 3 ||| 0-1 1-0
 +   *    
 +   * becomes
 +   * 
 +   *    [X] ||| [X,1] mots francaises ||| [X,1] French words ||| 1 2 3  ||| 0-1 1-0
 +   *    
 +   * For thrax-extracted phrasal grammars, it transforms
 +   * 
 +   *    [X] ||| mots francaises ||| French words ||| 1 2 3 ||| 0-1 1-0
 +   *
 +   * into
 +   * 
 +   *    [X] ||| [X,1] mots francaises ||| [X,1] French words ||| 1 2 3 ||| 0-1 1-0
 +   */
 +  @Override
 +  public Rule parseLine(String line) {
-     String[] fields = line.split(fieldDelimiter);
++    String[] fields = line.split(Constants.fieldDelimiter);
 +    
-     StringBuffer hieroLine = new StringBuffer();
-     hieroLine.append("[X] ||| [X,1] " + fields[0] + " ||| [X,1] " + fields[1] + " |||");
++    String nt = FormatUtils.cleanNonTerminal(Constants.defaultNT);
++    StringBuffer hieroLine = new StringBuffer(Constants.defaultNT + " ||| [" + nt + ",1] " + fields[0] + " ||| [" + nt + ",1] " + fields[1] + " |||");
 +
 +    String mosesFeatureString = fields[2];
 +    for (String value: mosesFeatureString.split(" ")) {
 +      float f = Float.parseFloat(value);
 +      hieroLine.append(String.format(" %f", f <= 0.0 ? -100 : -Math.log(f)));
 +    }
 +
 +    // alignments
 +    if (fields.length >= 4)
 +      hieroLine.append(" ||| " + fields[3]);
 +
 +    return super.parseLine(hieroLine.toString());
 +  }
 +  
 +  /**
 +   * Converts a Moses phrase table to a Joshua grammar. 
 +   * 
 +   * @param args the command-line arguments
 +   */
 +  public static void main(String[] args) {
 +    MosesFormatReader reader = new MosesFormatReader();
 +    for (String line: new LineReader(System.in)) {
 +      Rule rule = reader.parseLine(line);
 +      System.out.println(rule.textFormat());
 +    }    
 +  }
 +}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/91400fe2/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
index 3a37e08,0000000..2bfa8c1
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
@@@ -1,317 -1,0 +1,316 @@@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 +package org.apache.joshua.decoder.ff.tm.hash_based;
 +
 +import java.io.IOException;
 +import java.util.ArrayList;
 +import java.util.HashMap;
 +import java.util.List;
 +
 +import org.apache.joshua.corpus.Vocabulary;
- import org.apache.joshua.decoder.Decoder;
 +import org.apache.joshua.decoder.JoshuaConfiguration;
 +import org.apache.joshua.decoder.JoshuaConfiguration.OOVItem;
 +import org.apache.joshua.decoder.ff.FeatureFunction;
 +import org.apache.joshua.decoder.ff.tm.AbstractGrammar;
 +import org.apache.joshua.decoder.ff.tm.Rule;
 +import org.apache.joshua.decoder.ff.tm.GrammarReader;
 +import org.apache.joshua.decoder.ff.tm.Trie;
 +import org.apache.joshua.decoder.ff.tm.format.HieroFormatReader;
 +import org.apache.joshua.decoder.ff.tm.format.MosesFormatReader;
 +import org.apache.joshua.util.FormatUtils;
 +import org.slf4j.Logger;
 +import org.slf4j.LoggerFactory;
 +
 +/**
 + * This class implements a memory-based bilingual BatchGrammar.
 + * <p>
 + * The rules are stored in a trie. Each trie node has: (1) RuleBin: a list of rules matching the
 + * french sides so far (2) A HashMap of next-layer trie nodes, the next french word used as the key
 + * in HashMap
 + * 
 + * @author Zhifei Li zhifei.work@gmail.com
 + * @author Matt Post post@cs.jhu.edu
 + */
 +public class MemoryBasedBatchGrammar extends AbstractGrammar {
 +
 +  private static final Logger LOG = LoggerFactory.getLogger(MemoryBasedBatchGrammar.class);
 +
 +  // ===============================================================
 +  // Instance Fields
 +  // ===============================================================
 +
 +  /* The number of rules read. */
 +  private int qtyRulesRead = 0;
 +
 +  /* The number of distinct source sides. */
 +  private int qtyRuleBins = 0;
 +
 +  private int numDenseFeatures = 0;
 +
 +  /* The trie root. */
 +  private MemoryBasedTrie root = null;
 +
 +  /* The file containing the grammar. */
 +  private String grammarFile;
 +
 +  private GrammarReader<Rule> modelReader;
 +
 +  /* Whether the grammar's rules contain regular expressions. */
 +  private boolean isRegexpGrammar = false;
 +
 +  // ===============================================================
 +  // Static Fields
 +  // ===============================================================
 +
 +  // ===============================================================
 +  // Constructors
 +  // ===============================================================
 +
 +  public MemoryBasedBatchGrammar(JoshuaConfiguration joshuaConfiguration) {
 +    super(joshuaConfiguration);
 +    this.root = new MemoryBasedTrie();
 +    this.joshuaConfiguration = joshuaConfiguration;
 +  }
 +
 +  public MemoryBasedBatchGrammar(String owner, JoshuaConfiguration joshuaConfiguration) {
 +    this(joshuaConfiguration);
 +    this.owner = Vocabulary.id(owner);
 +  }
 +
 +  public MemoryBasedBatchGrammar(GrammarReader<Rule> gr, JoshuaConfiguration joshuaConfiguration) {
 +    // this.defaultOwner = Vocabulary.id(defaultOwner);
 +    // this.defaultLHS = Vocabulary.id(defaultLHSSymbol);
 +    this(joshuaConfiguration);
 +    modelReader = gr;
 +  }
 +
 +  public MemoryBasedBatchGrammar(String formatKeyword, String grammarFile, String owner,
 +      String defaultLHSSymbol, int spanLimit, JoshuaConfiguration joshuaConfiguration)
 +      throws IOException {
 +
 +    this(joshuaConfiguration);
 +    this.owner = Vocabulary.id(owner);
 +    Vocabulary.id(defaultLHSSymbol);
 +    this.spanLimit = spanLimit;
 +    this.grammarFile = grammarFile;
 +    this.setRegexpGrammar(formatKeyword.equals("regexp"));
 +
 +    // ==== loading grammar
 +    this.modelReader = createReader(formatKeyword, grammarFile);
 +    if (modelReader != null) {
 +      for (Rule rule : modelReader)
 +        if (rule != null) {
 +          addRule(rule);
 +        }
 +    } else {
 +      LOG.info("Couldn't create a GrammarReader for file {} with format {}",
 +          grammarFile, formatKeyword);
 +    }
 +
 +    this.printGrammar();
 +  }
 +
 +  protected GrammarReader<Rule> createReader(String format, String grammarFile) throws IOException {
 +
 +    if (grammarFile != null) {
 +      if ("hiero".equals(format) || "thrax".equals(format) || "regexp".equals(format)) {
 +        return new HieroFormatReader(grammarFile);
 +      } else if ("moses".equals(format)) {
 +        return new MosesFormatReader(grammarFile);
 +      } else {
 +        throw new RuntimeException(String.format("* FATAL: unknown grammar format '%s'", format));
 +      }
 +    }
 +    return null;
 +  }
 +
 +  // ===============================================================
 +  // Methods
 +  // ===============================================================
 +
 +  public void setSpanLimit(int spanLimit) {
 +    this.spanLimit = spanLimit;
 +  }
 +
 +  @Override
 +  public int getNumRules() {
 +    return this.qtyRulesRead;
 +  }
 +
 +  @Override
 +  public Rule constructManualRule(int lhs, int[] sourceWords, int[] targetWords,
 +      float[] denseScores, int arity) {
 +    return null;
 +  }
 +
 +  /**
 +   * if the span covered by the chart bin is greater than the limit, then return false
 +   */
 +  public boolean hasRuleForSpan(int i, int j, int pathLength) {
 +    if (this.spanLimit == -1) { // mono-glue grammar
 +      return (i == 0);
 +    } else {
 +      // System.err.println(String.format("%s HASRULEFORSPAN(%d,%d,%d)/%d = %s",
 +      // Vocabulary.word(this.owner), i, j, pathLength, spanLimit, pathLength <= this.spanLimit));
 +      return (pathLength <= this.spanLimit);
 +    }
 +  }
 +
 +  public Trie getTrieRoot() {
 +    return this.root;
 +  }
 +
 +  /**
 +   * Adds a rule to the grammar.
 +   */
 +  public void addRule(Rule rule) {
 +
 +    // TODO: Why two increments?
 +    this.qtyRulesRead++;
 +
 +    // if (owner == -1) {
 +    // System.err.println("* FATAL: MemoryBasedBatchGrammar::addRule(): owner not set for grammar");
 +    // System.exit(1);
 +    // }
 +    rule.setOwner(owner);
 +
 +    if (numDenseFeatures == 0)
 +      numDenseFeatures = rule.getFeatureVector().getDenseFeatures().size();
 +
 +    // === identify the position, and insert the trie nodes as necessary
 +    MemoryBasedTrie pos = root;
 +    int[] french = rule.getFrench();
 +
 +    maxSourcePhraseLength = Math.max(maxSourcePhraseLength, french.length);
 +
 +    for (int k = 0; k < french.length; k++) {
 +      int curSymID = french[k];
 +
 +      /*
 +       * Note that the nonTerminal symbol in the french is not cleaned (i.e., will be sth like
 +       * [X,1]), but the symbol in the Trie has to be cleaned, so that the match does not care about
 +       * the markup (i.e., [X,1] or [X,2] means the same thing, that is X) if
 +       * (Vocabulary.nt(french[k])) { curSymID = modelReader.cleanNonTerminal(french[k]); if
 +       * (logger.isLoggable(Level.FINEST)) logger.finest("Amended to: " + curSymID); }
 +       */
 +
 +      MemoryBasedTrie nextLayer = (MemoryBasedTrie) pos.match(curSymID);
 +      if (null == nextLayer) {
 +        nextLayer = new MemoryBasedTrie();
 +        if (pos.hasExtensions() == false) {
 +          pos.childrenTbl = new HashMap<Integer, MemoryBasedTrie>();
 +        }
 +        pos.childrenTbl.put(curSymID, nextLayer);
 +      }
 +      pos = nextLayer;
 +    }
 +
 +    // === add the rule into the trie node
 +    if (!pos.hasRules()) {
 +      pos.ruleBin = new MemoryBasedRuleBin(rule.getArity(), rule.getFrench());
 +      this.qtyRuleBins++;
 +    }
 +    pos.ruleBin.addRule(rule);
 +  }
 +
 +  protected void printGrammar() {
 +    LOG.info("MemoryBasedBatchGrammar: Read {} rules with {} distinct source sides from '{}'",
 +        this.qtyRulesRead, this.qtyRuleBins, grammarFile);
 +  }
 +
 +  /**
 +   * This returns true if the grammar contains rules that are regular expressions, possibly matching
 +   * many different inputs.
 +   * 
 +   * @return true if the grammar's rules may contain regular expressions.
 +   */
 +  @Override
 +  public boolean isRegexpGrammar() {
 +    return this.isRegexpGrammar;
 +  }
 +
 +  public void setRegexpGrammar(boolean value) {
 +    this.isRegexpGrammar = value;
 +  }
 +
 +  /***
 +   * Takes an input word and creates an OOV rule in the current grammar for that word.
 +   * 
 +   * @param sourceWord integer representation of word
 +   * @param featureFunctions {@link java.util.List} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
 +   */
 +  @Override
 +  public void addOOVRules(int sourceWord, List<FeatureFunction> featureFunctions) {
 +
 +    // TODO: _OOV shouldn't be outright added, since the word might not be OOV for the LM (but now
 +    // almost
 +    // certainly is)
 +    final int targetWord = this.joshuaConfiguration.mark_oovs ? Vocabulary.id(Vocabulary
 +        .word(sourceWord) + "_OOV") : sourceWord;
 +
 +    int[] sourceWords = { sourceWord };
 +    int[] targetWords = { targetWord };
 +    final String oovAlignment = "0-0";
 +
 +    if (this.joshuaConfiguration.oovList != null && this.joshuaConfiguration.oovList.size() != 0) {
 +      for (OOVItem item : this.joshuaConfiguration.oovList) {
 +        Rule oovRule = new Rule(Vocabulary.id(item.label), sourceWords, targetWords, "", 0,
 +            oovAlignment);
 +        addRule(oovRule);
 +        oovRule.estimateRuleCost(featureFunctions);
 +      }
 +    } else {
 +      int nt_i = Vocabulary.id(this.joshuaConfiguration.default_non_terminal);
 +      Rule oovRule = new Rule(nt_i, sourceWords, targetWords, "", 0, oovAlignment);
 +      addRule(oovRule);
 +      oovRule.estimateRuleCost(featureFunctions);
 +    }
 +  }
 +
 +  /**
 +   * Adds a default set of glue rules.
 +   * 
 +   * @param featureFunctions an {@link java.util.ArrayList} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
 +   */
 +  public void addGlueRules(ArrayList<FeatureFunction> featureFunctions) {
 +    HieroFormatReader reader = new HieroFormatReader();
 +
 +    String goalNT = FormatUtils.cleanNonTerminal(joshuaConfiguration.goal_symbol);
 +    String defaultNT = FormatUtils.cleanNonTerminal(joshuaConfiguration.default_non_terminal);
 +
 +    String[] ruleStrings = new String[] {
 +        String.format("[%s] ||| %s ||| %s ||| 0", goalNT, Vocabulary.START_SYM,
 +            Vocabulary.START_SYM),
 +        String.format("[%s] ||| [%s,1] [%s,2] ||| [%s,1] [%s,2] ||| -1", goalNT, goalNT, defaultNT,
 +            goalNT, defaultNT),
 +        String.format("[%s] ||| [%s,1] %s ||| [%s,1] %s ||| 0", goalNT, goalNT,
 +            Vocabulary.STOP_SYM, goalNT, Vocabulary.STOP_SYM) };
 +
 +    for (String ruleString : ruleStrings) {
 +      Rule rule = reader.parseLine(ruleString);
 +      addRule(rule);
 +      rule.estimateRuleCost(featureFunctions);
 +    }
 +  }
 +
 +  @Override
 +  public int getNumDenseFeatures() {
 +    return numDenseFeatures;
 +  }
 +}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/91400fe2/src/main/java/org/apache/joshua/decoder/hypergraph/AlignedSourceTokens.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/decoder/hypergraph/AlignedSourceTokens.java
index 948001f,0000000..864b383
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/decoder/hypergraph/AlignedSourceTokens.java
+++ b/src/main/java/org/apache/joshua/decoder/hypergraph/AlignedSourceTokens.java
@@@ -1,115 -1,0 +1,112 @@@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 +package org.apache.joshua.decoder.hypergraph;
 +
 +import java.util.LinkedList;
 +import java.util.ListIterator;
 +
 +/**
 + * Class that represents a one to (possibly) many alignment from target to
 + * source. Extends from a LinkedList. Instances of this class are updated by the
 + * WordAlignmentExtractor.substitute() method. 
 + * The {@link org.apache.joshua.decoder.hypergraph.AlignedSourceTokens#shiftBy(int, int)} 
 + * method shifts the
 + * elements in the list by a scalar to reflect substitutions of non terminals in
 + * the rule. if indexes are final, i.e. the point instance has been substituted
 + * into a parent WordAlignmentState once, 
 + * {@link org.apache.joshua.decoder.hypergraph.AlignedSourceTokens#isFinal} is set to true. 
 + * This is
 + * necessary since the final source index of a point is known once we have
 + * substituted in a complete WordAlignmentState into its parent. If the index in
 + * the list is a non terminal, {@link org.apache.joshua.decoder.hypergraph.AlignedSourceTokens#isNonTerminal} = true
 + */
 +class AlignedSourceTokens extends LinkedList<Integer> {
 +
 +  private static final long serialVersionUID = 1L;
 +  /** whether this Point refers to a non terminal in source&target */
 +  private boolean isNonTerminal = false;
 +  /** whether this instance does not need to be updated anymore */
 +  private boolean isFinal = false;
 +  /** whether the word this Point corresponds to has no alignment in source */
 +  private boolean isNull = false;
 +
-   AlignedSourceTokens() {
-   }
++  AlignedSourceTokens() {}
 +
 +  void setFinal() {
 +    isFinal = true;
 +  }
 +
 +  void setNonTerminal() {
 +    isNonTerminal = true;
 +  }
 +
 +  void setNull() {
 +    isNull = true;
 +  }
 +
 +  @Override
 +  /**
 +   * returns true if element was added.
 +   */
 +  public boolean add(Integer x) {
-     if (isNull || isNonTerminal)
-       return false;
-     return super.add(x);
++    return isNull ? false : super.add(x);
 +  }
 +
 +  public boolean isNonTerminal() {
 +    return isNonTerminal;
 +  }
 +
 +  public boolean isFinal() {
 +    return isFinal;
 +  }
 +
 +  public boolean isNull() {
 +    return isNull;
 +  }
 +
 +  /**
 +   * shifts each item in the LinkedList by <shift>.
 +   * Only applies to items larger than <start>
 +   */
 +  void shiftBy(int start, int shift) {
 +    if (!isFinal && !isNull) {
-       ListIterator<Integer> it = this.listIterator();
++      final ListIterator<Integer> it = this.listIterator();
 +      while (it.hasNext()) {
-         int x = it.next();
++        final int x = it.next();
 +        if (x > start) {
 +          it.set(x + shift);
 +        }
 +      }
 +    }
 +  }
 +
 +  public String toString() {
 +    StringBuilder sb = new StringBuilder();
 +    if (isFinal)
 +      sb.append("f");
 +    if (isNull) {
 +      sb.append("[NULL]");
 +    } else {
 +      sb.append(super.toString());
 +    }
 +    if (isNonTerminal)
 +      sb.append("^");
 +    return sb.toString();
 +  }
 +}