You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/06/01 02:52:13 UTC
[78/94] [abbrv] incubator-joshua git commit: Merge remote-tracking
branch 'origin/master' into JOSHUA-252
Merge remote-tracking branch 'origin/master' into JOSHUA-252
Includes many updates for tests, and updating hard-coded invocations of the old Java path.
# Conflicts:
# src/main/java/org/apache/joshua/decoder/StructuredTranslation.java
# src/main/java/org/apache/joshua/decoder/Translation.java
# src/main/java/org/apache/joshua/decoder/ff/tm/GrammarReader.java
# src/main/java/org/apache/joshua/decoder/ff/tm/format/HieroFormatReader.java
# src/main/java/org/apache/joshua/decoder/ff/tm/format/MosesFormatReader.java
# src/main/java/org/apache/joshua/decoder/hypergraph/KBestExtractor.java
# src/test/java/org/apache/joshua/corpus/VocabularyTest.java
# src/test/java/org/apache/joshua/decoder/kbest_extraction/KBestExtractionTest.java
# src/test/java/org/apache/joshua/decoder/phrase/constrained/ConstrainedPhraseDecodingTest.java
# src/test/java/org/apache/joshua/decoder/phrase/decode/PhraseDecodingTest.java
# src/test/java/org/apache/joshua/system/KenLmTest.java
# src/test/java/org/apache/joshua/system/MultithreadedTranslationTests.java
# src/test/java/org/apache/joshua/system/StructuredOutputTest.java
# src/test/java/org/apache/joshua/system/StructuredTranslationTest.java
# src/test/java/org/apache/joshua/util/FormatUtilsTest.java
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/91400fe2
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/91400fe2
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/91400fe2
Branch: refs/heads/master
Commit: 91400fe2a457d04c45a8daa12d74740d582cbca3
Parents: ef00b03 15c7619
Author: Matt Post <po...@cs.jhu.edu>
Authored: Tue May 31 12:36:43 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Tue May 31 12:36:43 2016 -0400
----------------------------------------------------------------------
scripts/support/create_glue_grammar.sh | 2 +-
scripts/support/grammar-packer.pl | 2 +-
scripts/training/pipeline.pl | 4 +-
.../joshua/decoder/StructuredTranslation.java | 66 ++++------
.../decoder/StructuredTranslationFactory.java | 112 +++++++++++++++++
.../org/apache/joshua/decoder/Translation.java | 125 +++++++++++++------
.../BloomFilterLanguageModel.java | 1 -
.../joshua/decoder/ff/tm/GrammarReader.java | 2 -
.../decoder/ff/tm/format/HieroFormatReader.java | 13 +-
.../decoder/ff/tm/format/MosesFormatReader.java | 12 +-
.../tm/hash_based/MemoryBasedBatchGrammar.java | 1 -
.../decoder/hypergraph/AlignedSourceTokens.java | 11 +-
.../decoder/hypergraph/KBestExtractor.java | 84 +++++++++----
.../decoder/hypergraph/WordAlignmentState.java | 103 ++++++++-------
.../apache/joshua/decoder/io/JSONMessage.java | 2 +-
.../org/apache/joshua/tools/GrammarPacker.java | 3 +-
.../java/org/apache/joshua/util/Constants.java | 36 ++++++
.../org/apache/joshua/util/FileUtility.java | 1 -
.../org/apache/joshua/util/io/LineReader.java | 5 +-
src/main/resources/log4j.properties | 4 +-
.../apache/joshua/corpus/CorpusArrayTest.java | 14 ---
.../apache/joshua/corpus/VocabularyTest.java | 27 ++--
.../joshua/decoder/DecoderThreadTest.java | 4 -
.../kbest_extraction/KBestExtractionTest.java | 10 +-
.../ConstrainedPhraseDecodingTest.java | 10 +-
.../phrase/decode/PhraseDecodingTest.java | 10 +-
.../org/apache/joshua/system/KenLmTest.java | 2 -
.../system/MultithreadedTranslationTests.java | 4 +-
.../joshua/system/StructuredOutputTest.java | 20 ++-
.../system/StructuredTranslationTest.java | 107 ++++++++++++----
.../org/apache/joshua/util/FormatUtilsTest.java | 6 -
src/test/resources/packed-grammar/test.sh | 2 +-
.../resources/thrax/filtering/test-exact.sh | 0
src/test/resources/thrax/filtering/test-fast.sh | 0
.../resources/thrax/filtering/test-loose.sh | 0
35 files changed, 515 insertions(+), 290 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/91400fe2/scripts/support/create_glue_grammar.sh
----------------------------------------------------------------------
diff --cc scripts/support/create_glue_grammar.sh
index 166e6ee,166e6ee..df60ade
--- a/scripts/support/create_glue_grammar.sh
+++ b/scripts/support/create_glue_grammar.sh
@@@ -23,4 -23,4 +23,4 @@@ if [[ -z "$1" ]]; the
exit
fi
--java -Xmx2g -cp $JOSHUA/lib/args4j-2.0.29.jar:$JOSHUA/class joshua.decoder.ff.tm.CreateGlueGrammar -g $1
++java -Xmx2g -cp $JOSHUA/target/joshua-*-jar-with-dependencies.jar org.apache.joshua.decoder.ff.tm.CreateGlueGrammar -g $1
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/91400fe2/scripts/support/grammar-packer.pl
----------------------------------------------------------------------
diff --cc scripts/support/grammar-packer.pl
index 7cd3153,7cd3153..a0caf3c
--- a/scripts/support/grammar-packer.pl
+++ b/scripts/support/grammar-packer.pl
@@@ -90,7 -90,7 +90,7 @@@ foreach my $grammar (@grammars)
my $grammars = join(" ", @sorted_grammars);
my $outputs = join(" ", @outputs);
my $alignments = $opts{a} ? "--ga" : "";
--my $cmd = "java -Xmx$opts{m} -cp $JOSHUA/lib/args4j-2.0.29.jar:$JOSHUA/lib/guava-19.0.jar:$JOSHUA/class joshua.tools.GrammarPackerCli -g $grammars --outputs $outputs $alignments";
++my $cmd = "java -Xmx$opts{m} -cp $JOSHUA/target/joshua-*-jar-with-dependencies.jar org.apache.joshua.tools.GrammarPackerCli -g $grammars --outputs $outputs $alignments";
print STDERR "Packing with $cmd...\n" if $opts{v};
my $retval = system($cmd);
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/91400fe2/scripts/training/pipeline.pl
----------------------------------------------------------------------
diff --cc scripts/training/pipeline.pl
index e6db89d,88d641c..d9aec24
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl
@@@ -1377,7 -1362,7 +1377,7 @@@ if ($DO_FILTER_TM and defined $GRAMMAR_
if ($OPTIMIZER_RUN == 1 and defined $TUNE_GRAMMAR and $GRAMMAR_TYPE ne "phrase" and $GRAMMAR_TYPE ne "moses") {
if (! defined $GLUE_GRAMMAR_FILE) {
$cachepipe->cmd("glue-tune",
-- "java -Xmx2g -cp $JOSHUA/lib/args4j-2.0.29.jar:$JOSHUA/class joshua.decoder.ff.tm.CreateGlueGrammar -g $TUNE_GRAMMAR > $DATA_DIRS{tune}/grammar.glue",
++ "$JOSHUA/scripts/support/create_glue_grammar.sh $TUNE_GRAMMAR > $DATA_DIRS{tune}/grammar.glue",
get_file_from_grammar($TUNE_GRAMMAR),
"$DATA_DIRS{tune}/grammar.glue");
$GLUE_GRAMMAR_FILE = "$DATA_DIRS{tune}/grammar.glue";
@@@ -1594,7 -1579,7 +1594,7 @@@ if ($DO_FILTER_TM and defined $GRAMMAR_
if ($OPTIMIZER_RUN == 1 and defined $TEST_GRAMMAR and $GRAMMAR_TYPE ne "phrase" and $GRAMMAR_TYPE ne "moses") {
if (! defined $GLUE_GRAMMAR_FILE) {
$cachepipe->cmd("glue-test",
-- "java -Xmx2g -cp $JOSHUA/lib/args4j-2.0.29.jar:$JOSHUA/class joshua.decoder.ff.tm.CreateGlueGrammar -g $TEST_GRAMMAR > $DATA_DIRS{test}/grammar.glue",
++ "$JOSHUA/scripts/support/create_glue_grammar.sh $TEST_GRAMMAR > $DATA_DIRS{test}/grammar.glue",
get_file_from_grammar($TEST_GRAMMAR),
"$DATA_DIRS{test}/grammar.glue");
$GLUE_GRAMMAR_FILE = "$DATA_DIRS{test}/grammar.glue";
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/91400fe2/src/main/java/org/apache/joshua/decoder/StructuredTranslation.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/decoder/StructuredTranslation.java
index 8aa518e,0000000..b548a74
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/decoder/StructuredTranslation.java
+++ b/src/main/java/org/apache/joshua/decoder/StructuredTranslation.java
@@@ -1,126 -1,0 +1,102 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder;
+
- import static java.util.Arrays.asList;
- import static java.util.Collections.emptyList;
- import static org.apache.joshua.decoder.hypergraph.ViterbiExtractor.getViterbiFeatures;
- import static org.apache.joshua.decoder.hypergraph.ViterbiExtractor.getViterbiString;
- import static org.apache.joshua.decoder.hypergraph.ViterbiExtractor.getViterbiWordAlignmentList;
- import static org.apache.joshua.util.FormatUtils.removeSentenceMarkers;
-
+import java.util.List;
+import java.util.Map;
+
- import org.apache.joshua.decoder.ff.FeatureFunction;
- import org.apache.joshua.decoder.hypergraph.HyperGraph;
+import org.apache.joshua.decoder.segment_file.Sentence;
+
+/**
- * <p>structuredTranslation provides a more structured access to translation
- * results than the Translation class.
- * Members of instances of this class can be used upstream.</p>
- * TODO:
- * Enable K-Best extraction.
++ * A StructuredTranslation instance provides a more structured access to
++ * translation results than the string-based Translation class.
++ * This is useful if the decoder is encapsulated in a larger project, instead
++ * of simply writing to a file or stdout.
++ * StructuredTranslation encodes all relevant information about a derivation,
++ * namely output string, tokens, score, features, and word alignment.
+ *
+ * @author fhieber
+ */
+public class StructuredTranslation {
+
+ private final Sentence sourceSentence;
+ private final String translationString;
+ private final List<String> translationTokens;
+ private final float translationScore;
+ private final List<List<Integer>> translationWordAlignments;
+ private final Map<String,Float> translationFeatures;
+ private final float extractionTime;
+
- public StructuredTranslation(final Sentence sourceSentence,
- final HyperGraph hypergraph,
- final List<FeatureFunction> featureFunctions) {
-
- final long startTime = System.currentTimeMillis();
-
- this.sourceSentence = sourceSentence;
- this.translationString = removeSentenceMarkers(getViterbiString(hypergraph));
- this.translationTokens = extractTranslationTokens();
- this.translationScore = extractTranslationScore(hypergraph);
- this.translationFeatures = getViterbiFeatures(hypergraph, featureFunctions, sourceSentence).getMap();
- this.translationWordAlignments = getViterbiWordAlignmentList(hypergraph);
- this.extractionTime = (System.currentTimeMillis() - startTime) / 1000.0f;
- }
-
- private float extractTranslationScore(final HyperGraph hypergraph) {
- if (hypergraph == null) {
- return 0;
- } else {
- return hypergraph.goalNode.getScore();
- }
++ public StructuredTranslation(
++ final Sentence sourceSentence,
++ final String translationString,
++ final List<String> translationTokens,
++ final float translationScore,
++ final List<List<Integer>> translationWordAlignments,
++ final Map<String,Float> translationFeatures,
++ final float extractionTime) {
++ this.sourceSentence = sourceSentence;
++ this.translationString = translationString;
++ this.translationTokens = translationTokens;
++ this.translationScore = translationScore;
++ this.translationWordAlignments = translationWordAlignments;
++ this.translationFeatures = translationFeatures;
++ this.extractionTime = extractionTime;
+ }
+
- private List<String> extractTranslationTokens() {
- if (translationString.isEmpty()) {
- return emptyList();
- } else {
- return asList(translationString.split("\\s+"));
- }
- }
-
- // Getters to use upstream
-
+ public Sentence getSourceSentence() {
+ return sourceSentence;
+ }
+
+ public int getSentenceId() {
+ return sourceSentence.id();
+ }
+
+ public String getTranslationString() {
+ return translationString;
+ }
+
+ public List<String> getTranslationTokens() {
+ return translationTokens;
+ }
+
+ public float getTranslationScore() {
+ return translationScore;
+ }
+
+ /**
+ * Returns a list of target to source alignments.
+ * @return a list of target to source alignments
+ */
+ public List<List<Integer>> getTranslationWordAlignments() {
+ return translationWordAlignments;
+ }
+
+ public Map<String,Float> getTranslationFeatures() {
+ return translationFeatures;
+ }
+
+ /**
+ * Time taken to build output information from the hypergraph.
+ * @return the time taken to build output information from the hypergraph
+ */
+ public Float getExtractionTime() {
+ return extractionTime;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/91400fe2/src/main/java/org/apache/joshua/decoder/StructuredTranslationFactory.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/decoder/StructuredTranslationFactory.java
index 0000000,0000000..de5a90b
new file mode 100644
--- /dev/null
+++ b/src/main/java/org/apache/joshua/decoder/StructuredTranslationFactory.java
@@@ -1,0 -1,0 +1,112 @@@
++/*
++ * Licensed to the Apache Software Foundation (ASF) under one
++ * or more contributor license agreements. See the NOTICE file
++ * distributed with this work for additional information
++ * regarding copyright ownership. The ASF licenses this file
++ * to you under the Apache License, Version 2.0 (the
++ * "License"); you may not use this file except in compliance
++ * with the License. You may obtain a copy of the License at
++ *
++ * http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing,
++ * software distributed under the License is distributed on an
++ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
++ * KIND, either express or implied. See the License for the
++ * specific language governing permissions and limitations
++ * under the License.
++ */
++package org.apache.joshua.decoder;
++
++import static java.util.Arrays.asList;
++import static java.util.Collections.emptyList;
++import static java.util.Collections.emptyMap;
++import static org.apache.joshua.decoder.hypergraph.ViterbiExtractor.getViterbiFeatures;
++import static org.apache.joshua.decoder.hypergraph.ViterbiExtractor.getViterbiString;
++import static org.apache.joshua.decoder.hypergraph.ViterbiExtractor.getViterbiWordAlignmentList;
++import static org.apache.joshua.util.FormatUtils.removeSentenceMarkers;
++
++import java.util.List;
++
++import org.apache.joshua.decoder.ff.FeatureFunction;
++import org.apache.joshua.decoder.hypergraph.HyperGraph;
++import org.apache.joshua.decoder.hypergraph.KBestExtractor.DerivationState;
++import org.apache.joshua.decoder.segment_file.Sentence;
++
++/**
++ * This factory provides methods to create StructuredTranslation objects
++ * from either Viterbi derivations or KBest derivations.
++ *
++ * @author fhieber
++ */
++public class StructuredTranslationFactory {
++
++ /**
++ * Returns a StructuredTranslation instance from the Viterbi derivation.
++ * @return A StructuredTranslation object representing the Viterbi derivation.
++ */
++ public static StructuredTranslation fromViterbiDerivation(
++ final Sentence sourceSentence,
++ final HyperGraph hypergraph,
++ final List<FeatureFunction> featureFunctions) {
++ final long startTime = System.currentTimeMillis();
++ final String translationString = removeSentenceMarkers(getViterbiString(hypergraph));
++ return new StructuredTranslation(
++ sourceSentence,
++ translationString,
++ extractTranslationTokens(translationString),
++ extractTranslationScore(hypergraph),
++ getViterbiWordAlignmentList(hypergraph),
++ getViterbiFeatures(hypergraph, featureFunctions, sourceSentence).getMap(),
++ (System.currentTimeMillis() - startTime) / 1000.0f);
++ }
++
++ /**
++ * Returns a StructuredTranslation from an empty decoder output
++ * @param sourceSentence
++ * @return
++ */
++ public static StructuredTranslation fromEmptyOutput(final Sentence sourceSentence) {
++ return new StructuredTranslation(
++ sourceSentence, "", emptyList(), 0, emptyList(), emptyMap(), 0f);
++ }
++
++ /**
++ * Returns a StructuredTranslation instance from a KBest DerivationState.
++ * @param sourceSentence Sentence object representing the source.
++ * @param derivationState the KBest DerivationState.
++ * @return A StructuredTranslation object representing the derivation encoded by derivationState.
++ */
++ public static StructuredTranslation fromKBestDerivation(
++ final Sentence sourceSentence,
++ final DerivationState derivationState) {
++ final long startTime = System.currentTimeMillis();
++ final String translationString = removeSentenceMarkers(derivationState.getHypothesis());
++ return new StructuredTranslation(
++ sourceSentence,
++ translationString,
++ extractTranslationTokens(translationString),
++ derivationState.getModelCost(),
++ derivationState.getWordAlignmentList(),
++ derivationState.getFeatures().getMap(),
++ (System.currentTimeMillis() - startTime) / 1000.0f);
++ }
++
++ private static float extractTranslationScore(final HyperGraph hypergraph) {
++ if (hypergraph == null) {
++ return 0;
++ } else {
++ return hypergraph.goalNode.getScore();
++ }
++ }
++
++ private static List<String> extractTranslationTokens(final String translationString) {
++ if (translationString.isEmpty()) {
++ return emptyList();
++ } else {
++ return asList(translationString.split("\\s+"));
++ }
++ }
++
++
++}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/91400fe2/src/main/java/org/apache/joshua/decoder/Translation.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/decoder/Translation.java
index ab37814,0000000..6b8e5e4
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/decoder/Translation.java
+++ b/src/main/java/org/apache/joshua/decoder/Translation.java
@@@ -1,203 -1,0 +1,248 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder;
+
+import static org.apache.joshua.decoder.hypergraph.ViterbiExtractor.getViterbiFeatures;
+import static org.apache.joshua.decoder.hypergraph.ViterbiExtractor.getViterbiString;
+import static org.apache.joshua.decoder.hypergraph.ViterbiExtractor.getViterbiWordAlignments;
++import static org.apache.joshua.decoder.StructuredTranslationFactory.fromViterbiDerivation;
+import static org.apache.joshua.util.FormatUtils.removeSentenceMarkers;
++import static java.util.Arrays.asList;
+
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.io.StringWriter;
+import java.util.List;
+
+import org.apache.joshua.decoder.ff.FeatureFunction;
+import org.apache.joshua.decoder.ff.FeatureVector;
+import org.apache.joshua.decoder.ff.lm.StateMinimizingLanguageModel;
+import org.apache.joshua.decoder.hypergraph.HyperGraph;
+import org.apache.joshua.decoder.hypergraph.KBestExtractor;
+import org.apache.joshua.decoder.io.DeNormalize;
+import org.apache.joshua.decoder.segment_file.Sentence;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class represents translated input objects (sentences or lattices). It is aware of the source
+ * sentence and id and contains the decoded hypergraph. Translation objects are returned by
+ * DecoderThread instances to the InputHandler, where they are assembled in order for output.
+ *
++<<<<<<< HEAD:src/main/java/org/apache/joshua/decoder/Translation.java
+ * @author Matt Post post@cs.jhu.edu
++=======
++ * @author Matt Post <po...@cs.jhu.edu>
++ * @author Felix Hieber <fh...@amazon.com>
++>>>>>>> origin/master:src/joshua/decoder/Translation.java
+ */
+
+public class Translation {
+ private static final Logger LOG = LoggerFactory.getLogger(Translation.class);
+ private Sentence source;
+
+ /**
+ * This stores the output of the translation so we don't have to hold onto the hypergraph while we
+ * wait for the outputs to be assembled.
+ */
+ private String output = null;
+
- private StructuredTranslation structuredTranslation = null;
-
++ /**
++ * Stores the list of StructuredTranslations.
++ * If joshuaConfig.topN == 0, will only contain the Viterbi translation.
++ * Else it will use KBestExtractor to populate this list.
++ */
++ private List<StructuredTranslation> structuredTranslations = null;
++
+ public Translation(Sentence source, HyperGraph hypergraph,
+ List<FeatureFunction> featureFunctions, JoshuaConfiguration joshuaConfiguration) {
+ this.source = source;
-
++
++ /**
++ * Structured output from Joshua provides a way to programmatically access translation results
++ * from downstream applications, instead of writing results as strings to an output buffer.
++ */
+ if (joshuaConfiguration.use_structured_output) {
-
- structuredTranslation = new StructuredTranslation(
- source, hypergraph, featureFunctions);
- this.output = structuredTranslation.getTranslationString();
++
++ if (joshuaConfiguration.topN == 0) {
++ /*
++ * Obtain Viterbi StructuredTranslation
++ */
++ StructuredTranslation translation = fromViterbiDerivation(source, hypergraph, featureFunctions);
++ this.output = translation.getTranslationString();
++ structuredTranslations = asList(translation);
++
++ } else {
++ /*
++ * Get K-Best list of StructuredTranslations
++ */
++ final KBestExtractor kBestExtractor = new KBestExtractor(source, featureFunctions, Decoder.weights, false, joshuaConfiguration);
++ structuredTranslations = kBestExtractor.KbestExtractOnHG(hypergraph, joshuaConfiguration.topN);
++ if (structuredTranslations.isEmpty()) {
++ structuredTranslations = asList(StructuredTranslationFactory.fromEmptyOutput(source));
++ this.output = "";
++ } else {
++ this.output = structuredTranslations.get(0).getTranslationString();
++ }
++ // TODO: We omit the BLEU rescoring for now since it is not clear whether it works at all and what the desired output is below.
++ }
+
+ } else {
+
+ StringWriter sw = new StringWriter();
+ BufferedWriter out = new BufferedWriter(sw);
+
+ try {
++
+ if (hypergraph != null) {
++
+ if (!joshuaConfiguration.hypergraphFilePattern.equals("")) {
+ hypergraph.dump(String.format(joshuaConfiguration.hypergraphFilePattern, source.id()), featureFunctions);
+ }
+
+ long startTime = System.currentTimeMillis();
+
+ // We must put this weight as zero, otherwise we get an error when we try to retrieve it
+ // without checking
+ Decoder.weights.increment("BLEU", 0);
+
+ if (joshuaConfiguration.topN == 0) {
+
+ /* construct Viterbi output */
+ final String best = getViterbiString(hypergraph);
+
+ LOG.info("Translation {}: {} {}", source.id(), hypergraph.goalNode.getScore(), best);
+
+ /*
+ * Setting topN to 0 turns off k-best extraction, in which case we need to parse through
+ * the output-string, with the understanding that we can only substitute variables for the
+ * output string, sentence number, and model score.
+ */
+ String translation = joshuaConfiguration.outputFormat
+ .replace("%s", removeSentenceMarkers(best))
+ .replace("%S", DeNormalize.processSingleLine(best))
+ .replace("%c", String.format("%.3f", hypergraph.goalNode.getScore()))
+ .replace("%i", String.format("%d", source.id()));
+
+ if (joshuaConfiguration.outputFormat.contains("%a")) {
+ translation = translation.replace("%a", getViterbiWordAlignments(hypergraph));
+ }
+
+ if (joshuaConfiguration.outputFormat.contains("%f")) {
+ final FeatureVector features = getViterbiFeatures(hypergraph, featureFunctions, source);
+ translation = translation.replace("%f", joshuaConfiguration.moses ? features.mosesString() : features.toString());
+ }
+
+ out.write(translation);
+ out.newLine();
+
+ } else {
+
+ final KBestExtractor kBestExtractor = new KBestExtractor(
+ source, featureFunctions, Decoder.weights, false, joshuaConfiguration);
+ kBestExtractor.lazyKBestExtractOnHG(hypergraph, joshuaConfiguration.topN, out);
+
+ if (joshuaConfiguration.rescoreForest) {
+ Decoder.weights.increment("BLEU", joshuaConfiguration.rescoreForestWeight);
+ kBestExtractor.lazyKBestExtractOnHG(hypergraph, joshuaConfiguration.topN, out);
+
+ Decoder.weights.increment("BLEU", -joshuaConfiguration.rescoreForestWeight);
+ kBestExtractor.lazyKBestExtractOnHG(hypergraph, joshuaConfiguration.topN, out);
+ }
+ }
+
+ float seconds = (float) (System.currentTimeMillis() - startTime) / 1000.0f;
+ LOG.info("Input {}: {}-best extraction took {} seconds", id(),
+ joshuaConfiguration.topN, seconds);
+
+ } else {
-
++
+ // Failed translations and blank lines get empty formatted outputs
- // @formatter:off
- String outputString = joshuaConfiguration.outputFormat
- .replace("%s", source.source())
- .replace("%e", "")
- .replace("%S", "")
- .replace("%t", "()")
- .replace("%i", Integer.toString(source.id()))
- .replace("%f", "")
- .replace("%c", "0.000");
- // @formatter:on
-
- out.write(outputString);
++ out.write(getFailedTranslationOutput(source, joshuaConfiguration));
+ out.newLine();
++
+ }
+
+ out.flush();
++
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+
+ this.output = sw.toString();
+
+ }
-
- /*
- * KenLM hack. If using KenLMFF, we need to tell KenLM to delete the pool used to create chart
- * objects for this sentence.
- */
- for (FeatureFunction feature : featureFunctions) {
- if (feature instanceof StateMinimizingLanguageModel) {
- ((StateMinimizingLanguageModel) feature).destroyPool(getSourceSentence().id());
- break;
- }
- }
++
++ // remove state from StateMinimizingLanguageModel instances in features.
++ destroyKenLMStates(featureFunctions);
+
+ }
+
+ public Sentence getSourceSentence() {
+ return this.source;
+ }
+
+ public int id() {
+ return source.id();
+ }
+
+ @Override
+ public String toString() {
+ return output;
+ }
-
++
++ private String getFailedTranslationOutput(final Sentence source, final JoshuaConfiguration joshuaConfiguration) {
++ return joshuaConfiguration.outputFormat
++ .replace("%s", source.source())
++ .replace("%e", "")
++ .replace("%S", "")
++ .replace("%t", "()")
++ .replace("%i", Integer.toString(source.id()))
++ .replace("%f", "")
++ .replace("%c", "0.000");
++ }
++
++ /**
++ * Returns the StructuredTranslations
++ * if JoshuaConfiguration.use_structured_output == True.
++ * @throws RuntimeException if JoshuaConfiguration.use_structured_output == False.
++ * @return List of StructuredTranslations.
++ */
++ public List<StructuredTranslation> getStructuredTranslations() {
++ if (structuredTranslations == null) {
++ throw new RuntimeException(
++ "No StructuredTranslation objects created. You should set JoshuaConfigration.use_structured_output = true");
++ }
++ return structuredTranslations;
++ }
++
+ /**
- * Returns the StructuredTranslation object
- * if JoshuaConfiguration.construct_structured_output == True.
- * @throws RuntimeException if StructuredTranslation object not set.
- * @return {@link org.apache.joshua.decoder.StructuredTranslation} object
++ * KenLM hack. If using KenLMFF, we need to tell KenLM to delete the pool used to create chart
++ * objects for this sentence.
+ */
- public StructuredTranslation getStructuredTranslation() {
- if (structuredTranslation == null) {
- throw new RuntimeException("No StructuredTranslation object created. You should set JoshuaConfigration.construct_structured_output = true");
++ private void destroyKenLMStates(final List<FeatureFunction> featureFunctions) {
++ for (FeatureFunction feature : featureFunctions) {
++ if (feature instanceof StateMinimizingLanguageModel) {
++ ((StateMinimizingLanguageModel) feature).destroyPool(getSourceSentence().id());
++ break;
++ }
+ }
- return structuredTranslation;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/91400fe2/src/main/java/org/apache/joshua/decoder/ff/lm/bloomfilter_lm/BloomFilterLanguageModel.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/decoder/ff/lm/bloomfilter_lm/BloomFilterLanguageModel.java
index 7d0e599,0000000..90d90b8
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/decoder/ff/lm/bloomfilter_lm/BloomFilterLanguageModel.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/lm/bloomfilter_lm/BloomFilterLanguageModel.java
@@@ -1,561 -1,0 +1,560 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.lm.bloomfilter_lm;
+
+import java.io.Externalizable;
+import java.io.FileInputStream;
- import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.ObjectInput;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutput;
+import java.io.ObjectOutputStream;
+import java.util.HashMap;
+import java.util.zip.GZIPInputStream;
+import java.util.zip.GZIPOutputStream;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.ff.lm.DefaultNGramLanguageModel;
+import org.apache.joshua.util.Regex;
+import org.apache.joshua.util.io.LineReader;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * An n-gram language model with linearly-interpolated Witten-Bell smoothing, using a Bloom filter
+ * as its main data structure. A Bloom filter is a lossy data structure that can be used to test for
+ * set membership.
+ */
+public class BloomFilterLanguageModel extends DefaultNGramLanguageModel implements Externalizable {
+ /**
+ * An initial value used for hashing n-grams so that they can be stored in a bloom filter.
+ */
+ public static final int HASH_SEED = 17;
+
+ /**
+ * Another value used in the process of hashing n-grams.
+ */
+ public static final int HASH_OFFSET = 37;
+
+ /**
+ * The maximum score that a language model feature function can return to the Joshua decoder.
+ */
+ public static final double MAX_SCORE = 100.0;
+
+ /**
+ * The logger for this class.
+ */
+ private static final Logger LOG = LoggerFactory.getLogger(BloomFilterLanguageModel.class);
+
+ /**
+ * The Bloom filter data structure itself.
+ */
+ private BloomFilter bf;
+
+ /**
+ * The base of the logarithm used to quantize n-gram counts. N-gram counts are quantized
+ * logarithmically to reduce the number of times we need to query the Bloom filter.
+ */
+ private double quantizationBase;
+
+ /**
+ * Natural log of the number of tokens seen in the training corpus.
+ */
+ private double numTokens;
+
+ /**
+ * An array of pairs of long, used as hash functions for storing or retreiving the count of an
+ * n-gram in the Bloom filter.
+ */
+ private long[][] countFuncs;
+ /**
+ * An array of pairs of long, used as hash functions for storing or retreiving the number of
+ * distinct types observed after an n-gram.
+ */
+ private long[][] typesFuncs;
+
+ /**
+ * The smoothed probability of an unseen n-gram. This is also the probability of any n-gram under
+ * the zeroth-order model.
+ */
+ transient private double p0;
+
+ /**
+ * The interpolation constant between Witten-Bell models of order zero and one. Stored in a field
+ * because it can be calculated ahead of time; it doesn't depend on the particular n-gram.
+ */
+ transient private double lambda0;
+
+ /**
+ * The maximum possible quantized count of any n-gram stored in the Bloom filter. Used as an upper
+ * bound on the count that could be returned when querying the Bloom filter.
+ */
+ transient private int maxQ; // max quantized count
+
+ /**
+ * Constructor called from the Joshua decoder. This constructor assumes that the LM has already
+ * been built, and takes the name of the file where the LM is stored.
+ *
+ * @param order the order of the language model
+ * @param filename path to the file where the language model is stored
+ * @throws IOException if the bloom filter language model cannot be rebuilt from the input file
+ */
+ public BloomFilterLanguageModel(int order, String filename) throws IOException {
+ super(order);
+ try {
+ readExternal(new ObjectInputStream(new GZIPInputStream(new FileInputStream(filename))));
+ } catch (ClassNotFoundException e) {
+ IOException ioe = new IOException("Could not rebuild bloom filter LM from file " + filename);
+ ioe.initCause(e);
+ throw ioe;
+ }
+
+ int vocabSize = Vocabulary.size();
+ p0 = -Math.log(vocabSize + 1);
+ double oneMinusLambda0 = numTokens - logAdd(Math.log(vocabSize), numTokens);
+ p0 += oneMinusLambda0;
+ lambda0 = Math.log(vocabSize) - logAdd(Math.log(vocabSize), numTokens);
+ maxQ = quantize((long) Math.exp(numTokens));
+ }
+
+ /**
+ * Constructor to be used by the main function. This constructor is used to build a new language
+ * model from scratch. An LM should be built with the main function before using it in the Joshua
+ * decoder.
+ *
+ * @param filename path to the file of training corpus statistics
+ * @param order the order of the language model
+ * @param size the size of the Bloom filter, in bits
+ * @param base a double. The base of the logarithm for quantization.
+ */
+ private BloomFilterLanguageModel(String filename, int order, int size, double base) {
+ super(order);
+ quantizationBase = base;
+ populateBloomFilter(size, filename);
+ }
+
+ /**
+ * calculates the linearly-interpolated Witten-Bell probability for a given ngram. this is
+ * calculated as: p(w|h) = pML(w|h)L(h) - (1 - L(h))p(w|h') where: w is a word and h is a history
+ * h' is the history h with the first word removed pML is the maximum-likelihood estimate of the
+ * probability L(.) is lambda, the interpolation factor, which depends only on the history h: L(h)
+ * = s(h) / s(h) + c(h) where s(.) is the observed number of distinct types after h, and c is the
+ * observed number of counts of h in the training corpus.
+ * <p>
+ * in fact this model calculates the probability starting from the lowest order and working its
+ * way up, to take advantage of the one- sided error rate inherent in using a bloom filter data
+ * structure.
+ *
+ * @param ngram the ngram whose probability is to be calculated
+ * @param ngramOrder the order of the ngram.
+ *
+ * @return the linearly-interpolated Witten-Bell smoothed probability of an ngram
+ */
+ private float wittenBell(int[] ngram, int ngramOrder) {
+ int end = ngram.length;
+ double p = p0; // current calculated probability
+ // note that p0 and lambda0 are independent of the given
+ // ngram so they are calculated ahead of time.
+ int MAX_QCOUNT = getCount(ngram, ngram.length - 1, ngram.length, maxQ);
+ if (MAX_QCOUNT == 0) // OOV!
+ return (float) p;
+ double pML = Math.log(unQuantize(MAX_QCOUNT)) - numTokens;
+
+ // p += lambda0 * pML;
+ p = logAdd(p, (lambda0 + pML));
+ if (ngram.length == 1) { // if it's a unigram, we're done
+ return (float) p;
+ }
+ // otherwise we calculate the linear interpolation
+ // with higher order models.
+ for (int i = end - 2; i >= end - ngramOrder && i >= 0; i--) {
+ int historyCnt = getCount(ngram, i, end, MAX_QCOUNT);
+ // if the count for the history is zero, all higher
+ // terms in the interpolation must be zero, so we
+ // are done here.
+ if (historyCnt == 0) {
+ return (float) p;
+ }
+ int historyTypesAfter = getTypesAfter(ngram, i, end, historyCnt);
+ // unQuantize the counts we got from the BF
+ double HC = unQuantize(historyCnt);
+ double HTA = 1 + unQuantize(historyTypesAfter);
+ // interpolation constant
+ double lambda = Math.log(HTA) - Math.log(HTA + HC);
+ double oneMinusLambda = Math.log(HC) - Math.log(HTA + HC);
+ // p *= 1 - lambda
+ p += oneMinusLambda;
+ int wordCount = getCount(ngram, i + 1, end, historyTypesAfter);
+ double WC = unQuantize(wordCount);
+ // p += lambda * p_ML(w|h)
+ if (WC == 0) return (float) p;
+ p = logAdd(p, lambda + Math.log(WC) - Math.log(HC));
+ MAX_QCOUNT = wordCount;
+ }
+ return (float) p;
+ }
+
+ /**
+ * Retrieve the count of a ngram from the Bloom filter. That is, how many times did we see this
+ * ngram in the training corpus? This corresponds roughly to algorithm 2 in Talbot and Osborne's
+ * "Tera-Scale LMs on the Cheap."
+ *
+ * @param ngram array containing the ngram as a sub-array
+ * @param start the index of the first word of the ngram
+ * @param end the index after the last word of the ngram
+ * @param qcount the maximum possible count to be returned
+ *
+ * @return the number of times the ngram was seen in the training corpus, quantized
+ */
+ private int getCount(int[] ngram, int start, int end, int qcount) {
+ for (int i = 1; i <= qcount; i++) {
+ int hash = hashNgram(ngram, start, end, i);
+ if (!bf.query(hash, countFuncs)) {
+ return i - 1;
+ }
+ }
+ return qcount;
+ }
+
+ /**
+ * Retrieve the number of distinct types that follow an ngram in the training corpus.
+ *
+ * This is another version of algorithm 2. As noted in the paper, we have different algorithms for
+ * getting ngram counts versus suffix counts because c(x) = 1 is a proxy item for s(x) = 1
+ *
+ * @param ngram an array the contains the ngram as a sub-array
+ * @param start the index of the first word of the ngram
+ * @param end the index after the last word of the ngram
+ * @param qcount the maximum possible return value
+ *
+ * @return the number of distinct types observed to follow an ngram in the training corpus,
+ * quantized
+ */
+ private int getTypesAfter(int[] ngram, int start, int end, int qcount) {
+ // first we check c(x) >= 1
+ int hash = hashNgram(ngram, start, end, 1);
+ if (!bf.query(hash, countFuncs)) {
+ return 0;
+ }
+ // if c(x) >= 1, we check for the stored suffix count
+ for (int i = 1; i < qcount; i++) {
+ hash = hashNgram(ngram, start, end, i);
+ if (!bf.query(hash, typesFuncs)) {
+ return i - 1;
+ }
+ }
+ return qcount;
+ }
+
+ /**
+ * Logarithmically quantizes raw counts. The quantization scheme is described in Talbot and
+ * Osborne's paper "Tera-Scale LMs on the Cheap."
+ *
+ * @param x long giving the raw count to be quantized
+ *
+ * @return the quantized count
+ */
+ private int quantize(long x) {
+ return 1 + (int) Math.floor(Math.log(x) / Math.log(quantizationBase));
+ }
+
+ /**
+ * Unquantizes a quantized count.
+ *
+ * @param x the quantized count
+ *
+ * @return the expected raw value of the quantized count
+ */
+ private double unQuantize(int x) {
+ if (x == 0) {
+ return 0;
+ } else {
+ return ((quantizationBase + 1) * Math.pow(quantizationBase, x - 1) - 1) / 2;
+ }
+ }
+
+ /**
+ * Converts an n-gram and a count into a value that can be stored into a Bloom filter. This is
+ * adapted directly from <code>AbstractPhrase.hashCode()</code> elsewhere in the Joshua code base.
+ *
+ * @param ngram an array containing the ngram as a sub-array
+ * @param start the index of the first word of the ngram
+ * @param end the index after the last word of the ngram
+ * @param val the count of the ngram
+ *
+ * @return a value suitable to be stored in a Bloom filter
+ */
+ private int hashNgram(int[] ngram, int start, int end, int val) {
+ int result = HASH_OFFSET * HASH_SEED + val;
+ for (int i = start; i < end; i++)
+ result = HASH_OFFSET * result + ngram[i];
+ return result;
+ }
+
+ /**
+ * Adds two numbers that are in the log domain, avoiding underflow.
+ *
+ * @param x one summand
+ * @param y the other summand
+ *
+ * @return the log of the sum of the exponent of the two numbers.
+ */
+ private static double logAdd(double x, double y) {
+ if (y <= x) {
+ return x + Math.log1p(Math.exp(y - x));
+ } else {
+ return y + Math.log1p(Math.exp(x - y));
+ }
+ }
+
+ /**
+ * Builds a language model and stores it in a file.
+ *
+ * @param argv command-line arguments
+ */
+ public static void main(String[] argv) {
+ if (argv.length < 5) {
+ String msg = "usage: BloomFilterLanguageModel <statistics file> <order> <size>"
+ + " <quantization base> <output file>";
+ System.err.println(msg);
+ LOG.error(msg);
+ return;
+ }
+ int order = Integer.parseInt(argv[1]);
+ int size = (int) (Integer.parseInt(argv[2]) * Math.pow(2, 23));
+ double base = Double.parseDouble(argv[3]);
+
+ try {
+ BloomFilterLanguageModel lm = new BloomFilterLanguageModel(argv[0], order, size, base);
+
+ ObjectOutputStream out =
+ new ObjectOutputStream(new GZIPOutputStream(new FileOutputStream(argv[4])));
+
+ lm.writeExternal(out);
+ out.close(); //TODO: try-with-resources
+ } catch (IOException e) {
+ LOG.error(e.getMessage(), e);
+ }
+ }
+
+ /**
+ * Adds ngram counts and counts of distinct types after ngrams, read from a file, to the Bloom
+ * filter.
+ * <p>
+ * The file format should look like this: ngram1 count types-after ngram2 count types-after ...
+ *
+ * @param bloomFilterSize the size of the Bloom filter, in bits
+ * @param filename path to the statistics file
+ */
+ private void populateBloomFilter(int bloomFilterSize, String filename) {
+ HashMap<String, Long> typesAfter = new HashMap<String, Long>();
+ try {
+ FileInputStream file_in = new FileInputStream(filename);
+ FileInputStream file_in_copy = new FileInputStream(filename);
+ InputStream in;
+ InputStream estimateStream;
+ if (filename.endsWith(".gz")) {
+ in = new GZIPInputStream(file_in);
+ estimateStream = new GZIPInputStream(file_in_copy);
+ } else {
+ in = file_in;
+ estimateStream = file_in_copy;
+ }
+ int numObjects = estimateNumberOfObjects(estimateStream);
+ LOG.debug("Estimated number of objects: {}", numObjects);
+ bf = new BloomFilter(bloomFilterSize, numObjects);
+ countFuncs = bf.initializeHashFunctions();
+ populateFromInputStream(in, typesAfter);
+ in.close();
+ } catch (IOException e) {
+ LOG.error(e.getMessage(), e);
+ return;
+ }
+ typesFuncs = bf.initializeHashFunctions();
+ for (String history : typesAfter.keySet()) {
+ String[] toks = Regex.spaces.split(history);
+ int[] hist = new int[toks.length];
+ for (int i = 0; i < toks.length; i++)
+ hist[i] = Vocabulary.id(toks[i]);
+ add(hist, typesAfter.get(history), typesFuncs);
+ }
+ return;
+ }
+
+ /**
+ * Estimate the number of objects that will be stored in the Bloom filter. The optimum number of
+ * hash functions depends on the number of items that will be stored, so we want a guess before we
+ * begin to read the statistics file and store it.
+ *
+ * @param source an InputStream pointing to the training corpus stats
+ *
+ * @return an estimate of the number of objects to be stored in the Bloom filter
+ */
+ private int estimateNumberOfObjects(InputStream source) {
+ int numLines = 0;
+ long maxCount = 0;
+ for (String line: new LineReader(source)) {
+ if (line.trim().equals("")) continue;
+ String[] toks = Regex.spaces.split(line);
+ if (toks.length > ngramOrder + 1) continue;
+ try {
+ long cnt = Long.parseLong(toks[toks.length - 1]);
+ if (cnt > maxCount) maxCount = cnt;
+ } catch (NumberFormatException e) {
+ LOG.error(e.getMessage(), e);
+ break;
+ }
+ numLines++;
+ }
+ double estimate = Math.log(maxCount) / Math.log(quantizationBase);
+ return (int) Math.round(numLines * estimate);
+ }
+
+ /**
+ * Reads the statistics from a source and stores them in the Bloom filter. The ngram counts are
+ * stored immediately in the Bloom filter, but the counts of distinct types following each ngram
+ * are accumulated from the file as we go.
+ *
+ * @param source an InputStream pointing to the statistics
+ * @param types a HashMap that will stores the accumulated counts of distinct types observed to
+ * follow each ngram
+ */
+ private void populateFromInputStream(InputStream source, HashMap<String, Long> types) {
+ numTokens = Double.NEGATIVE_INFINITY; // = log(0)
+ for (String line: new LineReader(source)) {
+ String[] toks = Regex.spaces.split(line);
+ if ((toks.length < 2) || (toks.length > ngramOrder + 1)) continue;
+ int[] ngram = new int[toks.length - 1];
+ StringBuilder history = new StringBuilder();
+ for (int i = 0; i < toks.length - 1; i++) {
+ ngram[i] = Vocabulary.id(toks[i]);
+ if (i < toks.length - 2) history.append(toks[i]).append(" ");
+ }
+
+ long cnt = Long.parseLong(toks[toks.length - 1]);
+ add(ngram, cnt, countFuncs);
+ if (toks.length == 2) { // unigram
+ numTokens = logAdd(numTokens, Math.log(cnt));
+ // no need to count types after ""
+ // that's what vocabulary.size() is for.
+ continue;
+ }
+ if (types.get(history) == null)
+ types.put(history.toString(), 1L);
+ else {
+ long x = (Long) types.get(history);
+ types.put(history.toString(), x + 1);
+ }
+ }
+ return;
+ }
+
+ /**
+ * Adds an ngram, along with an associated value, to the Bloom filter. This corresponds to Talbot
+ * and Osborne's "Tera-scale LMs on the cheap", algorithm 1.
+ *
+ * @param ngram an array representing the ngram
+ * @param value the value to be associated with the ngram
+ * @param funcs an array of long to be used as hash functions
+ */
+ private void add(int[] ngram, long value, long[][] funcs) {
+ if (ngram == null) return;
+ int qValue = quantize(value);
+ for (int i = 1; i <= qValue; i++) {
+ int hash = hashNgram(ngram, 0, ngram.length, i);
+ bf.add(hash, funcs);
+ }
+ }
+
+ /**
+ * Read a Bloom filter LM from an external file.
+ *
+ * @param in an ObjectInput stream to read from
+ */
+ public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException {
+ int vocabSize = in.readInt();
+ for (int i = 0; i < vocabSize; i++) {
+ String line = in.readUTF();
+ Vocabulary.id(line);
+ }
+ numTokens = in.readDouble();
+ countFuncs = new long[in.readInt()][2];
+ for (int i = 0; i < countFuncs.length; i++) {
+ countFuncs[i][0] = in.readLong();
+ countFuncs[i][1] = in.readLong();
+ }
+ typesFuncs = new long[in.readInt()][2];
+ for (int i = 0; i < typesFuncs.length; i++) {
+ typesFuncs[i][0] = in.readLong();
+ typesFuncs[i][1] = in.readLong();
+ }
+ quantizationBase = in.readDouble();
+ bf = new BloomFilter();
+ bf.readExternal(in);
+ }
+
+ /**
+ * Write a Bloom filter LM to some external location.
+ *
+ * @param out an ObjectOutput stream to write to
+ *
+ * @throws IOException if an input or output exception occurred
+ */
+ public void writeExternal(ObjectOutput out) throws IOException {
+ out.writeInt(Vocabulary.size());
+ for (int i = 0; i < Vocabulary.size(); i++) {
+ // out.writeBytes(vocabulary.getWord(i));
+ // out.writeChar('\n'); // newline
+ out.writeUTF(Vocabulary.word(i));
+ }
+ out.writeDouble(numTokens);
+ out.writeInt(countFuncs.length);
+ for (int i = 0; i < countFuncs.length; i++) {
+ out.writeLong(countFuncs[i][0]);
+ out.writeLong(countFuncs[i][1]);
+ }
+ out.writeInt(typesFuncs.length);
+ for (int i = 0; i < typesFuncs.length; i++) {
+ out.writeLong(typesFuncs[i][0]);
+ out.writeLong(typesFuncs[i][1]);
+ }
+ out.writeDouble(quantizationBase);
+ bf.writeExternal(out);
+ }
+
+ /**
+ * Returns the language model score for an n-gram. This is called from the rest of the Joshua
+ * decoder.
+ *
+ * @param ngram the ngram to score
+ * @param order the order of the model
+ *
+ * @return the language model score of the ngram
+ */
+ @Override
+ protected float ngramLogProbability_helper(int[] ngram, int order) {
+ int[] lm_ngram = new int[ngram.length];
+ for (int i = 0; i < ngram.length; i++) {
+ lm_ngram[i] = Vocabulary.id(Vocabulary.word(ngram[i]));
+ }
+ return wittenBell(lm_ngram, order);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/91400fe2/src/main/java/org/apache/joshua/decoder/ff/tm/GrammarReader.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/decoder/ff/tm/GrammarReader.java
index 501be62,0000000..df00255
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/decoder/ff/tm/GrammarReader.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/tm/GrammarReader.java
@@@ -1,160 -1,0 +1,158 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.tm;
+
+import java.io.IOException;
+import java.util.Iterator;
+
- import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.Decoder;
+import org.apache.joshua.util.io.LineReader;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This is a base class for simple, ASCII line-based grammars that are stored on disk.
+ *
+ * @author Juri Ganitkevitch
+ *
+ */
+public abstract class GrammarReader<R extends Rule> implements Iterable<R>, Iterator<R> {
+
+ private static final Logger LOG = LoggerFactory.getLogger(GrammarReader.class);
+
+ protected static String fieldDelimiter;
-
+ protected static String description;
+
+ protected String fileName;
+ protected LineReader reader;
+ protected String lookAhead;
+ protected int numRulesRead;
+
+
+ // dummy constructor for
+ public GrammarReader() {
+ this.fileName = null;
+ }
+
+ public GrammarReader(String fileName) throws IOException {
+ this.fileName = fileName;
+ this.reader = new LineReader(fileName);
+ LOG.info("Reading grammar from file {}...", fileName);
+ numRulesRead = 0;
+ advanceReader();
+ }
+
+ // the reader is the iterator itself
+ public Iterator<R> iterator() {
+ return this;
+ }
+
+ /** Unsupported Iterator method. */
+ public void remove() throws UnsupportedOperationException {
+ throw new UnsupportedOperationException();
+ }
+
+ public void close() {
+ if (null != this.reader) {
+ try {
+ this.reader.close();
+ } catch (IOException e) {
+ LOG.warn(e.getMessage(), e);
+ LOG.error("Error closing grammar file stream: {}", this.fileName);
+ }
+ this.reader = null;
+ }
+ }
+
+ /**
+ * For correct behavior <code>close</code> must be called on every GrammarReader, however this
+ * code attempts to avoid resource leaks.
+ *
+ * @see org.apache.joshua.util.io.LineReader
+ */
+ @Override
+ protected void finalize() throws Throwable {
+ if (this.reader != null) {
+ LOG.error("Grammar file stream was not closed, this indicates a coding error: {}",
+ this.fileName);
+ }
+
+ this.close();
+ super.finalize();
+ }
+
+ @Override
+ public boolean hasNext() {
+ return lookAhead != null;
+ }
+
+ private void advanceReader() {
+ try {
+ lookAhead = reader.readLine();
+ numRulesRead++;
+ } catch (IOException e) {
+ LOG.error("Error reading grammar from file: {}", fileName);
+ LOG.error(e.getMessage(), e);
+ }
+ if (lookAhead == null && reader != null) {
+ this.close();
+ }
+ }
+
+ /**
+ * Read the next line, and print reader progress.
+ */
+ @Override
+ public R next() {
+ String line = lookAhead;
+
+ int oldProgress = reader.progress();
+ advanceReader();
+
+
+ if (Decoder.VERBOSE >= 1) {
+ int newProgress = (reader != null) ? reader.progress() : 100;
+
+ //TODO: review this code. It is better to print progress based on time gap (like for every 1s or 2sec) than %!
+ if (newProgress > oldProgress) {
+ for (int i = oldProgress + 1; i <= newProgress; i++)
+ if (i == 97) {
+ System.err.print("1");
+ } else if (i == 98) {
+ System.err.print("0");
+ } else if (i == 99) {
+ System.err.print("0");
+ } else if (i == 100) {
+ System.err.println("%");
+ } else if (i % 10 == 0) {
+ System.err.print(String.format("%d", i));
+ System.err.flush();
+ } else if ((i - 1) % 10 == 0)
+ ; // skip at 11 since 10, 20, etc take two digits
+ else {
+ System.err.print(".");
+ System.err.flush();
+ }
+ }
+ }
+ return parseLine(line);
+ }
+
+ protected abstract R parseLine(String line);
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/91400fe2/src/main/java/org/apache/joshua/decoder/ff/tm/format/HieroFormatReader.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/decoder/ff/tm/format/HieroFormatReader.java
index d80638d,0000000..45c8c33
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/decoder/ff/tm/format/HieroFormatReader.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/tm/format/HieroFormatReader.java
@@@ -1,107 -1,0 +1,106 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.tm.format;
+
+import java.io.IOException;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.ff.tm.GrammarReader;
+import org.apache.joshua.decoder.ff.tm.Rule;
++import org.apache.joshua.util.Constants;
+import org.apache.joshua.util.FormatUtils;
+
+/**
+ * This class implements reading files in the format defined by David Chiang for Hiero.
+ *
+ * @author Matt Post post@cs.jhu.edu
+ */
+
+public class HieroFormatReader extends GrammarReader<Rule> {
+
+ static {
- fieldDelimiter = "\\s\\|{3}\\s";
+ description = "Original Hiero format";
+ }
+
+ public HieroFormatReader() {
+ super();
+ }
+
+ public HieroFormatReader(String grammarFile) throws IOException {
+ super(grammarFile);
+ }
+
+ @Override
+ public Rule parseLine(String line) {
- String[] fields = line.split(fieldDelimiter);
++ String[] fields = line.split(Constants.fieldDelimiter);
+ if (fields.length < 3) {
+ throw new RuntimeException(String.format("Rule '%s' does not have four fields", line));
+ }
+
+ int lhs = Vocabulary.id(fields[0]);
+
+ /**
+ * On the foreign side, we map nonterminals to negative IDs, and terminals to positive IDs.
+ */
+ int arity = 0;
+ String[] sourceWords = fields[1].split("\\s+");
+ int[] sourceIDs = new int[sourceWords.length];
+ for (int i = 0; i < sourceWords.length; i++) {
++ /* NOTE: This redundantly creates vocab items for terms like [X,1]. This might actually
++ * be necessary, so don't try to turn this into an if/else.
++ */
+ sourceIDs[i] = Vocabulary.id(sourceWords[i]);
+ if (FormatUtils.isNonterminal(sourceWords[i])) {
+ sourceIDs[i] = Vocabulary.id(FormatUtils.stripNonTerminalIndex(sourceWords[i]));
+ arity++;
+
+ // TODO: the arity here (after incrementing) should match the rule index. Should
+ // check that arity == FormatUtils.getNonterminalIndex(foreignWords[i]), throw runtime
+ // error if not
+ }
+ }
+
+ /**
+ * The English side maps terminal symbols to positive IDs. Nonterminal symbols are linked
+ * to the index of the source-side nonterminal they are linked to. So for a rule
+ *
+ * [X] ||| [X,1] [X,2] [X,3] ||| [X,2] [X,1] [X,3] ||| ...
+ *
+ * the English side nonterminals will be -2, -1, -3. This assumes that the source side of
+ * the rule is always listed monotonically.
+ */
+ String[] targetWords = fields[2].split("\\s+");
+ int[] targetIDs = new int[targetWords.length];
+ for (int i = 0; i < targetWords.length; i++) {
+ targetIDs[i] = Vocabulary.id(targetWords[i]);
+ if (FormatUtils.isNonterminal(targetWords[i])) {
+ targetIDs[i] = -FormatUtils.getNonterminalIndex(targetWords[i]);
+ }
+ }
+
+ String sparse_features = (fields.length > 3 ? fields[3] : "");
+ String alignment = (fields.length > 4) ? fields[4] : null;
+
+ return new Rule(lhs, sourceIDs, targetIDs, sparse_features, arity, alignment);
+ }
-
- public static String getFieldDelimiter() {
- return fieldDelimiter;
- }
-
++
+ public static boolean isNonTerminal(final String word) {
+ return FormatUtils.isNonterminal(word);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/91400fe2/src/main/java/org/apache/joshua/decoder/ff/tm/format/MosesFormatReader.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/decoder/ff/tm/format/MosesFormatReader.java
index 0252636,0000000..7811b3b
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/decoder/ff/tm/format/MosesFormatReader.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/tm/format/MosesFormatReader.java
@@@ -1,106 -1,0 +1,108 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.tm.format;
+
+import java.io.IOException;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.util.io.LineReader;
++import org.apache.joshua.util.Constants;
++import org.apache.joshua.util.FormatUtils;
+
+/***
+ * This class reads in the Moses phrase table format, with support for the source and target side,
+ * list of features, and word alignments. It works by
+ *
+ * - casting the phrase-based rules to left-branching hierarchical rules and passing them on \
+ * to its parent class, {@link HieroFormatReader}.
+ * - converting the probabilities to -log probabilities
+ *
+ * There is also a tool to convert the grammars directly, so that they can be suitably packed. Usage:
+ *
+ * <pre>
+ * cat PHRASE_TABLE | java -cp $JOSHUA/target/classes org.apache.joshua.decoder.ff.tm.format.MosesFormatReader
+ * </pre>
+ *
+ * @author Matt Post post@cs.jhu.edu
+ *
+ */
+
+public class MosesFormatReader extends HieroFormatReader {
+
+ public MosesFormatReader(String grammarFile) throws IOException {
+ super(grammarFile);
- Vocabulary.id("[X]");
++ Vocabulary.id(Constants.defaultNT);
+ }
+
+ public MosesFormatReader() {
+ super();
- Vocabulary.id("[X]");
++ Vocabulary.id(Constants.defaultNT);
+ }
+
+ /**
+ * When dealing with Moses format, this munges a Moses-style phrase table into a grammar.
+ *
+ * mots francaises ||| French words ||| 1 2 3 ||| 0-1 1-0
+ *
+ * becomes
+ *
+ * [X] ||| [X,1] mots francaises ||| [X,1] French words ||| 1 2 3 ||| 0-1 1-0
+ *
+ * For thrax-extracted phrasal grammars, it transforms
+ *
+ * [X] ||| mots francaises ||| French words ||| 1 2 3 ||| 0-1 1-0
+ *
+ * into
+ *
+ * [X] ||| [X,1] mots francaises ||| [X,1] French words ||| 1 2 3 ||| 0-1 1-0
+ */
+ @Override
+ public Rule parseLine(String line) {
- String[] fields = line.split(fieldDelimiter);
++ String[] fields = line.split(Constants.fieldDelimiter);
+
- StringBuffer hieroLine = new StringBuffer();
- hieroLine.append("[X] ||| [X,1] " + fields[0] + " ||| [X,1] " + fields[1] + " |||");
++ String nt = FormatUtils.cleanNonTerminal(Constants.defaultNT);
++ StringBuffer hieroLine = new StringBuffer(Constants.defaultNT + " ||| [" + nt + ",1] " + fields[0] + " ||| [" + nt + ",1] " + fields[1] + " |||");
+
+ String mosesFeatureString = fields[2];
+ for (String value: mosesFeatureString.split(" ")) {
+ float f = Float.parseFloat(value);
+ hieroLine.append(String.format(" %f", f <= 0.0 ? -100 : -Math.log(f)));
+ }
+
+ // alignments
+ if (fields.length >= 4)
+ hieroLine.append(" ||| " + fields[3]);
+
+ return super.parseLine(hieroLine.toString());
+ }
+
+ /**
+ * Converts a Moses phrase table to a Joshua grammar.
+ *
+ * @param args the command-line arguments
+ */
+ public static void main(String[] args) {
+ MosesFormatReader reader = new MosesFormatReader();
+ for (String line: new LineReader(System.in)) {
+ Rule rule = reader.parseLine(line);
+ System.out.println(rule.textFormat());
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/91400fe2/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
index 3a37e08,0000000..2bfa8c1
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
@@@ -1,317 -1,0 +1,316 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.tm.hash_based;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.joshua.corpus.Vocabulary;
- import org.apache.joshua.decoder.Decoder;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.JoshuaConfiguration.OOVItem;
+import org.apache.joshua.decoder.ff.FeatureFunction;
+import org.apache.joshua.decoder.ff.tm.AbstractGrammar;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.ff.tm.GrammarReader;
+import org.apache.joshua.decoder.ff.tm.Trie;
+import org.apache.joshua.decoder.ff.tm.format.HieroFormatReader;
+import org.apache.joshua.decoder.ff.tm.format.MosesFormatReader;
+import org.apache.joshua.util.FormatUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class implements a memory-based bilingual BatchGrammar.
+ * <p>
+ * The rules are stored in a trie. Each trie node has: (1) RuleBin: a list of rules matching the
+ * french sides so far (2) A HashMap of next-layer trie nodes, the next french word used as the key
+ * in HashMap
+ *
+ * @author Zhifei Li zhifei.work@gmail.com
+ * @author Matt Post post@cs.jhu.edu
+ */
+public class MemoryBasedBatchGrammar extends AbstractGrammar {
+
+ private static final Logger LOG = LoggerFactory.getLogger(MemoryBasedBatchGrammar.class);
+
+ // ===============================================================
+ // Instance Fields
+ // ===============================================================
+
+ /* The number of rules read. */
+ private int qtyRulesRead = 0;
+
+ /* The number of distinct source sides. */
+ private int qtyRuleBins = 0;
+
+ private int numDenseFeatures = 0;
+
+ /* The trie root. */
+ private MemoryBasedTrie root = null;
+
+ /* The file containing the grammar. */
+ private String grammarFile;
+
+ private GrammarReader<Rule> modelReader;
+
+ /* Whether the grammar's rules contain regular expressions. */
+ private boolean isRegexpGrammar = false;
+
+ // ===============================================================
+ // Static Fields
+ // ===============================================================
+
+ // ===============================================================
+ // Constructors
+ // ===============================================================
+
+ public MemoryBasedBatchGrammar(JoshuaConfiguration joshuaConfiguration) {
+ super(joshuaConfiguration);
+ this.root = new MemoryBasedTrie();
+ this.joshuaConfiguration = joshuaConfiguration;
+ }
+
+ public MemoryBasedBatchGrammar(String owner, JoshuaConfiguration joshuaConfiguration) {
+ this(joshuaConfiguration);
+ this.owner = Vocabulary.id(owner);
+ }
+
+ public MemoryBasedBatchGrammar(GrammarReader<Rule> gr, JoshuaConfiguration joshuaConfiguration) {
+ // this.defaultOwner = Vocabulary.id(defaultOwner);
+ // this.defaultLHS = Vocabulary.id(defaultLHSSymbol);
+ this(joshuaConfiguration);
+ modelReader = gr;
+ }
+
+ public MemoryBasedBatchGrammar(String formatKeyword, String grammarFile, String owner,
+ String defaultLHSSymbol, int spanLimit, JoshuaConfiguration joshuaConfiguration)
+ throws IOException {
+
+ this(joshuaConfiguration);
+ this.owner = Vocabulary.id(owner);
+ Vocabulary.id(defaultLHSSymbol);
+ this.spanLimit = spanLimit;
+ this.grammarFile = grammarFile;
+ this.setRegexpGrammar(formatKeyword.equals("regexp"));
+
+ // ==== loading grammar
+ this.modelReader = createReader(formatKeyword, grammarFile);
+ if (modelReader != null) {
+ for (Rule rule : modelReader)
+ if (rule != null) {
+ addRule(rule);
+ }
+ } else {
+ LOG.info("Couldn't create a GrammarReader for file {} with format {}",
+ grammarFile, formatKeyword);
+ }
+
+ this.printGrammar();
+ }
+
+ protected GrammarReader<Rule> createReader(String format, String grammarFile) throws IOException {
+
+ if (grammarFile != null) {
+ if ("hiero".equals(format) || "thrax".equals(format) || "regexp".equals(format)) {
+ return new HieroFormatReader(grammarFile);
+ } else if ("moses".equals(format)) {
+ return new MosesFormatReader(grammarFile);
+ } else {
+ throw new RuntimeException(String.format("* FATAL: unknown grammar format '%s'", format));
+ }
+ }
+ return null;
+ }
+
+ // ===============================================================
+ // Methods
+ // ===============================================================
+
+ public void setSpanLimit(int spanLimit) {
+ this.spanLimit = spanLimit;
+ }
+
+ @Override
+ public int getNumRules() {
+ return this.qtyRulesRead;
+ }
+
+ @Override
+ public Rule constructManualRule(int lhs, int[] sourceWords, int[] targetWords,
+ float[] denseScores, int arity) {
+ return null;
+ }
+
+ /**
+ * if the span covered by the chart bin is greater than the limit, then return false
+ */
+ public boolean hasRuleForSpan(int i, int j, int pathLength) {
+ if (this.spanLimit == -1) { // mono-glue grammar
+ return (i == 0);
+ } else {
+ // System.err.println(String.format("%s HASRULEFORSPAN(%d,%d,%d)/%d = %s",
+ // Vocabulary.word(this.owner), i, j, pathLength, spanLimit, pathLength <= this.spanLimit));
+ return (pathLength <= this.spanLimit);
+ }
+ }
+
+ public Trie getTrieRoot() {
+ return this.root;
+ }
+
+ /**
+ * Adds a rule to the grammar.
+ */
+ public void addRule(Rule rule) {
+
+ // TODO: Why two increments?
+ this.qtyRulesRead++;
+
+ // if (owner == -1) {
+ // System.err.println("* FATAL: MemoryBasedBatchGrammar::addRule(): owner not set for grammar");
+ // System.exit(1);
+ // }
+ rule.setOwner(owner);
+
+ if (numDenseFeatures == 0)
+ numDenseFeatures = rule.getFeatureVector().getDenseFeatures().size();
+
+ // === identify the position, and insert the trie nodes as necessary
+ MemoryBasedTrie pos = root;
+ int[] french = rule.getFrench();
+
+ maxSourcePhraseLength = Math.max(maxSourcePhraseLength, french.length);
+
+ for (int k = 0; k < french.length; k++) {
+ int curSymID = french[k];
+
+ /*
+ * Note that the nonTerminal symbol in the french is not cleaned (i.e., will be sth like
+ * [X,1]), but the symbol in the Trie has to be cleaned, so that the match does not care about
+ * the markup (i.e., [X,1] or [X,2] means the same thing, that is X) if
+ * (Vocabulary.nt(french[k])) { curSymID = modelReader.cleanNonTerminal(french[k]); if
+ * (logger.isLoggable(Level.FINEST)) logger.finest("Amended to: " + curSymID); }
+ */
+
+ MemoryBasedTrie nextLayer = (MemoryBasedTrie) pos.match(curSymID);
+ if (null == nextLayer) {
+ nextLayer = new MemoryBasedTrie();
+ if (pos.hasExtensions() == false) {
+ pos.childrenTbl = new HashMap<Integer, MemoryBasedTrie>();
+ }
+ pos.childrenTbl.put(curSymID, nextLayer);
+ }
+ pos = nextLayer;
+ }
+
+ // === add the rule into the trie node
+ if (!pos.hasRules()) {
+ pos.ruleBin = new MemoryBasedRuleBin(rule.getArity(), rule.getFrench());
+ this.qtyRuleBins++;
+ }
+ pos.ruleBin.addRule(rule);
+ }
+
+ protected void printGrammar() {
+ LOG.info("MemoryBasedBatchGrammar: Read {} rules with {} distinct source sides from '{}'",
+ this.qtyRulesRead, this.qtyRuleBins, grammarFile);
+ }
+
+ /**
+ * This returns true if the grammar contains rules that are regular expressions, possibly matching
+ * many different inputs.
+ *
+ * @return true if the grammar's rules may contain regular expressions.
+ */
+ @Override
+ public boolean isRegexpGrammar() {
+ return this.isRegexpGrammar;
+ }
+
+ public void setRegexpGrammar(boolean value) {
+ this.isRegexpGrammar = value;
+ }
+
+ /***
+ * Takes an input word and creates an OOV rule in the current grammar for that word.
+ *
+ * @param sourceWord integer representation of word
+ * @param featureFunctions {@link java.util.List} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
+ */
+ @Override
+ public void addOOVRules(int sourceWord, List<FeatureFunction> featureFunctions) {
+
+ // TODO: _OOV shouldn't be outright added, since the word might not be OOV for the LM (but now
+ // almost
+ // certainly is)
+ final int targetWord = this.joshuaConfiguration.mark_oovs ? Vocabulary.id(Vocabulary
+ .word(sourceWord) + "_OOV") : sourceWord;
+
+ int[] sourceWords = { sourceWord };
+ int[] targetWords = { targetWord };
+ final String oovAlignment = "0-0";
+
+ if (this.joshuaConfiguration.oovList != null && this.joshuaConfiguration.oovList.size() != 0) {
+ for (OOVItem item : this.joshuaConfiguration.oovList) {
+ Rule oovRule = new Rule(Vocabulary.id(item.label), sourceWords, targetWords, "", 0,
+ oovAlignment);
+ addRule(oovRule);
+ oovRule.estimateRuleCost(featureFunctions);
+ }
+ } else {
+ int nt_i = Vocabulary.id(this.joshuaConfiguration.default_non_terminal);
+ Rule oovRule = new Rule(nt_i, sourceWords, targetWords, "", 0, oovAlignment);
+ addRule(oovRule);
+ oovRule.estimateRuleCost(featureFunctions);
+ }
+ }
+
+ /**
+ * Adds a default set of glue rules.
+ *
+ * @param featureFunctions an {@link java.util.ArrayList} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
+ */
+ public void addGlueRules(ArrayList<FeatureFunction> featureFunctions) {
+ HieroFormatReader reader = new HieroFormatReader();
+
+ String goalNT = FormatUtils.cleanNonTerminal(joshuaConfiguration.goal_symbol);
+ String defaultNT = FormatUtils.cleanNonTerminal(joshuaConfiguration.default_non_terminal);
+
+ String[] ruleStrings = new String[] {
+ String.format("[%s] ||| %s ||| %s ||| 0", goalNT, Vocabulary.START_SYM,
+ Vocabulary.START_SYM),
+ String.format("[%s] ||| [%s,1] [%s,2] ||| [%s,1] [%s,2] ||| -1", goalNT, goalNT, defaultNT,
+ goalNT, defaultNT),
+ String.format("[%s] ||| [%s,1] %s ||| [%s,1] %s ||| 0", goalNT, goalNT,
+ Vocabulary.STOP_SYM, goalNT, Vocabulary.STOP_SYM) };
+
+ for (String ruleString : ruleStrings) {
+ Rule rule = reader.parseLine(ruleString);
+ addRule(rule);
+ rule.estimateRuleCost(featureFunctions);
+ }
+ }
+
+ @Override
+ public int getNumDenseFeatures() {
+ return numDenseFeatures;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/91400fe2/src/main/java/org/apache/joshua/decoder/hypergraph/AlignedSourceTokens.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/decoder/hypergraph/AlignedSourceTokens.java
index 948001f,0000000..864b383
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/decoder/hypergraph/AlignedSourceTokens.java
+++ b/src/main/java/org/apache/joshua/decoder/hypergraph/AlignedSourceTokens.java
@@@ -1,115 -1,0 +1,112 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.hypergraph;
+
+import java.util.LinkedList;
+import java.util.ListIterator;
+
+/**
+ * Class that represents a one to (possibly) many alignment from target to
+ * source. Extends from a LinkedList. Instances of this class are updated by the
+ * WordAlignmentExtractor.substitute() method.
+ * The {@link org.apache.joshua.decoder.hypergraph.AlignedSourceTokens#shiftBy(int, int)}
+ * method shifts the
+ * elements in the list by a scalar to reflect substitutions of non terminals in
+ * the rule. if indexes are final, i.e. the point instance has been substituted
+ * into a parent WordAlignmentState once,
+ * {@link org.apache.joshua.decoder.hypergraph.AlignedSourceTokens#isFinal} is set to true.
+ * This is
+ * necessary since the final source index of a point is known once we have
+ * substituted in a complete WordAlignmentState into its parent. If the index in
+ * the list is a non terminal, {@link org.apache.joshua.decoder.hypergraph.AlignedSourceTokens#isNonTerminal} = true
+ */
+class AlignedSourceTokens extends LinkedList<Integer> {
+
+ private static final long serialVersionUID = 1L;
+ /** whether this Point refers to a non terminal in source&target */
+ private boolean isNonTerminal = false;
+ /** whether this instance does not need to be updated anymore */
+ private boolean isFinal = false;
+ /** whether the word this Point corresponds to has no alignment in source */
+ private boolean isNull = false;
+
- AlignedSourceTokens() {
- }
++ AlignedSourceTokens() {}
+
+ void setFinal() {
+ isFinal = true;
+ }
+
+ void setNonTerminal() {
+ isNonTerminal = true;
+ }
+
+ void setNull() {
+ isNull = true;
+ }
+
+ @Override
+ /**
+ * returns true if element was added.
+ */
+ public boolean add(Integer x) {
- if (isNull || isNonTerminal)
- return false;
- return super.add(x);
++ return isNull ? false : super.add(x);
+ }
+
+ public boolean isNonTerminal() {
+ return isNonTerminal;
+ }
+
+ public boolean isFinal() {
+ return isFinal;
+ }
+
+ public boolean isNull() {
+ return isNull;
+ }
+
+ /**
+ * shifts each item in the LinkedList by <shift>.
+ * Only applies to items larger than <start>
+ */
+ void shiftBy(int start, int shift) {
+ if (!isFinal && !isNull) {
- ListIterator<Integer> it = this.listIterator();
++ final ListIterator<Integer> it = this.listIterator();
+ while (it.hasNext()) {
- int x = it.next();
++ final int x = it.next();
+ if (x > start) {
+ it.set(x + shift);
+ }
+ }
+ }
+ }
+
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ if (isFinal)
+ sb.append("f");
+ if (isNull) {
+ sb.append("[NULL]");
+ } else {
+ sb.append(super.toString());
+ }
+ if (isNonTerminal)
+ sb.append("^");
+ return sb.toString();
+ }
+}