You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/11/17 18:25:10 UTC
[5/8] incubator-joshua git commit: cleanup in log messages, typos
cleanup in log messages, typos
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/81baa658
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/81baa658
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/81baa658
Branch: refs/heads/save_custom_grammars
Commit: 81baa658207f53623f2b07a5ae7d6a6e6ef0bbd8
Parents: 561799a
Author: Matt Post <po...@cs.jhu.edu>
Authored: Wed Nov 16 06:41:03 2016 -0500
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Thu Nov 17 13:24:07 2016 -0500
----------------------------------------------------------------------
.../joshua/decoder/chart_parser/Chart.java | 4 +-
.../decoder/ff/tm/SentenceFilteredGrammar.java | 366 -------------------
.../tm/hash_based/MemoryBasedBatchGrammar.java | 10 +-
.../joshua/decoder/segment_file/Token.java | 2 +-
src/test/resources/decoder/n-ary/joshua.config | 2 -
5 files changed, 6 insertions(+), 378 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/81baa658/src/main/java/org/apache/joshua/decoder/chart_parser/Chart.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/chart_parser/Chart.java b/src/main/java/org/apache/joshua/decoder/chart_parser/Chart.java
index bd91a6f..883e20d 100644
--- a/src/main/java/org/apache/joshua/decoder/chart_parser/Chart.java
+++ b/src/main/java/org/apache/joshua/decoder/chart_parser/Chart.java
@@ -424,7 +424,7 @@ public class Chart {
if (null == this.cells.get(0, sourceLength)
|| !this.goalBin.transitToGoal(this.cells.get(0, sourceLength), this.featureFunctions,
this.sourceLength)) {
- LOG.warn("Input {}: Parse failure (either no derivations exist or pruning is too aggressive",
+ LOG.warn("Input {}: Parse failure (either no derivations exist, or pruning is too aggressive)",
sentence.id());
return null;
}
@@ -621,7 +621,7 @@ public class Chart {
if (null == this.cells.get(0, sourceLength)
|| !this.goalBin.transitToGoal(this.cells.get(0, sourceLength), this.featureFunctions,
this.sourceLength)) {
- LOG.warn("Input {}: Parse failure (either no derivations exist or pruning is too aggressive",
+ LOG.warn("Input {}: Parse failure (either no derivations exist, or pruning is too aggressive)",
sentence.id());
return null;
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/81baa658/src/main/java/org/apache/joshua/decoder/ff/tm/SentenceFilteredGrammar.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/tm/SentenceFilteredGrammar.java b/src/main/java/org/apache/joshua/decoder/ff/tm/SentenceFilteredGrammar.java
deleted file mode 100644
index 4f545b7..0000000
--- a/src/main/java/org/apache/joshua/decoder/ff/tm/SentenceFilteredGrammar.java
+++ /dev/null
@@ -1,366 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.joshua.decoder.ff.tm;
-
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.Map.Entry;
-
-import org.apache.joshua.decoder.ff.tm.hash_based.ExtensionIterator;
-import org.apache.joshua.decoder.ff.tm.hash_based.MemoryBasedBatchGrammar;
-import org.apache.joshua.decoder.segment_file.Sentence;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * This class implements dynamic sentence-level filtering. This is accomplished with a parallel
- * trie, a subset of the original trie, that only contains trie paths that are reachable from
- * traversals of the current sentence.
- *
- * @author Matt Post post@cs.jhu.edu
- */
-public class SentenceFilteredGrammar extends MemoryBasedBatchGrammar {
-
- private static final Logger LOG = LoggerFactory.getLogger(SentenceFilteredGrammar.class);
-
- private final AbstractGrammar baseGrammar;
- private final SentenceFilteredTrie filteredTrie;
- private final int[] tokens;
- private final Sentence sentence;
-
- /**
- * Construct a new sentence-filtered grammar. The main work is done in the enclosed trie (obtained
- * from the base grammar, which contains the complete grammar).
- *
- * @param baseGrammar a new {@link org.apache.joshua.decoder.ff.tm.AbstractGrammar} to populate
- * @param sentence {@link org.apache.joshua.lattice.Lattice} input
- */
- SentenceFilteredGrammar(AbstractGrammar baseGrammar, Sentence sentence) {
- super(OwnerMap.getOwner(baseGrammar.getOwner()), baseGrammar.joshuaConfiguration, baseGrammar.getSpanLimit());
- this.baseGrammar = baseGrammar;
- this.sentence = sentence;
- this.tokens = sentence.getWordIDs();
-
- int origCount = getNumRules(baseGrammar.getTrieRoot());
- long startTime = System.currentTimeMillis();
-
- /* Filter the rules; returns non-null object */
- this.filteredTrie = filter(baseGrammar.getTrieRoot());
- int filteredCount = getNumRules();
-
- float seconds = (System.currentTimeMillis() - startTime) / 1000.0f;
-
- LOG.debug("Sentence-level filtering of sentence {} ({} -> {} rules) in {} seconds",
- sentence.id(), origCount, filteredCount, seconds);
- }
-
- @Override
- public Trie getTrieRoot() {
- return filteredTrie;
- }
-
- /**
- * This function is poorly named: it doesn't mean whether a rule exists in the grammar for the
- * current span, but whether the grammar is permitted to apply rules to the current span (a
- * grammar-level parameter). As such we can just chain to the underlying grammar.
- */
- @Override
- public boolean hasRuleForSpan(int startIndex, int endIndex, int pathLength) {
- return baseGrammar.hasRuleForSpan(startIndex, endIndex, pathLength);
- }
-
- @Override
- public int getNumRules() {
- return getNumRules(getTrieRoot());
- }
-
- /**
- * A convenience function that counts the number of rules in a grammar's trie.
- *
- * @param node the {@link org.apache.joshua.decoder.ff.tm.Trie} implementation for which to count rules
- * @return the number of rules
- */
- public int getNumRules(Trie node) {
- int numRules = 0;
- if (node != null) {
- if (node.getRuleCollection() != null)
- numRules += node.getRuleCollection().getRules().size();
-
- if (node.getExtensions() != null)
- for (Trie child : node.getExtensions())
- numRules += getNumRules(child);
- }
-
- return numRules;
- }
-
- /**
- * What is the algorithm?
- *
- * Take the first word of the sentence, and start at the root of the trie. There are two things to
- * consider: (a) word matches and (b) nonterminal matches.
- *
- * For a word match, simply follow that arc along the trie. We create a parallel arc in our
- * filtered grammar to represent it. Each arc in the filtered trie knows about its
- * corresponding/underlying node in the unfiltered grammar trie.
- *
- * A nonterminal is always permitted to match. The question then is how much of the input sentence
- * we imagine it consumed. The answer is that it could have been any amount. So the recursive call
- * has to be a set of calls, one each to the next trie node with different lengths of the sentence
- * remaining.
- *
- * A problem occurs when we have multiple sequential nonterminals. For scope-3 grammars, there can
- * be four sequential nonterminals (in the case when they are grounded by terminals on both ends
- * of the nonterminal chain). We'd like to avoid looking at all possible ways to split up the
- * subsequence, because with respect to filtering rules, they are all the same.
- *
- * We accomplish this with the following restriction: for purposes of grammar filtering, only the
- * first in a sequence of nonterminal traversals can consume more than one word. Each of the
- * subsequent ones would have to consume just one word. We then just have to record in the
- * recursive call whether the last traversal was a nonterminal or not.
- *
- * @param unfilteredTrieRoot todo
- * @return the root of the filtered trie
- */
- private SentenceFilteredTrie filter(Trie unfilteredTrieRoot) {
- SentenceFilteredTrie filteredTrieRoot = new SentenceFilteredTrie(unfilteredTrieRoot);
-
- // System.err.println(String.format("FILTERING TO SENTENCE\n %s\n",
- // Vocabulary.getWords(tokens)));
-
- /*
- * The root of the trie is where rule applications start, so we simply try all possible
- * positions in the sentence.
- */
- for (int i = 0; i < tokens.length; i++) {
- filter(i, filteredTrieRoot, false);
- }
-
- return filteredTrieRoot;
- }
-
- /**
- * Matches rules against the sentence. Intelligently handles chains of sequential nonterminals.
- * Marks arcs that are traversable for this sentence.
- *
- * @param i the position in the sentence to start matching
- * @param trie the trie node to match against
- * @param lastWasNT true if the match that brought us here was against a nonterminal
- */
- private void filter(int i, SentenceFilteredTrie trieNode, boolean lastWasNT) {
- if (i >= tokens.length)
- return;
-
- /* Make sure the underlying unfiltered node has children. */
- Trie unfilteredTrieNode = trieNode.unfilteredTrieNode;
- if (unfilteredTrieNode.getChildren() == null) {
- // trieNode.path.retreat();
- return;
- }
-
- /* Match a word */
- Trie trie = unfilteredTrieNode.match(tokens[i]);
- if (trie != null) {
- /*
- * The current filtered node might already have an arc for this label. If so, retrieve it
- * (since we still need to follow it); if not, create it.
- */
- SentenceFilteredTrie nextFilteredTrie = trieNode.match(tokens[i]);
- if (nextFilteredTrie == null) {
- nextFilteredTrie = new SentenceFilteredTrie(trie);
- trieNode.children.put(tokens[i], nextFilteredTrie);
- }
-
- /*
- * Now continue, trying to match the child node against the next position in the sentence. The
- * third argument records that this match was not against a nonterminal.
- */
- filter(i + 1, nextFilteredTrie, false);
- }
-
- /*
- * Now we attempt to match nonterminals. Any nonterminal is permitted to match any region of the
- * sentence, up to the maximum span for that grammar. So we enumerate all children of the
- * current (unfiltered) trie grammar node, looking for nonterminals (items whose label value is
- * less than 0), then recurse.
- *
- * There is one subtlely. Adjacent nonterminals in a grammar rule can match a span (i, j) in (j
- * - i - 1) ways, but for purposes of determining whether a rule fits, this is all wasted
- * effort. To handle this, we allow the first nonterminal in a sequence to record 1, 2, 3, ...
- * terminals (up to the grammar's span limit, or the rest of the sentence, whichever is
- * shorter). Subsequent adjacent nonterminals are permitted to consume only a single terminal.
- */
- HashMap<Integer, ? extends Trie> children = unfilteredTrieNode.getChildren();
- if (children != null) {
- for (int label : children.keySet()) {
- if (label < 0) {
- SentenceFilteredTrie nextFilteredTrie = trieNode.match(label);
- if (nextFilteredTrie == null) {
- nextFilteredTrie = new SentenceFilteredTrie(unfilteredTrieNode.match(label));
- trieNode.children.put(label, nextFilteredTrie);
- }
-
- /*
- * Recurse. If the last match was a nonterminal, we can only consume one more token.
- *
- * TODO: This goes too far by looking at the whole sentence; each grammar has a maximum
- * span limit which should be consulted. What we should be doing is passing the point
- * where we started matching the current sentence, so we can apply this span limit, which
- * is easily accessible (baseGrammar.spanLimit).
- */
- int maxJ = lastWasNT ? (i + 1) : tokens.length;
- for (int j = i + 1; j <= maxJ; j++) {
- filter(j, nextFilteredTrie, true);
- }
- }
- }
- }
- }
-
- /**
- * Alternate filter that uses regular expressions, walking the grammar trie and matching the
- * source side of each rule collection against the input sentence. Failed matches are discarded,
- * and trie nodes extending from that position need not be explored.
- *
- * @param unfilteredTrie todo
- * @return the root of the filtered trie if any rules were retained, otherwise null
- */
- @SuppressWarnings("unused")
- private SentenceFilteredTrie filter_regexp(Trie unfilteredTrie) {
- SentenceFilteredTrie trie = null;
-
- /* Case 1: keep the trie node if it has a rule collection that matches the sentence */
- if (unfilteredTrie.hasRules())
- if (matchesSentence(unfilteredTrie))
- trie = new SentenceFilteredTrie(unfilteredTrie);
- else
- return null;
-
- /* Case 2: keep the trie node if it has children who have valid rule collections */
- if (unfilteredTrie.hasExtensions())
- for (Entry<Integer, ? extends Trie> arc : unfilteredTrie.getChildren().entrySet()) {
- Trie unfilteredChildTrie = arc.getValue();
- SentenceFilteredTrie nextTrie = filter_regexp(unfilteredChildTrie);
- if (nextTrie != null) {
- if (trie == null)
- trie = new SentenceFilteredTrie(unfilteredTrie);
- trie.children.put(arc.getKey(), nextTrie);
- }
- }
-
- return trie;
- }
-
- private boolean matchesSentence(Trie childTrie) {
- Rule rule = childTrie.getRuleCollection().getRules().get(0);
- return rule.matches(sentence);
- }
-
- /**
- * Implements a filtered trie, by sitting on top of a base trie and annotating nodes that match
- * the given input sentence.
- *
- * @author Matt Post post@cs.jhu.edu
- *
- */
- public class SentenceFilteredTrie implements Trie {
-
- /* The underlying unfiltered trie node. */
- private final Trie unfilteredTrieNode;
-
- /* The child nodes in the filtered trie. */
- private HashMap<Integer, SentenceFilteredTrie> children = null;
-
- /**
- * Constructor.
- *
- * @param unfilteredTrieNode todo
- */
- public SentenceFilteredTrie(Trie unfilteredTrieNode) {
- this.unfilteredTrieNode = unfilteredTrieNode;
- this.children = new HashMap<>();
- }
-
- @Override
- public SentenceFilteredTrie match(int wordID) {
- if (children != null)
- return children.get(wordID);
- return null;
- }
-
- @Override
- public boolean hasExtensions() {
- return children != null;
- }
-
- @Override
- public Collection<SentenceFilteredTrie> getExtensions() {
- if (children != null)
- return children.values();
-
- return null;
- }
-
- @Override
- public HashMap<Integer, SentenceFilteredTrie> getChildren() {
- return children;
- }
-
- @Override
- public boolean hasRules() {
- // Chain to the underlying unfiltered node.
- return unfilteredTrieNode.hasRules();
- }
-
- @Override
- public RuleCollection getRuleCollection() {
- // Chain to the underlying unfiltered node, since the rule collection just varies by target
- // side.
- return unfilteredTrieNode.getRuleCollection();
- }
-
- /**
- * Counts the number of rules.
- *
- * @return the number of rules rooted at this node.
- */
- public int getNumRules() {
- int numRules = 0;
- if (getTrieRoot() != null)
- if (getTrieRoot().getRuleCollection() != null)
- numRules += getTrieRoot().getRuleCollection().getRules().size();
-
- for (SentenceFilteredTrie node : getExtensions())
- numRules += node.getNumRules();
-
- return numRules;
- }
-
- @Override
- public Iterator<Integer> getTerminalExtensionIterator() {
- return new ExtensionIterator(children, true);
- }
-
- @Override
- public Iterator<Integer> getNonterminalExtensionIterator() {
- return new ExtensionIterator(children, false);
- }
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/81baa658/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java b/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
index cd2d3af..ebfa996 100644
--- a/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
@@ -109,8 +109,9 @@ public class MemoryBasedBatchGrammar extends AbstractGrammar {
LOG.info("Couldn't create a GrammarReader for file {} with format {}",
grammarFile, formatKeyword);
}
-
- this.printGrammar();
+
+ LOG.info("MemoryBasedBatchGrammar: Read {} rules with {} distinct source sides from '{}'",
+ this.qtyRulesRead, this.qtyRuleBins, grammarFile);
}
protected GrammarReader<Rule> createReader(String format, String grammarFile) throws IOException {
@@ -199,11 +200,6 @@ public class MemoryBasedBatchGrammar extends AbstractGrammar {
pos.ruleBin.addRule(rule);
}
- protected void printGrammar() {
- LOG.info("MemoryBasedBatchGrammar: Read {} rules with {} distinct source sides from '{}'",
- this.qtyRulesRead, this.qtyRuleBins, grammarFile);
- }
-
/***
* Takes an input word and creates an OOV rule in the current grammar for that word.
*
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/81baa658/src/main/java/org/apache/joshua/decoder/segment_file/Token.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/segment_file/Token.java b/src/main/java/org/apache/joshua/decoder/segment_file/Token.java
index 4cbc7fa..11ba88f 100644
--- a/src/main/java/org/apache/joshua/decoder/segment_file/Token.java
+++ b/src/main/java/org/apache/joshua/decoder/segment_file/Token.java
@@ -113,7 +113,7 @@ public class Token {
else
annotations.put("lettercase", "lower");
- LOG.info("TOKEN: {} -> {} ({})", token, token.toLowerCase(), annotations.get("lettercase"));
+ LOG.debug("TOKEN: {} -> {} ({})", token, token.toLowerCase(), annotations.get("lettercase"));
token = token.toLowerCase();
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/81baa658/src/test/resources/decoder/n-ary/joshua.config
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/n-ary/joshua.config b/src/test/resources/decoder/n-ary/joshua.config
index 1710c50..afc99bf 100644
--- a/src/test/resources/decoder/n-ary/joshua.config
+++ b/src/test/resources/decoder/n-ary/joshua.config
@@ -13,8 +13,6 @@ pop-limit = 100
#nbest config
use_unique_nbest = true
-use_tree_nbest = false
-add_combined_cost = true
top_n = 1
output-format = %c %s