You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/11/17 18:25:10 UTC
[5/8] incubator-joshua git commit: cleanup in log messages, typos

cleanup in log messages, typos


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/81baa658
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/81baa658
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/81baa658

Branch: refs/heads/save_custom_grammars
Commit: 81baa658207f53623f2b07a5ae7d6a6e6ef0bbd8
Parents: 561799a
Author: Matt Post <po...@cs.jhu.edu>
Authored: Wed Nov 16 06:41:03 2016 -0500
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Thu Nov 17 13:24:07 2016 -0500

----------------------------------------------------------------------
 .../joshua/decoder/chart_parser/Chart.java      |   4 +-
 .../decoder/ff/tm/SentenceFilteredGrammar.java  | 366 -------------------
 .../tm/hash_based/MemoryBasedBatchGrammar.java  |  10 +-
 .../joshua/decoder/segment_file/Token.java      |   2 +-
 src/test/resources/decoder/n-ary/joshua.config  |   2 -
 5 files changed, 6 insertions(+), 378 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/81baa658/src/main/java/org/apache/joshua/decoder/chart_parser/Chart.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/chart_parser/Chart.java b/src/main/java/org/apache/joshua/decoder/chart_parser/Chart.java
index bd91a6f..883e20d 100644
--- a/src/main/java/org/apache/joshua/decoder/chart_parser/Chart.java
+++ b/src/main/java/org/apache/joshua/decoder/chart_parser/Chart.java
@@ -424,7 +424,7 @@ public class Chart {
     if (null == this.cells.get(0, sourceLength)
         || !this.goalBin.transitToGoal(this.cells.get(0, sourceLength), this.featureFunctions,
             this.sourceLength)) {
-      LOG.warn("Input {}: Parse failure (either no derivations exist or pruning is too aggressive",
+      LOG.warn("Input {}: Parse failure (either no derivations exist, or pruning is too aggressive)",
           sentence.id());
       return null;
     }
@@ -621,7 +621,7 @@ public class Chart {
     if (null == this.cells.get(0, sourceLength)
         || !this.goalBin.transitToGoal(this.cells.get(0, sourceLength), this.featureFunctions,
             this.sourceLength)) {
-      LOG.warn("Input {}: Parse failure (either no derivations exist or pruning is too aggressive",
+      LOG.warn("Input {}: Parse failure (either no derivations exist, or pruning is too aggressive)",
           sentence.id());
       return null;
     }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/81baa658/src/main/java/org/apache/joshua/decoder/ff/tm/SentenceFilteredGrammar.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/tm/SentenceFilteredGrammar.java b/src/main/java/org/apache/joshua/decoder/ff/tm/SentenceFilteredGrammar.java
deleted file mode 100644
index 4f545b7..0000000
--- a/src/main/java/org/apache/joshua/decoder/ff/tm/SentenceFilteredGrammar.java
+++ /dev/null
@@ -1,366 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.joshua.decoder.ff.tm;
-
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.Map.Entry;
-
-import org.apache.joshua.decoder.ff.tm.hash_based.ExtensionIterator;
-import org.apache.joshua.decoder.ff.tm.hash_based.MemoryBasedBatchGrammar;
-import org.apache.joshua.decoder.segment_file.Sentence;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * This class implements dynamic sentence-level filtering. This is accomplished with a parallel
- * trie, a subset of the original trie, that only contains trie paths that are reachable from
- * traversals of the current sentence.
- * 
- * @author Matt Post post@cs.jhu.edu
- */
-public class SentenceFilteredGrammar extends MemoryBasedBatchGrammar {
-
-  private static final Logger LOG = LoggerFactory.getLogger(SentenceFilteredGrammar.class);
-
-  private final AbstractGrammar baseGrammar;
-  private final SentenceFilteredTrie filteredTrie;
-  private final int[] tokens;
-  private final Sentence sentence;
-
-  /**
-   * Construct a new sentence-filtered grammar. The main work is done in the enclosed trie (obtained
-   * from the base grammar, which contains the complete grammar).
-   * 
-   * @param baseGrammar a new {@link org.apache.joshua.decoder.ff.tm.AbstractGrammar} to populate
-   * @param sentence {@link org.apache.joshua.lattice.Lattice} input
-   */
-  SentenceFilteredGrammar(AbstractGrammar baseGrammar, Sentence sentence) {
-    super(OwnerMap.getOwner(baseGrammar.getOwner()), baseGrammar.joshuaConfiguration, baseGrammar.getSpanLimit());
-    this.baseGrammar = baseGrammar;
-    this.sentence = sentence;
-    this.tokens = sentence.getWordIDs();
-
-    int origCount = getNumRules(baseGrammar.getTrieRoot());
-    long startTime = System.currentTimeMillis();
-
-    /* Filter the rules; returns non-null object */
-    this.filteredTrie = filter(baseGrammar.getTrieRoot());
-    int filteredCount = getNumRules();
-
-    float seconds = (System.currentTimeMillis() - startTime) / 1000.0f;
-
-    LOG.debug("Sentence-level filtering of sentence {} ({} -> {} rules) in {} seconds",
-        sentence.id(), origCount, filteredCount, seconds);
-  }
-
-  @Override
-  public Trie getTrieRoot() {
-    return filteredTrie;
-  }
-
-  /**
-   * This function is poorly named: it doesn't mean whether a rule exists in the grammar for the
-   * current span, but whether the grammar is permitted to apply rules to the current span (a
-   * grammar-level parameter). As such we can just chain to the underlying grammar.
-   */
-  @Override
-  public boolean hasRuleForSpan(int startIndex, int endIndex, int pathLength) {
-    return baseGrammar.hasRuleForSpan(startIndex, endIndex, pathLength);
-  }
-
-  @Override
-  public int getNumRules() {
-    return getNumRules(getTrieRoot());
-  }
-
-  /**
-   * A convenience function that counts the number of rules in a grammar's trie.
-   * 
-   * @param node the {@link org.apache.joshua.decoder.ff.tm.Trie} implementation for which to count rules
-   * @return the number of rules
-   */
-  public int getNumRules(Trie node) {
-    int numRules = 0;
-    if (node != null) {
-      if (node.getRuleCollection() != null)
-        numRules += node.getRuleCollection().getRules().size();
-
-      if (node.getExtensions() != null)
-        for (Trie child : node.getExtensions())
-          numRules += getNumRules(child);
-    }
-
-    return numRules;
-  }
-
-  /**
-   * What is the algorithm?
-   * 
-   * Take the first word of the sentence, and start at the root of the trie. There are two things to
-   * consider: (a) word matches and (b) nonterminal matches.
-   * 
-   * For a word match, simply follow that arc along the trie. We create a parallel arc in our
-   * filtered grammar to represent it. Each arc in the filtered trie knows about its
-   * corresponding/underlying node in the unfiltered grammar trie.
-   * 
-   * A nonterminal is always permitted to match. The question then is how much of the input sentence
-   * we imagine it consumed. The answer is that it could have been any amount. So the recursive call
-   * has to be a set of calls, one each to the next trie node with different lengths of the sentence
-   * remaining.
-   * 
-   * A problem occurs when we have multiple sequential nonterminals. For scope-3 grammars, there can
-   * be four sequential nonterminals (in the case when they are grounded by terminals on both ends
-   * of the nonterminal chain). We'd like to avoid looking at all possible ways to split up the
-   * subsequence, because with respect to filtering rules, they are all the same.
-   * 
-   * We accomplish this with the following restriction: for purposes of grammar filtering, only the
-   * first in a sequence of nonterminal traversals can consume more than one word. Each of the
-   * subsequent ones would have to consume just one word. We then just have to record in the
-   * recursive call whether the last traversal was a nonterminal or not.
-   * 
-   * @param unfilteredTrieRoot todo
-   * @return the root of the filtered trie
-   */
-  private SentenceFilteredTrie filter(Trie unfilteredTrieRoot) {
-    SentenceFilteredTrie filteredTrieRoot = new SentenceFilteredTrie(unfilteredTrieRoot);
-
-    // System.err.println(String.format("FILTERING TO SENTENCE\n  %s\n",
-    // Vocabulary.getWords(tokens)));
-
-    /*
-     * The root of the trie is where rule applications start, so we simply try all possible
-     * positions in the sentence.
-     */
-    for (int i = 0; i < tokens.length; i++) {
-      filter(i, filteredTrieRoot, false);
-    }
-
-    return filteredTrieRoot;
-  }
-
-  /**
-   * Matches rules against the sentence. Intelligently handles chains of sequential nonterminals.
-   * Marks arcs that are traversable for this sentence.
-   * 
-   * @param i the position in the sentence to start matching
-   * @param trie the trie node to match against
-   * @param lastWasNT true if the match that brought us here was against a nonterminal
-   */
-  private void filter(int i, SentenceFilteredTrie trieNode, boolean lastWasNT) {
-    if (i >= tokens.length)
-      return;
-
-    /* Make sure the underlying unfiltered node has children. */
-    Trie unfilteredTrieNode = trieNode.unfilteredTrieNode;
-    if (unfilteredTrieNode.getChildren() == null) {
-      // trieNode.path.retreat();
-      return;
-    }
-
-    /* Match a word */
-    Trie trie = unfilteredTrieNode.match(tokens[i]);
-    if (trie != null) {
-      /*
-       * The current filtered node might already have an arc for this label. If so, retrieve it
-       * (since we still need to follow it); if not, create it.
-       */
-      SentenceFilteredTrie nextFilteredTrie = trieNode.match(tokens[i]);
-      if (nextFilteredTrie == null) {
-        nextFilteredTrie = new SentenceFilteredTrie(trie);
-        trieNode.children.put(tokens[i], nextFilteredTrie);
-      }
-
-      /*
-       * Now continue, trying to match the child node against the next position in the sentence. The
-       * third argument records that this match was not against a nonterminal.
-       */
-      filter(i + 1, nextFilteredTrie, false);
-    }
-
-    /*
-     * Now we attempt to match nonterminals. Any nonterminal is permitted to match any region of the
-     * sentence, up to the maximum span for that grammar. So we enumerate all children of the
-     * current (unfiltered) trie grammar node, looking for nonterminals (items whose label value is
-     * less than 0), then recurse.
-     * 
-     * There is one subtlely. Adjacent nonterminals in a grammar rule can match a span (i, j) in (j
-     * - i - 1) ways, but for purposes of determining whether a rule fits, this is all wasted
-     * effort. To handle this, we allow the first nonterminal in a sequence to record 1, 2, 3, ...
-     * terminals (up to the grammar's span limit, or the rest of the sentence, whichever is
-     * shorter). Subsequent adjacent nonterminals are permitted to consume only a single terminal.
-     */
-    HashMap<Integer, ? extends Trie> children = unfilteredTrieNode.getChildren();
-    if (children != null) {
-      for (int label : children.keySet()) {
-        if (label < 0) {
-          SentenceFilteredTrie nextFilteredTrie = trieNode.match(label);
-          if (nextFilteredTrie == null) {
-            nextFilteredTrie = new SentenceFilteredTrie(unfilteredTrieNode.match(label));
-            trieNode.children.put(label, nextFilteredTrie);
-          }
-
-          /*
-           * Recurse. If the last match was a nonterminal, we can only consume one more token.
-           * 
-           * TODO: This goes too far by looking at the whole sentence; each grammar has a maximum
-           * span limit which should be consulted. What we should be doing is passing the point
-           * where we started matching the current sentence, so we can apply this span limit, which
-           * is easily accessible (baseGrammar.spanLimit).
-           */
-          int maxJ = lastWasNT ? (i + 1) : tokens.length;
-          for (int j = i + 1; j <= maxJ; j++) {
-            filter(j, nextFilteredTrie, true);
-          }
-        }
-      }
-    }
-  }
-
-  /**
-   * Alternate filter that uses regular expressions, walking the grammar trie and matching the
-   * source side of each rule collection against the input sentence. Failed matches are discarded,
-   * and trie nodes extending from that position need not be explored.
-   * 
-   * @param unfilteredTrie todo
-   * @return the root of the filtered trie if any rules were retained, otherwise null
-   */
-  @SuppressWarnings("unused")
-  private SentenceFilteredTrie filter_regexp(Trie unfilteredTrie) {
-    SentenceFilteredTrie trie = null;
-
-    /* Case 1: keep the trie node if it has a rule collection that matches the sentence */
-    if (unfilteredTrie.hasRules())
-      if (matchesSentence(unfilteredTrie))
-        trie = new SentenceFilteredTrie(unfilteredTrie);
-      else
-        return null;
-
-    /* Case 2: keep the trie node if it has children who have valid rule collections */
-    if (unfilteredTrie.hasExtensions())
-      for (Entry<Integer, ? extends Trie> arc : unfilteredTrie.getChildren().entrySet()) {
-        Trie unfilteredChildTrie = arc.getValue();
-        SentenceFilteredTrie nextTrie = filter_regexp(unfilteredChildTrie);
-        if (nextTrie != null) {
-          if (trie == null)
-            trie = new SentenceFilteredTrie(unfilteredTrie);
-          trie.children.put(arc.getKey(), nextTrie);
-        }
-      }
-
-    return trie;
-  }
-
-  private boolean matchesSentence(Trie childTrie) {
-    Rule rule = childTrie.getRuleCollection().getRules().get(0);
-    return rule.matches(sentence);
-  }
-
-  /**
-   * Implements a filtered trie, by sitting on top of a base trie and annotating nodes that match
-   * the given input sentence.
-   * 
-   * @author Matt Post post@cs.jhu.edu
-   * 
-   */
-  public class SentenceFilteredTrie implements Trie {
-
-    /* The underlying unfiltered trie node. */
-    private final Trie unfilteredTrieNode;
-
-    /* The child nodes in the filtered trie. */
-    private HashMap<Integer, SentenceFilteredTrie> children = null;
-
-    /**
-     * Constructor.
-     * 
-     * @param unfilteredTrieNode todo
-     */
-    public SentenceFilteredTrie(Trie unfilteredTrieNode) {
-      this.unfilteredTrieNode = unfilteredTrieNode;
-      this.children = new HashMap<>();
-    }
-
-    @Override
-    public SentenceFilteredTrie match(int wordID) {
-      if (children != null)
-        return children.get(wordID);
-      return null;
-    }
-
-    @Override
-    public boolean hasExtensions() {
-      return children != null;
-    }
-
-    @Override
-    public Collection<SentenceFilteredTrie> getExtensions() {
-      if (children != null)
-        return children.values();
-
-      return null;
-    }
-
-    @Override
-    public HashMap<Integer, SentenceFilteredTrie> getChildren() {
-      return children;
-    }
-
-    @Override
-    public boolean hasRules() {
-      // Chain to the underlying unfiltered node.
-      return unfilteredTrieNode.hasRules();
-    }
-
-    @Override
-    public RuleCollection getRuleCollection() {
-      // Chain to the underlying unfiltered node, since the rule collection just varies by target
-      // side.
-      return unfilteredTrieNode.getRuleCollection();
-    }
-
-    /**
-     * Counts the number of rules.
-     * 
-     * @return the number of rules rooted at this node.
-     */
-    public int getNumRules() {
-      int numRules = 0;
-      if (getTrieRoot() != null)
-        if (getTrieRoot().getRuleCollection() != null)
-          numRules += getTrieRoot().getRuleCollection().getRules().size();
-
-      for (SentenceFilteredTrie node : getExtensions())
-        numRules += node.getNumRules();
-
-      return numRules;
-    }
-
-    @Override
-    public Iterator<Integer> getTerminalExtensionIterator() {
-      return new ExtensionIterator(children, true);
-    }
-
-    @Override
-    public Iterator<Integer> getNonterminalExtensionIterator() {
-      return new ExtensionIterator(children, false);
-    }
-  }
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/81baa658/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java b/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
index cd2d3af..ebfa996 100644
--- a/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
@@ -109,8 +109,9 @@ public class MemoryBasedBatchGrammar extends AbstractGrammar {
       LOG.info("Couldn't create a GrammarReader for file {} with format {}",
           grammarFile, formatKeyword);
     }
-
-    this.printGrammar();
+    
+    LOG.info("MemoryBasedBatchGrammar: Read {} rules with {} distinct source sides from '{}'",
+        this.qtyRulesRead, this.qtyRuleBins, grammarFile);
   }
 
   protected GrammarReader<Rule> createReader(String format, String grammarFile) throws IOException {
@@ -199,11 +200,6 @@ public class MemoryBasedBatchGrammar extends AbstractGrammar {
     pos.ruleBin.addRule(rule);
   }
 
-  protected void printGrammar() {
-    LOG.info("MemoryBasedBatchGrammar: Read {} rules with {} distinct source sides from '{}'",
-        this.qtyRulesRead, this.qtyRuleBins, grammarFile);
-  }
-
   /***
    * Takes an input word and creates an OOV rule in the current grammar for that word.
    * 

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/81baa658/src/main/java/org/apache/joshua/decoder/segment_file/Token.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/segment_file/Token.java b/src/main/java/org/apache/joshua/decoder/segment_file/Token.java
index 4cbc7fa..11ba88f 100644
--- a/src/main/java/org/apache/joshua/decoder/segment_file/Token.java
+++ b/src/main/java/org/apache/joshua/decoder/segment_file/Token.java
@@ -113,7 +113,7 @@ public class Token {
       else
         annotations.put("lettercase",  "lower");
       
-      LOG.info("TOKEN: {} -> {} ({})", token, token.toLowerCase(), annotations.get("lettercase"));
+      LOG.debug("TOKEN: {} -> {} ({})", token, token.toLowerCase(), annotations.get("lettercase"));
       token = token.toLowerCase(); 
     }
     

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/81baa658/src/test/resources/decoder/n-ary/joshua.config
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/n-ary/joshua.config b/src/test/resources/decoder/n-ary/joshua.config
index 1710c50..afc99bf 100644
--- a/src/test/resources/decoder/n-ary/joshua.config
+++ b/src/test/resources/decoder/n-ary/joshua.config
@@ -13,8 +13,6 @@ pop-limit = 100
 
 #nbest config
 use_unique_nbest = true
-use_tree_nbest = false
-add_combined_cost = true
 top_n = 1
 
 output-format = %c %s