You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/05/25 20:38:17 UTC

[1/5] incubator-joshua git commit: added comments clarifying importance of a weird vocab call

Repository: incubator-joshua
Updated Branches:
  refs/heads/JOSHUA-273 89b5673f4 -> 31c66f2e7


added comments clarifying importance of a weird vocab call


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/05bd5221
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/05bd5221
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/05bd5221

Branch: refs/heads/JOSHUA-273
Commit: 05bd522157e76f16a7706e1b1194175eb02e3162
Parents: 4d73c17
Author: Matt Post <po...@cs.jhu.edu>
Authored: Wed May 25 15:05:57 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Wed May 25 15:05:57 2016 -0400

----------------------------------------------------------------------
 src/joshua/decoder/ff/tm/format/HieroFormatReader.java | 3 +++
 src/joshua/tools/GrammarPacker.java                    | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/05bd5221/src/joshua/decoder/ff/tm/format/HieroFormatReader.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/tm/format/HieroFormatReader.java b/src/joshua/decoder/ff/tm/format/HieroFormatReader.java
index 92d7ddb..9c21fb0 100644
--- a/src/joshua/decoder/ff/tm/format/HieroFormatReader.java
+++ b/src/joshua/decoder/ff/tm/format/HieroFormatReader.java
@@ -63,6 +63,9 @@ public class HieroFormatReader extends GrammarReader<Rule> {
     String[] sourceWords = fields[1].split("\\s+");
     int[] sourceIDs = new int[sourceWords.length];
     for (int i = 0; i < sourceWords.length; i++) {
+      /* NOTE: This redundantly creates vocab items for terms like [X,1]. This might actually
+       * be necessary, so don't try to turn this into an if/else.
+       */
       sourceIDs[i] = Vocabulary.id(sourceWords[i]);
       if (FormatUtils.isNonterminal(sourceWords[i])) {
         sourceIDs[i] = Vocabulary.id(FormatUtils.stripNonTerminalIndex(sourceWords[i]));

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/05bd5221/src/joshua/tools/GrammarPacker.java
----------------------------------------------------------------------
diff --git a/src/joshua/tools/GrammarPacker.java b/src/joshua/tools/GrammarPacker.java
index 8c39582..1a14789 100644
--- a/src/joshua/tools/GrammarPacker.java
+++ b/src/joshua/tools/GrammarPacker.java
@@ -256,7 +256,8 @@ public class GrammarPacker {
        * NOTE: In case of nonterminals, we add both stripped versions ("[X]")
        * and "[X,1]" to the vocabulary.
        * 
-       * TODO: MJP May 2016: Is it necessary to add [X,1]?
+       * TODO: MJP May 2016: Is it necessary to add [X,1]? This is currently being done in
+       * {@link HieroFormatReader}, which is called by {@link MosesFormatReader}. 
        */
 
       // Add feature names to vocabulary and pass the value through the


[5/5] incubator-joshua git commit: bugfix: wasn't incrementing k!

Posted by mj...@apache.org.
bugfix: wasn't incrementing k!


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/31c66f2e
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/31c66f2e
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/31c66f2e

Branch: refs/heads/JOSHUA-273
Commit: 31c66f2e70a8b7b415905871a3e1516279d62fb1
Parents: fb0f6ab
Author: Matt Post <po...@cs.jhu.edu>
Authored: Wed May 25 16:37:35 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Wed May 25 16:37:35 2016 -0400

----------------------------------------------------------------------
 src/joshua/decoder/Decoder.java | 2 ++
 1 file changed, 2 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/31c66f2e/src/joshua/decoder/Decoder.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/Decoder.java b/src/joshua/decoder/Decoder.java
index 1a48110..c3850a3 100644
--- a/src/joshua/decoder/Decoder.java
+++ b/src/joshua/decoder/Decoder.java
@@ -514,6 +514,8 @@ public class Decoder {
           String bestString = t.getFormattedTranslation();
           out.write(bestString.getBytes());
           out.write("\n".getBytes());
+          
+          k++;
         }
       }
       out.flush();


[4/5] incubator-joshua git commit: Merge branch 'master' into JOSHUA-273

Posted by mj...@apache.org.
Merge branch 'master' into JOSHUA-273


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/fb0f6aba
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/fb0f6aba
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/fb0f6aba

Branch: refs/heads/JOSHUA-273
Commit: fb0f6aba9a2c63ebf176c78326819ac6b88ccf0d
Parents: 89b5673 b1961e1
Author: Matt Post <po...@cs.jhu.edu>
Authored: Wed May 25 16:33:27 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Wed May 25 16:33:27 2016 -0400

----------------------------------------------------------------------
 src/joshua/decoder/ff/tm/GrammarReader.java     |  2 --
 .../decoder/ff/tm/format/HieroFormatReader.java | 13 ++++---
 .../decoder/ff/tm/format/MosesFormatReader.java | 12 ++++---
 src/joshua/tools/GrammarPacker.java             |  3 +-
 src/joshua/util/Constants.java                  | 36 ++++++++++++++++++++
 5 files changed, 51 insertions(+), 15 deletions(-)
----------------------------------------------------------------------



[3/5] incubator-joshua git commit: made a place for constants and pushed a few things into it

Posted by mj...@apache.org.
made a place for constants and pushed a few things into it


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/b1961e17
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/b1961e17
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/b1961e17

Branch: refs/heads/JOSHUA-273
Commit: b1961e17f59811c5e4cd070ab8691691574bb8ec
Parents: 89d3b18
Author: Matt Post <po...@cs.jhu.edu>
Authored: Wed May 25 16:33:13 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Wed May 25 16:33:13 2016 -0400

----------------------------------------------------------------------
 src/joshua/decoder/ff/tm/GrammarReader.java     |  2 --
 .../decoder/ff/tm/format/HieroFormatReader.java | 10 ++----
 .../decoder/ff/tm/format/MosesFormatReader.java | 12 ++++---
 src/joshua/util/Constants.java                  | 36 ++++++++++++++++++++
 4 files changed, 46 insertions(+), 14 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b1961e17/src/joshua/decoder/ff/tm/GrammarReader.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/tm/GrammarReader.java b/src/joshua/decoder/ff/tm/GrammarReader.java
index 3432e53..6f8c312 100644
--- a/src/joshua/decoder/ff/tm/GrammarReader.java
+++ b/src/joshua/decoder/ff/tm/GrammarReader.java
@@ -34,8 +34,6 @@ import joshua.util.io.LineReader;
  */
 public abstract class GrammarReader<R extends Rule> implements Iterable<R>, Iterator<R> {
 
-  protected static String fieldDelimiter;
-
   protected static String description;
 
   protected String fileName;

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b1961e17/src/joshua/decoder/ff/tm/format/HieroFormatReader.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/tm/format/HieroFormatReader.java b/src/joshua/decoder/ff/tm/format/HieroFormatReader.java
index 9c21fb0..9b2039e 100644
--- a/src/joshua/decoder/ff/tm/format/HieroFormatReader.java
+++ b/src/joshua/decoder/ff/tm/format/HieroFormatReader.java
@@ -23,6 +23,7 @@ import java.io.IOException;
 import joshua.corpus.Vocabulary;
 import joshua.decoder.ff.tm.GrammarReader;
 import joshua.decoder.ff.tm.Rule;
+import joshua.util.Constants;
 import joshua.util.FormatUtils;
 
 /**
@@ -35,7 +36,6 @@ import joshua.util.FormatUtils;
 public class HieroFormatReader extends GrammarReader<Rule> {
 
   static {
-    fieldDelimiter = "\\s\\|{3}\\s";
     description = "Original Hiero format";
   }
 
@@ -49,7 +49,7 @@ public class HieroFormatReader extends GrammarReader<Rule> {
 
   @Override
   public Rule parseLine(String line) {
-    String[] fields = line.split(fieldDelimiter);
+    String[] fields = line.split(Constants.fieldDelimiter);
     if (fields.length < 3) {
       throw new RuntimeException(String.format("Rule '%s' does not have four fields", line));
     }
@@ -100,11 +100,7 @@ public class HieroFormatReader extends GrammarReader<Rule> {
 
     return new Rule(lhs, sourceIDs, targetIDs, sparse_features, arity, alignment);
   }
-
-  public static String getFieldDelimiter() {
-    return fieldDelimiter;
-  }
-
+  
   public static boolean isNonTerminal(final String word) {
     return FormatUtils.isNonterminal(word);
   }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b1961e17/src/joshua/decoder/ff/tm/format/MosesFormatReader.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/tm/format/MosesFormatReader.java b/src/joshua/decoder/ff/tm/format/MosesFormatReader.java
index 47a3e46..a2ada68 100644
--- a/src/joshua/decoder/ff/tm/format/MosesFormatReader.java
+++ b/src/joshua/decoder/ff/tm/format/MosesFormatReader.java
@@ -22,6 +22,8 @@ import java.io.IOException;
 
 import joshua.corpus.Vocabulary;
 import joshua.decoder.ff.tm.Rule;
+import joshua.util.Constants;
+import joshua.util.FormatUtils;
 import joshua.util.io.LineReader;
 
 /***
@@ -46,12 +48,12 @@ public class MosesFormatReader extends HieroFormatReader {
 
   public MosesFormatReader(String grammarFile) throws IOException {
     super(grammarFile);
-    Vocabulary.id("[X]");
+    Vocabulary.id(Constants.defaultNT);
   }
   
   public MosesFormatReader() {
     super();
-    Vocabulary.id("[X]");
+    Vocabulary.id(Constants.defaultNT);
   }
   
   /**
@@ -73,10 +75,10 @@ public class MosesFormatReader extends HieroFormatReader {
    */
   @Override
   public Rule parseLine(String line) {
-    String[] fields = line.split(fieldDelimiter);
+    String[] fields = line.split(Constants.fieldDelimiter);
     
-    StringBuffer hieroLine = new StringBuffer();
-    hieroLine.append("[X] ||| [X,1] " + fields[0] + " ||| [X,1] " + fields[1] + " |||");
+    String nt = FormatUtils.cleanNonTerminal(Constants.defaultNT);
+    StringBuffer hieroLine = new StringBuffer(Constants.defaultNT + " ||| [" + nt + ",1] " + fields[0] + " ||| [" + nt + ",1] " + fields[1] + " |||");
 
     String mosesFeatureString = fields[2];
     for (String value: mosesFeatureString.split(" ")) {

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b1961e17/src/joshua/util/Constants.java
----------------------------------------------------------------------
diff --git a/src/joshua/util/Constants.java b/src/joshua/util/Constants.java
new file mode 100644
index 0000000..90e3016
--- /dev/null
+++ b/src/joshua/util/Constants.java
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package joshua.util;
+
+/***
+ * One day, all constants should be moved here (many are in Vocabulary).
+ * 
+ * @author Matt Post <po...@cs.jhu.edu>
+ */
+
+public final class Constants {
+  public static String defaultNT = "[X]";
+
+  public static final String START_SYM = "<s>";
+  public static final String STOP_SYM = "</s>";
+  public static final String UNKNOWN_WORD = "<unk>";
+  
+  public static final String fieldDelimiter = "\\s\\|{3}\\s";
+  public static final String spaceSeparator = "\\s+";
+}


[2/5] incubator-joshua git commit: removed unused file

Posted by mj...@apache.org.
removed unused file


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/89d3b183
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/89d3b183
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/89d3b183

Branch: refs/heads/JOSHUA-273
Commit: 89d3b183b6de06e830d1097dbf0757e25590d1ae
Parents: 05bd522
Author: Matt Post <po...@cs.jhu.edu>
Authored: Wed May 25 15:06:13 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Wed May 25 15:06:13 2016 -0400

----------------------------------------------------------------------
 .../chart_parser/ManualConstraintsHandler.java  | 217 -------------------
 1 file changed, 217 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/89d3b183/src/joshua/decoder/chart_parser/ManualConstraintsHandler.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/chart_parser/ManualConstraintsHandler.java b/src/joshua/decoder/chart_parser/ManualConstraintsHandler.java
deleted file mode 100644
index baed984..0000000
--- a/src/joshua/decoder/chart_parser/ManualConstraintsHandler.java
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.chart_parser;
-
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-
-import joshua.corpus.Vocabulary;
-import joshua.decoder.ff.tm.Grammar;
-import joshua.decoder.ff.tm.Rule;
-import joshua.decoder.segment_file.ConstraintRule;
-import joshua.decoder.segment_file.ConstraintSpan;
-
-/**
- * @author Zhifei Li, <zh...@gmail.com>
- */
-
-public class ManualConstraintsHandler {
-
-  // TODO: each span only has one ConstraintSpan
-  // contain spans that have LHS or RHS constraints (they are always hard)
-  private HashMap<String, ConstraintSpan> constraintSpansForFiltering;
-
-  // contain spans that have hard "rule" constraint; key: start_span; value:
-  // end_span
-  private ArrayList<Span> spansWithHardRuleConstraint;
-
-  private Chart chart;
-  private Grammar grammarForConstructManualRule;
-
-  private static final Logger logger = Logger.getLogger(ManualConstraintsHandler.class.getName());
-
-  public ManualConstraintsHandler(Chart chart, Grammar grammarForConstructManualRule,
-      List<ConstraintSpan> constraintSpans) {
-    this.chart = chart;
-    this.grammarForConstructManualRule = grammarForConstructManualRule;
-    initialize(constraintSpans);
-  }
-
-  private void initialize(List<ConstraintSpan> constraintSpans) {
-    /**
-     * Note that manual constraints or OOV handling is not part of seeding
-     * */
-    /**
-     * (1) add manual rule (only allow flat rules) into the chart as constraints (2) add RHS or LHS
-     * constraint into constraintSpansForFiltering (3) add span signature into
-     * setOfSpansWithHardRuleConstraint; if the span contains a hard "RULE" constraint
-     */
-    if (null != constraintSpans) {
-
-      for (ConstraintSpan cSpan : constraintSpans) {
-        if (null != cSpan.rules()) {
-          boolean shouldAdd = false; // contain LHS or RHS constraints?
-          for (ConstraintRule cRule : cSpan.rules()) {
-            /**
-             * Note that LHS and RHS constraints are always hard, while Rule constraint can be soft
-             * or hard
-             **/
-            switch (cRule.type()) {
-              case RULE:
-                // == prepare the feature scores
-                // TODO: this require the input always specify the right number of
-                // features
-                float[] featureScores = new float[cRule.features().length];
-
-                for (int i = 0; i < featureScores.length; i++) {
-                  if (cSpan.isHard()) {
-                    featureScores[i] = 0; // force the feature cost as zero
-                  } else {
-                    featureScores[i] = cRule.features()[i];
-                  }
-                }
-
-                /**
-                 * If the RULE constraint is hard, then we should filter all out all consituents
-                 * (within this span), which are contructed from regular grammar
-                 */
-                if (cSpan.isHard()) {
-                  if (null == this.spansWithHardRuleConstraint) {
-                    this.spansWithHardRuleConstraint = new ArrayList<Span>();
-                  }
-                  this.spansWithHardRuleConstraint.add(new Span(cSpan.start(), cSpan.end()));
-                }
-
-                int arity = 0; // only allow flat rule (i.e. arity=0)
-                Rule rule =
-                    this.grammarForConstructManualRule.constructManualRule(
-                        Vocabulary.id(cRule.lhs()), Vocabulary.addAll(cRule.foreignRhs()),
-                        Vocabulary.addAll(cRule.nativeRhs()), featureScores, arity);
-
-                // add to the chart
-                chart.addAxiom(cSpan.start(), cSpan.end(), rule, new SourcePath());
-                if (logger.isLoggable(Level.INFO))
-                  logger.info("Adding RULE constraint for span " + cSpan.start() + ", "
-                      + cSpan.end() + "; isHard=" + cSpan.isHard() + rule.getLHS());
-                break;
-
-              default:
-                shouldAdd = true;
-            }
-          }
-          if (shouldAdd) {
-            if (logger.isLoggable(Level.INFO))
-              logger.info("Adding LHS or RHS constraint for span " + cSpan.start() + ", "
-                  + cSpan.end());
-            if (null == this.constraintSpansForFiltering) {
-              this.constraintSpansForFiltering = new HashMap<String, ConstraintSpan>();
-            }
-            this.constraintSpansForFiltering.put(getSpanSignature(cSpan.start(), cSpan.end()),
-                cSpan);
-          }
-        }
-      }
-    }
-
-  }
-
-  // ===============================================================
-  // Manual constraint annotation methods and classes
-  // ===============================================================
-
-  /**
-   * if there are any LHS or RHS constraints for a span, then all the applicable grammar rules in
-   * that span will have to pass the filter.
-   */
-  public List<Rule> filterRules(int i, int j, List<Rule> rulesIn) {
-    if (null == this.constraintSpansForFiltering) return rulesIn;
-    ConstraintSpan cSpan = this.constraintSpansForFiltering.get(getSpanSignature(i, j));
-    if (null == cSpan) { // no filtering
-      return rulesIn;
-    } else {
-
-      List<Rule> rulesOut = new ArrayList<Rule>();
-      for (Rule gRule : rulesIn) {
-        // gRule will survive, if any constraint (LHS or RHS) lets it survive
-        for (ConstraintRule cRule : cSpan.rules()) {
-          if (shouldSurvive(cRule, gRule)) {
-            rulesOut.add(gRule);
-            break;
-          }
-        }
-      }
-      return rulesOut;
-    }
-  }
-
-  /**
-   * should we filter out the gRule based on the manually provided constraint cRule
-   */
-  public boolean shouldSurvive(ConstraintRule cRule, Rule gRule) {
-
-    switch (cRule.type()) {
-      case LHS:
-        return (gRule.getLHS() == Vocabulary.id(cRule.lhs()));
-      case RHS:
-        int[] targetWords = Vocabulary.addAll(cRule.nativeRhs());
-
-        if (targetWords.length != gRule.getEnglish().length) return false;
-
-        for (int t = 0; t < targetWords.length; t++) {
-          if (targetWords[t] != gRule.getEnglish()[t]) return false;
-        }
-
-        return true;
-      default: // not surviving
-        return false;
-    }
-  }
-
-  /**
-   * if a span is *within* the coverage of a *hard* rule constraint, then this span will be only
-   * allowed to use the mannual rules
-   */
-  public boolean containHardRuleConstraint(int startSpan, int endSpan) {
-    if (null != this.spansWithHardRuleConstraint) {
-      for (Span span : this.spansWithHardRuleConstraint) {
-        if (startSpan >= span.startPos && endSpan <= span.endPos) return true;
-      }
-    }
-    return false;
-  }
-
-  private String getSpanSignature(int i, int j) {
-    return i + " " + j;
-  }
-
-  private static class Span {
-
-    int startPos;
-    int endPos;
-
-    public Span(int startPos, int endPos) {
-      this.startPos = startPos;
-      this.endPos = endPos;
-    }
-  }
-
-}