You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/04/24 22:53:22 UTC

[01/18] incubator-joshua git commit: OOV fix for class-based LM

Repository: incubator-joshua
Updated Branches:
  refs/heads/master 3f4fa9928 -> f2ae90433
  refs/heads/morph a86ae8e87 -> 00eaf7168


OOV fix for class-based LM


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/b7f23108
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/b7f23108
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/b7f23108

Branch: refs/heads/morph
Commit: b7f23108ffce1451fac45dcf6ac7ff6efa44ec56
Parents: 5396c5f
Author: Matt Post <po...@cs.jhu.edu>
Authored: Thu Apr 21 09:24:06 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Thu Apr 21 09:24:06 2016 -0400

----------------------------------------------------------------------
 src/joshua/decoder/ff/lm/LanguageModelFF.java | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b7f23108/src/joshua/decoder/ff/lm/LanguageModelFF.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/lm/LanguageModelFF.java b/src/joshua/decoder/ff/lm/LanguageModelFF.java
index 732229c..18c149d 100644
--- a/src/joshua/decoder/ff/lm/LanguageModelFF.java
+++ b/src/joshua/decoder/ff/lm/LanguageModelFF.java
@@ -92,7 +92,7 @@ public class LanguageModelFF extends StatefulFF {
   
   protected class ClassMap {
 
-    private final int OOV_id = 10;
+    private final int OOV_id = Vocabulary.getUnknownId();
     private HashMap<Integer, Integer> classMap;
 
     public ClassMap(String file_name) throws IOException {
@@ -101,11 +101,7 @@ public class LanguageModelFF extends StatefulFF {
     }
 
     public int getClassID(int wordID) {
-      if (this.classMap.containsKey(wordID)) {
-        return this.classMap.get(wordID);
-      } else {
-        return OOV_id;
-      }
+      return this.classMap.getOrDefault(wordID, OOV_id);
     }
 
     /**


[10/18] incubator-joshua git commit: bugfix in looking up source word classifier and prediction

Posted by mj...@apache.org.
bugfix in looking up source word classifier and prediction


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/c30bddba
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/c30bddba
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/c30bddba

Branch: refs/heads/morph
Commit: c30bddbafaa12f72020f7e746e4c3e138cf2294c
Parents: 4b8c640
Author: Matt Post <po...@cs.jhu.edu>
Authored: Fri Apr 22 23:40:32 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Fri Apr 22 23:40:32 2016 -0400

----------------------------------------------------------------------
 src/joshua/decoder/ff/LexicalSharpener.java | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/c30bddba/src/joshua/decoder/ff/LexicalSharpener.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/LexicalSharpener.java b/src/joshua/decoder/ff/LexicalSharpener.java
index 16d1021..b8f0c39 100644
--- a/src/joshua/decoder/ff/LexicalSharpener.java
+++ b/src/joshua/decoder/ff/LexicalSharpener.java
@@ -151,6 +151,8 @@ public class LexicalSharpener extends StatelessFF {
   public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
       Sentence sentence, Accumulator acc) {
     
+    int[] resolved = anchorRuleSourceToSentence(rule, tailNodes, i);
+    
     Map<Integer, List<Integer>> points = rule.getAlignmentMap();
     for (int t: points.keySet()) {
       List<Integer> source_indices = points.get(t);
@@ -158,12 +160,14 @@ public class LexicalSharpener extends StatelessFF {
         continue;
       
       int targetID = rule.getEnglish()[t];
-      int s = i + source_indices.get(0);
-      Token sourceToken = sentence.getTokens().get(s);
+      String targetWord = Vocabulary.word(targetID);
+      int sourceIndex = resolved[source_indices.get(0)];
+      Token sourceToken = sentence.getTokens().get(sourceIndex);
+      String sourceWord = Vocabulary.word(sourceToken.getWord());
       String featureString = sourceToken.getAnnotationString().replace('|', ' ');
       
-      System.err.println(String.format("%s: %s -> %s?",  name, sourceToken, Vocabulary.word(targetID)));
-      Classification result = predict(sourceToken.getWord(), targetID, featureString);
+      System.err.println(String.format("%s: %s -> %s?",  name, sourceWord, targetWord));
+      Classification result = predict(sourceWord, targetWord, featureString);
       if (result != null) {
         Labeling labeling = result.getLabeling();
         int num = labeling.numLocations();
@@ -193,12 +197,11 @@ public class LexicalSharpener extends StatelessFF {
       return "21+";
   }
   
-  public Classification predict(int sourceID, int targetID, String featureString) {
-    String word = Vocabulary.word(sourceID);
-    if (classifiers.containsKey(word)) {
-      MalletPredictor predictor = classifiers.get(word);
+  public Classification predict(String sourceWord, String targetWord, String featureString) {
+    if (classifiers.containsKey(sourceWord)) {
+      MalletPredictor predictor = classifiers.get(sourceWord);
       if (predictor != null)
-        return predictor.predict(word, featureString);
+        return predictor.predict(targetWord, featureString);
     }
 
     return null;
@@ -273,7 +276,7 @@ public class LexicalSharpener extends StatelessFF {
       String sourceWord = tokens[0];
       String targetWord = tokens[1];
       String features = tokens[2];
-      Classification result = ts.predict(Vocabulary.id(sourceWord), Vocabulary.id(targetWord), features);
+      Classification result = ts.predict(sourceWord, targetWord, features);
       if (result != null)
         System.out.println(String.format("%s %f", result.getLabelVector().getBestLabel(), result.getLabelVector().getBestValue()));
       else 


[16/18] incubator-joshua git commit: added lowercaser option to pipeline (set to 'cat' for null)

Posted by mj...@apache.org.
added lowercaser option to pipeline (set to 'cat' for null)


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/f2ae9043
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/f2ae9043
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/f2ae9043

Branch: refs/heads/morph
Commit: f2ae90433c7e3e3dc95fa697c3565e4462306ba0
Parents: 3f4fa99
Author: Matt Post <po...@cs.jhu.edu>
Authored: Sun Apr 24 15:39:01 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Sun Apr 24 15:39:01 2016 -0400

----------------------------------------------------------------------
 scripts/training/pipeline.pl | 1 +
 1 file changed, 1 insertion(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2ae9043/scripts/training/pipeline.pl
----------------------------------------------------------------------
diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl
index 8629508..a438e60 100755
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl
@@ -264,6 +264,7 @@ my $retval = GetOptions(
   "tokenizer-source=s"      => \$TOKENIZER_SOURCE,
   "tokenizer-target=s"      => \$TOKENIZER_TARGET,
   "normalizer=s"      => \$NORMALIZER,
+  "lowercaser=s"      => \$LOWERCASER,
   "joshua-config=s"   => \$_JOSHUA_CONFIG,
   "joshua-args=s"      => \$_JOSHUA_ARGS,
   "joshua-mem=s"      => \$JOSHUA_MEM,


[02/18] incubator-joshua git commit: pack alignments by default!

Posted by mj...@apache.org.
pack alignments by default!


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/a330afe4
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/a330afe4
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/a330afe4

Branch: refs/heads/morph
Commit: a330afe4de2fab29dd4eee49c1f3834e435ed540
Parents: a86ae8e
Author: Matt Post <po...@cs.jhu.edu>
Authored: Fri Apr 22 10:02:27 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Fri Apr 22 10:02:27 2016 -0400

----------------------------------------------------------------------
 scripts/support/grammar-packer.pl | 8 +++++---
 scripts/support/run_bundler.py    | 1 +
 scripts/training/pipeline.pl      | 2 +-
 3 files changed, 7 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/a330afe4/scripts/support/grammar-packer.pl
----------------------------------------------------------------------
diff --git a/scripts/support/grammar-packer.pl b/scripts/support/grammar-packer.pl
index d2b1627..e485513 100755
--- a/scripts/support/grammar-packer.pl
+++ b/scripts/support/grammar-packer.pl
@@ -20,20 +20,21 @@ use File::Temp qw/tempfile/;
 use File::Basename qw/basename/;
 
 my %opts = (
+  a => 0,         # whether alignments are included in the grammar(s)
   g => '',        # comma-separated list of grammars to pack
   o => '',        # comma-separated list of grammar output directories
   m => '8g',      # amount of memory to give the packer
   T => '/tmp',    # location of temporary space
   v => 0,         # verbose
 );
-getopts("m:T:vg:o:", \%opts) || die usage();
+getopts("am:T:vg:o:", \%opts) || die usage();
 die usage() if (@ARGV);
 
 my $JOSHUA = $ENV{JOSHUA} or die "you must defined \$JOSHUA";
 my $CAT    = "$JOSHUA/scripts/training/scat";
 
 sub usage {
-  print "Usage: grammar-packer.pl [-m MEM] [-T /path/to/tmp] -g 'grammar [grammar2 ...]' -o 'grammar.packed [grammar2.packed ...]'\n";
+  print "Usage: grammar-packer.pl [-a] [-m MEM] [-T /path/to/tmp] -g 'grammar [grammar2 ...]' -o 'grammar.packed [grammar2.packed ...]'\n";
   exit 1;
 }
 
@@ -88,7 +89,8 @@ foreach my $grammar (@grammars) {
 # Do the packing using the config.
 my $grammars = join(" ", @sorted_grammars);
 my $outputs  = join(" ", @outputs);
-my $cmd = "java -Xmx$opts{m} -cp $JOSHUA/lib/args4j-2.0.29.jar:$JOSHUA/class joshua.tools.GrammarPackerCli -g $grammars --outputs $outputs";
+my $alignments = $opts{a} ? "--ga" : "";
+my $cmd = "java -Xmx$opts{m} -cp $JOSHUA/lib/args4j-2.0.29.jar:$JOSHUA/class joshua.tools.GrammarPackerCli -g $grammars --outputs $outputs $alignments";
 print STDERR "Packing with $cmd...\n" if $opts{v};
 
 my $retval = system($cmd);

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/a330afe4/scripts/support/run_bundler.py
----------------------------------------------------------------------
diff --git a/scripts/support/run_bundler.py b/scripts/support/run_bundler.py
index ae54221..b64b6f7 100755
--- a/scripts/support/run_bundler.py
+++ b/scripts/support/run_bundler.py
@@ -326,6 +326,7 @@ def recursive_copy(src, dest, symlink = False):
 
 def run_grammar_packer(src_path, dest_path):
     cmd = [os.path.join(JOSHUA_PATH, "scripts/support/grammar-packer.pl"),
+           "-a",
            "-T", opts.tmpdir,
            "-g", src_path, "-o", dest_path]
     logging.info(

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/a330afe4/scripts/training/pipeline.pl
----------------------------------------------------------------------
diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl
index 8629508..fd9436e 100755
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl
@@ -1644,7 +1644,7 @@ if ($DO_PACK_GRAMMARS) {
   my $packed_dir = "$DATA_DIRS{test}/grammar.packed";
   if ($OPTIMIZER_RUN == 1 and ! is_packed($TEST_GRAMMAR)) {
     $cachepipe->cmd("test-pack",
-                    "$SCRIPTDIR/support/grammar-packer.pl -T $TMPDIR -m $PACKER_MEM -g $TEST_GRAMMAR -o $packed_dir",
+                    "$SCRIPTDIR/support/grammar-packer.pl -a -T $TMPDIR -m $PACKER_MEM -g $TEST_GRAMMAR -o $packed_dir",
                     $TEST_GRAMMAR,
                     "$packed_dir/vocabulary",
                     "$packed_dir/encoding",


[17/18] incubator-joshua git commit: added lowercaser option to pipeline (set to 'cat' for null)

Posted by mj...@apache.org.
added lowercaser option to pipeline (set to 'cat' for null)


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/f2ae9043
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/f2ae9043
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/f2ae9043

Branch: refs/heads/master
Commit: f2ae90433c7e3e3dc95fa697c3565e4462306ba0
Parents: 3f4fa99
Author: Matt Post <po...@cs.jhu.edu>
Authored: Sun Apr 24 15:39:01 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Sun Apr 24 15:39:01 2016 -0400

----------------------------------------------------------------------
 scripts/training/pipeline.pl | 1 +
 1 file changed, 1 insertion(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2ae9043/scripts/training/pipeline.pl
----------------------------------------------------------------------
diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl
index 8629508..a438e60 100755
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl
@@ -264,6 +264,7 @@ my $retval = GetOptions(
   "tokenizer-source=s"      => \$TOKENIZER_SOURCE,
   "tokenizer-target=s"      => \$TOKENIZER_TARGET,
   "normalizer=s"      => \$NORMALIZER,
+  "lowercaser=s"      => \$LOWERCASER,
   "joshua-config=s"   => \$_JOSHUA_CONFIG,
   "joshua-args=s"      => \$_JOSHUA_ARGS,
   "joshua-mem=s"      => \$JOSHUA_MEM,


[05/18] incubator-joshua git commit: Model now serializes

Posted by mj...@apache.org.
Model now serializes


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/68b01bc1
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/68b01bc1
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/68b01bc1

Branch: refs/heads/morph
Commit: 68b01bc168298db382334e9f01bdf2992db85b01
Parents: 1c8aaa5
Author: Matt Post <po...@cs.jhu.edu>
Authored: Fri Apr 22 15:59:39 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Fri Apr 22 15:59:39 2016 -0400

----------------------------------------------------------------------
 src/joshua/decoder/ff/LexicalSharpener.java | 184 ++++++-----------------
 src/joshua/decoder/ff/MalletPredictor.java  |  97 ++++++++++++
 2 files changed, 143 insertions(+), 138 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/68b01bc1/src/joshua/decoder/ff/LexicalSharpener.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/LexicalSharpener.java b/src/joshua/decoder/ff/LexicalSharpener.java
index 2c96f83..8671d57 100644
--- a/src/joshua/decoder/ff/LexicalSharpener.java
+++ b/src/joshua/decoder/ff/LexicalSharpener.java
@@ -19,24 +19,16 @@ package joshua.decoder.ff;
 import java.io.FileInputStream;
 import java.io.FileNotFoundException;
 import java.io.FileOutputStream;
-import java.io.FileReader;
 import java.io.IOException;
 import java.io.ObjectInputStream;
 import java.io.ObjectOutputStream;
-import java.io.StringReader;
-import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Scanner;
 
 import cc.mallet.classify.*;
-import cc.mallet.pipe.*;
-import cc.mallet.pipe.iterator.CsvIterator;
-import cc.mallet.types.Alphabet;
-import cc.mallet.types.Instance;
-import cc.mallet.types.InstanceList;
-import cc.mallet.types.LabelAlphabet;
+import cc.mallet.types.Labeling;
 import joshua.corpus.Vocabulary;
 import joshua.decoder.Decoder;
 import joshua.decoder.JoshuaConfiguration;
@@ -52,7 +44,8 @@ import joshua.util.io.LineReader;
 
 public class LexicalSharpener extends StatelessFF {
 
-  private HashMap<Integer,Predictor> classifiers = null;
+  private HashMap<String,MalletPredictor> classifiers = null;
+
   public LexicalSharpener(final FeatureVector weights, String[] args, JoshuaConfiguration config) {
     super(weights, "LexicalSharpener", args, config);
 
@@ -63,6 +56,13 @@ public class LexicalSharpener extends StatelessFF {
         System.err.println(String.format("* FATAL[LexicalSharpener]: can't load %s", parsedArgs.get("training-data")));
         System.exit(1);
       }
+    } else if (parsedArgs.containsKey("model")) {
+      try {
+        loadClassifiers(parsedArgs.get("model"));
+      } catch (ClassNotFoundException | IOException e) {
+        // TODO Auto-generated catch block
+        e.printStackTrace();
+      }
     }
   }
   
@@ -75,7 +75,7 @@ public class LexicalSharpener extends StatelessFF {
    */
   public void trainAll(String dataFile) throws FileNotFoundException {
   
-    classifiers = new HashMap<Integer, Predictor>();
+    classifiers = new HashMap<String, MalletPredictor>();
 
     Decoder.LOG(1, "Reading " + dataFile);
     LineReader lineReader = null;
@@ -92,7 +92,7 @@ public class LexicalSharpener extends StatelessFF {
     for (String line : lineReader) {
       String sourceWord = line.substring(0, line.indexOf(' '));
       if (lastSourceWord != null && ! sourceWord.equals(lastSourceWord)) {
-        classifiers.put(Vocabulary.id(lastSourceWord), new Predictor(lastSourceWord, examples));
+        classifiers.put(lastSourceWord, new MalletPredictor(lastSourceWord, examples));
         //        System.err.println(String.format("WORD %s:\n%s\n", lastOutcome, buffer));
         examples = "";
       }
@@ -101,18 +101,18 @@ public class LexicalSharpener extends StatelessFF {
       lastSourceWord = sourceWord;
       linesRead++;
     }
-    classifiers.put(Vocabulary.id(lastSourceWord), new Predictor(lastSourceWord, examples));
+    classifiers.put(lastSourceWord, new MalletPredictor(lastSourceWord, examples));
   
     System.err.println(String.format("Read %d lines from training file", linesRead));
   }
 
   public void loadClassifiers(String modelFile) throws ClassNotFoundException, IOException {
     ObjectInputStream ois = new ObjectInputStream(new FileInputStream(modelFile));
-    classifiers = (HashMap<Integer,Predictor>) ois.readObject();
+    classifiers = (HashMap<String,MalletPredictor>) ois.readObject();
     ois.close();
     
     System.err.println(String.format("Loaded model with %d keys", classifiers.keySet().size()));
-    for (int key: classifiers.keySet()) {
+    for (String key: classifiers.keySet()) {
       System.err.println("  " + key);
     }
   }
@@ -133,8 +133,6 @@ public class LexicalSharpener extends StatelessFF {
   public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
       Sentence sentence, Accumulator acc) {
     
-    System.err.println(String.format("RULE: %s",  rule));
-        
     Map<Integer, List<Integer>> points = rule.getAlignmentMap();
     for (int t: points.keySet()) {
       List<Integer> source_indices = points.get(t);
@@ -142,27 +140,46 @@ public class LexicalSharpener extends StatelessFF {
         continue;
       
       int targetID = rule.getEnglish()[t];
-      String targetWord = Vocabulary.word(targetID);
       int s = i + source_indices.get(0);
       Token sourceToken = sentence.getTokens().get(s);
       String featureString = sourceToken.getAnnotationString().replace('|', ' ');
       
       Classification result = predict(sourceToken.getWord(), targetID, featureString);
-      System.out.println("RESULT: " + result.getLabeling());
-      if (result.bestLabelIsCorrect()) {
-        acc.add(String.format("%s_match", name), 1);
+      if (result != null) {
+        Labeling labeling = result.getLabeling();
+        int num = labeling.numLocations();
+        int predicted = Vocabulary.id(labeling.getBestLabel().toString());
+//        System.err.println(String.format("LexicalSharpener: predicted %s (rule %s) %.5f",
+//            labeling.getBestLabel().toString(), Vocabulary.word(targetID), Math.log(labeling.getBestValue())));
+        if (num > 1 && predicted == targetID) {
+          acc.add(String.format("%s_match_%s", name, getBin(num)), 1);
+        }
+        acc.add(String.format("%s_weight", name), (float) Math.log(labeling.getBestValue()));
       }
     }
     
     return null;
   }
   
+  private String getBin(int num) {
+    if (num == 2)
+      return "2";
+    else if (num <= 5)
+      return "3-5";
+    else if (num <= 10)
+      return "6-10";
+    else if (num <= 20)
+      return "11-20";
+    else
+      return "21+";
+  }
+  
   public Classification predict(int sourceID, int targetID, String featureString) {
     String word = Vocabulary.word(sourceID);
-    if (classifiers.containsKey(sourceID)) {
-      Predictor predictor = classifiers.get(sourceID);
+    if (classifiers.containsKey(word)) {
+      MalletPredictor predictor = classifiers.get(word);
       if (predictor != null)
-        return predictor.predict(Vocabulary.word(targetID), featureString);
+        return predictor.predict(word, featureString);
     }
 
     return null;
@@ -212,112 +229,6 @@ public class LexicalSharpener extends StatelessFF {
     return anchoredSource;
   }
   
-  public class Predictor {
-    
-    private SerialPipes pipes = null;
-    private InstanceList instances = null;
-    private String sourceWord = null;
-    private String examples = null;
-    private Classifier classifier = null;
-    
-    public Predictor(String word, String examples) {
-      this.sourceWord = word;
-      this.examples = examples;
-      ArrayList<Pipe> pipeList = new ArrayList<Pipe>();
-
-      // I don't know if this is needed
-      pipeList.add(new Target2Label());
-      // Convert custom lines to Instance objects (svmLight2FeatureVectorAndLabel not versatile enough)
-      pipeList.add(new SvmLight2FeatureVectorAndLabel());
-      // Validation
-//      pipeList.add(new PrintInputAndTarget());
-      
-      // name: english word
-      // data: features (FeatureVector)
-      // target: foreign inflection
-      // source: null
-
-      pipes = new SerialPipes(pipeList);
-      instances = new InstanceList(pipes);
-    }
-
-    /**
-       * Returns a Classification object a list of features. Uses "which" to determine which classifier
-       * to use.
-       *   
-       * @param which the classifier to use
-       * @param features the set of features
-       * @return
-       */
-    public Classification predict(String outcome, String features) {
-      Instance instance = new Instance(features, outcome, null, null);
-      System.err.println("PREDICT targetWord = " + (String) instance.getTarget());
-      System.err.println("PREDICT features = " + (String) instance.getData());
-
-      if (classifier == null)
-        train();
-
-      Classification result = (Classification) classifier.classify(pipes.instanceFrom(instance));
-      return result;
-    }
-
-    public void train() {
-//      System.err.println(String.format("Word %s: training model", sourceWord));
-//      System.err.println(String.format("  Examples: %s", examples));
-      
-      StringReader reader = new StringReader(examples);
-
-      // Constructs an instance with everything shoved into the data field
-      instances.addThruPipe(new CsvIterator(reader, "(\\S+)\\s+(.*)", 2, -1, 1));
-
-      ClassifierTrainer trainer = new MaxEntTrainer();
-      classifier = trainer.train(instances);
-      
-      System.err.println(String.format("Trained a model for %s with %d outcomes", 
-          sourceWord, pipes.getTargetAlphabet().size()));
-    }
-
-    /**
-     * Returns the number of distinct outcomes. Requires the model to have been trained!
-     * 
-     * @return
-     */
-    public int getNumOutcomes() {
-      if (classifier == null)
-        train();
-      return pipes.getTargetAlphabet().size();
-    }
-  }
-  
-  public static void example(String[] args) throws IOException, ClassNotFoundException {
-
-    ArrayList<Pipe> pipeList = new ArrayList<Pipe>();
-
-    Alphabet dataAlphabet = new Alphabet();
-    LabelAlphabet labelAlphabet = new LabelAlphabet();
-    
-    pipeList.add(new Target2Label(dataAlphabet, labelAlphabet));
-    // Basically, SvmLight but with a custom (fixed) alphabet)
-    pipeList.add(new SvmLight2FeatureVectorAndLabel());
-
-    FileReader reader1 = new FileReader("data.1");
-    FileReader reader2 = new FileReader("data.2");
-
-    SerialPipes pipes = new SerialPipes(pipeList);
-    InstanceList instances = new InstanceList(dataAlphabet, labelAlphabet);
-    instances.setPipe(pipes);
-    instances.addThruPipe(new CsvIterator(reader1, "(\\S+)\\s+(\\S+)\\s+(.*)", 3, 2, 1));
-    ClassifierTrainer trainer1 = new MaxEntTrainer();
-    Classifier classifier1 = trainer1.train(instances);
-    
-    pipes = new SerialPipes(pipeList);
-    instances = new InstanceList(dataAlphabet, labelAlphabet);
-    instances.setPipe(pipes);
-    instances.addThruPipe(new CsvIterator(reader2, "(\\S+)\\s+(\\S+)\\s+(.*)", 3, 2, 1));
-    ClassifierTrainer trainer2 = new MaxEntTrainer();
-    Classifier classifier2 = trainer2.train(instances);
-  }
-  
   public static void main(String[] args) throws IOException, ClassNotFoundException {
     LexicalSharpener ts = new LexicalSharpener(null, args, null);
     
@@ -329,14 +240,11 @@ public class LexicalSharpener extends StatelessFF {
       System.err.println("Training model from file " + dataFile);
       ts.trainAll(dataFile);
     
-//      if (args.length > 1)
-//        modelFile = args[1];
-//      
-//      System.err.println("Writing model to file " + modelFile); 
-//      ts.saveClassifiers(modelFile);
-//    } else {
-//      System.err.println("Loading model from file " + modelFile);
-//      ts.loadClassifiers(modelFile);
+      if (args.length > 1)
+        modelFile = args[1];
+      
+      System.err.println("Writing model to file " + modelFile); 
+      ts.saveClassifiers(modelFile);
     }
     
     Scanner stdin = new Scanner(System.in);

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/68b01bc1/src/joshua/decoder/ff/MalletPredictor.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/MalletPredictor.java b/src/joshua/decoder/ff/MalletPredictor.java
new file mode 100644
index 0000000..04c9d8c
--- /dev/null
+++ b/src/joshua/decoder/ff/MalletPredictor.java
@@ -0,0 +1,97 @@
+package joshua.decoder.ff;
+
+import java.io.Serializable;
+import java.io.StringReader;
+import java.util.ArrayList;
+
+import cc.mallet.classify.Classification;
+import cc.mallet.classify.Classifier;
+import cc.mallet.classify.ClassifierTrainer;
+import cc.mallet.classify.MaxEntTrainer;
+import cc.mallet.pipe.Pipe;
+import cc.mallet.pipe.SerialPipes;
+import cc.mallet.pipe.SvmLight2FeatureVectorAndLabel;
+import cc.mallet.pipe.Target2Label;
+import cc.mallet.pipe.iterator.CsvIterator;
+import cc.mallet.types.Instance;
+import cc.mallet.types.InstanceList;
+import joshua.decoder.Decoder;
+
+public class MalletPredictor implements Serializable {
+    
+    private static final long serialVersionUID = 1L;
+
+    private SerialPipes pipes = null;
+    private InstanceList instances = null;
+    private String sourceWord = null;
+    private String examples = null;
+    private Classifier classifier = null;
+    
+    public MalletPredictor(String word, String examples) {
+      this.sourceWord = word;
+      this.examples = examples;
+      ArrayList<Pipe> pipeList = new ArrayList<Pipe>();
+
+      // I don't know if this is needed
+      pipeList.add(new Target2Label());
+      // Convert custom lines to Instance objects (svmLight2FeatureVectorAndLabel not versatile enough)
+      pipeList.add(new SvmLight2FeatureVectorAndLabel());
+      // Validation
+//      pipeList.add(new PrintInputAndTarget());
+      
+      // name: english word
+      // data: features (FeatureVector)
+      // target: foreign inflection
+      // source: null
+
+      pipes = new SerialPipes(pipeList);
+      instances = new InstanceList(pipes);
+    }
+
+    /**
+       * Returns a Classification object a list of features. Uses "which" to determine which classifier
+       * to use.
+       *   
+       * @param which the classifier to use
+       * @param features the set of features
+       * @return
+       */
+    public Classification predict(String outcome, String features) {
+      Instance instance = new Instance(features, outcome, null, null);
+//      SYSTEM.ERR.PRINTLN("PREDICT TARGETWORD = " + (STRING) INSTANCE.GETTARGET());
+//      SYSTEM.ERR.PRINTLN("PREDICT FEATURES = " + (STRING) INSTANCE.GETDATA());
+
+      if (classifier == null)
+        train();
+
+      Classification result = (Classification) classifier.classify(pipes.instanceFrom(instance));
+      return result;
+    }
+
+    public void train() {
+      Decoder.LOG(2, String.format("Word %s: training model from %d examples", 
+          sourceWord, examples.split("\\n").length));
+      
+      StringReader reader = new StringReader(examples);
+
+      // Constructs an instance with everything shoved into the data field
+      instances.addThruPipe(new CsvIterator(reader, "(\\S+)\\s+(.*)", 2, -1, 1));
+
+      ClassifierTrainer trainer = new MaxEntTrainer();
+      classifier = trainer.train(instances);
+      
+//      Decoder.LOG(1, String.format("%s: Trained a model for %s with %d outcomes", 
+//          name, sourceWord, pipes.getTargetAlphabet().size()));
+    }
+
+    /**
+     * Returns the number of distinct outcomes. Requires the model to have been trained!
+     * 
+     * @return
+     */
+    public int getNumOutcomes() {
+      if (classifier == null)
+        train();
+      return pipes.getTargetAlphabet().size();
+    }
+  }
\ No newline at end of file


[18/18] incubator-joshua git commit: Merge branch 'master' into morph

Posted by mj...@apache.org.
Merge branch 'master' into morph


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/00eaf716
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/00eaf716
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/00eaf716

Branch: refs/heads/morph
Commit: 00eaf71682f3339da7d1c21e2a6a6110b98bbbd8
Parents: bb3b79c f2ae904
Author: Matt Post <po...@cs.jhu.edu>
Authored: Sun Apr 24 15:40:27 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Sun Apr 24 15:40:27 2016 -0400

----------------------------------------------------------------------
 scripts/training/pipeline.pl                    |   1 +
 src/joshua/decoder/JoshuaConfiguration.java     |   7 +
 .../decoder/hypergraph/KBestExtractor.java      |  37 ++++-
 .../hypergraph/WordAlignmentExtractor.java      |   2 -
 .../decoder/hypergraph/WordAlignmentState.java  |   1 -
 src/joshua/decoder/segment_file/Sentence.java   |   8 +-
 src/joshua/decoder/segment_file/Token.java      |  28 +++-
 src/joshua/lattice/Lattice.java                 |  31 ++--
 src/joshua/util/FormatUtils.java                |  19 +++
 test/decoder/lowercaser/config                  | 140 +++++++++++++++++++
 test/decoder/lowercaser/grammar.glue            |   4 +
 test/decoder/lowercaser/grammar.test            |   1 +
 test/decoder/lowercaser/output.gold             |   3 +
 test/decoder/lowercaser/test.sh                 |  18 +++
 14 files changed, 273 insertions(+), 27 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/00eaf716/scripts/training/pipeline.pl
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/00eaf716/src/joshua/decoder/segment_file/Token.java
----------------------------------------------------------------------
diff --cc src/joshua/decoder/segment_file/Token.java
index 9dcec22,ebe9a43..655b536
--- a/src/joshua/decoder/segment_file/Token.java
+++ b/src/joshua/decoder/segment_file/Token.java
@@@ -36,7 -39,7 +39,8 @@@ public class Token 
    private int tokenID;
  
    private HashMap<String,String> annotations = null;
 +  private String annotationString;
+   private JoshuaConfiguration joshuaConfiguration;
  
    /**
     * Constructor : Creates a Token object from a raw word
@@@ -59,10 -62,11 +63,12 @@@
     * @param rawWord A word with annotation information (possibly)
     *  
     */
-   public Token(String rawWord) {
+   public Token(String rawWord, JoshuaConfiguration config) {
+     
+     this.joshuaConfiguration = config;
      
      annotations = new HashMap<String,String>();
 +    annotationString = "";
      
      // Matches a word with an annotation
      // Check guidelines in constructor description
@@@ -123,11 -143,4 +145,11 @@@
      
      return null;
    }
 -}
 +  
 +  /**
 +   * Returns the raw annotation string
 +   */
 +  public String getAnnotationString() {
 +    return annotationString;
 +  }
- }
++}


[08/18] incubator-joshua git commit: pruning out predictors with too many outcomes

Posted by mj...@apache.org.
pruning out predictors with too many outcomes


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/8b59b99d
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/8b59b99d
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/8b59b99d

Branch: refs/heads/morph
Commit: 8b59b99d8efa3b65ceb258ea3c65dc17534accea
Parents: dc6b411
Author: Matt Post <po...@cs.jhu.edu>
Authored: Fri Apr 22 21:52:35 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Fri Apr 22 21:52:35 2016 -0400

----------------------------------------------------------------------
 src/joshua/decoder/ff/LexicalSharpener.java | 25 +++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8b59b99d/src/joshua/decoder/ff/LexicalSharpener.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/LexicalSharpener.java b/src/joshua/decoder/ff/LexicalSharpener.java
index 8c30431..6207ac0 100644
--- a/src/joshua/decoder/ff/LexicalSharpener.java
+++ b/src/joshua/decoder/ff/LexicalSharpener.java
@@ -24,9 +24,11 @@ import java.io.ObjectInputStream;
 import java.io.ObjectOutputStream;
 import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Scanner;
+import java.util.Set;
 
 import cc.mallet.classify.*;
 import cc.mallet.types.Labeling;
@@ -89,17 +91,22 @@ public class LexicalSharpener extends StatelessFF {
   
     String lastSourceWord = null;
     ArrayList<String> examples = new ArrayList<String>();
+    HashMap<String,Integer> targets = new HashMap<String,Integer>();
     int linesRead = 0;
     for (String line : lineReader) {
-      String sourceWord = line.substring(0, line.indexOf(' '));
+      String[] tokens = line.split("\\s+", 3);
+      String sourceWord = tokens[0];
+      String targetWord = tokens[1];
 
       if (lastSourceWord != null && ! sourceWord.equals(lastSourceWord)) {
-        classifiers.put(lastSourceWord, new MalletPredictor(lastSourceWord, examples));
+        classifiers.put(lastSourceWord, createClassifier(lastSourceWord, targets, examples));
 //                System.err.println(String.format("WORD %s:\n%s\n", lastSourceWord, examples));
         examples = new ArrayList<String>();
+        targets = new HashMap<String,Integer>();
       }
   
       examples.add(line);
+      targets.put(targetWord, targets.getOrDefault(targetWord, 0));
       lastSourceWord = sourceWord;
       linesRead++;
     }
@@ -108,15 +115,23 @@ public class LexicalSharpener extends StatelessFF {
     System.err.println(String.format("Read %d lines from training file", linesRead));
   }
 
+  private MalletPredictor createClassifier(String lastSourceWord, HashMap<String, Integer> counts,
+      ArrayList<String> examples) {
+    
+    int numExamples = examples.size();
+    
+    if (examples.size() < 75)
+      return new MalletPredictor(lastSourceWord, examples);
+    
+    return null;
+  }
+
   public void loadClassifiers(String modelFile) throws ClassNotFoundException, IOException {
     ObjectInputStream ois = new ObjectInputStream(new FileInputStream(modelFile));
     classifiers = (HashMap<String,MalletPredictor>) ois.readObject();
     ois.close();
     
     System.err.println(String.format("Loaded model with %d keys", classifiers.keySet().size()));
-    for (String key: classifiers.keySet()) {
-      System.err.println("  " + key);
-    }
   }
 
   public void saveClassifiers(String modelFile) throws FileNotFoundException, IOException {


[06/18] incubator-joshua git commit: huge efficiency fix in reading in the data

Posted by mj...@apache.org.
huge efficiency fix in reading in the data


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/155249f9
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/155249f9
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/155249f9

Branch: refs/heads/morph
Commit: 155249f9d0f5c00ea2c7d70917c94b06df401e33
Parents: 68b01bc
Author: Matt Post <po...@cs.jhu.edu>
Authored: Fri Apr 22 16:52:42 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Fri Apr 22 16:52:42 2016 -0400

----------------------------------------------------------------------
 src/joshua/decoder/ff/LexicalSharpener.java | 10 +++---
 src/joshua/decoder/ff/MalletPredictor.java  | 45 +++++++++++++-----------
 2 files changed, 31 insertions(+), 24 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/155249f9/src/joshua/decoder/ff/LexicalSharpener.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/LexicalSharpener.java b/src/joshua/decoder/ff/LexicalSharpener.java
index 8671d57..8c30431 100644
--- a/src/joshua/decoder/ff/LexicalSharpener.java
+++ b/src/joshua/decoder/ff/LexicalSharpener.java
@@ -22,6 +22,7 @@ import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.ObjectInputStream;
 import java.io.ObjectOutputStream;
+import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
@@ -87,17 +88,18 @@ public class LexicalSharpener extends StatelessFF {
     }
   
     String lastSourceWord = null;
-    String examples = "";
+    ArrayList<String> examples = new ArrayList<String>();
     int linesRead = 0;
     for (String line : lineReader) {
       String sourceWord = line.substring(0, line.indexOf(' '));
+
       if (lastSourceWord != null && ! sourceWord.equals(lastSourceWord)) {
         classifiers.put(lastSourceWord, new MalletPredictor(lastSourceWord, examples));
-        //        System.err.println(String.format("WORD %s:\n%s\n", lastOutcome, buffer));
-        examples = "";
+//                System.err.println(String.format("WORD %s:\n%s\n", lastSourceWord, examples));
+        examples = new ArrayList<String>();
       }
   
-      examples += line + "\n";
+      examples.add(line);
       lastSourceWord = sourceWord;
       linesRead++;
     }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/155249f9/src/joshua/decoder/ff/MalletPredictor.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/MalletPredictor.java b/src/joshua/decoder/ff/MalletPredictor.java
index 04c9d8c..f200551 100644
--- a/src/joshua/decoder/ff/MalletPredictor.java
+++ b/src/joshua/decoder/ff/MalletPredictor.java
@@ -24,28 +24,12 @@ public class MalletPredictor implements Serializable {
     private SerialPipes pipes = null;
     private InstanceList instances = null;
     private String sourceWord = null;
-    private String examples = null;
+    private ArrayList<String> examples = null;
     private Classifier classifier = null;
     
-    public MalletPredictor(String word, String examples) {
+    public MalletPredictor(String word, ArrayList<String> examples) {
       this.sourceWord = word;
       this.examples = examples;
-      ArrayList<Pipe> pipeList = new ArrayList<Pipe>();
-
-      // I don't know if this is needed
-      pipeList.add(new Target2Label());
-      // Convert custom lines to Instance objects (svmLight2FeatureVectorAndLabel not versatile enough)
-      pipeList.add(new SvmLight2FeatureVectorAndLabel());
-      // Validation
-//      pipeList.add(new PrintInputAndTarget());
-      
-      // name: english word
-      // data: features (FeatureVector)
-      // target: foreign inflection
-      // source: null
-
-      pipes = new SerialPipes(pipeList);
-      instances = new InstanceList(pipes);
     }
 
     /**
@@ -70,9 +54,30 @@ public class MalletPredictor implements Serializable {
 
     public void train() {
       Decoder.LOG(2, String.format("Word %s: training model from %d examples", 
-          sourceWord, examples.split("\\n").length));
+          sourceWord, examples.size()));
+      
+      ArrayList<Pipe> pipeList = new ArrayList<Pipe>();
+
+      // I don't know if this is needed
+      pipeList.add(new Target2Label());
+      // Convert custom lines to Instance objects (svmLight2FeatureVectorAndLabel not versatile enough)
+      pipeList.add(new SvmLight2FeatureVectorAndLabel());
+      // Validation
+//      pipeList.add(new PrintInputAndTarget());
+      
+      // name: english word
+      // data: features (FeatureVector)
+      // target: foreign inflection
+      // source: null
+
+      pipes = new SerialPipes(pipeList);
+      instances = new InstanceList(pipes);
       
-      StringReader reader = new StringReader(examples);
+      /* I know, this is *terrible*, but I need it to work *now* */
+      String exampleList = "";
+      for (String example: examples)
+        exampleList += example + "\n";
+      StringReader reader = new StringReader(exampleList);
 
       // Constructs an instance with everything shoved into the data field
       instances.addThruPipe(new CsvIterator(reader, "(\\S+)\\s+(.*)", 2, -1, 1));


[03/18] incubator-joshua git commit: Merge branch 'master' into morph

Posted by mj...@apache.org.
Merge branch 'master' into morph


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/71f808e5
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/71f808e5
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/71f808e5

Branch: refs/heads/morph
Commit: 71f808e56b175363b10b5e17d1d6f1edc802f6d6
Parents: a330afe b7f2310
Author: Matt Post <po...@cs.jhu.edu>
Authored: Fri Apr 22 10:45:27 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Fri Apr 22 10:45:27 2016 -0400

----------------------------------------------------------------------
 src/joshua/decoder/ff/lm/LanguageModelFF.java | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)
----------------------------------------------------------------------



[07/18] incubator-joshua git commit: added training script

Posted by mj...@apache.org.
added training script


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/dc6b4112
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/dc6b4112
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/dc6b4112

Branch: refs/heads/morph
Commit: dc6b41129d21dd647dbf20919c4093b4495f80fb
Parents: 155249f
Author: Matt Post <po...@cs.jhu.edu>
Authored: Fri Apr 22 19:09:39 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Fri Apr 22 19:09:39 2016 -0400

----------------------------------------------------------------------
 scripts/morph/train-mallet.sh | 13 +++++++++++++
 1 file changed, 13 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/dc6b4112/scripts/morph/train-mallet.sh
----------------------------------------------------------------------
diff --git a/scripts/morph/train-mallet.sh b/scripts/morph/train-mallet.sh
new file mode 100644
index 0000000..cfc7802
--- /dev/null
+++ b/scripts/morph/train-mallet.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+# Trains a mallet model on source-annotated data of the form
+#
+# source_word target_word feat:val feat:val feat:val
+
+if [[ -z $2 ]]; then
+  echo "Usage: train-mallet.sh DATA_FILE MODEL_FILE"
+  echo "This will read data from DATA_FILE and serialize the models to MODEL_FILE"
+  exit
+fi
+
+java -mx16g -cp $JOSHUA/lib/mallet-2.0.7.jar:$JOSHUA/lib/trove4j-2.0.2.jar:$JOSHUA/class joshua.decoder.ff.LexicalSharpener $1 $2


[12/18] incubator-joshua git commit: moved files that were strangely under joshua-6

Posted by mj...@apache.org.
moved files that were strangely under joshua-6


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/bc83a1a6
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/bc83a1a6
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/bc83a1a6

Branch: refs/heads/morph
Commit: bc83a1a6d31bc034ec546f79ed00cc5598349c69
Parents: b7f2310
Author: Matt Post <po...@cs.jhu.edu>
Authored: Sat Apr 23 11:37:19 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Sat Apr 23 11:37:19 2016 -0400

----------------------------------------------------------------------
 .../joshua/decoder/StructuredTranslation.java   | 143 -------------------
 .../ViterbiFeatureVectorWalkerFunction.java     |  44 ------
 .../ViterbiOutputStringWalkerFunction.java      |  96 -------------
 src/joshua/decoder/StructuredTranslation.java   | 143 +++++++++++++++++++
 .../ViterbiFeatureVectorWalkerFunction.java     |  44 ++++++
 .../ViterbiOutputStringWalkerFunction.java      |  96 +++++++++++++
 6 files changed, 283 insertions(+), 283 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/bc83a1a6/joshua-6/src/joshua/decoder/StructuredTranslation.java
----------------------------------------------------------------------
diff --git a/joshua-6/src/joshua/decoder/StructuredTranslation.java b/joshua-6/src/joshua/decoder/StructuredTranslation.java
deleted file mode 100644
index 1939ea0..0000000
--- a/joshua-6/src/joshua/decoder/StructuredTranslation.java
+++ /dev/null
@@ -1,143 +0,0 @@
-package joshua.decoder;
-
-import static java.util.Arrays.asList;
-import static java.util.Collections.emptyList;
-import static java.util.Collections.emptyMap;
-import static joshua.decoder.hypergraph.ViterbiExtractor.walk;
-
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-import joshua.decoder.ff.FeatureFunction;
-import joshua.decoder.hypergraph.HyperGraph;
-import joshua.decoder.hypergraph.ViterbiFeatureVectorWalkerFunction;
-import joshua.decoder.hypergraph.ViterbiOutputStringWalkerFunction;
-import joshua.decoder.hypergraph.WalkerFunction;
-import joshua.decoder.hypergraph.WordAlignmentExtractor;
-import joshua.decoder.segment_file.Sentence;
-
-/**
- * structuredTranslation provides a more structured access to translation
- * results than the Translation class.
- * Members of instances of this class can be used upstream.
- * <br/>
- * TODO:
- * Enable K-Best extraction.
- * 
- * @author fhieber
- */
-public class StructuredTranslation {
-  
-  private final Sentence sourceSentence;
-  private final List<FeatureFunction> featureFunctions;
-  
-  private final String translationString;
-  private final List<String> translationTokens;
-  private final float translationScore;
-  private List<List<Integer>> translationWordAlignments;
-  private Map<String,Float> translationFeatures;
-  private final float extractionTime;
-  
-  public StructuredTranslation(final Sentence sourceSentence,
-      final HyperGraph hypergraph,
-      final List<FeatureFunction> featureFunctions) {
-    
-      final long startTime = System.currentTimeMillis();
-      
-      this.sourceSentence = sourceSentence;
-      this.featureFunctions = featureFunctions;
-      this.translationString = extractViterbiString(hypergraph);
-      this.translationTokens = extractTranslationTokens();
-      this.translationScore = extractTranslationScore(hypergraph);
-      this.translationFeatures = extractViterbiFeatures(hypergraph);
-      this.translationWordAlignments = extractViterbiWordAlignment(hypergraph);
-      this.extractionTime = (System.currentTimeMillis() - startTime) / 1000.0f;
-  }
-  
-  private Map<String,Float> extractViterbiFeatures(final HyperGraph hypergraph) {
-    if (hypergraph == null) {
-      return emptyMap(); 
-    } else {
-      ViterbiFeatureVectorWalkerFunction viterbiFeatureVectorWalker = new ViterbiFeatureVectorWalkerFunction(featureFunctions, sourceSentence);
-      walk(hypergraph.goalNode, viterbiFeatureVectorWalker);
-      return new HashMap<String,Float>(viterbiFeatureVectorWalker.getFeaturesMap());
-    }
-  }
-
-  private List<List<Integer>> extractViterbiWordAlignment(final HyperGraph hypergraph) {
-    if (hypergraph == null) {
-      return emptyList();
-    } else {
-      final WordAlignmentExtractor wordAlignmentWalker = new WordAlignmentExtractor();
-      walk(hypergraph.goalNode, wordAlignmentWalker);
-      return wordAlignmentWalker.getFinalWordAlignments();
-    }
-  }
-  
-  private float extractTranslationScore(final HyperGraph hypergraph) {
-    if (hypergraph == null) {
-      return 0;
-    } else {
-      return hypergraph.goalNode.getScore();
-    }
-  }
-  
-  private String extractViterbiString(final HyperGraph hypergraph) {
-    if (hypergraph == null) {
-      return sourceSentence.source();
-    } else {
-      final WalkerFunction viterbiOutputStringWalker = new ViterbiOutputStringWalkerFunction();
-      walk(hypergraph.goalNode, viterbiOutputStringWalker);
-      return viterbiOutputStringWalker.toString();
-    }
-  }
-  
-  private List<String> extractTranslationTokens() {
-    if (translationString.isEmpty()) {
-      return emptyList();
-    } else {
-      return asList(translationString.split("\\s+"));
-    }
-  }
-  
-  // Getters to use upstream
-  
-  public Sentence getSourceSentence() {
-    return sourceSentence;
-  }
-
-  public int getSentenceId() {
-    return sourceSentence.id();
-  }
-
-  public String getTranslationString() {
-    return translationString;
-  }
-
-  public List<String> getTranslationTokens() {
-    return translationTokens;
-  }
-
-  public float getTranslationScore() {
-    return translationScore;
-  }
-
-  /**
-   * Returns a list of target to source alignments.
-   */
-  public List<List<Integer>> getTranslationWordAlignments() {
-    return translationWordAlignments;
-  }
-  
-  public Map<String,Float> getTranslationFeatures() {
-    return translationFeatures;
-  }
-  
-  /**
-   * Time taken to build output information from the hypergraph.
-   */
-  public Float getExtractionTime() {
-    return extractionTime;
-  }
-}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/bc83a1a6/joshua-6/src/joshua/decoder/hypergraph/ViterbiFeatureVectorWalkerFunction.java
----------------------------------------------------------------------
diff --git a/joshua-6/src/joshua/decoder/hypergraph/ViterbiFeatureVectorWalkerFunction.java b/joshua-6/src/joshua/decoder/hypergraph/ViterbiFeatureVectorWalkerFunction.java
deleted file mode 100644
index 5af6c4d..0000000
--- a/joshua-6/src/joshua/decoder/hypergraph/ViterbiFeatureVectorWalkerFunction.java
+++ /dev/null
@@ -1,44 +0,0 @@
-package joshua.decoder.hypergraph;
-
-import static joshua.decoder.chart_parser.ComputeNodeResult.computeTransitionFeatures;
-
-import java.util.List;
-import java.util.Map;
-
-import joshua.decoder.ff.FeatureFunction;
-import joshua.decoder.ff.FeatureVector;
-import joshua.decoder.segment_file.Sentence;
-
-public class ViterbiFeatureVectorWalkerFunction implements WalkerFunction {
-  
-  private final FeatureVector features;
-  private final List<FeatureFunction> featureFunctions;
-  private final Sentence sourceSentence;
-  
-  public ViterbiFeatureVectorWalkerFunction(
-      final List<FeatureFunction> featureFunctions,
-      final Sentence sourceSentence) {
-    this.features = new FeatureVector();
-    this.featureFunctions = featureFunctions;
-    this.sourceSentence = sourceSentence;
-  }
-
-  /**
-   * Recompute feature values for each Viterbi edge and add to features.
-   */
-  @Override
-  public void apply(HGNode node) {
-    final FeatureVector edgeFeatures = computeTransitionFeatures(
-        featureFunctions, node.bestHyperedge, node.i, node.j, sourceSentence);
-    features.add(edgeFeatures);
-  }
-  
-  public FeatureVector getFeatures() {
-    return features;
-  }
-  
-  public Map<String,Float> getFeaturesMap() {
-    return features.getMap();
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/bc83a1a6/joshua-6/src/joshua/decoder/hypergraph/ViterbiOutputStringWalkerFunction.java
----------------------------------------------------------------------
diff --git a/joshua-6/src/joshua/decoder/hypergraph/ViterbiOutputStringWalkerFunction.java b/joshua-6/src/joshua/decoder/hypergraph/ViterbiOutputStringWalkerFunction.java
deleted file mode 100644
index 0c84375..0000000
--- a/joshua-6/src/joshua/decoder/hypergraph/ViterbiOutputStringWalkerFunction.java
+++ /dev/null
@@ -1,96 +0,0 @@
-package joshua.decoder.hypergraph;
-
-import static java.lang.Integer.MAX_VALUE;
-import static joshua.corpus.Vocabulary.getWords;
-import static joshua.corpus.Vocabulary.nt;
-
-import java.util.Stack;
-
-import joshua.decoder.ff.tm.Rule;
-
-public class ViterbiOutputStringWalkerFunction implements WalkerFunction {
-  
-  private Stack<int[]> viterbiWords = new Stack<int[]>();
-
-  @Override
-  public void apply(HGNode node) {
-    final Rule rule = node.bestHyperedge.getRule();
-    if (rule != null) {
-      merge(rule.getEnglish());
-    }
-  }
-  
-  private boolean containsNonTerminals(final int[] ids) {
-    boolean hasNonTerminals = false;
-    for (int i = 0; i < ids.length; i++) {
-      if (nt(ids[i])) {
-        hasNonTerminals = true;
-        break;
-      }
-    }
-    return hasNonTerminals;
-  }
-  
-  /**
-   * Returns the index of the next non-terminal slot to fill.
-   * Since non-terminals in right hand sides of rules are indexed by
-   * their order on the source side, this function looks for the largest
-   * negative id in ids and returns its index. 
-   */
-  private int getNextNonTerminalIndexToFill(final int[] ids) {
-    int nextIndex = 0;
-    int nextNonTerminal = -MAX_VALUE;
-    for (int i = 0; i < ids.length; i++) {
-      if (nt(ids[i]) && ids[i] > nextNonTerminal) {
-        nextIndex = i;
-        nextNonTerminal = ids[i];
-      }
-    }
-    return nextIndex;
-  }
-  
-  private int[] substituteNonTerminal(final int[] parentWords, final int[] childWords) {
-    final int ntIndex = getNextNonTerminalIndexToFill(parentWords);
-    final int[] result = new int[parentWords.length + childWords.length - 1];
-    int resultIndex = 0;
-    for (int i = 0; i < ntIndex; i++) {
-      result[resultIndex++] = parentWords[i];
-    }
-    for (int i = 0; i < childWords.length; i++) {
-      result[resultIndex++] = childWords[i];
-    }
-    for (int i = ntIndex + 1; i < parentWords.length; i++) {
-      result[resultIndex++] = parentWords[i];
-    }
-    return result;
-  }
-
-  private void merge(final int[] words) {
-    if (!containsNonTerminals(words)
-        && !viterbiWords.isEmpty()
-        && containsNonTerminals(viterbiWords.peek())) {
-      merge(substituteNonTerminal(viterbiWords.pop(), words));
-    } else {
-      viterbiWords.add(words);
-    }
-  }
-  
-  @Override
-  public String toString() {
-    if (viterbiWords.isEmpty()) {
-      return "";
-    }
-    
-    if (viterbiWords.size() != 1) {
-      throw new RuntimeException(
-          String.format(
-              "Stack of ViterbiOutputStringWalker should contain only a single (last) element, but was size %d", viterbiWords.size()));
-    }
-    
-    String result = getWords(viterbiWords.peek());
-    // strip of sentence markers (<s>,</s>)
-    result = result.substring(result.indexOf(' ') + 1, result.lastIndexOf(' '));
-    return result.trim();
-  }
-  
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/bc83a1a6/src/joshua/decoder/StructuredTranslation.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/StructuredTranslation.java b/src/joshua/decoder/StructuredTranslation.java
new file mode 100644
index 0000000..1939ea0
--- /dev/null
+++ b/src/joshua/decoder/StructuredTranslation.java
@@ -0,0 +1,143 @@
+package joshua.decoder;
+
+import static java.util.Arrays.asList;
+import static java.util.Collections.emptyList;
+import static java.util.Collections.emptyMap;
+import static joshua.decoder.hypergraph.ViterbiExtractor.walk;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import joshua.decoder.ff.FeatureFunction;
+import joshua.decoder.hypergraph.HyperGraph;
+import joshua.decoder.hypergraph.ViterbiFeatureVectorWalkerFunction;
+import joshua.decoder.hypergraph.ViterbiOutputStringWalkerFunction;
+import joshua.decoder.hypergraph.WalkerFunction;
+import joshua.decoder.hypergraph.WordAlignmentExtractor;
+import joshua.decoder.segment_file.Sentence;
+
+/**
+ * structuredTranslation provides a more structured access to translation
+ * results than the Translation class.
+ * Members of instances of this class can be used upstream.
+ * <br/>
+ * TODO:
+ * Enable K-Best extraction.
+ * 
+ * @author fhieber
+ */
+public class StructuredTranslation {
+  
+  private final Sentence sourceSentence;
+  private final List<FeatureFunction> featureFunctions;
+  
+  private final String translationString;
+  private final List<String> translationTokens;
+  private final float translationScore;
+  private List<List<Integer>> translationWordAlignments;
+  private Map<String,Float> translationFeatures;
+  private final float extractionTime;
+  
+  public StructuredTranslation(final Sentence sourceSentence,
+      final HyperGraph hypergraph,
+      final List<FeatureFunction> featureFunctions) {
+    
+      final long startTime = System.currentTimeMillis();
+      
+      this.sourceSentence = sourceSentence;
+      this.featureFunctions = featureFunctions;
+      this.translationString = extractViterbiString(hypergraph);
+      this.translationTokens = extractTranslationTokens();
+      this.translationScore = extractTranslationScore(hypergraph);
+      this.translationFeatures = extractViterbiFeatures(hypergraph);
+      this.translationWordAlignments = extractViterbiWordAlignment(hypergraph);
+      this.extractionTime = (System.currentTimeMillis() - startTime) / 1000.0f;
+  }
+  
+  private Map<String,Float> extractViterbiFeatures(final HyperGraph hypergraph) {
+    if (hypergraph == null) {
+      return emptyMap(); 
+    } else {
+      ViterbiFeatureVectorWalkerFunction viterbiFeatureVectorWalker = new ViterbiFeatureVectorWalkerFunction(featureFunctions, sourceSentence);
+      walk(hypergraph.goalNode, viterbiFeatureVectorWalker);
+      return new HashMap<String,Float>(viterbiFeatureVectorWalker.getFeaturesMap());
+    }
+  }
+
+  private List<List<Integer>> extractViterbiWordAlignment(final HyperGraph hypergraph) {
+    if (hypergraph == null) {
+      return emptyList();
+    } else {
+      final WordAlignmentExtractor wordAlignmentWalker = new WordAlignmentExtractor();
+      walk(hypergraph.goalNode, wordAlignmentWalker);
+      return wordAlignmentWalker.getFinalWordAlignments();
+    }
+  }
+  
+  private float extractTranslationScore(final HyperGraph hypergraph) {
+    if (hypergraph == null) {
+      return 0;
+    } else {
+      return hypergraph.goalNode.getScore();
+    }
+  }
+  
+  private String extractViterbiString(final HyperGraph hypergraph) {
+    if (hypergraph == null) {
+      return sourceSentence.source();
+    } else {
+      final WalkerFunction viterbiOutputStringWalker = new ViterbiOutputStringWalkerFunction();
+      walk(hypergraph.goalNode, viterbiOutputStringWalker);
+      return viterbiOutputStringWalker.toString();
+    }
+  }
+  
+  private List<String> extractTranslationTokens() {
+    if (translationString.isEmpty()) {
+      return emptyList();
+    } else {
+      return asList(translationString.split("\\s+"));
+    }
+  }
+  
+  // Getters to use upstream
+  
+  public Sentence getSourceSentence() {
+    return sourceSentence;
+  }
+
+  public int getSentenceId() {
+    return sourceSentence.id();
+  }
+
+  public String getTranslationString() {
+    return translationString;
+  }
+
+  public List<String> getTranslationTokens() {
+    return translationTokens;
+  }
+
+  public float getTranslationScore() {
+    return translationScore;
+  }
+
+  /**
+   * Returns a list of target to source alignments.
+   */
+  public List<List<Integer>> getTranslationWordAlignments() {
+    return translationWordAlignments;
+  }
+  
+  public Map<String,Float> getTranslationFeatures() {
+    return translationFeatures;
+  }
+  
+  /**
+   * Time taken to build output information from the hypergraph.
+   */
+  public Float getExtractionTime() {
+    return extractionTime;
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/bc83a1a6/src/joshua/decoder/hypergraph/ViterbiFeatureVectorWalkerFunction.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/ViterbiFeatureVectorWalkerFunction.java b/src/joshua/decoder/hypergraph/ViterbiFeatureVectorWalkerFunction.java
new file mode 100644
index 0000000..5af6c4d
--- /dev/null
+++ b/src/joshua/decoder/hypergraph/ViterbiFeatureVectorWalkerFunction.java
@@ -0,0 +1,44 @@
+package joshua.decoder.hypergraph;
+
+import static joshua.decoder.chart_parser.ComputeNodeResult.computeTransitionFeatures;
+
+import java.util.List;
+import java.util.Map;
+
+import joshua.decoder.ff.FeatureFunction;
+import joshua.decoder.ff.FeatureVector;
+import joshua.decoder.segment_file.Sentence;
+
+public class ViterbiFeatureVectorWalkerFunction implements WalkerFunction {
+  
+  private final FeatureVector features;
+  private final List<FeatureFunction> featureFunctions;
+  private final Sentence sourceSentence;
+  
+  public ViterbiFeatureVectorWalkerFunction(
+      final List<FeatureFunction> featureFunctions,
+      final Sentence sourceSentence) {
+    this.features = new FeatureVector();
+    this.featureFunctions = featureFunctions;
+    this.sourceSentence = sourceSentence;
+  }
+
+  /**
+   * Recompute feature values for each Viterbi edge and add to features.
+   */
+  @Override
+  public void apply(HGNode node) {
+    final FeatureVector edgeFeatures = computeTransitionFeatures(
+        featureFunctions, node.bestHyperedge, node.i, node.j, sourceSentence);
+    features.add(edgeFeatures);
+  }
+  
+  public FeatureVector getFeatures() {
+    return features;
+  }
+  
+  public Map<String,Float> getFeaturesMap() {
+    return features.getMap();
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/bc83a1a6/src/joshua/decoder/hypergraph/ViterbiOutputStringWalkerFunction.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/ViterbiOutputStringWalkerFunction.java b/src/joshua/decoder/hypergraph/ViterbiOutputStringWalkerFunction.java
new file mode 100644
index 0000000..0c84375
--- /dev/null
+++ b/src/joshua/decoder/hypergraph/ViterbiOutputStringWalkerFunction.java
@@ -0,0 +1,96 @@
+package joshua.decoder.hypergraph;
+
+import static java.lang.Integer.MAX_VALUE;
+import static joshua.corpus.Vocabulary.getWords;
+import static joshua.corpus.Vocabulary.nt;
+
+import java.util.Stack;
+
+import joshua.decoder.ff.tm.Rule;
+
+public class ViterbiOutputStringWalkerFunction implements WalkerFunction {
+  
+  private Stack<int[]> viterbiWords = new Stack<int[]>();
+
+  @Override
+  public void apply(HGNode node) {
+    final Rule rule = node.bestHyperedge.getRule();
+    if (rule != null) {
+      merge(rule.getEnglish());
+    }
+  }
+  
+  private boolean containsNonTerminals(final int[] ids) {
+    boolean hasNonTerminals = false;
+    for (int i = 0; i < ids.length; i++) {
+      if (nt(ids[i])) {
+        hasNonTerminals = true;
+        break;
+      }
+    }
+    return hasNonTerminals;
+  }
+  
+  /**
+   * Returns the index of the next non-terminal slot to fill.
+   * Since non-terminals in right hand sides of rules are indexed by
+   * their order on the source side, this function looks for the largest
+   * negative id in ids and returns its index. 
+   */
+  private int getNextNonTerminalIndexToFill(final int[] ids) {
+    int nextIndex = 0;
+    int nextNonTerminal = -MAX_VALUE;
+    for (int i = 0; i < ids.length; i++) {
+      if (nt(ids[i]) && ids[i] > nextNonTerminal) {
+        nextIndex = i;
+        nextNonTerminal = ids[i];
+      }
+    }
+    return nextIndex;
+  }
+  
+  private int[] substituteNonTerminal(final int[] parentWords, final int[] childWords) {
+    final int ntIndex = getNextNonTerminalIndexToFill(parentWords);
+    final int[] result = new int[parentWords.length + childWords.length - 1];
+    int resultIndex = 0;
+    for (int i = 0; i < ntIndex; i++) {
+      result[resultIndex++] = parentWords[i];
+    }
+    for (int i = 0; i < childWords.length; i++) {
+      result[resultIndex++] = childWords[i];
+    }
+    for (int i = ntIndex + 1; i < parentWords.length; i++) {
+      result[resultIndex++] = parentWords[i];
+    }
+    return result;
+  }
+
+  private void merge(final int[] words) {
+    if (!containsNonTerminals(words)
+        && !viterbiWords.isEmpty()
+        && containsNonTerminals(viterbiWords.peek())) {
+      merge(substituteNonTerminal(viterbiWords.pop(), words));
+    } else {
+      viterbiWords.add(words);
+    }
+  }
+  
+  @Override
+  public String toString() {
+    if (viterbiWords.isEmpty()) {
+      return "";
+    }
+    
+    if (viterbiWords.size() != 1) {
+      throw new RuntimeException(
+          String.format(
+              "Stack of ViterbiOutputStringWalker should contain only a single (last) element, but was size %d", viterbiWords.size()));
+    }
+    
+    String result = getWords(viterbiWords.peek());
+    // strip of sentence markers (<s>,</s>)
+    result = result.substring(result.indexOf(' ') + 1, result.lastIndexOf(' '));
+    return result.trim();
+  }
+  
+}
\ No newline at end of file


[11/18] incubator-joshua git commit: too much stderr!

Posted by mj...@apache.org.
too much stderr!


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/2fa4b42a
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/2fa4b42a
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/2fa4b42a

Branch: refs/heads/morph
Commit: 2fa4b42abdfa554ae6f04c791e38f468bf6851d0
Parents: c30bddb
Author: Matt Post <po...@cs.jhu.edu>
Authored: Fri Apr 22 23:55:56 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Fri Apr 22 23:55:56 2016 -0400

----------------------------------------------------------------------
 src/joshua/decoder/ff/LexicalSharpener.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/2fa4b42a/src/joshua/decoder/ff/LexicalSharpener.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/LexicalSharpener.java b/src/joshua/decoder/ff/LexicalSharpener.java
index b8f0c39..22662a8 100644
--- a/src/joshua/decoder/ff/LexicalSharpener.java
+++ b/src/joshua/decoder/ff/LexicalSharpener.java
@@ -166,7 +166,7 @@ public class LexicalSharpener extends StatelessFF {
       String sourceWord = Vocabulary.word(sourceToken.getWord());
       String featureString = sourceToken.getAnnotationString().replace('|', ' ');
       
-      System.err.println(String.format("%s: %s -> %s?",  name, sourceWord, targetWord));
+//      System.err.println(String.format("%s: %s -> %s?",  name, sourceWord, targetWord));
       Classification result = predict(sourceWord, targetWord, featureString);
       if (result != null) {
         Labeling labeling = result.getLabeling();


[13/18] incubator-joshua git commit: Revert "moved files that were strangely under joshua-6"

Posted by mj...@apache.org.
Revert "moved files that were strangely under joshua-6"

This reverts commit bc83a1a6d31bc034ec546f79ed00cc5598349c69.


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/4f2bec7c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/4f2bec7c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/4f2bec7c

Branch: refs/heads/morph
Commit: 4f2bec7c00803029cc4cb187fa7f567d7e6a1f22
Parents: bc83a1a
Author: Matt Post <po...@cs.jhu.edu>
Authored: Sat Apr 23 19:14:25 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Sat Apr 23 19:14:25 2016 -0400

----------------------------------------------------------------------
 .../joshua/decoder/StructuredTranslation.java   | 143 +++++++++++++++++++
 .../ViterbiFeatureVectorWalkerFunction.java     |  44 ++++++
 .../ViterbiOutputStringWalkerFunction.java      |  96 +++++++++++++
 src/joshua/decoder/StructuredTranslation.java   | 143 -------------------
 .../ViterbiFeatureVectorWalkerFunction.java     |  44 ------
 .../ViterbiOutputStringWalkerFunction.java      |  96 -------------
 6 files changed, 283 insertions(+), 283 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/4f2bec7c/joshua-6/src/joshua/decoder/StructuredTranslation.java
----------------------------------------------------------------------
diff --git a/joshua-6/src/joshua/decoder/StructuredTranslation.java b/joshua-6/src/joshua/decoder/StructuredTranslation.java
new file mode 100644
index 0000000..1939ea0
--- /dev/null
+++ b/joshua-6/src/joshua/decoder/StructuredTranslation.java
@@ -0,0 +1,143 @@
+package joshua.decoder;
+
+import static java.util.Arrays.asList;
+import static java.util.Collections.emptyList;
+import static java.util.Collections.emptyMap;
+import static joshua.decoder.hypergraph.ViterbiExtractor.walk;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import joshua.decoder.ff.FeatureFunction;
+import joshua.decoder.hypergraph.HyperGraph;
+import joshua.decoder.hypergraph.ViterbiFeatureVectorWalkerFunction;
+import joshua.decoder.hypergraph.ViterbiOutputStringWalkerFunction;
+import joshua.decoder.hypergraph.WalkerFunction;
+import joshua.decoder.hypergraph.WordAlignmentExtractor;
+import joshua.decoder.segment_file.Sentence;
+
+/**
+ * structuredTranslation provides a more structured access to translation
+ * results than the Translation class.
+ * Members of instances of this class can be used upstream.
+ * <br/>
+ * TODO:
+ * Enable K-Best extraction.
+ * 
+ * @author fhieber
+ */
+public class StructuredTranslation {
+  
+  private final Sentence sourceSentence;
+  private final List<FeatureFunction> featureFunctions;
+  
+  private final String translationString;
+  private final List<String> translationTokens;
+  private final float translationScore;
+  private List<List<Integer>> translationWordAlignments;
+  private Map<String,Float> translationFeatures;
+  private final float extractionTime;
+  
+  public StructuredTranslation(final Sentence sourceSentence,
+      final HyperGraph hypergraph,
+      final List<FeatureFunction> featureFunctions) {
+    
+      final long startTime = System.currentTimeMillis();
+      
+      this.sourceSentence = sourceSentence;
+      this.featureFunctions = featureFunctions;
+      this.translationString = extractViterbiString(hypergraph);
+      this.translationTokens = extractTranslationTokens();
+      this.translationScore = extractTranslationScore(hypergraph);
+      this.translationFeatures = extractViterbiFeatures(hypergraph);
+      this.translationWordAlignments = extractViterbiWordAlignment(hypergraph);
+      this.extractionTime = (System.currentTimeMillis() - startTime) / 1000.0f;
+  }
+  
+  private Map<String,Float> extractViterbiFeatures(final HyperGraph hypergraph) {
+    if (hypergraph == null) {
+      return emptyMap(); 
+    } else {
+      ViterbiFeatureVectorWalkerFunction viterbiFeatureVectorWalker = new ViterbiFeatureVectorWalkerFunction(featureFunctions, sourceSentence);
+      walk(hypergraph.goalNode, viterbiFeatureVectorWalker);
+      return new HashMap<String,Float>(viterbiFeatureVectorWalker.getFeaturesMap());
+    }
+  }
+
+  private List<List<Integer>> extractViterbiWordAlignment(final HyperGraph hypergraph) {
+    if (hypergraph == null) {
+      return emptyList();
+    } else {
+      final WordAlignmentExtractor wordAlignmentWalker = new WordAlignmentExtractor();
+      walk(hypergraph.goalNode, wordAlignmentWalker);
+      return wordAlignmentWalker.getFinalWordAlignments();
+    }
+  }
+  
+  private float extractTranslationScore(final HyperGraph hypergraph) {
+    if (hypergraph == null) {
+      return 0;
+    } else {
+      return hypergraph.goalNode.getScore();
+    }
+  }
+  
+  private String extractViterbiString(final HyperGraph hypergraph) {
+    if (hypergraph == null) {
+      return sourceSentence.source();
+    } else {
+      final WalkerFunction viterbiOutputStringWalker = new ViterbiOutputStringWalkerFunction();
+      walk(hypergraph.goalNode, viterbiOutputStringWalker);
+      return viterbiOutputStringWalker.toString();
+    }
+  }
+  
+  private List<String> extractTranslationTokens() {
+    if (translationString.isEmpty()) {
+      return emptyList();
+    } else {
+      return asList(translationString.split("\\s+"));
+    }
+  }
+  
+  // Getters to use upstream
+  
+  public Sentence getSourceSentence() {
+    return sourceSentence;
+  }
+
+  public int getSentenceId() {
+    return sourceSentence.id();
+  }
+
+  public String getTranslationString() {
+    return translationString;
+  }
+
+  public List<String> getTranslationTokens() {
+    return translationTokens;
+  }
+
+  public float getTranslationScore() {
+    return translationScore;
+  }
+
+  /**
+   * Returns a list of target to source alignments.
+   */
+  public List<List<Integer>> getTranslationWordAlignments() {
+    return translationWordAlignments;
+  }
+  
+  public Map<String,Float> getTranslationFeatures() {
+    return translationFeatures;
+  }
+  
+  /**
+   * Time taken to build output information from the hypergraph.
+   */
+  public Float getExtractionTime() {
+    return extractionTime;
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/4f2bec7c/joshua-6/src/joshua/decoder/hypergraph/ViterbiFeatureVectorWalkerFunction.java
----------------------------------------------------------------------
diff --git a/joshua-6/src/joshua/decoder/hypergraph/ViterbiFeatureVectorWalkerFunction.java b/joshua-6/src/joshua/decoder/hypergraph/ViterbiFeatureVectorWalkerFunction.java
new file mode 100644
index 0000000..5af6c4d
--- /dev/null
+++ b/joshua-6/src/joshua/decoder/hypergraph/ViterbiFeatureVectorWalkerFunction.java
@@ -0,0 +1,44 @@
+package joshua.decoder.hypergraph;
+
+import static joshua.decoder.chart_parser.ComputeNodeResult.computeTransitionFeatures;
+
+import java.util.List;
+import java.util.Map;
+
+import joshua.decoder.ff.FeatureFunction;
+import joshua.decoder.ff.FeatureVector;
+import joshua.decoder.segment_file.Sentence;
+
+public class ViterbiFeatureVectorWalkerFunction implements WalkerFunction {
+  
+  private final FeatureVector features;
+  private final List<FeatureFunction> featureFunctions;
+  private final Sentence sourceSentence;
+  
+  public ViterbiFeatureVectorWalkerFunction(
+      final List<FeatureFunction> featureFunctions,
+      final Sentence sourceSentence) {
+    this.features = new FeatureVector();
+    this.featureFunctions = featureFunctions;
+    this.sourceSentence = sourceSentence;
+  }
+
+  /**
+   * Recompute feature values for each Viterbi edge and add to features.
+   */
+  @Override
+  public void apply(HGNode node) {
+    final FeatureVector edgeFeatures = computeTransitionFeatures(
+        featureFunctions, node.bestHyperedge, node.i, node.j, sourceSentence);
+    features.add(edgeFeatures);
+  }
+  
+  public FeatureVector getFeatures() {
+    return features;
+  }
+  
+  public Map<String,Float> getFeaturesMap() {
+    return features.getMap();
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/4f2bec7c/joshua-6/src/joshua/decoder/hypergraph/ViterbiOutputStringWalkerFunction.java
----------------------------------------------------------------------
diff --git a/joshua-6/src/joshua/decoder/hypergraph/ViterbiOutputStringWalkerFunction.java b/joshua-6/src/joshua/decoder/hypergraph/ViterbiOutputStringWalkerFunction.java
new file mode 100644
index 0000000..0c84375
--- /dev/null
+++ b/joshua-6/src/joshua/decoder/hypergraph/ViterbiOutputStringWalkerFunction.java
@@ -0,0 +1,96 @@
+package joshua.decoder.hypergraph;
+
+import static java.lang.Integer.MAX_VALUE;
+import static joshua.corpus.Vocabulary.getWords;
+import static joshua.corpus.Vocabulary.nt;
+
+import java.util.Stack;
+
+import joshua.decoder.ff.tm.Rule;
+
+public class ViterbiOutputStringWalkerFunction implements WalkerFunction {
+  
+  private Stack<int[]> viterbiWords = new Stack<int[]>();
+
+  @Override
+  public void apply(HGNode node) {
+    final Rule rule = node.bestHyperedge.getRule();
+    if (rule != null) {
+      merge(rule.getEnglish());
+    }
+  }
+  
+  private boolean containsNonTerminals(final int[] ids) {
+    boolean hasNonTerminals = false;
+    for (int i = 0; i < ids.length; i++) {
+      if (nt(ids[i])) {
+        hasNonTerminals = true;
+        break;
+      }
+    }
+    return hasNonTerminals;
+  }
+  
+  /**
+   * Returns the index of the next non-terminal slot to fill.
+   * Since non-terminals in right hand sides of rules are indexed by
+   * their order on the source side, this function looks for the largest
+   * negative id in ids and returns its index. 
+   */
+  private int getNextNonTerminalIndexToFill(final int[] ids) {
+    int nextIndex = 0;
+    int nextNonTerminal = -MAX_VALUE;
+    for (int i = 0; i < ids.length; i++) {
+      if (nt(ids[i]) && ids[i] > nextNonTerminal) {
+        nextIndex = i;
+        nextNonTerminal = ids[i];
+      }
+    }
+    return nextIndex;
+  }
+  
+  private int[] substituteNonTerminal(final int[] parentWords, final int[] childWords) {
+    final int ntIndex = getNextNonTerminalIndexToFill(parentWords);
+    final int[] result = new int[parentWords.length + childWords.length - 1];
+    int resultIndex = 0;
+    for (int i = 0; i < ntIndex; i++) {
+      result[resultIndex++] = parentWords[i];
+    }
+    for (int i = 0; i < childWords.length; i++) {
+      result[resultIndex++] = childWords[i];
+    }
+    for (int i = ntIndex + 1; i < parentWords.length; i++) {
+      result[resultIndex++] = parentWords[i];
+    }
+    return result;
+  }
+
+  private void merge(final int[] words) {
+    if (!containsNonTerminals(words)
+        && !viterbiWords.isEmpty()
+        && containsNonTerminals(viterbiWords.peek())) {
+      merge(substituteNonTerminal(viterbiWords.pop(), words));
+    } else {
+      viterbiWords.add(words);
+    }
+  }
+  
+  @Override
+  public String toString() {
+    if (viterbiWords.isEmpty()) {
+      return "";
+    }
+    
+    if (viterbiWords.size() != 1) {
+      throw new RuntimeException(
+          String.format(
+              "Stack of ViterbiOutputStringWalker should contain only a single (last) element, but was size %d", viterbiWords.size()));
+    }
+    
+    String result = getWords(viterbiWords.peek());
+    // strip of sentence markers (<s>,</s>)
+    result = result.substring(result.indexOf(' ') + 1, result.lastIndexOf(' '));
+    return result.trim();
+  }
+  
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/4f2bec7c/src/joshua/decoder/StructuredTranslation.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/StructuredTranslation.java b/src/joshua/decoder/StructuredTranslation.java
deleted file mode 100644
index 1939ea0..0000000
--- a/src/joshua/decoder/StructuredTranslation.java
+++ /dev/null
@@ -1,143 +0,0 @@
-package joshua.decoder;
-
-import static java.util.Arrays.asList;
-import static java.util.Collections.emptyList;
-import static java.util.Collections.emptyMap;
-import static joshua.decoder.hypergraph.ViterbiExtractor.walk;
-
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-import joshua.decoder.ff.FeatureFunction;
-import joshua.decoder.hypergraph.HyperGraph;
-import joshua.decoder.hypergraph.ViterbiFeatureVectorWalkerFunction;
-import joshua.decoder.hypergraph.ViterbiOutputStringWalkerFunction;
-import joshua.decoder.hypergraph.WalkerFunction;
-import joshua.decoder.hypergraph.WordAlignmentExtractor;
-import joshua.decoder.segment_file.Sentence;
-
-/**
- * structuredTranslation provides a more structured access to translation
- * results than the Translation class.
- * Members of instances of this class can be used upstream.
- * <br/>
- * TODO:
- * Enable K-Best extraction.
- * 
- * @author fhieber
- */
-public class StructuredTranslation {
-  
-  private final Sentence sourceSentence;
-  private final List<FeatureFunction> featureFunctions;
-  
-  private final String translationString;
-  private final List<String> translationTokens;
-  private final float translationScore;
-  private List<List<Integer>> translationWordAlignments;
-  private Map<String,Float> translationFeatures;
-  private final float extractionTime;
-  
-  public StructuredTranslation(final Sentence sourceSentence,
-      final HyperGraph hypergraph,
-      final List<FeatureFunction> featureFunctions) {
-    
-      final long startTime = System.currentTimeMillis();
-      
-      this.sourceSentence = sourceSentence;
-      this.featureFunctions = featureFunctions;
-      this.translationString = extractViterbiString(hypergraph);
-      this.translationTokens = extractTranslationTokens();
-      this.translationScore = extractTranslationScore(hypergraph);
-      this.translationFeatures = extractViterbiFeatures(hypergraph);
-      this.translationWordAlignments = extractViterbiWordAlignment(hypergraph);
-      this.extractionTime = (System.currentTimeMillis() - startTime) / 1000.0f;
-  }
-  
-  private Map<String,Float> extractViterbiFeatures(final HyperGraph hypergraph) {
-    if (hypergraph == null) {
-      return emptyMap(); 
-    } else {
-      ViterbiFeatureVectorWalkerFunction viterbiFeatureVectorWalker = new ViterbiFeatureVectorWalkerFunction(featureFunctions, sourceSentence);
-      walk(hypergraph.goalNode, viterbiFeatureVectorWalker);
-      return new HashMap<String,Float>(viterbiFeatureVectorWalker.getFeaturesMap());
-    }
-  }
-
-  private List<List<Integer>> extractViterbiWordAlignment(final HyperGraph hypergraph) {
-    if (hypergraph == null) {
-      return emptyList();
-    } else {
-      final WordAlignmentExtractor wordAlignmentWalker = new WordAlignmentExtractor();
-      walk(hypergraph.goalNode, wordAlignmentWalker);
-      return wordAlignmentWalker.getFinalWordAlignments();
-    }
-  }
-  
-  private float extractTranslationScore(final HyperGraph hypergraph) {
-    if (hypergraph == null) {
-      return 0;
-    } else {
-      return hypergraph.goalNode.getScore();
-    }
-  }
-  
-  private String extractViterbiString(final HyperGraph hypergraph) {
-    if (hypergraph == null) {
-      return sourceSentence.source();
-    } else {
-      final WalkerFunction viterbiOutputStringWalker = new ViterbiOutputStringWalkerFunction();
-      walk(hypergraph.goalNode, viterbiOutputStringWalker);
-      return viterbiOutputStringWalker.toString();
-    }
-  }
-  
-  private List<String> extractTranslationTokens() {
-    if (translationString.isEmpty()) {
-      return emptyList();
-    } else {
-      return asList(translationString.split("\\s+"));
-    }
-  }
-  
-  // Getters to use upstream
-  
-  public Sentence getSourceSentence() {
-    return sourceSentence;
-  }
-
-  public int getSentenceId() {
-    return sourceSentence.id();
-  }
-
-  public String getTranslationString() {
-    return translationString;
-  }
-
-  public List<String> getTranslationTokens() {
-    return translationTokens;
-  }
-
-  public float getTranslationScore() {
-    return translationScore;
-  }
-
-  /**
-   * Returns a list of target to source alignments.
-   */
-  public List<List<Integer>> getTranslationWordAlignments() {
-    return translationWordAlignments;
-  }
-  
-  public Map<String,Float> getTranslationFeatures() {
-    return translationFeatures;
-  }
-  
-  /**
-   * Time taken to build output information from the hypergraph.
-   */
-  public Float getExtractionTime() {
-    return extractionTime;
-  }
-}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/4f2bec7c/src/joshua/decoder/hypergraph/ViterbiFeatureVectorWalkerFunction.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/ViterbiFeatureVectorWalkerFunction.java b/src/joshua/decoder/hypergraph/ViterbiFeatureVectorWalkerFunction.java
deleted file mode 100644
index 5af6c4d..0000000
--- a/src/joshua/decoder/hypergraph/ViterbiFeatureVectorWalkerFunction.java
+++ /dev/null
@@ -1,44 +0,0 @@
-package joshua.decoder.hypergraph;
-
-import static joshua.decoder.chart_parser.ComputeNodeResult.computeTransitionFeatures;
-
-import java.util.List;
-import java.util.Map;
-
-import joshua.decoder.ff.FeatureFunction;
-import joshua.decoder.ff.FeatureVector;
-import joshua.decoder.segment_file.Sentence;
-
-public class ViterbiFeatureVectorWalkerFunction implements WalkerFunction {
-  
-  private final FeatureVector features;
-  private final List<FeatureFunction> featureFunctions;
-  private final Sentence sourceSentence;
-  
-  public ViterbiFeatureVectorWalkerFunction(
-      final List<FeatureFunction> featureFunctions,
-      final Sentence sourceSentence) {
-    this.features = new FeatureVector();
-    this.featureFunctions = featureFunctions;
-    this.sourceSentence = sourceSentence;
-  }
-
-  /**
-   * Recompute feature values for each Viterbi edge and add to features.
-   */
-  @Override
-  public void apply(HGNode node) {
-    final FeatureVector edgeFeatures = computeTransitionFeatures(
-        featureFunctions, node.bestHyperedge, node.i, node.j, sourceSentence);
-    features.add(edgeFeatures);
-  }
-  
-  public FeatureVector getFeatures() {
-    return features;
-  }
-  
-  public Map<String,Float> getFeaturesMap() {
-    return features.getMap();
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/4f2bec7c/src/joshua/decoder/hypergraph/ViterbiOutputStringWalkerFunction.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/ViterbiOutputStringWalkerFunction.java b/src/joshua/decoder/hypergraph/ViterbiOutputStringWalkerFunction.java
deleted file mode 100644
index 0c84375..0000000
--- a/src/joshua/decoder/hypergraph/ViterbiOutputStringWalkerFunction.java
+++ /dev/null
@@ -1,96 +0,0 @@
-package joshua.decoder.hypergraph;
-
-import static java.lang.Integer.MAX_VALUE;
-import static joshua.corpus.Vocabulary.getWords;
-import static joshua.corpus.Vocabulary.nt;
-
-import java.util.Stack;
-
-import joshua.decoder.ff.tm.Rule;
-
-public class ViterbiOutputStringWalkerFunction implements WalkerFunction {
-  
-  private Stack<int[]> viterbiWords = new Stack<int[]>();
-
-  @Override
-  public void apply(HGNode node) {
-    final Rule rule = node.bestHyperedge.getRule();
-    if (rule != null) {
-      merge(rule.getEnglish());
-    }
-  }
-  
-  private boolean containsNonTerminals(final int[] ids) {
-    boolean hasNonTerminals = false;
-    for (int i = 0; i < ids.length; i++) {
-      if (nt(ids[i])) {
-        hasNonTerminals = true;
-        break;
-      }
-    }
-    return hasNonTerminals;
-  }
-  
-  /**
-   * Returns the index of the next non-terminal slot to fill.
-   * Since non-terminals in right hand sides of rules are indexed by
-   * their order on the source side, this function looks for the largest
-   * negative id in ids and returns its index. 
-   */
-  private int getNextNonTerminalIndexToFill(final int[] ids) {
-    int nextIndex = 0;
-    int nextNonTerminal = -MAX_VALUE;
-    for (int i = 0; i < ids.length; i++) {
-      if (nt(ids[i]) && ids[i] > nextNonTerminal) {
-        nextIndex = i;
-        nextNonTerminal = ids[i];
-      }
-    }
-    return nextIndex;
-  }
-  
-  private int[] substituteNonTerminal(final int[] parentWords, final int[] childWords) {
-    final int ntIndex = getNextNonTerminalIndexToFill(parentWords);
-    final int[] result = new int[parentWords.length + childWords.length - 1];
-    int resultIndex = 0;
-    for (int i = 0; i < ntIndex; i++) {
-      result[resultIndex++] = parentWords[i];
-    }
-    for (int i = 0; i < childWords.length; i++) {
-      result[resultIndex++] = childWords[i];
-    }
-    for (int i = ntIndex + 1; i < parentWords.length; i++) {
-      result[resultIndex++] = parentWords[i];
-    }
-    return result;
-  }
-
-  private void merge(final int[] words) {
-    if (!containsNonTerminals(words)
-        && !viterbiWords.isEmpty()
-        && containsNonTerminals(viterbiWords.peek())) {
-      merge(substituteNonTerminal(viterbiWords.pop(), words));
-    } else {
-      viterbiWords.add(words);
-    }
-  }
-  
-  @Override
-  public String toString() {
-    if (viterbiWords.isEmpty()) {
-      return "";
-    }
-    
-    if (viterbiWords.size() != 1) {
-      throw new RuntimeException(
-          String.format(
-              "Stack of ViterbiOutputStringWalker should contain only a single (last) element, but was size %d", viterbiWords.size()));
-    }
-    
-    String result = getWords(viterbiWords.peek());
-    // strip of sentence markers (<s>,</s>)
-    result = result.substring(result.indexOf(' ') + 1, result.lastIndexOf(' '));
-    return result.trim();
-  }
-  
-}
\ No newline at end of file


[15/18] incubator-joshua git commit: permissions

Posted by mj...@apache.org.
permissions


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/bb3b79cc
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/bb3b79cc
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/bb3b79cc

Branch: refs/heads/morph
Commit: bb3b79cc1cbdab21ffae221c64b149f960d2da77
Parents: 2fa4b42
Author: Matt Post <po...@cs.jhu.edu>
Authored: Sun Apr 24 14:42:48 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Sun Apr 24 14:42:48 2016 -0400

----------------------------------------------------------------------
 scripts/morph/train-mallet.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/bb3b79cc/scripts/morph/train-mallet.sh
----------------------------------------------------------------------
diff --git a/scripts/morph/train-mallet.sh b/scripts/morph/train-mallet.sh
old mode 100644
new mode 100755


[09/18] incubator-joshua git commit: added some debugging...

Posted by mj...@apache.org.
added some debugging...


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/4b8c640c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/4b8c640c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/4b8c640c

Branch: refs/heads/morph
Commit: 4b8c640c69a015a39d2a38b6483ac696d5fa6b2e
Parents: 8b59b99
Author: Matt Post <po...@cs.jhu.edu>
Authored: Fri Apr 22 22:40:05 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Fri Apr 22 22:40:05 2016 -0400

----------------------------------------------------------------------
 src/joshua/decoder/ff/LexicalSharpener.java | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/4b8c640c/src/joshua/decoder/ff/LexicalSharpener.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/LexicalSharpener.java b/src/joshua/decoder/ff/LexicalSharpener.java
index 6207ac0..16d1021 100644
--- a/src/joshua/decoder/ff/LexicalSharpener.java
+++ b/src/joshua/decoder/ff/LexicalSharpener.java
@@ -112,7 +112,7 @@ public class LexicalSharpener extends StatelessFF {
     }
     classifiers.put(lastSourceWord, new MalletPredictor(lastSourceWord, examples));
   
-    System.err.println(String.format("Read %d lines from training file", linesRead));
+    Decoder.LOG(1, String.format("Read %d lines from training file", linesRead));
   }
 
   private MalletPredictor createClassifier(String lastSourceWord, HashMap<String, Integer> counts,
@@ -131,7 +131,8 @@ public class LexicalSharpener extends StatelessFF {
     classifiers = (HashMap<String,MalletPredictor>) ois.readObject();
     ois.close();
     
-    System.err.println(String.format("Loaded model with %d keys", classifiers.keySet().size()));
+    System.err.println(String.format("%s: Loaded model with %d keys", 
+        name, classifiers.keySet().size()));
   }
 
   public void saveClassifiers(String modelFile) throws FileNotFoundException, IOException {
@@ -161,6 +162,7 @@ public class LexicalSharpener extends StatelessFF {
       Token sourceToken = sentence.getTokens().get(s);
       String featureString = sourceToken.getAnnotationString().replace('|', ' ');
       
+      System.err.println(String.format("%s: %s -> %s?",  name, sourceToken, Vocabulary.word(targetID)));
       Classification result = predict(sourceToken.getWord(), targetID, featureString);
       if (result != null) {
         Labeling labeling = result.getLabeling();


[14/18] incubator-joshua git commit: Added -lowercase option to enable source-side projection of case

Posted by mj...@apache.org.
Added -lowercase option to enable source-side projection of case

If you add -lowercase to Joshua, it will lowercase all input, adding an annotation to each token of the form

    lettercase = {lower, upper, all-upper}

Then, at output time, the source-side input case will be projected to the target side.


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/3f4fa992
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/3f4fa992
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/3f4fa992

Branch: refs/heads/morph
Commit: 3f4fa992803fd9a7ac6dc3c51d803b65fda9d83d
Parents: 4f2bec7
Author: Matt Post <po...@cs.jhu.edu>
Authored: Sun Apr 24 14:38:26 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Sun Apr 24 14:38:26 2016 -0400

----------------------------------------------------------------------
 src/joshua/decoder/JoshuaConfiguration.java     |   7 +
 .../decoder/hypergraph/KBestExtractor.java      |  37 ++++-
 .../hypergraph/WordAlignmentExtractor.java      |   2 -
 .../decoder/hypergraph/WordAlignmentState.java  |   1 -
 src/joshua/decoder/segment_file/Sentence.java   |   8 +-
 src/joshua/decoder/segment_file/Token.java      |  26 +++-
 src/joshua/lattice/Lattice.java                 |  31 ++--
 src/joshua/util/FormatUtils.java                |  19 +++
 test/decoder/lowercaser/config                  | 140 +++++++++++++++++++
 test/decoder/lowercaser/grammar.glue            |   4 +
 test/decoder/lowercaser/grammar.test            |   1 +
 test/decoder/lowercaser/output.gold             |   3 +
 test/decoder/lowercaser/test.sh                 |  18 +++
 13 files changed, 271 insertions(+), 26 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/src/joshua/decoder/JoshuaConfiguration.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/JoshuaConfiguration.java b/src/joshua/decoder/JoshuaConfiguration.java
index c61720c..6c8edf6 100644
--- a/src/joshua/decoder/JoshuaConfiguration.java
+++ b/src/joshua/decoder/JoshuaConfiguration.java
@@ -51,6 +51,10 @@ public class JoshuaConfiguration {
   // whether to construct a StructuredTranslation object for each request instead of 
   // printing to stdout. Used when the Decoder is used from Java directly.
   public Boolean use_structured_output = false;
+  
+  // If set to true, Joshua will lowercase the input, creating an annotation that marks the
+  // original case
+  public boolean lowercase = false;
 
   // List of grammar files to read
   public ArrayList<String> tms = new ArrayList<String>();
@@ -638,6 +642,9 @@ public class JoshuaConfiguration {
           } else if (parameter.equals(normalize_key("cached-rules-size"))) {
               // Check source sentence
               cachedRuleSize = Integer.parseInt(fds[1]);
+          } else if (parameter.equals(normalize_key("lowercase"))) {
+            lowercase = true;
+            
           } else {
 
             if (parameter.equals(normalize_key("use-sent-specific-tm"))

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/src/joshua/decoder/hypergraph/KBestExtractor.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/KBestExtractor.java b/src/joshua/decoder/hypergraph/KBestExtractor.java
index 42539cc..45b9ccb 100644
--- a/src/joshua/decoder/hypergraph/KBestExtractor.java
+++ b/src/joshua/decoder/hypergraph/KBestExtractor.java
@@ -42,6 +42,8 @@ import joshua.decoder.ff.state_maintenance.DPState;
 import joshua.decoder.ff.tm.Rule;
 import joshua.decoder.io.DeNormalize;
 import joshua.decoder.segment_file.Sentence;
+import joshua.decoder.segment_file.Token;
+import joshua.util.FormatUtils;
 
 /**
  * This class implements lazy k-best extraction on a hyper-graph.
@@ -185,12 +187,12 @@ public class KBestExtractor {
           .replaceAll("-lsb-", "[")
           .replaceAll("-rsb-", "]")
           .replaceAll("-pipe-", "|");
-
+      
 
       outputString = joshuaConfiguration.outputFormat
           .replace("%k", Integer.toString(k))
-          .replace("%s", hypothesis)
-          .replace("%S", DeNormalize.processSingleLine(hypothesis))
+          .replace("%s", recapitalize(hypothesis, node))
+          .replace("%S", DeNormalize.processSingleLine(recapitalize(hypothesis, node)))
           .replace("%i", Integer.toString(sentence.id()))
           .replace("%f", joshuaConfiguration.moses ? features.mosesString() : features.toString())
           .replace("%c", String.format("%.3f", derivationState.cost));
@@ -283,6 +285,35 @@ public class KBestExtractor {
     }
     return virtualNode;
   }
+  
+  private String recapitalize(String input, HGNode goalNode) {
+    WordAlignmentState alignment = ViterbiExtractor.buildViterbiAlignment(goalNode);
+
+    String[] tokens = input.split("\\s+");
+    
+    List<List<Integer>> points = alignment.toFinalList();
+    for (int i = 0; i < points.size(); i++) {
+      List<Integer> target = points.get(i);
+      for (int source: target) {
+        Token token = sentence.getTokens().get(source + 1); // skip <s>
+        String annotation = "";
+        if (token != null && token.getAnnotation("lettercase") != null)
+          annotation = token.getAnnotation("lettercase");
+        if (source != 0 && annotation.equals("upper"))
+          tokens[i] = FormatUtils.capitalize(tokens[i]);
+        else if (annotation.equals("all-upper"))
+          tokens[i] = tokens[i].toUpperCase();
+      }
+    }
+
+    String cap = new String();
+    for (int i = 0; i < tokens.length; i++) {
+      if (i > 0)
+        cap += " ";
+      cap += tokens[i];
+    }
+    return cap; 
+  }
 
 
   /**

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java b/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java
index 63619ee..8e0c2a6 100644
--- a/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java
+++ b/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java
@@ -2,8 +2,6 @@ package joshua.decoder.hypergraph;
 
 import java.util.Stack;
 
-import joshua.corpus.Vocabulary;
-import joshua.decoder.Decoder;
 import joshua.decoder.ff.tm.Rule;
 import joshua.decoder.hypergraph.KBestExtractor.DerivationState;
 import joshua.decoder.hypergraph.KBestExtractor.DerivationVisitor;

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/src/joshua/decoder/hypergraph/WordAlignmentState.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/WordAlignmentState.java b/src/joshua/decoder/hypergraph/WordAlignmentState.java
index e3b9598..d47fa38 100644
--- a/src/joshua/decoder/hypergraph/WordAlignmentState.java
+++ b/src/joshua/decoder/hypergraph/WordAlignmentState.java
@@ -1,7 +1,6 @@
 package joshua.decoder.hypergraph;
 
 import java.util.ArrayList;
-import java.util.HashMap;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.ListIterator;

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/src/joshua/decoder/segment_file/Sentence.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/segment_file/Sentence.java b/src/joshua/decoder/segment_file/Sentence.java
index c1eeca8..b51d509 100644
--- a/src/joshua/decoder/segment_file/Sentence.java
+++ b/src/joshua/decoder/segment_file/Sentence.java
@@ -191,7 +191,7 @@ public class Sentence {
             for (int i = 0; i <= chars.length - width; i++) {
               int j = i + width;
               if (width != chars.length) {
-                Token token = new Token(word.substring(i, j));
+                Token token = new Token(word.substring(i, j), config);
                 if (vocabulary.contains(id)) {
                   nodes.get(i).addArc(nodes.get(j), 0.0f, token);
                   wordChart.set(i, j, true);
@@ -386,7 +386,7 @@ public class Sentence {
    */
   public Lattice<String> stringLattice() {
     assert isLinearChain();
-    return Lattice.createStringLatticeFromString(source());
+    return Lattice.createStringLatticeFromString(source(), config);
   }
 
   public List<ConstraintSpan> constraints() {
@@ -400,10 +400,10 @@ public class Sentence {
           System.err.println("* FATAL: lattice decoding currently not supported for stack-based search algorithm.");
           System.exit(12);
         }
-        this.sourceLattice = Lattice.createTokenLatticeFromPLF(rawSource());
+        this.sourceLattice = Lattice.createTokenLatticeFromPLF(rawSource(), config);
       } else
         this.sourceLattice = Lattice.createTokenLatticeFromString(String.format("%s %s %s", Vocabulary.START_SYM,
-            rawSource(), Vocabulary.STOP_SYM));
+            rawSource(), Vocabulary.STOP_SYM), config);
     }
     return this.sourceLattice;
   }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/src/joshua/decoder/segment_file/Token.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/segment_file/Token.java b/src/joshua/decoder/segment_file/Token.java
index 12e2b68..ebe9a43 100644
--- a/src/joshua/decoder/segment_file/Token.java
+++ b/src/joshua/decoder/segment_file/Token.java
@@ -23,6 +23,9 @@ import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 import joshua.corpus.Vocabulary;
+import joshua.decoder.Decoder;
+import joshua.decoder.JoshuaConfiguration;
+import joshua.util.FormatUtils;
 
 /**
  * Stores the identity of a word and its annotations in a sentence.
@@ -36,6 +39,7 @@ public class Token {
   private int tokenID;
 
   private HashMap<String,String> annotations = null;
+  private JoshuaConfiguration joshuaConfiguration;
 
   /**
    * Constructor : Creates a Token object from a raw word
@@ -58,7 +62,9 @@ public class Token {
    * @param rawWord A word with annotation information (possibly)
    *  
    */
-  public Token(String rawWord) {
+  public Token(String rawWord, JoshuaConfiguration config) {
+    
+    this.joshuaConfiguration = config;
     
     annotations = new HashMap<String,String>();
     
@@ -89,9 +95,21 @@ public class Token {
         .replaceAll("\\]",  "-rsb-")
         .replaceAll("\\|",  "-pipe-");
 
+    if (joshuaConfiguration != null && joshuaConfiguration.lowercase) {
+      if (FormatUtils.ISALLUPPERCASE(token))
+        annotations.put("lettercase", "all-upper");
+      else if (Character.isUpperCase(token.charAt(0)))
+        annotations.put("lettercase",  "upper");
+      else
+        annotations.put("lettercase",  "lower");
+      
+      Decoder.LOG(2, String.format("TOKEN: %s -> %s (%s)", token, token.toLowerCase(), annotations.get("lettercase")));
+      token = token.toLowerCase(); 
+    }
+    
     tokenID = Vocabulary.id(token);
   }
-
+  
   /**
    * Returns the word ID (vocab ID) for this token
    * 
@@ -108,6 +126,10 @@ public class Token {
   public String getWordIdentity() {
     return token;
   }
+  
+  public String toString() {
+    return token;
+  }
 
   /**
    * Returns the annotationID (vocab ID)

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/src/joshua/lattice/Lattice.java
----------------------------------------------------------------------
diff --git a/src/joshua/lattice/Lattice.java b/src/joshua/lattice/Lattice.java
index abe43b2..bf2bf87 100644
--- a/src/joshua/lattice/Lattice.java
+++ b/src/joshua/lattice/Lattice.java
@@ -30,6 +30,7 @@ import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 import joshua.corpus.Vocabulary;
+import joshua.decoder.JoshuaConfiguration;
 import joshua.decoder.segment_file.Token;
 import joshua.util.ChartSpan;
 
@@ -61,6 +62,8 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
 
   /** Logger for this class. */
   private static final Logger logger = Logger.getLogger(Lattice.class.getName());
+  
+  JoshuaConfiguration config = null;
 
   /**
    * Constructs a new lattice from an existing list of (connected) nodes.
@@ -70,13 +73,13 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
    * 
    * @param nodes A list of nodes which must be in topological order.
    */
-  public Lattice(List<Node<Value>> nodes) {
+  public Lattice(List<Node<Value>> nodes, JoshuaConfiguration config) {
     this.nodes = nodes;
 //    this.distances = calculateAllPairsShortestPath();
     this.latticeHasAmbiguity = true;
   }
 
-  public Lattice(List<Node<Value>> nodes, boolean isAmbiguous) {
+  public Lattice(List<Node<Value>> nodes, boolean isAmbiguous, JoshuaConfiguration config) {
     // Node<Value> sink = new Node<Value>(nodes.size());
     // nodes.add(sink);
     this.nodes = nodes;
@@ -89,7 +92,7 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
    * 
    * @param linearChain a sequence of Value objects
    */
-  public Lattice(Value[] linearChain) {
+  public Lattice(Value[] linearChain, JoshuaConfiguration config) {
     this.latticeHasAmbiguity = false;
     this.nodes = new ArrayList<Node<Value>>();
 
@@ -140,17 +143,17 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
    * @param linearChain
    * @return Lattice representation of the linear chain.
    */
-  public static Lattice<Token> createTokenLatticeFromString(String source) {
+  public static Lattice<Token> createTokenLatticeFromString(String source, JoshuaConfiguration config) {
     String[] tokens = source.split("\\s+");
     Token[] integerSentence = new Token[tokens.length];
     for (int i = 0; i < tokens.length; i++) {
-      integerSentence[i] = new Token(tokens[i]);
+      integerSentence[i] = new Token(tokens[i], config);
     }
 
-    return new Lattice<Token>(integerSentence);
+    return new Lattice<Token>(integerSentence, config);
   }
 
-  public static Lattice<Token> createTokenLatticeFromPLF(String data) {
+  public static Lattice<Token> createTokenLatticeFromPLF(String data, JoshuaConfiguration config) {
     ArrayList<Node<Token>> nodes = new ArrayList<Node<Token>>();
     
     // This matches a sequence of tuples, which describe arcs leaving this node
@@ -211,7 +214,7 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
 
         String remainingArcs = arcMatcher.group(4);
 
-        Token arcToken = new Token(arcLabel);
+        Token arcToken = new Token(arcLabel, config);
         currentNode.addArc(destinationNode, arcWeight, arcToken);
 
         arcMatcher = arcPattern.matcher(remainingArcs);
@@ -225,16 +228,16 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
     /* Add <s> to the start of the lattice. */
     if (nodes.size() > 1 && nodes.get(1) != null) {
       Node<Token> firstNode = nodes.get(1);
-      startNode.addArc(firstNode, 0.0f, new Token(Vocabulary.START_SYM));
+      startNode.addArc(firstNode, 0.0f, new Token(Vocabulary.START_SYM, config));
     }
 
     /* Add </s> as a final state, connect it to the previous end-state */
     nodeID = nodes.get(nodes.size()-1).getNumber() + 1;
     Node<Token> endNode = new Node<Token>(nodeID);
-    nodes.get(nodes.size()-1).addArc(endNode, 0.0f, new Token(Vocabulary.STOP_SYM));
+    nodes.get(nodes.size()-1).addArc(endNode, 0.0f, new Token(Vocabulary.STOP_SYM, config));
     nodes.add(endNode);
 
-    return new Lattice<Token>(nodes, latticeIsAmbiguous);
+    return new Lattice<Token>(nodes, latticeIsAmbiguous, config);
   }
 
   /**
@@ -243,7 +246,7 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
    * @param data String representation of a lattice.
    * @return A lattice that corresponds to the given string.
    */
-  public static Lattice<String> createStringLatticeFromString(String data) {
+  public static Lattice<String> createStringLatticeFromString(String data, JoshuaConfiguration config) {
 
     Map<Integer, Node<String>> nodes = new HashMap<Integer, Node<String>>();
 
@@ -303,7 +306,7 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
 
     logger.fine(nodeList.toString());
 
-    return new Lattice<String>(nodeList);
+    return new Lattice<String>(nodeList, config);
   }
 
   /**
@@ -431,7 +434,7 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
     nodes.get(2).addArc(nodes.get(3), 3.0f, "b");
     nodes.get(2).addArc(nodes.get(3), 5.0f, "c");
 
-    Lattice<String> graph = new Lattice<String>(nodes);
+    Lattice<String> graph = new Lattice<String>(nodes, null);
 
     System.out.println("Shortest path from 0 to 3: " + graph.getShortestPath(0, 3));
   }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/src/joshua/util/FormatUtils.java
----------------------------------------------------------------------
diff --git a/src/joshua/util/FormatUtils.java b/src/joshua/util/FormatUtils.java
index 3bd53e9..c196328 100644
--- a/src/joshua/util/FormatUtils.java
+++ b/src/joshua/util/FormatUtils.java
@@ -170,4 +170,23 @@ public class FormatUtils {
       return false;
     }
   }
+  
+  /**
+   * Determines if a string contains ALL CAPS
+   * 
+   * @param token
+   * @return true if the string is all in uppercase, false otherwise
+   */
+  public static boolean ISALLUPPERCASE(String token) {
+    for (int i = 0; i < token.length(); i++)
+      if (! Character.isUpperCase(token.charAt(i)))
+        return false;
+    return true;
+  }
+
+  public static String capitalize(String word) {
+    if (word == null || word.length() == 0)
+      return word;
+    return word.substring(0, 1).toUpperCase() + word.substring(1);
+  }
 }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/test/decoder/lowercaser/config
----------------------------------------------------------------------
diff --git a/test/decoder/lowercaser/config b/test/decoder/lowercaser/config
new file mode 100644
index 0000000..efa787e
--- /dev/null
+++ b/test/decoder/lowercaser/config
@@ -0,0 +1,140 @@
+# This file is a template for the Joshua pipeline; variables enclosed
+# in <angle-brackets> are substituted by the pipeline script as
+# appropriate.  This file also serves to document Joshua's many
+# parameters.
+
+# These are the grammar file specifications.  Joshua supports an
+# arbitrary number of grammar files, each specified on its own line
+# using the following format:
+#
+#   tm = TYPE OWNER LIMIT FILE
+# 
+# TYPE is "packed", "thrax", or "samt".  The latter denotes the format
+# used in Zollmann and Venugopal's SAMT decoder
+# (http://www.cs.cmu.edu/~zollmann/samt/).
+# 
+# OWNER is the "owner" of the rules in the grammar; this is used to
+# determine which set of phrasal features apply to the grammar's
+# rules.  Having different owners allows different features to be
+# applied to different grammars, and for grammars to share features
+# across files.
+#
+# LIMIT is the maximum input span permitted for the application of
+# grammar rules found in the grammar file.  A value of -1 implies no limit.
+#
+# FILE is the grammar file (or directory when using packed grammars).
+# The file can be compressed with gzip, which is determined by the
+# presence or absence of a ".gz" file extension.
+#
+# By a convention defined by Chiang (2007), the grammars are split
+# into two files: the main translation grammar containing all the
+# learned translation rules, and a glue grammar which supports
+# monotonic concatenation of hierarchical phrases. The glue grammar's
+# main distinction from the regular grammar is that the span limit
+# does not apply to it.  
+
+tm = hiero -maxspan 20 -path grammar.test -owner pt
+tm = thrax -path grammar.glue -maxspan -1 -owner glue
+
+# This symbol is used over unknown words in the source language
+
+default-non-terminal = X
+
+# This is the goal nonterminal, used to determine when a complete
+# parse is found.  It should correspond to the root-level rules in the
+# glue grammar.
+
+goal-symbol = GOAL
+
+# Language model config.
+#
+# Multiple language models are supported.  For each language model,
+# create one of the following lines:
+#
+# feature-function = LanguageModel -lm_type TYPE -lm_order ORDER -lm_file FILE
+# feature-function = StateMinimizingLanguageModel -lm_order ORDER -lm_file FILE
+#
+# - TYPE is one of "kenlm" or "berkeleylm"
+# - ORDER is the order of the language model (default 5)
+# - FILE is the path to the LM file. This can be binarized if appropriate to the type
+#   (e.g., KenLM has a compiled format)
+#
+# A state-minimizing LM collapses left-state. Currently only KenLM supports this.
+#
+# For each LM, add a weight lm_INDEX below, where indexing starts from 0.
+
+
+
+# The suffix _OOV is appended to unknown source-language words if this
+# is set to true.
+
+mark-oovs = false
+
+# The search algorithm: "cky" for hierarchical / phrase-based decoding, 
+# "stack" for phrase-based decoding
+search = cky
+
+# The pop-limit for decoding.  This determines how many hypotheses are
+# considered over each span of the input.
+
+pop-limit = 100
+
+# How many hypotheses to output
+
+top-n = 1
+
+# Whether those hypotheses should be distinct strings
+
+use-unique-nbest = true
+
+# This is the default format of the ouput printed to STDOUT.  The variables that can be
+# substituted are:
+#
+# %i: the sentence number (0-indexed)
+# %s: the translated sentence
+# %t: the derivation tree
+# %f: the feature string
+# %c: the model cost
+
+output-format = %s
+
+# When printing the trees (%t in 'output-format'), this controls whether the alignments
+# are also printed.
+
+include-align-index = false
+
+# And these are the feature functions to activate.
+feature-function = OOVPenalty
+feature-function = WordPenalty
+
+## Model weights #####################################################
+
+# For each langage model line listed above, create a weight in the
+# following format: the keyword "lm", a 0-based index, and the weight.
+# lm_INDEX WEIGHT
+
+
+# The phrasal weights correspond to weights stored with each of the
+# grammar rules.  The format is
+#
+#   tm_OWNER_COLUMN WEIGHT
+#
+# where COLUMN denotes the 0-based order of the parameter in the
+# grammar file and WEIGHT is the corresponding weight.  In the future,
+# we plan to add a sparse feature representation which will simplify
+# this.
+
+# The wordpenalty feature counts the number of words in each hypothesis.
+
+
+# This feature counts the number of unknown words in the hypothesis.
+
+
+# This feature weights paths through an input lattice.  It is only activated
+# when decoding lattices.
+
+WordPenalty -4.72455379476569
+OOVPenalty 0.7897219562429866
+tm_pt_0 0.3137696816891433
+tm_glue_0 -0.04493059277470993
+

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/test/decoder/lowercaser/grammar.glue
----------------------------------------------------------------------
diff --git a/test/decoder/lowercaser/grammar.glue b/test/decoder/lowercaser/grammar.glue
new file mode 100644
index 0000000..69e1520
--- /dev/null
+++ b/test/decoder/lowercaser/grammar.glue
@@ -0,0 +1,4 @@
+[GOAL] ||| <s> ||| <s> ||| 0
+[GOAL] ||| [GOAL,1] [X,2] ||| [GOAL,1] [X,2] ||| -1
+[GOAL] ||| [GOAL,1] </s> ||| [GOAL,1] </s> ||| 0
+[GOAL] ||| <s> [X,1] </s> ||| <s> [X,1] </s> ||| 0

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/test/decoder/lowercaser/grammar.test
----------------------------------------------------------------------
diff --git a/test/decoder/lowercaser/grammar.test b/test/decoder/lowercaser/grammar.test
new file mode 100644
index 0000000..3745008
--- /dev/null
+++ b/test/decoder/lowercaser/grammar.test
@@ -0,0 +1 @@
+[X] ||| ella ||| she ||| 1 ||| 0-0

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/test/decoder/lowercaser/output.gold
----------------------------------------------------------------------
diff --git a/test/decoder/lowercaser/output.gold b/test/decoder/lowercaser/output.gold
new file mode 100644
index 0000000..0c9c1eb
--- /dev/null
+++ b/test/decoder/lowercaser/output.gold
@@ -0,0 +1,3 @@
+ELLA
+she
+SHE

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/test/decoder/lowercaser/test.sh
----------------------------------------------------------------------
diff --git a/test/decoder/lowercaser/test.sh b/test/decoder/lowercaser/test.sh
new file mode 100755
index 0000000..4db1251
--- /dev/null
+++ b/test/decoder/lowercaser/test.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+set -u
+
+(
+echo -e "ELLA" | $JOSHUA/bin/joshua-decoder -config config
+echo -e "Ella" | $JOSHUA/bin/joshua-decoder -config config -lowercase
+echo -e "ELLA" | $JOSHUA/bin/joshua-decoder -config config -lowercase
+) > output 2> .log
+
+diff -u output output.gold > diff
+
+if [ $? -eq 0 ]; then
+    rm -f log output diff
+    exit 0
+else
+    exit 1
+fi


[04/18] incubator-joshua git commit: added to build path

Posted by mj...@apache.org.
added to build path


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/1c8aaa5e
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/1c8aaa5e
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/1c8aaa5e

Branch: refs/heads/morph
Commit: 1c8aaa5eb89a800b51478c352b890f535067c5e4
Parents: 71f808e
Author: Matt Post <po...@cs.jhu.edu>
Authored: Fri Apr 22 10:50:54 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Fri Apr 22 10:50:54 2016 -0400

----------------------------------------------------------------------
 build.xml | 2 ++
 1 file changed, 2 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/1c8aaa5e/build.xml
----------------------------------------------------------------------
diff --git a/build.xml b/build.xml
index c0c6132..1489c4e 100644
--- a/build.xml
+++ b/build.xml
@@ -29,6 +29,8 @@
       <include name="args4j-2.0.29.jar" />
       <include name="gson-2.5.jar" />
       <include name="guava-19.0.jar" />
+      <include name="mallet-2.0.7.jar" />
+      <include name="trove4j-2.0.2" />
     </fileset>
     <fileset dir="${thraxlib}">
       <include name="thrax.jar" />