You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/04/24 22:53:22 UTC
[01/18] incubator-joshua git commit: OOV fix for class-based LM
Repository: incubator-joshua
Updated Branches:
refs/heads/master 3f4fa9928 -> f2ae90433
refs/heads/morph a86ae8e87 -> 00eaf7168
OOV fix for class-based LM
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/b7f23108
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/b7f23108
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/b7f23108
Branch: refs/heads/morph
Commit: b7f23108ffce1451fac45dcf6ac7ff6efa44ec56
Parents: 5396c5f
Author: Matt Post <po...@cs.jhu.edu>
Authored: Thu Apr 21 09:24:06 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Thu Apr 21 09:24:06 2016 -0400
----------------------------------------------------------------------
src/joshua/decoder/ff/lm/LanguageModelFF.java | 8 ++------
1 file changed, 2 insertions(+), 6 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b7f23108/src/joshua/decoder/ff/lm/LanguageModelFF.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/lm/LanguageModelFF.java b/src/joshua/decoder/ff/lm/LanguageModelFF.java
index 732229c..18c149d 100644
--- a/src/joshua/decoder/ff/lm/LanguageModelFF.java
+++ b/src/joshua/decoder/ff/lm/LanguageModelFF.java
@@ -92,7 +92,7 @@ public class LanguageModelFF extends StatefulFF {
protected class ClassMap {
- private final int OOV_id = 10;
+ private final int OOV_id = Vocabulary.getUnknownId();
private HashMap<Integer, Integer> classMap;
public ClassMap(String file_name) throws IOException {
@@ -101,11 +101,7 @@ public class LanguageModelFF extends StatefulFF {
}
public int getClassID(int wordID) {
- if (this.classMap.containsKey(wordID)) {
- return this.classMap.get(wordID);
- } else {
- return OOV_id;
- }
+ return this.classMap.getOrDefault(wordID, OOV_id);
}
/**
[10/18] incubator-joshua git commit: bugfix in looking up source word
classifier and prediction
Posted by mj...@apache.org.
bugfix in looking up source word classifier and prediction
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/c30bddba
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/c30bddba
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/c30bddba
Branch: refs/heads/morph
Commit: c30bddbafaa12f72020f7e746e4c3e138cf2294c
Parents: 4b8c640
Author: Matt Post <po...@cs.jhu.edu>
Authored: Fri Apr 22 23:40:32 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Fri Apr 22 23:40:32 2016 -0400
----------------------------------------------------------------------
src/joshua/decoder/ff/LexicalSharpener.java | 23 +++++++++++++----------
1 file changed, 13 insertions(+), 10 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/c30bddba/src/joshua/decoder/ff/LexicalSharpener.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/LexicalSharpener.java b/src/joshua/decoder/ff/LexicalSharpener.java
index 16d1021..b8f0c39 100644
--- a/src/joshua/decoder/ff/LexicalSharpener.java
+++ b/src/joshua/decoder/ff/LexicalSharpener.java
@@ -151,6 +151,8 @@ public class LexicalSharpener extends StatelessFF {
public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
Sentence sentence, Accumulator acc) {
+ int[] resolved = anchorRuleSourceToSentence(rule, tailNodes, i);
+
Map<Integer, List<Integer>> points = rule.getAlignmentMap();
for (int t: points.keySet()) {
List<Integer> source_indices = points.get(t);
@@ -158,12 +160,14 @@ public class LexicalSharpener extends StatelessFF {
continue;
int targetID = rule.getEnglish()[t];
- int s = i + source_indices.get(0);
- Token sourceToken = sentence.getTokens().get(s);
+ String targetWord = Vocabulary.word(targetID);
+ int sourceIndex = resolved[source_indices.get(0)];
+ Token sourceToken = sentence.getTokens().get(sourceIndex);
+ String sourceWord = Vocabulary.word(sourceToken.getWord());
String featureString = sourceToken.getAnnotationString().replace('|', ' ');
- System.err.println(String.format("%s: %s -> %s?", name, sourceToken, Vocabulary.word(targetID)));
- Classification result = predict(sourceToken.getWord(), targetID, featureString);
+ System.err.println(String.format("%s: %s -> %s?", name, sourceWord, targetWord));
+ Classification result = predict(sourceWord, targetWord, featureString);
if (result != null) {
Labeling labeling = result.getLabeling();
int num = labeling.numLocations();
@@ -193,12 +197,11 @@ public class LexicalSharpener extends StatelessFF {
return "21+";
}
- public Classification predict(int sourceID, int targetID, String featureString) {
- String word = Vocabulary.word(sourceID);
- if (classifiers.containsKey(word)) {
- MalletPredictor predictor = classifiers.get(word);
+ public Classification predict(String sourceWord, String targetWord, String featureString) {
+ if (classifiers.containsKey(sourceWord)) {
+ MalletPredictor predictor = classifiers.get(sourceWord);
if (predictor != null)
- return predictor.predict(word, featureString);
+ return predictor.predict(targetWord, featureString);
}
return null;
@@ -273,7 +276,7 @@ public class LexicalSharpener extends StatelessFF {
String sourceWord = tokens[0];
String targetWord = tokens[1];
String features = tokens[2];
- Classification result = ts.predict(Vocabulary.id(sourceWord), Vocabulary.id(targetWord), features);
+ Classification result = ts.predict(sourceWord, targetWord, features);
if (result != null)
System.out.println(String.format("%s %f", result.getLabelVector().getBestLabel(), result.getLabelVector().getBestValue()));
else
[16/18] incubator-joshua git commit: added lowercaser option to
pipeline (set to 'cat' for null)
Posted by mj...@apache.org.
added lowercaser option to pipeline (set to 'cat' for null)
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/f2ae9043
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/f2ae9043
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/f2ae9043
Branch: refs/heads/morph
Commit: f2ae90433c7e3e3dc95fa697c3565e4462306ba0
Parents: 3f4fa99
Author: Matt Post <po...@cs.jhu.edu>
Authored: Sun Apr 24 15:39:01 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Sun Apr 24 15:39:01 2016 -0400
----------------------------------------------------------------------
scripts/training/pipeline.pl | 1 +
1 file changed, 1 insertion(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2ae9043/scripts/training/pipeline.pl
----------------------------------------------------------------------
diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl
index 8629508..a438e60 100755
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl
@@ -264,6 +264,7 @@ my $retval = GetOptions(
"tokenizer-source=s" => \$TOKENIZER_SOURCE,
"tokenizer-target=s" => \$TOKENIZER_TARGET,
"normalizer=s" => \$NORMALIZER,
+ "lowercaser=s" => \$LOWERCASER,
"joshua-config=s" => \$_JOSHUA_CONFIG,
"joshua-args=s" => \$_JOSHUA_ARGS,
"joshua-mem=s" => \$JOSHUA_MEM,
[02/18] incubator-joshua git commit: pack alignments by default!
Posted by mj...@apache.org.
pack alignments by default!
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/a330afe4
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/a330afe4
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/a330afe4
Branch: refs/heads/morph
Commit: a330afe4de2fab29dd4eee49c1f3834e435ed540
Parents: a86ae8e
Author: Matt Post <po...@cs.jhu.edu>
Authored: Fri Apr 22 10:02:27 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Fri Apr 22 10:02:27 2016 -0400
----------------------------------------------------------------------
scripts/support/grammar-packer.pl | 8 +++++---
scripts/support/run_bundler.py | 1 +
scripts/training/pipeline.pl | 2 +-
3 files changed, 7 insertions(+), 4 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/a330afe4/scripts/support/grammar-packer.pl
----------------------------------------------------------------------
diff --git a/scripts/support/grammar-packer.pl b/scripts/support/grammar-packer.pl
index d2b1627..e485513 100755
--- a/scripts/support/grammar-packer.pl
+++ b/scripts/support/grammar-packer.pl
@@ -20,20 +20,21 @@ use File::Temp qw/tempfile/;
use File::Basename qw/basename/;
my %opts = (
+ a => 0, # whether alignments are included in the grammar(s)
g => '', # comma-separated list of grammars to pack
o => '', # comma-separated list of grammar output directories
m => '8g', # amount of memory to give the packer
T => '/tmp', # location of temporary space
v => 0, # verbose
);
-getopts("m:T:vg:o:", \%opts) || die usage();
+getopts("am:T:vg:o:", \%opts) || die usage();
die usage() if (@ARGV);
my $JOSHUA = $ENV{JOSHUA} or die "you must defined \$JOSHUA";
my $CAT = "$JOSHUA/scripts/training/scat";
sub usage {
- print "Usage: grammar-packer.pl [-m MEM] [-T /path/to/tmp] -g 'grammar [grammar2 ...]' -o 'grammar.packed [grammar2.packed ...]'\n";
+ print "Usage: grammar-packer.pl [-a] [-m MEM] [-T /path/to/tmp] -g 'grammar [grammar2 ...]' -o 'grammar.packed [grammar2.packed ...]'\n";
exit 1;
}
@@ -88,7 +89,8 @@ foreach my $grammar (@grammars) {
# Do the packing using the config.
my $grammars = join(" ", @sorted_grammars);
my $outputs = join(" ", @outputs);
-my $cmd = "java -Xmx$opts{m} -cp $JOSHUA/lib/args4j-2.0.29.jar:$JOSHUA/class joshua.tools.GrammarPackerCli -g $grammars --outputs $outputs";
+my $alignments = $opts{a} ? "--ga" : "";
+my $cmd = "java -Xmx$opts{m} -cp $JOSHUA/lib/args4j-2.0.29.jar:$JOSHUA/class joshua.tools.GrammarPackerCli -g $grammars --outputs $outputs $alignments";
print STDERR "Packing with $cmd...\n" if $opts{v};
my $retval = system($cmd);
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/a330afe4/scripts/support/run_bundler.py
----------------------------------------------------------------------
diff --git a/scripts/support/run_bundler.py b/scripts/support/run_bundler.py
index ae54221..b64b6f7 100755
--- a/scripts/support/run_bundler.py
+++ b/scripts/support/run_bundler.py
@@ -326,6 +326,7 @@ def recursive_copy(src, dest, symlink = False):
def run_grammar_packer(src_path, dest_path):
cmd = [os.path.join(JOSHUA_PATH, "scripts/support/grammar-packer.pl"),
+ "-a",
"-T", opts.tmpdir,
"-g", src_path, "-o", dest_path]
logging.info(
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/a330afe4/scripts/training/pipeline.pl
----------------------------------------------------------------------
diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl
index 8629508..fd9436e 100755
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl
@@ -1644,7 +1644,7 @@ if ($DO_PACK_GRAMMARS) {
my $packed_dir = "$DATA_DIRS{test}/grammar.packed";
if ($OPTIMIZER_RUN == 1 and ! is_packed($TEST_GRAMMAR)) {
$cachepipe->cmd("test-pack",
- "$SCRIPTDIR/support/grammar-packer.pl -T $TMPDIR -m $PACKER_MEM -g $TEST_GRAMMAR -o $packed_dir",
+ "$SCRIPTDIR/support/grammar-packer.pl -a -T $TMPDIR -m $PACKER_MEM -g $TEST_GRAMMAR -o $packed_dir",
$TEST_GRAMMAR,
"$packed_dir/vocabulary",
"$packed_dir/encoding",
[17/18] incubator-joshua git commit: added lowercaser option to
pipeline (set to 'cat' for null)
Posted by mj...@apache.org.
added lowercaser option to pipeline (set to 'cat' for null)
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/f2ae9043
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/f2ae9043
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/f2ae9043
Branch: refs/heads/master
Commit: f2ae90433c7e3e3dc95fa697c3565e4462306ba0
Parents: 3f4fa99
Author: Matt Post <po...@cs.jhu.edu>
Authored: Sun Apr 24 15:39:01 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Sun Apr 24 15:39:01 2016 -0400
----------------------------------------------------------------------
scripts/training/pipeline.pl | 1 +
1 file changed, 1 insertion(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2ae9043/scripts/training/pipeline.pl
----------------------------------------------------------------------
diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl
index 8629508..a438e60 100755
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl
@@ -264,6 +264,7 @@ my $retval = GetOptions(
"tokenizer-source=s" => \$TOKENIZER_SOURCE,
"tokenizer-target=s" => \$TOKENIZER_TARGET,
"normalizer=s" => \$NORMALIZER,
+ "lowercaser=s" => \$LOWERCASER,
"joshua-config=s" => \$_JOSHUA_CONFIG,
"joshua-args=s" => \$_JOSHUA_ARGS,
"joshua-mem=s" => \$JOSHUA_MEM,
[05/18] incubator-joshua git commit: Model now serializes
Posted by mj...@apache.org.
Model now serializes
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/68b01bc1
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/68b01bc1
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/68b01bc1
Branch: refs/heads/morph
Commit: 68b01bc168298db382334e9f01bdf2992db85b01
Parents: 1c8aaa5
Author: Matt Post <po...@cs.jhu.edu>
Authored: Fri Apr 22 15:59:39 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Fri Apr 22 15:59:39 2016 -0400
----------------------------------------------------------------------
src/joshua/decoder/ff/LexicalSharpener.java | 184 ++++++-----------------
src/joshua/decoder/ff/MalletPredictor.java | 97 ++++++++++++
2 files changed, 143 insertions(+), 138 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/68b01bc1/src/joshua/decoder/ff/LexicalSharpener.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/LexicalSharpener.java b/src/joshua/decoder/ff/LexicalSharpener.java
index 2c96f83..8671d57 100644
--- a/src/joshua/decoder/ff/LexicalSharpener.java
+++ b/src/joshua/decoder/ff/LexicalSharpener.java
@@ -19,24 +19,16 @@ package joshua.decoder.ff;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
-import java.io.FileReader;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
-import java.io.StringReader;
-import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import cc.mallet.classify.*;
-import cc.mallet.pipe.*;
-import cc.mallet.pipe.iterator.CsvIterator;
-import cc.mallet.types.Alphabet;
-import cc.mallet.types.Instance;
-import cc.mallet.types.InstanceList;
-import cc.mallet.types.LabelAlphabet;
+import cc.mallet.types.Labeling;
import joshua.corpus.Vocabulary;
import joshua.decoder.Decoder;
import joshua.decoder.JoshuaConfiguration;
@@ -52,7 +44,8 @@ import joshua.util.io.LineReader;
public class LexicalSharpener extends StatelessFF {
- private HashMap<Integer,Predictor> classifiers = null;
+ private HashMap<String,MalletPredictor> classifiers = null;
+
public LexicalSharpener(final FeatureVector weights, String[] args, JoshuaConfiguration config) {
super(weights, "LexicalSharpener", args, config);
@@ -63,6 +56,13 @@ public class LexicalSharpener extends StatelessFF {
System.err.println(String.format("* FATAL[LexicalSharpener]: can't load %s", parsedArgs.get("training-data")));
System.exit(1);
}
+ } else if (parsedArgs.containsKey("model")) {
+ try {
+ loadClassifiers(parsedArgs.get("model"));
+ } catch (ClassNotFoundException | IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
}
}
@@ -75,7 +75,7 @@ public class LexicalSharpener extends StatelessFF {
*/
public void trainAll(String dataFile) throws FileNotFoundException {
- classifiers = new HashMap<Integer, Predictor>();
+ classifiers = new HashMap<String, MalletPredictor>();
Decoder.LOG(1, "Reading " + dataFile);
LineReader lineReader = null;
@@ -92,7 +92,7 @@ public class LexicalSharpener extends StatelessFF {
for (String line : lineReader) {
String sourceWord = line.substring(0, line.indexOf(' '));
if (lastSourceWord != null && ! sourceWord.equals(lastSourceWord)) {
- classifiers.put(Vocabulary.id(lastSourceWord), new Predictor(lastSourceWord, examples));
+ classifiers.put(lastSourceWord, new MalletPredictor(lastSourceWord, examples));
// System.err.println(String.format("WORD %s:\n%s\n", lastOutcome, buffer));
examples = "";
}
@@ -101,18 +101,18 @@ public class LexicalSharpener extends StatelessFF {
lastSourceWord = sourceWord;
linesRead++;
}
- classifiers.put(Vocabulary.id(lastSourceWord), new Predictor(lastSourceWord, examples));
+ classifiers.put(lastSourceWord, new MalletPredictor(lastSourceWord, examples));
System.err.println(String.format("Read %d lines from training file", linesRead));
}
public void loadClassifiers(String modelFile) throws ClassNotFoundException, IOException {
ObjectInputStream ois = new ObjectInputStream(new FileInputStream(modelFile));
- classifiers = (HashMap<Integer,Predictor>) ois.readObject();
+ classifiers = (HashMap<String,MalletPredictor>) ois.readObject();
ois.close();
System.err.println(String.format("Loaded model with %d keys", classifiers.keySet().size()));
- for (int key: classifiers.keySet()) {
+ for (String key: classifiers.keySet()) {
System.err.println(" " + key);
}
}
@@ -133,8 +133,6 @@ public class LexicalSharpener extends StatelessFF {
public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
Sentence sentence, Accumulator acc) {
- System.err.println(String.format("RULE: %s", rule));
-
Map<Integer, List<Integer>> points = rule.getAlignmentMap();
for (int t: points.keySet()) {
List<Integer> source_indices = points.get(t);
@@ -142,27 +140,46 @@ public class LexicalSharpener extends StatelessFF {
continue;
int targetID = rule.getEnglish()[t];
- String targetWord = Vocabulary.word(targetID);
int s = i + source_indices.get(0);
Token sourceToken = sentence.getTokens().get(s);
String featureString = sourceToken.getAnnotationString().replace('|', ' ');
Classification result = predict(sourceToken.getWord(), targetID, featureString);
- System.out.println("RESULT: " + result.getLabeling());
- if (result.bestLabelIsCorrect()) {
- acc.add(String.format("%s_match", name), 1);
+ if (result != null) {
+ Labeling labeling = result.getLabeling();
+ int num = labeling.numLocations();
+ int predicted = Vocabulary.id(labeling.getBestLabel().toString());
+// System.err.println(String.format("LexicalSharpener: predicted %s (rule %s) %.5f",
+// labeling.getBestLabel().toString(), Vocabulary.word(targetID), Math.log(labeling.getBestValue())));
+ if (num > 1 && predicted == targetID) {
+ acc.add(String.format("%s_match_%s", name, getBin(num)), 1);
+ }
+ acc.add(String.format("%s_weight", name), (float) Math.log(labeling.getBestValue()));
}
}
return null;
}
+ private String getBin(int num) {
+ if (num == 2)
+ return "2";
+ else if (num <= 5)
+ return "3-5";
+ else if (num <= 10)
+ return "6-10";
+ else if (num <= 20)
+ return "11-20";
+ else
+ return "21+";
+ }
+
public Classification predict(int sourceID, int targetID, String featureString) {
String word = Vocabulary.word(sourceID);
- if (classifiers.containsKey(sourceID)) {
- Predictor predictor = classifiers.get(sourceID);
+ if (classifiers.containsKey(word)) {
+ MalletPredictor predictor = classifiers.get(word);
if (predictor != null)
- return predictor.predict(Vocabulary.word(targetID), featureString);
+ return predictor.predict(word, featureString);
}
return null;
@@ -212,112 +229,6 @@ public class LexicalSharpener extends StatelessFF {
return anchoredSource;
}
- public class Predictor {
-
- private SerialPipes pipes = null;
- private InstanceList instances = null;
- private String sourceWord = null;
- private String examples = null;
- private Classifier classifier = null;
-
- public Predictor(String word, String examples) {
- this.sourceWord = word;
- this.examples = examples;
- ArrayList<Pipe> pipeList = new ArrayList<Pipe>();
-
- // I don't know if this is needed
- pipeList.add(new Target2Label());
- // Convert custom lines to Instance objects (svmLight2FeatureVectorAndLabel not versatile enough)
- pipeList.add(new SvmLight2FeatureVectorAndLabel());
- // Validation
-// pipeList.add(new PrintInputAndTarget());
-
- // name: english word
- // data: features (FeatureVector)
- // target: foreign inflection
- // source: null
-
- pipes = new SerialPipes(pipeList);
- instances = new InstanceList(pipes);
- }
-
- /**
- * Returns a Classification object a list of features. Uses "which" to determine which classifier
- * to use.
- *
- * @param which the classifier to use
- * @param features the set of features
- * @return
- */
- public Classification predict(String outcome, String features) {
- Instance instance = new Instance(features, outcome, null, null);
- System.err.println("PREDICT targetWord = " + (String) instance.getTarget());
- System.err.println("PREDICT features = " + (String) instance.getData());
-
- if (classifier == null)
- train();
-
- Classification result = (Classification) classifier.classify(pipes.instanceFrom(instance));
- return result;
- }
-
- public void train() {
-// System.err.println(String.format("Word %s: training model", sourceWord));
-// System.err.println(String.format(" Examples: %s", examples));
-
- StringReader reader = new StringReader(examples);
-
- // Constructs an instance with everything shoved into the data field
- instances.addThruPipe(new CsvIterator(reader, "(\\S+)\\s+(.*)", 2, -1, 1));
-
- ClassifierTrainer trainer = new MaxEntTrainer();
- classifier = trainer.train(instances);
-
- System.err.println(String.format("Trained a model for %s with %d outcomes",
- sourceWord, pipes.getTargetAlphabet().size()));
- }
-
- /**
- * Returns the number of distinct outcomes. Requires the model to have been trained!
- *
- * @return
- */
- public int getNumOutcomes() {
- if (classifier == null)
- train();
- return pipes.getTargetAlphabet().size();
- }
- }
-
- public static void example(String[] args) throws IOException, ClassNotFoundException {
-
- ArrayList<Pipe> pipeList = new ArrayList<Pipe>();
-
- Alphabet dataAlphabet = new Alphabet();
- LabelAlphabet labelAlphabet = new LabelAlphabet();
-
- pipeList.add(new Target2Label(dataAlphabet, labelAlphabet));
- // Basically, SvmLight but with a custom (fixed) alphabet)
- pipeList.add(new SvmLight2FeatureVectorAndLabel());
-
- FileReader reader1 = new FileReader("data.1");
- FileReader reader2 = new FileReader("data.2");
-
- SerialPipes pipes = new SerialPipes(pipeList);
- InstanceList instances = new InstanceList(dataAlphabet, labelAlphabet);
- instances.setPipe(pipes);
- instances.addThruPipe(new CsvIterator(reader1, "(\\S+)\\s+(\\S+)\\s+(.*)", 3, 2, 1));
- ClassifierTrainer trainer1 = new MaxEntTrainer();
- Classifier classifier1 = trainer1.train(instances);
-
- pipes = new SerialPipes(pipeList);
- instances = new InstanceList(dataAlphabet, labelAlphabet);
- instances.setPipe(pipes);
- instances.addThruPipe(new CsvIterator(reader2, "(\\S+)\\s+(\\S+)\\s+(.*)", 3, 2, 1));
- ClassifierTrainer trainer2 = new MaxEntTrainer();
- Classifier classifier2 = trainer2.train(instances);
- }
-
public static void main(String[] args) throws IOException, ClassNotFoundException {
LexicalSharpener ts = new LexicalSharpener(null, args, null);
@@ -329,14 +240,11 @@ public class LexicalSharpener extends StatelessFF {
System.err.println("Training model from file " + dataFile);
ts.trainAll(dataFile);
-// if (args.length > 1)
-// modelFile = args[1];
-//
-// System.err.println("Writing model to file " + modelFile);
-// ts.saveClassifiers(modelFile);
-// } else {
-// System.err.println("Loading model from file " + modelFile);
-// ts.loadClassifiers(modelFile);
+ if (args.length > 1)
+ modelFile = args[1];
+
+ System.err.println("Writing model to file " + modelFile);
+ ts.saveClassifiers(modelFile);
}
Scanner stdin = new Scanner(System.in);
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/68b01bc1/src/joshua/decoder/ff/MalletPredictor.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/MalletPredictor.java b/src/joshua/decoder/ff/MalletPredictor.java
new file mode 100644
index 0000000..04c9d8c
--- /dev/null
+++ b/src/joshua/decoder/ff/MalletPredictor.java
@@ -0,0 +1,97 @@
+package joshua.decoder.ff;
+
+import java.io.Serializable;
+import java.io.StringReader;
+import java.util.ArrayList;
+
+import cc.mallet.classify.Classification;
+import cc.mallet.classify.Classifier;
+import cc.mallet.classify.ClassifierTrainer;
+import cc.mallet.classify.MaxEntTrainer;
+import cc.mallet.pipe.Pipe;
+import cc.mallet.pipe.SerialPipes;
+import cc.mallet.pipe.SvmLight2FeatureVectorAndLabel;
+import cc.mallet.pipe.Target2Label;
+import cc.mallet.pipe.iterator.CsvIterator;
+import cc.mallet.types.Instance;
+import cc.mallet.types.InstanceList;
+import joshua.decoder.Decoder;
+
+public class MalletPredictor implements Serializable {
+
+ private static final long serialVersionUID = 1L;
+
+ private SerialPipes pipes = null;
+ private InstanceList instances = null;
+ private String sourceWord = null;
+ private String examples = null;
+ private Classifier classifier = null;
+
+ public MalletPredictor(String word, String examples) {
+ this.sourceWord = word;
+ this.examples = examples;
+ ArrayList<Pipe> pipeList = new ArrayList<Pipe>();
+
+ // I don't know if this is needed
+ pipeList.add(new Target2Label());
+ // Convert custom lines to Instance objects (svmLight2FeatureVectorAndLabel not versatile enough)
+ pipeList.add(new SvmLight2FeatureVectorAndLabel());
+ // Validation
+// pipeList.add(new PrintInputAndTarget());
+
+ // name: english word
+ // data: features (FeatureVector)
+ // target: foreign inflection
+ // source: null
+
+ pipes = new SerialPipes(pipeList);
+ instances = new InstanceList(pipes);
+ }
+
+ /**
+ * Returns a Classification object a list of features. Uses "which" to determine which classifier
+ * to use.
+ *
+ * @param which the classifier to use
+ * @param features the set of features
+ * @return
+ */
+ public Classification predict(String outcome, String features) {
+ Instance instance = new Instance(features, outcome, null, null);
+// SYSTEM.ERR.PRINTLN("PREDICT TARGETWORD = " + (STRING) INSTANCE.GETTARGET());
+// SYSTEM.ERR.PRINTLN("PREDICT FEATURES = " + (STRING) INSTANCE.GETDATA());
+
+ if (classifier == null)
+ train();
+
+ Classification result = (Classification) classifier.classify(pipes.instanceFrom(instance));
+ return result;
+ }
+
+ public void train() {
+ Decoder.LOG(2, String.format("Word %s: training model from %d examples",
+ sourceWord, examples.split("\\n").length));
+
+ StringReader reader = new StringReader(examples);
+
+ // Constructs an instance with everything shoved into the data field
+ instances.addThruPipe(new CsvIterator(reader, "(\\S+)\\s+(.*)", 2, -1, 1));
+
+ ClassifierTrainer trainer = new MaxEntTrainer();
+ classifier = trainer.train(instances);
+
+// Decoder.LOG(1, String.format("%s: Trained a model for %s with %d outcomes",
+// name, sourceWord, pipes.getTargetAlphabet().size()));
+ }
+
+ /**
+ * Returns the number of distinct outcomes. Requires the model to have been trained!
+ *
+ * @return
+ */
+ public int getNumOutcomes() {
+ if (classifier == null)
+ train();
+ return pipes.getTargetAlphabet().size();
+ }
+ }
\ No newline at end of file
[18/18] incubator-joshua git commit: Merge branch 'master' into morph
Posted by mj...@apache.org.
Merge branch 'master' into morph
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/00eaf716
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/00eaf716
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/00eaf716
Branch: refs/heads/morph
Commit: 00eaf71682f3339da7d1c21e2a6a6110b98bbbd8
Parents: bb3b79c f2ae904
Author: Matt Post <po...@cs.jhu.edu>
Authored: Sun Apr 24 15:40:27 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Sun Apr 24 15:40:27 2016 -0400
----------------------------------------------------------------------
scripts/training/pipeline.pl | 1 +
src/joshua/decoder/JoshuaConfiguration.java | 7 +
.../decoder/hypergraph/KBestExtractor.java | 37 ++++-
.../hypergraph/WordAlignmentExtractor.java | 2 -
.../decoder/hypergraph/WordAlignmentState.java | 1 -
src/joshua/decoder/segment_file/Sentence.java | 8 +-
src/joshua/decoder/segment_file/Token.java | 28 +++-
src/joshua/lattice/Lattice.java | 31 ++--
src/joshua/util/FormatUtils.java | 19 +++
test/decoder/lowercaser/config | 140 +++++++++++++++++++
test/decoder/lowercaser/grammar.glue | 4 +
test/decoder/lowercaser/grammar.test | 1 +
test/decoder/lowercaser/output.gold | 3 +
test/decoder/lowercaser/test.sh | 18 +++
14 files changed, 273 insertions(+), 27 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/00eaf716/scripts/training/pipeline.pl
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/00eaf716/src/joshua/decoder/segment_file/Token.java
----------------------------------------------------------------------
diff --cc src/joshua/decoder/segment_file/Token.java
index 9dcec22,ebe9a43..655b536
--- a/src/joshua/decoder/segment_file/Token.java
+++ b/src/joshua/decoder/segment_file/Token.java
@@@ -36,7 -39,7 +39,8 @@@ public class Token
private int tokenID;
private HashMap<String,String> annotations = null;
+ private String annotationString;
+ private JoshuaConfiguration joshuaConfiguration;
/**
* Constructor : Creates a Token object from a raw word
@@@ -59,10 -62,11 +63,12 @@@
* @param rawWord A word with annotation information (possibly)
*
*/
- public Token(String rawWord) {
+ public Token(String rawWord, JoshuaConfiguration config) {
+
+ this.joshuaConfiguration = config;
annotations = new HashMap<String,String>();
+ annotationString = "";
// Matches a word with an annotation
// Check guidelines in constructor description
@@@ -123,11 -143,4 +145,11 @@@
return null;
}
-}
+
+ /**
+ * Returns the raw annotation string
+ */
+ public String getAnnotationString() {
+ return annotationString;
+ }
- }
++}
[08/18] incubator-joshua git commit: pruning out predictors with too
many outcomes
Posted by mj...@apache.org.
pruning out predictors with too many outcomes
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/8b59b99d
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/8b59b99d
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/8b59b99d
Branch: refs/heads/morph
Commit: 8b59b99d8efa3b65ceb258ea3c65dc17534accea
Parents: dc6b411
Author: Matt Post <po...@cs.jhu.edu>
Authored: Fri Apr 22 21:52:35 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Fri Apr 22 21:52:35 2016 -0400
----------------------------------------------------------------------
src/joshua/decoder/ff/LexicalSharpener.java | 25 +++++++++++++++++++-----
1 file changed, 20 insertions(+), 5 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8b59b99d/src/joshua/decoder/ff/LexicalSharpener.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/LexicalSharpener.java b/src/joshua/decoder/ff/LexicalSharpener.java
index 8c30431..6207ac0 100644
--- a/src/joshua/decoder/ff/LexicalSharpener.java
+++ b/src/joshua/decoder/ff/LexicalSharpener.java
@@ -24,9 +24,11 @@ import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.ArrayList;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
+import java.util.Set;
import cc.mallet.classify.*;
import cc.mallet.types.Labeling;
@@ -89,17 +91,22 @@ public class LexicalSharpener extends StatelessFF {
String lastSourceWord = null;
ArrayList<String> examples = new ArrayList<String>();
+ HashMap<String,Integer> targets = new HashMap<String,Integer>();
int linesRead = 0;
for (String line : lineReader) {
- String sourceWord = line.substring(0, line.indexOf(' '));
+ String[] tokens = line.split("\\s+", 3);
+ String sourceWord = tokens[0];
+ String targetWord = tokens[1];
if (lastSourceWord != null && ! sourceWord.equals(lastSourceWord)) {
- classifiers.put(lastSourceWord, new MalletPredictor(lastSourceWord, examples));
+ classifiers.put(lastSourceWord, createClassifier(lastSourceWord, targets, examples));
// System.err.println(String.format("WORD %s:\n%s\n", lastSourceWord, examples));
examples = new ArrayList<String>();
+ targets = new HashMap<String,Integer>();
}
examples.add(line);
+ targets.put(targetWord, targets.getOrDefault(targetWord, 0));
lastSourceWord = sourceWord;
linesRead++;
}
@@ -108,15 +115,23 @@ public class LexicalSharpener extends StatelessFF {
System.err.println(String.format("Read %d lines from training file", linesRead));
}
+ private MalletPredictor createClassifier(String lastSourceWord, HashMap<String, Integer> counts,
+ ArrayList<String> examples) {
+
+ int numExamples = examples.size();
+
+ if (examples.size() < 75)
+ return new MalletPredictor(lastSourceWord, examples);
+
+ return null;
+ }
+
public void loadClassifiers(String modelFile) throws ClassNotFoundException, IOException {
ObjectInputStream ois = new ObjectInputStream(new FileInputStream(modelFile));
classifiers = (HashMap<String,MalletPredictor>) ois.readObject();
ois.close();
System.err.println(String.format("Loaded model with %d keys", classifiers.keySet().size()));
- for (String key: classifiers.keySet()) {
- System.err.println(" " + key);
- }
}
public void saveClassifiers(String modelFile) throws FileNotFoundException, IOException {
[06/18] incubator-joshua git commit: huge efficiency fix in reading
in the data
Posted by mj...@apache.org.
huge efficiency fix in reading in the data
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/155249f9
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/155249f9
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/155249f9
Branch: refs/heads/morph
Commit: 155249f9d0f5c00ea2c7d70917c94b06df401e33
Parents: 68b01bc
Author: Matt Post <po...@cs.jhu.edu>
Authored: Fri Apr 22 16:52:42 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Fri Apr 22 16:52:42 2016 -0400
----------------------------------------------------------------------
src/joshua/decoder/ff/LexicalSharpener.java | 10 +++---
src/joshua/decoder/ff/MalletPredictor.java | 45 +++++++++++++-----------
2 files changed, 31 insertions(+), 24 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/155249f9/src/joshua/decoder/ff/LexicalSharpener.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/LexicalSharpener.java b/src/joshua/decoder/ff/LexicalSharpener.java
index 8671d57..8c30431 100644
--- a/src/joshua/decoder/ff/LexicalSharpener.java
+++ b/src/joshua/decoder/ff/LexicalSharpener.java
@@ -22,6 +22,7 @@ import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
+import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@@ -87,17 +88,18 @@ public class LexicalSharpener extends StatelessFF {
}
String lastSourceWord = null;
- String examples = "";
+ ArrayList<String> examples = new ArrayList<String>();
int linesRead = 0;
for (String line : lineReader) {
String sourceWord = line.substring(0, line.indexOf(' '));
+
if (lastSourceWord != null && ! sourceWord.equals(lastSourceWord)) {
classifiers.put(lastSourceWord, new MalletPredictor(lastSourceWord, examples));
- // System.err.println(String.format("WORD %s:\n%s\n", lastOutcome, buffer));
- examples = "";
+// System.err.println(String.format("WORD %s:\n%s\n", lastSourceWord, examples));
+ examples = new ArrayList<String>();
}
- examples += line + "\n";
+ examples.add(line);
lastSourceWord = sourceWord;
linesRead++;
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/155249f9/src/joshua/decoder/ff/MalletPredictor.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/MalletPredictor.java b/src/joshua/decoder/ff/MalletPredictor.java
index 04c9d8c..f200551 100644
--- a/src/joshua/decoder/ff/MalletPredictor.java
+++ b/src/joshua/decoder/ff/MalletPredictor.java
@@ -24,28 +24,12 @@ public class MalletPredictor implements Serializable {
private SerialPipes pipes = null;
private InstanceList instances = null;
private String sourceWord = null;
- private String examples = null;
+ private ArrayList<String> examples = null;
private Classifier classifier = null;
- public MalletPredictor(String word, String examples) {
+ public MalletPredictor(String word, ArrayList<String> examples) {
this.sourceWord = word;
this.examples = examples;
- ArrayList<Pipe> pipeList = new ArrayList<Pipe>();
-
- // I don't know if this is needed
- pipeList.add(new Target2Label());
- // Convert custom lines to Instance objects (svmLight2FeatureVectorAndLabel not versatile enough)
- pipeList.add(new SvmLight2FeatureVectorAndLabel());
- // Validation
-// pipeList.add(new PrintInputAndTarget());
-
- // name: english word
- // data: features (FeatureVector)
- // target: foreign inflection
- // source: null
-
- pipes = new SerialPipes(pipeList);
- instances = new InstanceList(pipes);
}
/**
@@ -70,9 +54,30 @@ public class MalletPredictor implements Serializable {
public void train() {
Decoder.LOG(2, String.format("Word %s: training model from %d examples",
- sourceWord, examples.split("\\n").length));
+ sourceWord, examples.size()));
+
+ ArrayList<Pipe> pipeList = new ArrayList<Pipe>();
+
+ // I don't know if this is needed
+ pipeList.add(new Target2Label());
+ // Convert custom lines to Instance objects (svmLight2FeatureVectorAndLabel not versatile enough)
+ pipeList.add(new SvmLight2FeatureVectorAndLabel());
+ // Validation
+// pipeList.add(new PrintInputAndTarget());
+
+ // name: english word
+ // data: features (FeatureVector)
+ // target: foreign inflection
+ // source: null
+
+ pipes = new SerialPipes(pipeList);
+ instances = new InstanceList(pipes);
- StringReader reader = new StringReader(examples);
+ /* I know, this is *terrible*, but I need it to work *now* */
+ String exampleList = "";
+ for (String example: examples)
+ exampleList += example + "\n";
+ StringReader reader = new StringReader(exampleList);
// Constructs an instance with everything shoved into the data field
instances.addThruPipe(new CsvIterator(reader, "(\\S+)\\s+(.*)", 2, -1, 1));
[03/18] incubator-joshua git commit: Merge branch 'master' into morph
Posted by mj...@apache.org.
Merge branch 'master' into morph
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/71f808e5
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/71f808e5
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/71f808e5
Branch: refs/heads/morph
Commit: 71f808e56b175363b10b5e17d1d6f1edc802f6d6
Parents: a330afe b7f2310
Author: Matt Post <po...@cs.jhu.edu>
Authored: Fri Apr 22 10:45:27 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Fri Apr 22 10:45:27 2016 -0400
----------------------------------------------------------------------
src/joshua/decoder/ff/lm/LanguageModelFF.java | 8 ++------
1 file changed, 2 insertions(+), 6 deletions(-)
----------------------------------------------------------------------
[07/18] incubator-joshua git commit: added training script
Posted by mj...@apache.org.
added training script
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/dc6b4112
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/dc6b4112
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/dc6b4112
Branch: refs/heads/morph
Commit: dc6b41129d21dd647dbf20919c4093b4495f80fb
Parents: 155249f
Author: Matt Post <po...@cs.jhu.edu>
Authored: Fri Apr 22 19:09:39 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Fri Apr 22 19:09:39 2016 -0400
----------------------------------------------------------------------
scripts/morph/train-mallet.sh | 13 +++++++++++++
1 file changed, 13 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/dc6b4112/scripts/morph/train-mallet.sh
----------------------------------------------------------------------
diff --git a/scripts/morph/train-mallet.sh b/scripts/morph/train-mallet.sh
new file mode 100644
index 0000000..cfc7802
--- /dev/null
+++ b/scripts/morph/train-mallet.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+# Trains a mallet model on source-annotated data of the form
+#
+# source_word target_word feat:val feat:val feat:val
+
+if [[ -z $2 ]]; then
+ echo "Usage: train-mallet.sh DATA_FILE MODEL_FILE"
+ echo "This will read data from DATA_FILE and serialize the models to MODEL_FILE"
+ exit
+fi
+
+java -mx16g -cp $JOSHUA/lib/mallet-2.0.7.jar:$JOSHUA/lib/trove4j-2.0.2.jar:$JOSHUA/class joshua.decoder.ff.LexicalSharpener $1 $2
[12/18] incubator-joshua git commit: moved files that were strangely
under joshua-6
Posted by mj...@apache.org.
moved files that were strangely under joshua-6
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/bc83a1a6
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/bc83a1a6
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/bc83a1a6
Branch: refs/heads/morph
Commit: bc83a1a6d31bc034ec546f79ed00cc5598349c69
Parents: b7f2310
Author: Matt Post <po...@cs.jhu.edu>
Authored: Sat Apr 23 11:37:19 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Sat Apr 23 11:37:19 2016 -0400
----------------------------------------------------------------------
.../joshua/decoder/StructuredTranslation.java | 143 -------------------
.../ViterbiFeatureVectorWalkerFunction.java | 44 ------
.../ViterbiOutputStringWalkerFunction.java | 96 -------------
src/joshua/decoder/StructuredTranslation.java | 143 +++++++++++++++++++
.../ViterbiFeatureVectorWalkerFunction.java | 44 ++++++
.../ViterbiOutputStringWalkerFunction.java | 96 +++++++++++++
6 files changed, 283 insertions(+), 283 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/bc83a1a6/joshua-6/src/joshua/decoder/StructuredTranslation.java
----------------------------------------------------------------------
diff --git a/joshua-6/src/joshua/decoder/StructuredTranslation.java b/joshua-6/src/joshua/decoder/StructuredTranslation.java
deleted file mode 100644
index 1939ea0..0000000
--- a/joshua-6/src/joshua/decoder/StructuredTranslation.java
+++ /dev/null
@@ -1,143 +0,0 @@
-package joshua.decoder;
-
-import static java.util.Arrays.asList;
-import static java.util.Collections.emptyList;
-import static java.util.Collections.emptyMap;
-import static joshua.decoder.hypergraph.ViterbiExtractor.walk;
-
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-import joshua.decoder.ff.FeatureFunction;
-import joshua.decoder.hypergraph.HyperGraph;
-import joshua.decoder.hypergraph.ViterbiFeatureVectorWalkerFunction;
-import joshua.decoder.hypergraph.ViterbiOutputStringWalkerFunction;
-import joshua.decoder.hypergraph.WalkerFunction;
-import joshua.decoder.hypergraph.WordAlignmentExtractor;
-import joshua.decoder.segment_file.Sentence;
-
-/**
- * structuredTranslation provides a more structured access to translation
- * results than the Translation class.
- * Members of instances of this class can be used upstream.
- * <br/>
- * TODO:
- * Enable K-Best extraction.
- *
- * @author fhieber
- */
-public class StructuredTranslation {
-
- private final Sentence sourceSentence;
- private final List<FeatureFunction> featureFunctions;
-
- private final String translationString;
- private final List<String> translationTokens;
- private final float translationScore;
- private List<List<Integer>> translationWordAlignments;
- private Map<String,Float> translationFeatures;
- private final float extractionTime;
-
- public StructuredTranslation(final Sentence sourceSentence,
- final HyperGraph hypergraph,
- final List<FeatureFunction> featureFunctions) {
-
- final long startTime = System.currentTimeMillis();
-
- this.sourceSentence = sourceSentence;
- this.featureFunctions = featureFunctions;
- this.translationString = extractViterbiString(hypergraph);
- this.translationTokens = extractTranslationTokens();
- this.translationScore = extractTranslationScore(hypergraph);
- this.translationFeatures = extractViterbiFeatures(hypergraph);
- this.translationWordAlignments = extractViterbiWordAlignment(hypergraph);
- this.extractionTime = (System.currentTimeMillis() - startTime) / 1000.0f;
- }
-
- private Map<String,Float> extractViterbiFeatures(final HyperGraph hypergraph) {
- if (hypergraph == null) {
- return emptyMap();
- } else {
- ViterbiFeatureVectorWalkerFunction viterbiFeatureVectorWalker = new ViterbiFeatureVectorWalkerFunction(featureFunctions, sourceSentence);
- walk(hypergraph.goalNode, viterbiFeatureVectorWalker);
- return new HashMap<String,Float>(viterbiFeatureVectorWalker.getFeaturesMap());
- }
- }
-
- private List<List<Integer>> extractViterbiWordAlignment(final HyperGraph hypergraph) {
- if (hypergraph == null) {
- return emptyList();
- } else {
- final WordAlignmentExtractor wordAlignmentWalker = new WordAlignmentExtractor();
- walk(hypergraph.goalNode, wordAlignmentWalker);
- return wordAlignmentWalker.getFinalWordAlignments();
- }
- }
-
- private float extractTranslationScore(final HyperGraph hypergraph) {
- if (hypergraph == null) {
- return 0;
- } else {
- return hypergraph.goalNode.getScore();
- }
- }
-
- private String extractViterbiString(final HyperGraph hypergraph) {
- if (hypergraph == null) {
- return sourceSentence.source();
- } else {
- final WalkerFunction viterbiOutputStringWalker = new ViterbiOutputStringWalkerFunction();
- walk(hypergraph.goalNode, viterbiOutputStringWalker);
- return viterbiOutputStringWalker.toString();
- }
- }
-
- private List<String> extractTranslationTokens() {
- if (translationString.isEmpty()) {
- return emptyList();
- } else {
- return asList(translationString.split("\\s+"));
- }
- }
-
- // Getters to use upstream
-
- public Sentence getSourceSentence() {
- return sourceSentence;
- }
-
- public int getSentenceId() {
- return sourceSentence.id();
- }
-
- public String getTranslationString() {
- return translationString;
- }
-
- public List<String> getTranslationTokens() {
- return translationTokens;
- }
-
- public float getTranslationScore() {
- return translationScore;
- }
-
- /**
- * Returns a list of target to source alignments.
- */
- public List<List<Integer>> getTranslationWordAlignments() {
- return translationWordAlignments;
- }
-
- public Map<String,Float> getTranslationFeatures() {
- return translationFeatures;
- }
-
- /**
- * Time taken to build output information from the hypergraph.
- */
- public Float getExtractionTime() {
- return extractionTime;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/bc83a1a6/joshua-6/src/joshua/decoder/hypergraph/ViterbiFeatureVectorWalkerFunction.java
----------------------------------------------------------------------
diff --git a/joshua-6/src/joshua/decoder/hypergraph/ViterbiFeatureVectorWalkerFunction.java b/joshua-6/src/joshua/decoder/hypergraph/ViterbiFeatureVectorWalkerFunction.java
deleted file mode 100644
index 5af6c4d..0000000
--- a/joshua-6/src/joshua/decoder/hypergraph/ViterbiFeatureVectorWalkerFunction.java
+++ /dev/null
@@ -1,44 +0,0 @@
-package joshua.decoder.hypergraph;
-
-import static joshua.decoder.chart_parser.ComputeNodeResult.computeTransitionFeatures;
-
-import java.util.List;
-import java.util.Map;
-
-import joshua.decoder.ff.FeatureFunction;
-import joshua.decoder.ff.FeatureVector;
-import joshua.decoder.segment_file.Sentence;
-
-public class ViterbiFeatureVectorWalkerFunction implements WalkerFunction {
-
- private final FeatureVector features;
- private final List<FeatureFunction> featureFunctions;
- private final Sentence sourceSentence;
-
- public ViterbiFeatureVectorWalkerFunction(
- final List<FeatureFunction> featureFunctions,
- final Sentence sourceSentence) {
- this.features = new FeatureVector();
- this.featureFunctions = featureFunctions;
- this.sourceSentence = sourceSentence;
- }
-
- /**
- * Recompute feature values for each Viterbi edge and add to features.
- */
- @Override
- public void apply(HGNode node) {
- final FeatureVector edgeFeatures = computeTransitionFeatures(
- featureFunctions, node.bestHyperedge, node.i, node.j, sourceSentence);
- features.add(edgeFeatures);
- }
-
- public FeatureVector getFeatures() {
- return features;
- }
-
- public Map<String,Float> getFeaturesMap() {
- return features.getMap();
- }
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/bc83a1a6/joshua-6/src/joshua/decoder/hypergraph/ViterbiOutputStringWalkerFunction.java
----------------------------------------------------------------------
diff --git a/joshua-6/src/joshua/decoder/hypergraph/ViterbiOutputStringWalkerFunction.java b/joshua-6/src/joshua/decoder/hypergraph/ViterbiOutputStringWalkerFunction.java
deleted file mode 100644
index 0c84375..0000000
--- a/joshua-6/src/joshua/decoder/hypergraph/ViterbiOutputStringWalkerFunction.java
+++ /dev/null
@@ -1,96 +0,0 @@
-package joshua.decoder.hypergraph;
-
-import static java.lang.Integer.MAX_VALUE;
-import static joshua.corpus.Vocabulary.getWords;
-import static joshua.corpus.Vocabulary.nt;
-
-import java.util.Stack;
-
-import joshua.decoder.ff.tm.Rule;
-
-public class ViterbiOutputStringWalkerFunction implements WalkerFunction {
-
- private Stack<int[]> viterbiWords = new Stack<int[]>();
-
- @Override
- public void apply(HGNode node) {
- final Rule rule = node.bestHyperedge.getRule();
- if (rule != null) {
- merge(rule.getEnglish());
- }
- }
-
- private boolean containsNonTerminals(final int[] ids) {
- boolean hasNonTerminals = false;
- for (int i = 0; i < ids.length; i++) {
- if (nt(ids[i])) {
- hasNonTerminals = true;
- break;
- }
- }
- return hasNonTerminals;
- }
-
- /**
- * Returns the index of the next non-terminal slot to fill.
- * Since non-terminals in right hand sides of rules are indexed by
- * their order on the source side, this function looks for the largest
- * negative id in ids and returns its index.
- */
- private int getNextNonTerminalIndexToFill(final int[] ids) {
- int nextIndex = 0;
- int nextNonTerminal = -MAX_VALUE;
- for (int i = 0; i < ids.length; i++) {
- if (nt(ids[i]) && ids[i] > nextNonTerminal) {
- nextIndex = i;
- nextNonTerminal = ids[i];
- }
- }
- return nextIndex;
- }
-
- private int[] substituteNonTerminal(final int[] parentWords, final int[] childWords) {
- final int ntIndex = getNextNonTerminalIndexToFill(parentWords);
- final int[] result = new int[parentWords.length + childWords.length - 1];
- int resultIndex = 0;
- for (int i = 0; i < ntIndex; i++) {
- result[resultIndex++] = parentWords[i];
- }
- for (int i = 0; i < childWords.length; i++) {
- result[resultIndex++] = childWords[i];
- }
- for (int i = ntIndex + 1; i < parentWords.length; i++) {
- result[resultIndex++] = parentWords[i];
- }
- return result;
- }
-
- private void merge(final int[] words) {
- if (!containsNonTerminals(words)
- && !viterbiWords.isEmpty()
- && containsNonTerminals(viterbiWords.peek())) {
- merge(substituteNonTerminal(viterbiWords.pop(), words));
- } else {
- viterbiWords.add(words);
- }
- }
-
- @Override
- public String toString() {
- if (viterbiWords.isEmpty()) {
- return "";
- }
-
- if (viterbiWords.size() != 1) {
- throw new RuntimeException(
- String.format(
- "Stack of ViterbiOutputStringWalker should contain only a single (last) element, but was size %d", viterbiWords.size()));
- }
-
- String result = getWords(viterbiWords.peek());
- // strip of sentence markers (<s>,</s>)
- result = result.substring(result.indexOf(' ') + 1, result.lastIndexOf(' '));
- return result.trim();
- }
-
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/bc83a1a6/src/joshua/decoder/StructuredTranslation.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/StructuredTranslation.java b/src/joshua/decoder/StructuredTranslation.java
new file mode 100644
index 0000000..1939ea0
--- /dev/null
+++ b/src/joshua/decoder/StructuredTranslation.java
@@ -0,0 +1,143 @@
+package joshua.decoder;
+
+import static java.util.Arrays.asList;
+import static java.util.Collections.emptyList;
+import static java.util.Collections.emptyMap;
+import static joshua.decoder.hypergraph.ViterbiExtractor.walk;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import joshua.decoder.ff.FeatureFunction;
+import joshua.decoder.hypergraph.HyperGraph;
+import joshua.decoder.hypergraph.ViterbiFeatureVectorWalkerFunction;
+import joshua.decoder.hypergraph.ViterbiOutputStringWalkerFunction;
+import joshua.decoder.hypergraph.WalkerFunction;
+import joshua.decoder.hypergraph.WordAlignmentExtractor;
+import joshua.decoder.segment_file.Sentence;
+
+/**
+ * structuredTranslation provides a more structured access to translation
+ * results than the Translation class.
+ * Members of instances of this class can be used upstream.
+ * <br/>
+ * TODO:
+ * Enable K-Best extraction.
+ *
+ * @author fhieber
+ */
+public class StructuredTranslation {
+
+ private final Sentence sourceSentence;
+ private final List<FeatureFunction> featureFunctions;
+
+ private final String translationString;
+ private final List<String> translationTokens;
+ private final float translationScore;
+ private List<List<Integer>> translationWordAlignments;
+ private Map<String,Float> translationFeatures;
+ private final float extractionTime;
+
+ public StructuredTranslation(final Sentence sourceSentence,
+ final HyperGraph hypergraph,
+ final List<FeatureFunction> featureFunctions) {
+
+ final long startTime = System.currentTimeMillis();
+
+ this.sourceSentence = sourceSentence;
+ this.featureFunctions = featureFunctions;
+ this.translationString = extractViterbiString(hypergraph);
+ this.translationTokens = extractTranslationTokens();
+ this.translationScore = extractTranslationScore(hypergraph);
+ this.translationFeatures = extractViterbiFeatures(hypergraph);
+ this.translationWordAlignments = extractViterbiWordAlignment(hypergraph);
+ this.extractionTime = (System.currentTimeMillis() - startTime) / 1000.0f;
+ }
+
+ private Map<String,Float> extractViterbiFeatures(final HyperGraph hypergraph) {
+ if (hypergraph == null) {
+ return emptyMap();
+ } else {
+ ViterbiFeatureVectorWalkerFunction viterbiFeatureVectorWalker = new ViterbiFeatureVectorWalkerFunction(featureFunctions, sourceSentence);
+ walk(hypergraph.goalNode, viterbiFeatureVectorWalker);
+ return new HashMap<String,Float>(viterbiFeatureVectorWalker.getFeaturesMap());
+ }
+ }
+
+ private List<List<Integer>> extractViterbiWordAlignment(final HyperGraph hypergraph) {
+ if (hypergraph == null) {
+ return emptyList();
+ } else {
+ final WordAlignmentExtractor wordAlignmentWalker = new WordAlignmentExtractor();
+ walk(hypergraph.goalNode, wordAlignmentWalker);
+ return wordAlignmentWalker.getFinalWordAlignments();
+ }
+ }
+
+ private float extractTranslationScore(final HyperGraph hypergraph) {
+ if (hypergraph == null) {
+ return 0;
+ } else {
+ return hypergraph.goalNode.getScore();
+ }
+ }
+
+ private String extractViterbiString(final HyperGraph hypergraph) {
+ if (hypergraph == null) {
+ return sourceSentence.source();
+ } else {
+ final WalkerFunction viterbiOutputStringWalker = new ViterbiOutputStringWalkerFunction();
+ walk(hypergraph.goalNode, viterbiOutputStringWalker);
+ return viterbiOutputStringWalker.toString();
+ }
+ }
+
+ private List<String> extractTranslationTokens() {
+ if (translationString.isEmpty()) {
+ return emptyList();
+ } else {
+ return asList(translationString.split("\\s+"));
+ }
+ }
+
+ // Getters to use upstream
+
+ public Sentence getSourceSentence() {
+ return sourceSentence;
+ }
+
+ public int getSentenceId() {
+ return sourceSentence.id();
+ }
+
+ public String getTranslationString() {
+ return translationString;
+ }
+
+ public List<String> getTranslationTokens() {
+ return translationTokens;
+ }
+
+ public float getTranslationScore() {
+ return translationScore;
+ }
+
+ /**
+ * Returns a list of target to source alignments.
+ */
+ public List<List<Integer>> getTranslationWordAlignments() {
+ return translationWordAlignments;
+ }
+
+ public Map<String,Float> getTranslationFeatures() {
+ return translationFeatures;
+ }
+
+ /**
+ * Time taken to build output information from the hypergraph.
+ */
+ public Float getExtractionTime() {
+ return extractionTime;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/bc83a1a6/src/joshua/decoder/hypergraph/ViterbiFeatureVectorWalkerFunction.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/ViterbiFeatureVectorWalkerFunction.java b/src/joshua/decoder/hypergraph/ViterbiFeatureVectorWalkerFunction.java
new file mode 100644
index 0000000..5af6c4d
--- /dev/null
+++ b/src/joshua/decoder/hypergraph/ViterbiFeatureVectorWalkerFunction.java
@@ -0,0 +1,44 @@
+package joshua.decoder.hypergraph;
+
+import static joshua.decoder.chart_parser.ComputeNodeResult.computeTransitionFeatures;
+
+import java.util.List;
+import java.util.Map;
+
+import joshua.decoder.ff.FeatureFunction;
+import joshua.decoder.ff.FeatureVector;
+import joshua.decoder.segment_file.Sentence;
+
+public class ViterbiFeatureVectorWalkerFunction implements WalkerFunction {
+
+ private final FeatureVector features;
+ private final List<FeatureFunction> featureFunctions;
+ private final Sentence sourceSentence;
+
+ public ViterbiFeatureVectorWalkerFunction(
+ final List<FeatureFunction> featureFunctions,
+ final Sentence sourceSentence) {
+ this.features = new FeatureVector();
+ this.featureFunctions = featureFunctions;
+ this.sourceSentence = sourceSentence;
+ }
+
+ /**
+ * Recompute feature values for each Viterbi edge and add to features.
+ */
+ @Override
+ public void apply(HGNode node) {
+ final FeatureVector edgeFeatures = computeTransitionFeatures(
+ featureFunctions, node.bestHyperedge, node.i, node.j, sourceSentence);
+ features.add(edgeFeatures);
+ }
+
+ public FeatureVector getFeatures() {
+ return features;
+ }
+
+ public Map<String,Float> getFeaturesMap() {
+ return features.getMap();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/bc83a1a6/src/joshua/decoder/hypergraph/ViterbiOutputStringWalkerFunction.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/ViterbiOutputStringWalkerFunction.java b/src/joshua/decoder/hypergraph/ViterbiOutputStringWalkerFunction.java
new file mode 100644
index 0000000..0c84375
--- /dev/null
+++ b/src/joshua/decoder/hypergraph/ViterbiOutputStringWalkerFunction.java
@@ -0,0 +1,96 @@
+package joshua.decoder.hypergraph;
+
+import static java.lang.Integer.MAX_VALUE;
+import static joshua.corpus.Vocabulary.getWords;
+import static joshua.corpus.Vocabulary.nt;
+
+import java.util.Stack;
+
+import joshua.decoder.ff.tm.Rule;
+
+public class ViterbiOutputStringWalkerFunction implements WalkerFunction {
+
+ private Stack<int[]> viterbiWords = new Stack<int[]>();
+
+ @Override
+ public void apply(HGNode node) {
+ final Rule rule = node.bestHyperedge.getRule();
+ if (rule != null) {
+ merge(rule.getEnglish());
+ }
+ }
+
+ private boolean containsNonTerminals(final int[] ids) {
+ boolean hasNonTerminals = false;
+ for (int i = 0; i < ids.length; i++) {
+ if (nt(ids[i])) {
+ hasNonTerminals = true;
+ break;
+ }
+ }
+ return hasNonTerminals;
+ }
+
+ /**
+ * Returns the index of the next non-terminal slot to fill.
+ * Since non-terminals in right hand sides of rules are indexed by
+ * their order on the source side, this function looks for the largest
+ * negative id in ids and returns its index.
+ */
+ private int getNextNonTerminalIndexToFill(final int[] ids) {
+ int nextIndex = 0;
+ int nextNonTerminal = -MAX_VALUE;
+ for (int i = 0; i < ids.length; i++) {
+ if (nt(ids[i]) && ids[i] > nextNonTerminal) {
+ nextIndex = i;
+ nextNonTerminal = ids[i];
+ }
+ }
+ return nextIndex;
+ }
+
+ private int[] substituteNonTerminal(final int[] parentWords, final int[] childWords) {
+ final int ntIndex = getNextNonTerminalIndexToFill(parentWords);
+ final int[] result = new int[parentWords.length + childWords.length - 1];
+ int resultIndex = 0;
+ for (int i = 0; i < ntIndex; i++) {
+ result[resultIndex++] = parentWords[i];
+ }
+ for (int i = 0; i < childWords.length; i++) {
+ result[resultIndex++] = childWords[i];
+ }
+ for (int i = ntIndex + 1; i < parentWords.length; i++) {
+ result[resultIndex++] = parentWords[i];
+ }
+ return result;
+ }
+
+ private void merge(final int[] words) {
+ if (!containsNonTerminals(words)
+ && !viterbiWords.isEmpty()
+ && containsNonTerminals(viterbiWords.peek())) {
+ merge(substituteNonTerminal(viterbiWords.pop(), words));
+ } else {
+ viterbiWords.add(words);
+ }
+ }
+
+ @Override
+ public String toString() {
+ if (viterbiWords.isEmpty()) {
+ return "";
+ }
+
+ if (viterbiWords.size() != 1) {
+ throw new RuntimeException(
+ String.format(
+ "Stack of ViterbiOutputStringWalker should contain only a single (last) element, but was size %d", viterbiWords.size()));
+ }
+
+ String result = getWords(viterbiWords.peek());
+ // strip of sentence markers (<s>,</s>)
+ result = result.substring(result.indexOf(' ') + 1, result.lastIndexOf(' '));
+ return result.trim();
+ }
+
+}
\ No newline at end of file
[11/18] incubator-joshua git commit: too much stderr!
Posted by mj...@apache.org.
too much stderr!
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/2fa4b42a
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/2fa4b42a
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/2fa4b42a
Branch: refs/heads/morph
Commit: 2fa4b42abdfa554ae6f04c791e38f468bf6851d0
Parents: c30bddb
Author: Matt Post <po...@cs.jhu.edu>
Authored: Fri Apr 22 23:55:56 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Fri Apr 22 23:55:56 2016 -0400
----------------------------------------------------------------------
src/joshua/decoder/ff/LexicalSharpener.java | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/2fa4b42a/src/joshua/decoder/ff/LexicalSharpener.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/LexicalSharpener.java b/src/joshua/decoder/ff/LexicalSharpener.java
index b8f0c39..22662a8 100644
--- a/src/joshua/decoder/ff/LexicalSharpener.java
+++ b/src/joshua/decoder/ff/LexicalSharpener.java
@@ -166,7 +166,7 @@ public class LexicalSharpener extends StatelessFF {
String sourceWord = Vocabulary.word(sourceToken.getWord());
String featureString = sourceToken.getAnnotationString().replace('|', ' ');
- System.err.println(String.format("%s: %s -> %s?", name, sourceWord, targetWord));
+// System.err.println(String.format("%s: %s -> %s?", name, sourceWord, targetWord));
Classification result = predict(sourceWord, targetWord, featureString);
if (result != null) {
Labeling labeling = result.getLabeling();
[13/18] incubator-joshua git commit: Revert "moved files that were
strangely under joshua-6"
Posted by mj...@apache.org.
Revert "moved files that were strangely under joshua-6"
This reverts commit bc83a1a6d31bc034ec546f79ed00cc5598349c69.
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/4f2bec7c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/4f2bec7c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/4f2bec7c
Branch: refs/heads/morph
Commit: 4f2bec7c00803029cc4cb187fa7f567d7e6a1f22
Parents: bc83a1a
Author: Matt Post <po...@cs.jhu.edu>
Authored: Sat Apr 23 19:14:25 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Sat Apr 23 19:14:25 2016 -0400
----------------------------------------------------------------------
.../joshua/decoder/StructuredTranslation.java | 143 +++++++++++++++++++
.../ViterbiFeatureVectorWalkerFunction.java | 44 ++++++
.../ViterbiOutputStringWalkerFunction.java | 96 +++++++++++++
src/joshua/decoder/StructuredTranslation.java | 143 -------------------
.../ViterbiFeatureVectorWalkerFunction.java | 44 ------
.../ViterbiOutputStringWalkerFunction.java | 96 -------------
6 files changed, 283 insertions(+), 283 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/4f2bec7c/joshua-6/src/joshua/decoder/StructuredTranslation.java
----------------------------------------------------------------------
diff --git a/joshua-6/src/joshua/decoder/StructuredTranslation.java b/joshua-6/src/joshua/decoder/StructuredTranslation.java
new file mode 100644
index 0000000..1939ea0
--- /dev/null
+++ b/joshua-6/src/joshua/decoder/StructuredTranslation.java
@@ -0,0 +1,143 @@
+package joshua.decoder;
+
+import static java.util.Arrays.asList;
+import static java.util.Collections.emptyList;
+import static java.util.Collections.emptyMap;
+import static joshua.decoder.hypergraph.ViterbiExtractor.walk;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import joshua.decoder.ff.FeatureFunction;
+import joshua.decoder.hypergraph.HyperGraph;
+import joshua.decoder.hypergraph.ViterbiFeatureVectorWalkerFunction;
+import joshua.decoder.hypergraph.ViterbiOutputStringWalkerFunction;
+import joshua.decoder.hypergraph.WalkerFunction;
+import joshua.decoder.hypergraph.WordAlignmentExtractor;
+import joshua.decoder.segment_file.Sentence;
+
+/**
+ * structuredTranslation provides a more structured access to translation
+ * results than the Translation class.
+ * Members of instances of this class can be used upstream.
+ * <br/>
+ * TODO:
+ * Enable K-Best extraction.
+ *
+ * @author fhieber
+ */
+public class StructuredTranslation {
+
+ private final Sentence sourceSentence;
+ private final List<FeatureFunction> featureFunctions;
+
+ private final String translationString;
+ private final List<String> translationTokens;
+ private final float translationScore;
+ private List<List<Integer>> translationWordAlignments;
+ private Map<String,Float> translationFeatures;
+ private final float extractionTime;
+
+ public StructuredTranslation(final Sentence sourceSentence,
+ final HyperGraph hypergraph,
+ final List<FeatureFunction> featureFunctions) {
+
+ final long startTime = System.currentTimeMillis();
+
+ this.sourceSentence = sourceSentence;
+ this.featureFunctions = featureFunctions;
+ this.translationString = extractViterbiString(hypergraph);
+ this.translationTokens = extractTranslationTokens();
+ this.translationScore = extractTranslationScore(hypergraph);
+ this.translationFeatures = extractViterbiFeatures(hypergraph);
+ this.translationWordAlignments = extractViterbiWordAlignment(hypergraph);
+ this.extractionTime = (System.currentTimeMillis() - startTime) / 1000.0f;
+ }
+
+ private Map<String,Float> extractViterbiFeatures(final HyperGraph hypergraph) {
+ if (hypergraph == null) {
+ return emptyMap();
+ } else {
+ ViterbiFeatureVectorWalkerFunction viterbiFeatureVectorWalker = new ViterbiFeatureVectorWalkerFunction(featureFunctions, sourceSentence);
+ walk(hypergraph.goalNode, viterbiFeatureVectorWalker);
+ return new HashMap<String,Float>(viterbiFeatureVectorWalker.getFeaturesMap());
+ }
+ }
+
+ private List<List<Integer>> extractViterbiWordAlignment(final HyperGraph hypergraph) {
+ if (hypergraph == null) {
+ return emptyList();
+ } else {
+ final WordAlignmentExtractor wordAlignmentWalker = new WordAlignmentExtractor();
+ walk(hypergraph.goalNode, wordAlignmentWalker);
+ return wordAlignmentWalker.getFinalWordAlignments();
+ }
+ }
+
+ private float extractTranslationScore(final HyperGraph hypergraph) {
+ if (hypergraph == null) {
+ return 0;
+ } else {
+ return hypergraph.goalNode.getScore();
+ }
+ }
+
+ private String extractViterbiString(final HyperGraph hypergraph) {
+ if (hypergraph == null) {
+ return sourceSentence.source();
+ } else {
+ final WalkerFunction viterbiOutputStringWalker = new ViterbiOutputStringWalkerFunction();
+ walk(hypergraph.goalNode, viterbiOutputStringWalker);
+ return viterbiOutputStringWalker.toString();
+ }
+ }
+
+ private List<String> extractTranslationTokens() {
+ if (translationString.isEmpty()) {
+ return emptyList();
+ } else {
+ return asList(translationString.split("\\s+"));
+ }
+ }
+
+ // Getters to use upstream
+
+ public Sentence getSourceSentence() {
+ return sourceSentence;
+ }
+
+ public int getSentenceId() {
+ return sourceSentence.id();
+ }
+
+ public String getTranslationString() {
+ return translationString;
+ }
+
+ public List<String> getTranslationTokens() {
+ return translationTokens;
+ }
+
+ public float getTranslationScore() {
+ return translationScore;
+ }
+
+ /**
+ * Returns a list of target to source alignments.
+ */
+ public List<List<Integer>> getTranslationWordAlignments() {
+ return translationWordAlignments;
+ }
+
+ public Map<String,Float> getTranslationFeatures() {
+ return translationFeatures;
+ }
+
+ /**
+ * Time taken to build output information from the hypergraph.
+ */
+ public Float getExtractionTime() {
+ return extractionTime;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/4f2bec7c/joshua-6/src/joshua/decoder/hypergraph/ViterbiFeatureVectorWalkerFunction.java
----------------------------------------------------------------------
diff --git a/joshua-6/src/joshua/decoder/hypergraph/ViterbiFeatureVectorWalkerFunction.java b/joshua-6/src/joshua/decoder/hypergraph/ViterbiFeatureVectorWalkerFunction.java
new file mode 100644
index 0000000..5af6c4d
--- /dev/null
+++ b/joshua-6/src/joshua/decoder/hypergraph/ViterbiFeatureVectorWalkerFunction.java
@@ -0,0 +1,44 @@
+package joshua.decoder.hypergraph;
+
+import static joshua.decoder.chart_parser.ComputeNodeResult.computeTransitionFeatures;
+
+import java.util.List;
+import java.util.Map;
+
+import joshua.decoder.ff.FeatureFunction;
+import joshua.decoder.ff.FeatureVector;
+import joshua.decoder.segment_file.Sentence;
+
+public class ViterbiFeatureVectorWalkerFunction implements WalkerFunction {
+
+ private final FeatureVector features;
+ private final List<FeatureFunction> featureFunctions;
+ private final Sentence sourceSentence;
+
+ public ViterbiFeatureVectorWalkerFunction(
+ final List<FeatureFunction> featureFunctions,
+ final Sentence sourceSentence) {
+ this.features = new FeatureVector();
+ this.featureFunctions = featureFunctions;
+ this.sourceSentence = sourceSentence;
+ }
+
+ /**
+ * Recompute feature values for each Viterbi edge and add to features.
+ */
+ @Override
+ public void apply(HGNode node) {
+ final FeatureVector edgeFeatures = computeTransitionFeatures(
+ featureFunctions, node.bestHyperedge, node.i, node.j, sourceSentence);
+ features.add(edgeFeatures);
+ }
+
+ public FeatureVector getFeatures() {
+ return features;
+ }
+
+ public Map<String,Float> getFeaturesMap() {
+ return features.getMap();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/4f2bec7c/joshua-6/src/joshua/decoder/hypergraph/ViterbiOutputStringWalkerFunction.java
----------------------------------------------------------------------
diff --git a/joshua-6/src/joshua/decoder/hypergraph/ViterbiOutputStringWalkerFunction.java b/joshua-6/src/joshua/decoder/hypergraph/ViterbiOutputStringWalkerFunction.java
new file mode 100644
index 0000000..0c84375
--- /dev/null
+++ b/joshua-6/src/joshua/decoder/hypergraph/ViterbiOutputStringWalkerFunction.java
@@ -0,0 +1,96 @@
+package joshua.decoder.hypergraph;
+
+import static java.lang.Integer.MAX_VALUE;
+import static joshua.corpus.Vocabulary.getWords;
+import static joshua.corpus.Vocabulary.nt;
+
+import java.util.Stack;
+
+import joshua.decoder.ff.tm.Rule;
+
+public class ViterbiOutputStringWalkerFunction implements WalkerFunction {
+
+ private Stack<int[]> viterbiWords = new Stack<int[]>();
+
+ @Override
+ public void apply(HGNode node) {
+ final Rule rule = node.bestHyperedge.getRule();
+ if (rule != null) {
+ merge(rule.getEnglish());
+ }
+ }
+
+ private boolean containsNonTerminals(final int[] ids) {
+ boolean hasNonTerminals = false;
+ for (int i = 0; i < ids.length; i++) {
+ if (nt(ids[i])) {
+ hasNonTerminals = true;
+ break;
+ }
+ }
+ return hasNonTerminals;
+ }
+
+ /**
+ * Returns the index of the next non-terminal slot to fill.
+ * Since non-terminals in right hand sides of rules are indexed by
+ * their order on the source side, this function looks for the largest
+ * negative id in ids and returns its index.
+ */
+ private int getNextNonTerminalIndexToFill(final int[] ids) {
+ int nextIndex = 0;
+ int nextNonTerminal = -MAX_VALUE;
+ for (int i = 0; i < ids.length; i++) {
+ if (nt(ids[i]) && ids[i] > nextNonTerminal) {
+ nextIndex = i;
+ nextNonTerminal = ids[i];
+ }
+ }
+ return nextIndex;
+ }
+
+ private int[] substituteNonTerminal(final int[] parentWords, final int[] childWords) {
+ final int ntIndex = getNextNonTerminalIndexToFill(parentWords);
+ final int[] result = new int[parentWords.length + childWords.length - 1];
+ int resultIndex = 0;
+ for (int i = 0; i < ntIndex; i++) {
+ result[resultIndex++] = parentWords[i];
+ }
+ for (int i = 0; i < childWords.length; i++) {
+ result[resultIndex++] = childWords[i];
+ }
+ for (int i = ntIndex + 1; i < parentWords.length; i++) {
+ result[resultIndex++] = parentWords[i];
+ }
+ return result;
+ }
+
+ private void merge(final int[] words) {
+ if (!containsNonTerminals(words)
+ && !viterbiWords.isEmpty()
+ && containsNonTerminals(viterbiWords.peek())) {
+ merge(substituteNonTerminal(viterbiWords.pop(), words));
+ } else {
+ viterbiWords.add(words);
+ }
+ }
+
+ @Override
+ public String toString() {
+ if (viterbiWords.isEmpty()) {
+ return "";
+ }
+
+ if (viterbiWords.size() != 1) {
+ throw new RuntimeException(
+ String.format(
+ "Stack of ViterbiOutputStringWalker should contain only a single (last) element, but was size %d", viterbiWords.size()));
+ }
+
+ String result = getWords(viterbiWords.peek());
+ // strip of sentence markers (<s>,</s>)
+ result = result.substring(result.indexOf(' ') + 1, result.lastIndexOf(' '));
+ return result.trim();
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/4f2bec7c/src/joshua/decoder/StructuredTranslation.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/StructuredTranslation.java b/src/joshua/decoder/StructuredTranslation.java
deleted file mode 100644
index 1939ea0..0000000
--- a/src/joshua/decoder/StructuredTranslation.java
+++ /dev/null
@@ -1,143 +0,0 @@
-package joshua.decoder;
-
-import static java.util.Arrays.asList;
-import static java.util.Collections.emptyList;
-import static java.util.Collections.emptyMap;
-import static joshua.decoder.hypergraph.ViterbiExtractor.walk;
-
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-import joshua.decoder.ff.FeatureFunction;
-import joshua.decoder.hypergraph.HyperGraph;
-import joshua.decoder.hypergraph.ViterbiFeatureVectorWalkerFunction;
-import joshua.decoder.hypergraph.ViterbiOutputStringWalkerFunction;
-import joshua.decoder.hypergraph.WalkerFunction;
-import joshua.decoder.hypergraph.WordAlignmentExtractor;
-import joshua.decoder.segment_file.Sentence;
-
-/**
- * structuredTranslation provides a more structured access to translation
- * results than the Translation class.
- * Members of instances of this class can be used upstream.
- * <br/>
- * TODO:
- * Enable K-Best extraction.
- *
- * @author fhieber
- */
-public class StructuredTranslation {
-
- private final Sentence sourceSentence;
- private final List<FeatureFunction> featureFunctions;
-
- private final String translationString;
- private final List<String> translationTokens;
- private final float translationScore;
- private List<List<Integer>> translationWordAlignments;
- private Map<String,Float> translationFeatures;
- private final float extractionTime;
-
- public StructuredTranslation(final Sentence sourceSentence,
- final HyperGraph hypergraph,
- final List<FeatureFunction> featureFunctions) {
-
- final long startTime = System.currentTimeMillis();
-
- this.sourceSentence = sourceSentence;
- this.featureFunctions = featureFunctions;
- this.translationString = extractViterbiString(hypergraph);
- this.translationTokens = extractTranslationTokens();
- this.translationScore = extractTranslationScore(hypergraph);
- this.translationFeatures = extractViterbiFeatures(hypergraph);
- this.translationWordAlignments = extractViterbiWordAlignment(hypergraph);
- this.extractionTime = (System.currentTimeMillis() - startTime) / 1000.0f;
- }
-
- private Map<String,Float> extractViterbiFeatures(final HyperGraph hypergraph) {
- if (hypergraph == null) {
- return emptyMap();
- } else {
- ViterbiFeatureVectorWalkerFunction viterbiFeatureVectorWalker = new ViterbiFeatureVectorWalkerFunction(featureFunctions, sourceSentence);
- walk(hypergraph.goalNode, viterbiFeatureVectorWalker);
- return new HashMap<String,Float>(viterbiFeatureVectorWalker.getFeaturesMap());
- }
- }
-
- private List<List<Integer>> extractViterbiWordAlignment(final HyperGraph hypergraph) {
- if (hypergraph == null) {
- return emptyList();
- } else {
- final WordAlignmentExtractor wordAlignmentWalker = new WordAlignmentExtractor();
- walk(hypergraph.goalNode, wordAlignmentWalker);
- return wordAlignmentWalker.getFinalWordAlignments();
- }
- }
-
- private float extractTranslationScore(final HyperGraph hypergraph) {
- if (hypergraph == null) {
- return 0;
- } else {
- return hypergraph.goalNode.getScore();
- }
- }
-
- private String extractViterbiString(final HyperGraph hypergraph) {
- if (hypergraph == null) {
- return sourceSentence.source();
- } else {
- final WalkerFunction viterbiOutputStringWalker = new ViterbiOutputStringWalkerFunction();
- walk(hypergraph.goalNode, viterbiOutputStringWalker);
- return viterbiOutputStringWalker.toString();
- }
- }
-
- private List<String> extractTranslationTokens() {
- if (translationString.isEmpty()) {
- return emptyList();
- } else {
- return asList(translationString.split("\\s+"));
- }
- }
-
- // Getters to use upstream
-
- public Sentence getSourceSentence() {
- return sourceSentence;
- }
-
- public int getSentenceId() {
- return sourceSentence.id();
- }
-
- public String getTranslationString() {
- return translationString;
- }
-
- public List<String> getTranslationTokens() {
- return translationTokens;
- }
-
- public float getTranslationScore() {
- return translationScore;
- }
-
- /**
- * Returns a list of target to source alignments.
- */
- public List<List<Integer>> getTranslationWordAlignments() {
- return translationWordAlignments;
- }
-
- public Map<String,Float> getTranslationFeatures() {
- return translationFeatures;
- }
-
- /**
- * Time taken to build output information from the hypergraph.
- */
- public Float getExtractionTime() {
- return extractionTime;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/4f2bec7c/src/joshua/decoder/hypergraph/ViterbiFeatureVectorWalkerFunction.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/ViterbiFeatureVectorWalkerFunction.java b/src/joshua/decoder/hypergraph/ViterbiFeatureVectorWalkerFunction.java
deleted file mode 100644
index 5af6c4d..0000000
--- a/src/joshua/decoder/hypergraph/ViterbiFeatureVectorWalkerFunction.java
+++ /dev/null
@@ -1,44 +0,0 @@
-package joshua.decoder.hypergraph;
-
-import static joshua.decoder.chart_parser.ComputeNodeResult.computeTransitionFeatures;
-
-import java.util.List;
-import java.util.Map;
-
-import joshua.decoder.ff.FeatureFunction;
-import joshua.decoder.ff.FeatureVector;
-import joshua.decoder.segment_file.Sentence;
-
-public class ViterbiFeatureVectorWalkerFunction implements WalkerFunction {
-
- private final FeatureVector features;
- private final List<FeatureFunction> featureFunctions;
- private final Sentence sourceSentence;
-
- public ViterbiFeatureVectorWalkerFunction(
- final List<FeatureFunction> featureFunctions,
- final Sentence sourceSentence) {
- this.features = new FeatureVector();
- this.featureFunctions = featureFunctions;
- this.sourceSentence = sourceSentence;
- }
-
- /**
- * Recompute feature values for each Viterbi edge and add to features.
- */
- @Override
- public void apply(HGNode node) {
- final FeatureVector edgeFeatures = computeTransitionFeatures(
- featureFunctions, node.bestHyperedge, node.i, node.j, sourceSentence);
- features.add(edgeFeatures);
- }
-
- public FeatureVector getFeatures() {
- return features;
- }
-
- public Map<String,Float> getFeaturesMap() {
- return features.getMap();
- }
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/4f2bec7c/src/joshua/decoder/hypergraph/ViterbiOutputStringWalkerFunction.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/ViterbiOutputStringWalkerFunction.java b/src/joshua/decoder/hypergraph/ViterbiOutputStringWalkerFunction.java
deleted file mode 100644
index 0c84375..0000000
--- a/src/joshua/decoder/hypergraph/ViterbiOutputStringWalkerFunction.java
+++ /dev/null
@@ -1,96 +0,0 @@
-package joshua.decoder.hypergraph;
-
-import static java.lang.Integer.MAX_VALUE;
-import static joshua.corpus.Vocabulary.getWords;
-import static joshua.corpus.Vocabulary.nt;
-
-import java.util.Stack;
-
-import joshua.decoder.ff.tm.Rule;
-
-public class ViterbiOutputStringWalkerFunction implements WalkerFunction {
-
- private Stack<int[]> viterbiWords = new Stack<int[]>();
-
- @Override
- public void apply(HGNode node) {
- final Rule rule = node.bestHyperedge.getRule();
- if (rule != null) {
- merge(rule.getEnglish());
- }
- }
-
- private boolean containsNonTerminals(final int[] ids) {
- boolean hasNonTerminals = false;
- for (int i = 0; i < ids.length; i++) {
- if (nt(ids[i])) {
- hasNonTerminals = true;
- break;
- }
- }
- return hasNonTerminals;
- }
-
- /**
- * Returns the index of the next non-terminal slot to fill.
- * Since non-terminals in right hand sides of rules are indexed by
- * their order on the source side, this function looks for the largest
- * negative id in ids and returns its index.
- */
- private int getNextNonTerminalIndexToFill(final int[] ids) {
- int nextIndex = 0;
- int nextNonTerminal = -MAX_VALUE;
- for (int i = 0; i < ids.length; i++) {
- if (nt(ids[i]) && ids[i] > nextNonTerminal) {
- nextIndex = i;
- nextNonTerminal = ids[i];
- }
- }
- return nextIndex;
- }
-
- private int[] substituteNonTerminal(final int[] parentWords, final int[] childWords) {
- final int ntIndex = getNextNonTerminalIndexToFill(parentWords);
- final int[] result = new int[parentWords.length + childWords.length - 1];
- int resultIndex = 0;
- for (int i = 0; i < ntIndex; i++) {
- result[resultIndex++] = parentWords[i];
- }
- for (int i = 0; i < childWords.length; i++) {
- result[resultIndex++] = childWords[i];
- }
- for (int i = ntIndex + 1; i < parentWords.length; i++) {
- result[resultIndex++] = parentWords[i];
- }
- return result;
- }
-
- private void merge(final int[] words) {
- if (!containsNonTerminals(words)
- && !viterbiWords.isEmpty()
- && containsNonTerminals(viterbiWords.peek())) {
- merge(substituteNonTerminal(viterbiWords.pop(), words));
- } else {
- viterbiWords.add(words);
- }
- }
-
- @Override
- public String toString() {
- if (viterbiWords.isEmpty()) {
- return "";
- }
-
- if (viterbiWords.size() != 1) {
- throw new RuntimeException(
- String.format(
- "Stack of ViterbiOutputStringWalker should contain only a single (last) element, but was size %d", viterbiWords.size()));
- }
-
- String result = getWords(viterbiWords.peek());
- // strip of sentence markers (<s>,</s>)
- result = result.substring(result.indexOf(' ') + 1, result.lastIndexOf(' '));
- return result.trim();
- }
-
-}
\ No newline at end of file
[15/18] incubator-joshua git commit: permissions
Posted by mj...@apache.org.
permissions
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/bb3b79cc
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/bb3b79cc
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/bb3b79cc
Branch: refs/heads/morph
Commit: bb3b79cc1cbdab21ffae221c64b149f960d2da77
Parents: 2fa4b42
Author: Matt Post <po...@cs.jhu.edu>
Authored: Sun Apr 24 14:42:48 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Sun Apr 24 14:42:48 2016 -0400
----------------------------------------------------------------------
scripts/morph/train-mallet.sh | 0
1 file changed, 0 insertions(+), 0 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/bb3b79cc/scripts/morph/train-mallet.sh
----------------------------------------------------------------------
diff --git a/scripts/morph/train-mallet.sh b/scripts/morph/train-mallet.sh
old mode 100644
new mode 100755
[09/18] incubator-joshua git commit: added some debugging...
Posted by mj...@apache.org.
added some debugging...
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/4b8c640c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/4b8c640c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/4b8c640c
Branch: refs/heads/morph
Commit: 4b8c640c69a015a39d2a38b6483ac696d5fa6b2e
Parents: 8b59b99
Author: Matt Post <po...@cs.jhu.edu>
Authored: Fri Apr 22 22:40:05 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Fri Apr 22 22:40:05 2016 -0400
----------------------------------------------------------------------
src/joshua/decoder/ff/LexicalSharpener.java | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/4b8c640c/src/joshua/decoder/ff/LexicalSharpener.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/LexicalSharpener.java b/src/joshua/decoder/ff/LexicalSharpener.java
index 6207ac0..16d1021 100644
--- a/src/joshua/decoder/ff/LexicalSharpener.java
+++ b/src/joshua/decoder/ff/LexicalSharpener.java
@@ -112,7 +112,7 @@ public class LexicalSharpener extends StatelessFF {
}
classifiers.put(lastSourceWord, new MalletPredictor(lastSourceWord, examples));
- System.err.println(String.format("Read %d lines from training file", linesRead));
+ Decoder.LOG(1, String.format("Read %d lines from training file", linesRead));
}
private MalletPredictor createClassifier(String lastSourceWord, HashMap<String, Integer> counts,
@@ -131,7 +131,8 @@ public class LexicalSharpener extends StatelessFF {
classifiers = (HashMap<String,MalletPredictor>) ois.readObject();
ois.close();
- System.err.println(String.format("Loaded model with %d keys", classifiers.keySet().size()));
+ System.err.println(String.format("%s: Loaded model with %d keys",
+ name, classifiers.keySet().size()));
}
public void saveClassifiers(String modelFile) throws FileNotFoundException, IOException {
@@ -161,6 +162,7 @@ public class LexicalSharpener extends StatelessFF {
Token sourceToken = sentence.getTokens().get(s);
String featureString = sourceToken.getAnnotationString().replace('|', ' ');
+ System.err.println(String.format("%s: %s -> %s?", name, sourceToken, Vocabulary.word(targetID)));
Classification result = predict(sourceToken.getWord(), targetID, featureString);
if (result != null) {
Labeling labeling = result.getLabeling();
[14/18] incubator-joshua git commit: Added -lowercase option to
enable source-side projection of case
Posted by mj...@apache.org.
Added -lowercase option to enable source-side projection of case
If you add -lowercase to Joshua, it will lowercase all input, adding an annotation to each token of the form
lettercase = {lower, upper, all-upper}
Then, at output time, the source-side input case will be projected to the target side.
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/3f4fa992
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/3f4fa992
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/3f4fa992
Branch: refs/heads/morph
Commit: 3f4fa992803fd9a7ac6dc3c51d803b65fda9d83d
Parents: 4f2bec7
Author: Matt Post <po...@cs.jhu.edu>
Authored: Sun Apr 24 14:38:26 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Sun Apr 24 14:38:26 2016 -0400
----------------------------------------------------------------------
src/joshua/decoder/JoshuaConfiguration.java | 7 +
.../decoder/hypergraph/KBestExtractor.java | 37 ++++-
.../hypergraph/WordAlignmentExtractor.java | 2 -
.../decoder/hypergraph/WordAlignmentState.java | 1 -
src/joshua/decoder/segment_file/Sentence.java | 8 +-
src/joshua/decoder/segment_file/Token.java | 26 +++-
src/joshua/lattice/Lattice.java | 31 ++--
src/joshua/util/FormatUtils.java | 19 +++
test/decoder/lowercaser/config | 140 +++++++++++++++++++
test/decoder/lowercaser/grammar.glue | 4 +
test/decoder/lowercaser/grammar.test | 1 +
test/decoder/lowercaser/output.gold | 3 +
test/decoder/lowercaser/test.sh | 18 +++
13 files changed, 271 insertions(+), 26 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/src/joshua/decoder/JoshuaConfiguration.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/JoshuaConfiguration.java b/src/joshua/decoder/JoshuaConfiguration.java
index c61720c..6c8edf6 100644
--- a/src/joshua/decoder/JoshuaConfiguration.java
+++ b/src/joshua/decoder/JoshuaConfiguration.java
@@ -51,6 +51,10 @@ public class JoshuaConfiguration {
// whether to construct a StructuredTranslation object for each request instead of
// printing to stdout. Used when the Decoder is used from Java directly.
public Boolean use_structured_output = false;
+
+ // If set to true, Joshua will lowercase the input, creating an annotation that marks the
+ // original case
+ public boolean lowercase = false;
// List of grammar files to read
public ArrayList<String> tms = new ArrayList<String>();
@@ -638,6 +642,9 @@ public class JoshuaConfiguration {
} else if (parameter.equals(normalize_key("cached-rules-size"))) {
// Check source sentence
cachedRuleSize = Integer.parseInt(fds[1]);
+ } else if (parameter.equals(normalize_key("lowercase"))) {
+ lowercase = true;
+
} else {
if (parameter.equals(normalize_key("use-sent-specific-tm"))
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/src/joshua/decoder/hypergraph/KBestExtractor.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/KBestExtractor.java b/src/joshua/decoder/hypergraph/KBestExtractor.java
index 42539cc..45b9ccb 100644
--- a/src/joshua/decoder/hypergraph/KBestExtractor.java
+++ b/src/joshua/decoder/hypergraph/KBestExtractor.java
@@ -42,6 +42,8 @@ import joshua.decoder.ff.state_maintenance.DPState;
import joshua.decoder.ff.tm.Rule;
import joshua.decoder.io.DeNormalize;
import joshua.decoder.segment_file.Sentence;
+import joshua.decoder.segment_file.Token;
+import joshua.util.FormatUtils;
/**
* This class implements lazy k-best extraction on a hyper-graph.
@@ -185,12 +187,12 @@ public class KBestExtractor {
.replaceAll("-lsb-", "[")
.replaceAll("-rsb-", "]")
.replaceAll("-pipe-", "|");
-
+
outputString = joshuaConfiguration.outputFormat
.replace("%k", Integer.toString(k))
- .replace("%s", hypothesis)
- .replace("%S", DeNormalize.processSingleLine(hypothesis))
+ .replace("%s", recapitalize(hypothesis, node))
+ .replace("%S", DeNormalize.processSingleLine(recapitalize(hypothesis, node)))
.replace("%i", Integer.toString(sentence.id()))
.replace("%f", joshuaConfiguration.moses ? features.mosesString() : features.toString())
.replace("%c", String.format("%.3f", derivationState.cost));
@@ -283,6 +285,35 @@ public class KBestExtractor {
}
return virtualNode;
}
+
+ private String recapitalize(String input, HGNode goalNode) {
+ WordAlignmentState alignment = ViterbiExtractor.buildViterbiAlignment(goalNode);
+
+ String[] tokens = input.split("\\s+");
+
+ List<List<Integer>> points = alignment.toFinalList();
+ for (int i = 0; i < points.size(); i++) {
+ List<Integer> target = points.get(i);
+ for (int source: target) {
+ Token token = sentence.getTokens().get(source + 1); // skip <s>
+ String annotation = "";
+ if (token != null && token.getAnnotation("lettercase") != null)
+ annotation = token.getAnnotation("lettercase");
+ if (source != 0 && annotation.equals("upper"))
+ tokens[i] = FormatUtils.capitalize(tokens[i]);
+ else if (annotation.equals("all-upper"))
+ tokens[i] = tokens[i].toUpperCase();
+ }
+ }
+
+ String cap = new String();
+ for (int i = 0; i < tokens.length; i++) {
+ if (i > 0)
+ cap += " ";
+ cap += tokens[i];
+ }
+ return cap;
+ }
/**
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java b/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java
index 63619ee..8e0c2a6 100644
--- a/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java
+++ b/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java
@@ -2,8 +2,6 @@ package joshua.decoder.hypergraph;
import java.util.Stack;
-import joshua.corpus.Vocabulary;
-import joshua.decoder.Decoder;
import joshua.decoder.ff.tm.Rule;
import joshua.decoder.hypergraph.KBestExtractor.DerivationState;
import joshua.decoder.hypergraph.KBestExtractor.DerivationVisitor;
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/src/joshua/decoder/hypergraph/WordAlignmentState.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/WordAlignmentState.java b/src/joshua/decoder/hypergraph/WordAlignmentState.java
index e3b9598..d47fa38 100644
--- a/src/joshua/decoder/hypergraph/WordAlignmentState.java
+++ b/src/joshua/decoder/hypergraph/WordAlignmentState.java
@@ -1,7 +1,6 @@
package joshua.decoder.hypergraph;
import java.util.ArrayList;
-import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/src/joshua/decoder/segment_file/Sentence.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/segment_file/Sentence.java b/src/joshua/decoder/segment_file/Sentence.java
index c1eeca8..b51d509 100644
--- a/src/joshua/decoder/segment_file/Sentence.java
+++ b/src/joshua/decoder/segment_file/Sentence.java
@@ -191,7 +191,7 @@ public class Sentence {
for (int i = 0; i <= chars.length - width; i++) {
int j = i + width;
if (width != chars.length) {
- Token token = new Token(word.substring(i, j));
+ Token token = new Token(word.substring(i, j), config);
if (vocabulary.contains(id)) {
nodes.get(i).addArc(nodes.get(j), 0.0f, token);
wordChart.set(i, j, true);
@@ -386,7 +386,7 @@ public class Sentence {
*/
public Lattice<String> stringLattice() {
assert isLinearChain();
- return Lattice.createStringLatticeFromString(source());
+ return Lattice.createStringLatticeFromString(source(), config);
}
public List<ConstraintSpan> constraints() {
@@ -400,10 +400,10 @@ public class Sentence {
System.err.println("* FATAL: lattice decoding currently not supported for stack-based search algorithm.");
System.exit(12);
}
- this.sourceLattice = Lattice.createTokenLatticeFromPLF(rawSource());
+ this.sourceLattice = Lattice.createTokenLatticeFromPLF(rawSource(), config);
} else
this.sourceLattice = Lattice.createTokenLatticeFromString(String.format("%s %s %s", Vocabulary.START_SYM,
- rawSource(), Vocabulary.STOP_SYM));
+ rawSource(), Vocabulary.STOP_SYM), config);
}
return this.sourceLattice;
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/src/joshua/decoder/segment_file/Token.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/segment_file/Token.java b/src/joshua/decoder/segment_file/Token.java
index 12e2b68..ebe9a43 100644
--- a/src/joshua/decoder/segment_file/Token.java
+++ b/src/joshua/decoder/segment_file/Token.java
@@ -23,6 +23,9 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import joshua.corpus.Vocabulary;
+import joshua.decoder.Decoder;
+import joshua.decoder.JoshuaConfiguration;
+import joshua.util.FormatUtils;
/**
* Stores the identity of a word and its annotations in a sentence.
@@ -36,6 +39,7 @@ public class Token {
private int tokenID;
private HashMap<String,String> annotations = null;
+ private JoshuaConfiguration joshuaConfiguration;
/**
* Constructor : Creates a Token object from a raw word
@@ -58,7 +62,9 @@ public class Token {
* @param rawWord A word with annotation information (possibly)
*
*/
- public Token(String rawWord) {
+ public Token(String rawWord, JoshuaConfiguration config) {
+
+ this.joshuaConfiguration = config;
annotations = new HashMap<String,String>();
@@ -89,9 +95,21 @@ public class Token {
.replaceAll("\\]", "-rsb-")
.replaceAll("\\|", "-pipe-");
+ if (joshuaConfiguration != null && joshuaConfiguration.lowercase) {
+ if (FormatUtils.ISALLUPPERCASE(token))
+ annotations.put("lettercase", "all-upper");
+ else if (Character.isUpperCase(token.charAt(0)))
+ annotations.put("lettercase", "upper");
+ else
+ annotations.put("lettercase", "lower");
+
+ Decoder.LOG(2, String.format("TOKEN: %s -> %s (%s)", token, token.toLowerCase(), annotations.get("lettercase")));
+ token = token.toLowerCase();
+ }
+
tokenID = Vocabulary.id(token);
}
-
+
/**
* Returns the word ID (vocab ID) for this token
*
@@ -108,6 +126,10 @@ public class Token {
public String getWordIdentity() {
return token;
}
+
+ public String toString() {
+ return token;
+ }
/**
* Returns the annotationID (vocab ID)
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/src/joshua/lattice/Lattice.java
----------------------------------------------------------------------
diff --git a/src/joshua/lattice/Lattice.java b/src/joshua/lattice/Lattice.java
index abe43b2..bf2bf87 100644
--- a/src/joshua/lattice/Lattice.java
+++ b/src/joshua/lattice/Lattice.java
@@ -30,6 +30,7 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import joshua.corpus.Vocabulary;
+import joshua.decoder.JoshuaConfiguration;
import joshua.decoder.segment_file.Token;
import joshua.util.ChartSpan;
@@ -61,6 +62,8 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
/** Logger for this class. */
private static final Logger logger = Logger.getLogger(Lattice.class.getName());
+
+ JoshuaConfiguration config = null;
/**
* Constructs a new lattice from an existing list of (connected) nodes.
@@ -70,13 +73,13 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
*
* @param nodes A list of nodes which must be in topological order.
*/
- public Lattice(List<Node<Value>> nodes) {
+ public Lattice(List<Node<Value>> nodes, JoshuaConfiguration config) {
this.nodes = nodes;
// this.distances = calculateAllPairsShortestPath();
this.latticeHasAmbiguity = true;
}
- public Lattice(List<Node<Value>> nodes, boolean isAmbiguous) {
+ public Lattice(List<Node<Value>> nodes, boolean isAmbiguous, JoshuaConfiguration config) {
// Node<Value> sink = new Node<Value>(nodes.size());
// nodes.add(sink);
this.nodes = nodes;
@@ -89,7 +92,7 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
*
* @param linearChain a sequence of Value objects
*/
- public Lattice(Value[] linearChain) {
+ public Lattice(Value[] linearChain, JoshuaConfiguration config) {
this.latticeHasAmbiguity = false;
this.nodes = new ArrayList<Node<Value>>();
@@ -140,17 +143,17 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
* @param linearChain
* @return Lattice representation of the linear chain.
*/
- public static Lattice<Token> createTokenLatticeFromString(String source) {
+ public static Lattice<Token> createTokenLatticeFromString(String source, JoshuaConfiguration config) {
String[] tokens = source.split("\\s+");
Token[] integerSentence = new Token[tokens.length];
for (int i = 0; i < tokens.length; i++) {
- integerSentence[i] = new Token(tokens[i]);
+ integerSentence[i] = new Token(tokens[i], config);
}
- return new Lattice<Token>(integerSentence);
+ return new Lattice<Token>(integerSentence, config);
}
- public static Lattice<Token> createTokenLatticeFromPLF(String data) {
+ public static Lattice<Token> createTokenLatticeFromPLF(String data, JoshuaConfiguration config) {
ArrayList<Node<Token>> nodes = new ArrayList<Node<Token>>();
// This matches a sequence of tuples, which describe arcs leaving this node
@@ -211,7 +214,7 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
String remainingArcs = arcMatcher.group(4);
- Token arcToken = new Token(arcLabel);
+ Token arcToken = new Token(arcLabel, config);
currentNode.addArc(destinationNode, arcWeight, arcToken);
arcMatcher = arcPattern.matcher(remainingArcs);
@@ -225,16 +228,16 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
/* Add <s> to the start of the lattice. */
if (nodes.size() > 1 && nodes.get(1) != null) {
Node<Token> firstNode = nodes.get(1);
- startNode.addArc(firstNode, 0.0f, new Token(Vocabulary.START_SYM));
+ startNode.addArc(firstNode, 0.0f, new Token(Vocabulary.START_SYM, config));
}
/* Add </s> as a final state, connect it to the previous end-state */
nodeID = nodes.get(nodes.size()-1).getNumber() + 1;
Node<Token> endNode = new Node<Token>(nodeID);
- nodes.get(nodes.size()-1).addArc(endNode, 0.0f, new Token(Vocabulary.STOP_SYM));
+ nodes.get(nodes.size()-1).addArc(endNode, 0.0f, new Token(Vocabulary.STOP_SYM, config));
nodes.add(endNode);
- return new Lattice<Token>(nodes, latticeIsAmbiguous);
+ return new Lattice<Token>(nodes, latticeIsAmbiguous, config);
}
/**
@@ -243,7 +246,7 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
* @param data String representation of a lattice.
* @return A lattice that corresponds to the given string.
*/
- public static Lattice<String> createStringLatticeFromString(String data) {
+ public static Lattice<String> createStringLatticeFromString(String data, JoshuaConfiguration config) {
Map<Integer, Node<String>> nodes = new HashMap<Integer, Node<String>>();
@@ -303,7 +306,7 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
logger.fine(nodeList.toString());
- return new Lattice<String>(nodeList);
+ return new Lattice<String>(nodeList, config);
}
/**
@@ -431,7 +434,7 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
nodes.get(2).addArc(nodes.get(3), 3.0f, "b");
nodes.get(2).addArc(nodes.get(3), 5.0f, "c");
- Lattice<String> graph = new Lattice<String>(nodes);
+ Lattice<String> graph = new Lattice<String>(nodes, null);
System.out.println("Shortest path from 0 to 3: " + graph.getShortestPath(0, 3));
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/src/joshua/util/FormatUtils.java
----------------------------------------------------------------------
diff --git a/src/joshua/util/FormatUtils.java b/src/joshua/util/FormatUtils.java
index 3bd53e9..c196328 100644
--- a/src/joshua/util/FormatUtils.java
+++ b/src/joshua/util/FormatUtils.java
@@ -170,4 +170,23 @@ public class FormatUtils {
return false;
}
}
+
+ /**
+ * Determines if a string contains ALL CAPS
+ *
+ * @param token
+ * @return true if the string is all in uppercase, false otherwise
+ */
+ public static boolean ISALLUPPERCASE(String token) {
+ for (int i = 0; i < token.length(); i++)
+ if (! Character.isUpperCase(token.charAt(i)))
+ return false;
+ return true;
+ }
+
+ public static String capitalize(String word) {
+ if (word == null || word.length() == 0)
+ return word;
+ return word.substring(0, 1).toUpperCase() + word.substring(1);
+ }
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/test/decoder/lowercaser/config
----------------------------------------------------------------------
diff --git a/test/decoder/lowercaser/config b/test/decoder/lowercaser/config
new file mode 100644
index 0000000..efa787e
--- /dev/null
+++ b/test/decoder/lowercaser/config
@@ -0,0 +1,140 @@
+# This file is a template for the Joshua pipeline; variables enclosed
+# in <angle-brackets> are substituted by the pipeline script as
+# appropriate. This file also serves to document Joshua's many
+# parameters.
+
+# These are the grammar file specifications. Joshua supports an
+# arbitrary number of grammar files, each specified on its own line
+# using the following format:
+#
+# tm = TYPE OWNER LIMIT FILE
+#
+# TYPE is "packed", "thrax", or "samt". The latter denotes the format
+# used in Zollmann and Venugopal's SAMT decoder
+# (http://www.cs.cmu.edu/~zollmann/samt/).
+#
+# OWNER is the "owner" of the rules in the grammar; this is used to
+# determine which set of phrasal features apply to the grammar's
+# rules. Having different owners allows different features to be
+# applied to different grammars, and for grammars to share features
+# across files.
+#
+# LIMIT is the maximum input span permitted for the application of
+# grammar rules found in the grammar file. A value of -1 implies no limit.
+#
+# FILE is the grammar file (or directory when using packed grammars).
+# The file can be compressed with gzip, which is determined by the
+# presence or absence of a ".gz" file extension.
+#
+# By a convention defined by Chiang (2007), the grammars are split
+# into two files: the main translation grammar containing all the
+# learned translation rules, and a glue grammar which supports
+# monotonic concatenation of hierarchical phrases. The glue grammar's
+# main distinction from the regular grammar is that the span limit
+# does not apply to it.
+
+tm = hiero -maxspan 20 -path grammar.test -owner pt
+tm = thrax -path grammar.glue -maxspan -1 -owner glue
+
+# This symbol is used over unknown words in the source language
+
+default-non-terminal = X
+
+# This is the goal nonterminal, used to determine when a complete
+# parse is found. It should correspond to the root-level rules in the
+# glue grammar.
+
+goal-symbol = GOAL
+
+# Language model config.
+#
+# Multiple language models are supported. For each language model,
+# create one of the following lines:
+#
+# feature-function = LanguageModel -lm_type TYPE -lm_order ORDER -lm_file FILE
+# feature-function = StateMinimizingLanguageModel -lm_order ORDER -lm_file FILE
+#
+# - TYPE is one of "kenlm" or "berkeleylm"
+# - ORDER is the order of the language model (default 5)
+# - FILE is the path to the LM file. This can be binarized if appropriate to the type
+# (e.g., KenLM has a compiled format)
+#
+# A state-minimizing LM collapses left-state. Currently only KenLM supports this.
+#
+# For each LM, add a weight lm_INDEX below, where indexing starts from 0.
+
+
+
+# The suffix _OOV is appended to unknown source-language words if this
+# is set to true.
+
+mark-oovs = false
+
+# The search algorithm: "cky" for hierarchical / phrase-based decoding,
+# "stack" for phrase-based decoding
+search = cky
+
+# The pop-limit for decoding. This determines how many hypotheses are
+# considered over each span of the input.
+
+pop-limit = 100
+
+# How many hypotheses to output
+
+top-n = 1
+
+# Whether those hypotheses should be distinct strings
+
+use-unique-nbest = true
+
+# This is the default format of the ouput printed to STDOUT. The variables that can be
+# substituted are:
+#
+# %i: the sentence number (0-indexed)
+# %s: the translated sentence
+# %t: the derivation tree
+# %f: the feature string
+# %c: the model cost
+
+output-format = %s
+
+# When printing the trees (%t in 'output-format'), this controls whether the alignments
+# are also printed.
+
+include-align-index = false
+
+# And these are the feature functions to activate.
+feature-function = OOVPenalty
+feature-function = WordPenalty
+
+## Model weights #####################################################
+
+# For each langage model line listed above, create a weight in the
+# following format: the keyword "lm", a 0-based index, and the weight.
+# lm_INDEX WEIGHT
+
+
+# The phrasal weights correspond to weights stored with each of the
+# grammar rules. The format is
+#
+# tm_OWNER_COLUMN WEIGHT
+#
+# where COLUMN denotes the 0-based order of the parameter in the
+# grammar file and WEIGHT is the corresponding weight. In the future,
+# we plan to add a sparse feature representation which will simplify
+# this.
+
+# The wordpenalty feature counts the number of words in each hypothesis.
+
+
+# This feature counts the number of unknown words in the hypothesis.
+
+
+# This feature weights paths through an input lattice. It is only activated
+# when decoding lattices.
+
+WordPenalty -4.72455379476569
+OOVPenalty 0.7897219562429866
+tm_pt_0 0.3137696816891433
+tm_glue_0 -0.04493059277470993
+
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/test/decoder/lowercaser/grammar.glue
----------------------------------------------------------------------
diff --git a/test/decoder/lowercaser/grammar.glue b/test/decoder/lowercaser/grammar.glue
new file mode 100644
index 0000000..69e1520
--- /dev/null
+++ b/test/decoder/lowercaser/grammar.glue
@@ -0,0 +1,4 @@
+[GOAL] ||| <s> ||| <s> ||| 0
+[GOAL] ||| [GOAL,1] [X,2] ||| [GOAL,1] [X,2] ||| -1
+[GOAL] ||| [GOAL,1] </s> ||| [GOAL,1] </s> ||| 0
+[GOAL] ||| <s> [X,1] </s> ||| <s> [X,1] </s> ||| 0
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/test/decoder/lowercaser/grammar.test
----------------------------------------------------------------------
diff --git a/test/decoder/lowercaser/grammar.test b/test/decoder/lowercaser/grammar.test
new file mode 100644
index 0000000..3745008
--- /dev/null
+++ b/test/decoder/lowercaser/grammar.test
@@ -0,0 +1 @@
+[X] ||| ella ||| she ||| 1 ||| 0-0
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/test/decoder/lowercaser/output.gold
----------------------------------------------------------------------
diff --git a/test/decoder/lowercaser/output.gold b/test/decoder/lowercaser/output.gold
new file mode 100644
index 0000000..0c9c1eb
--- /dev/null
+++ b/test/decoder/lowercaser/output.gold
@@ -0,0 +1,3 @@
+ELLA
+she
+SHE
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f4fa992/test/decoder/lowercaser/test.sh
----------------------------------------------------------------------
diff --git a/test/decoder/lowercaser/test.sh b/test/decoder/lowercaser/test.sh
new file mode 100755
index 0000000..4db1251
--- /dev/null
+++ b/test/decoder/lowercaser/test.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+set -u
+
+(
+echo -e "ELLA" | $JOSHUA/bin/joshua-decoder -config config
+echo -e "Ella" | $JOSHUA/bin/joshua-decoder -config config -lowercase
+echo -e "ELLA" | $JOSHUA/bin/joshua-decoder -config config -lowercase
+) > output 2> .log
+
+diff -u output output.gold > diff
+
+if [ $? -eq 0 ]; then
+ rm -f log output diff
+ exit 0
+else
+ exit 1
+fi
[04/18] incubator-joshua git commit: added to build path
Posted by mj...@apache.org.
added to build path
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/1c8aaa5e
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/1c8aaa5e
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/1c8aaa5e
Branch: refs/heads/morph
Commit: 1c8aaa5eb89a800b51478c352b890f535067c5e4
Parents: 71f808e
Author: Matt Post <po...@cs.jhu.edu>
Authored: Fri Apr 22 10:50:54 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Fri Apr 22 10:50:54 2016 -0400
----------------------------------------------------------------------
build.xml | 2 ++
1 file changed, 2 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/1c8aaa5e/build.xml
----------------------------------------------------------------------
diff --git a/build.xml b/build.xml
index c0c6132..1489c4e 100644
--- a/build.xml
+++ b/build.xml
@@ -29,6 +29,8 @@
<include name="args4j-2.0.29.jar" />
<include name="gson-2.5.jar" />
<include name="guava-19.0.jar" />
+ <include name="mallet-2.0.7.jar" />
+ <include name="trove4j-2.0.2" />
</fileset>
<fileset dir="${thraxlib}">
<include name="thrax.jar" />