You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/04/22 06:17:50 UTC
[04/13] incubator-joshua git commit: renamed
renamed
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/0ef5d3eb
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/0ef5d3eb
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/0ef5d3eb
Branch: refs/heads/morph
Commit: 0ef5d3eb25f3abd13d60b69dcb290a65c8214c73
Parents: 47f1af5
Author: Matt Post <po...@cs.jhu.edu>
Authored: Thu Apr 21 09:15:42 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Thu Apr 21 09:15:42 2016 -0400
----------------------------------------------------------------------
.../decoder/ff/morph/InflectionPredictor.java | 246 -------------------
.../decoder/ff/morph/LexicalSharpener.java | 246 +++++++++++++++++++
2 files changed, 246 insertions(+), 246 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/0ef5d3eb/src/joshua/decoder/ff/morph/InflectionPredictor.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/morph/InflectionPredictor.java b/src/joshua/decoder/ff/morph/InflectionPredictor.java
deleted file mode 100644
index f4a4310..0000000
--- a/src/joshua/decoder/ff/morph/InflectionPredictor.java
+++ /dev/null
@@ -1,246 +0,0 @@
-package joshua.decoder.ff.morph;
-
-/***
- * This feature function scores a rule application by predicting, for each target word aligned with
- * a source word, how likely the lexical translation is in context.
- *
- * The feature function can be provided with a trained model or a raw training file which it will
- * then train prior to decoding.
- *
- * Format of training file:
- *
- * source_word target_word feature:value feature:value feature:value ...
- *
- * Invocation:
- *
- * java -cp /Users/post/code/joshua/lib/mallet-2.0.7.jar:/Users/post/code/joshua/lib/trove4j-2.0.2.jar:$JOSHUA/class joshua.decoder.ff.morph.LexicalSharpener /path/to/training/data
- */
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
-import java.io.FileReader;
-import java.io.IOException;
-import java.io.ObjectInputStream;
-import java.io.ObjectOutputStream;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-import java.util.Scanner;
-
-import cc.mallet.classify.*;
-import cc.mallet.pipe.*;
-import cc.mallet.pipe.iterator.CsvIterator;
-import cc.mallet.types.Instance;
-import cc.mallet.types.InstanceList;
-import joshua.corpus.Vocabulary;
-import joshua.decoder.JoshuaConfiguration;
-import joshua.decoder.chart_parser.SourcePath;
-import joshua.decoder.ff.FeatureVector;
-import joshua.decoder.ff.StatelessFF;
-import joshua.decoder.ff.state_maintenance.DPState;
-import joshua.decoder.ff.tm.Rule;
-import joshua.decoder.hypergraph.HGNode;
-import joshua.decoder.segment_file.Sentence;
-import joshua.decoder.segment_file.Token;
-
-public class InflectionPredictor extends StatelessFF {
-
- private Classifier classifier = null;
- private SerialPipes pipes = null;
-
- public InflectionPredictor(final FeatureVector weights, String[] args, JoshuaConfiguration config) {
- super(weights, "LexicalSharpener", args, config);
-
- ArrayList<Pipe> pipeList = new ArrayList<Pipe>();
-
- // I don't know if this is needed
- pipeList.add(new Target2Label());
- // Convert SVM-light format to sparse feature vector
- pipeList.add(new SvmLight2FeatureVectorAndLabel());
- // Validation
-// pipeList.add(new PrintInputAndTarget());
-
- // name: english word
- // data: features (FeatureVector)
- // target: foreign inflection
- // source: null
-
- pipes = new SerialPipes(pipeList);
-
- if (parsedArgs.containsKey("model")) {
- String modelFile = parsedArgs.get("model");
- if (! new File(modelFile).exists()) {
- if (parsedArgs.getOrDefault("training-data", null) != null) {
- try {
- classifier = train(parsedArgs.get("training-data"));
- } catch (FileNotFoundException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- } else {
- System.err.println("* FATAL: no model and no training data.");
- System.exit(1);
- }
- } else {
- try {
- loadClassifier(modelFile);
- } catch (IOException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- } catch (ClassNotFoundException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- }
- }
- }
-
- /**
- * Trains a maxent classifier from the provided training data, returning a Mallet model.
- *
- * @param dataFile
- * @return
- * @throws FileNotFoundException
- */
- public Classifier train(String dataFile) throws FileNotFoundException {
-
- // Remove the first field (Mallet's "name" field), leave the rest for SVM-light conversion
- InstanceList instances = new InstanceList(pipes);
- instances.addThruPipe(new CsvIterator(new FileReader(dataFile),
- "(\\w+)\\s+(.*)",
- 2, -1, 1));
-
- ClassifierTrainer trainer = new MaxEntTrainer();
- Classifier classifier = trainer.train(instances);
-
- return classifier;
- }
-
- public void loadClassifier(String modelFile) throws ClassNotFoundException, IOException {
- ObjectInputStream ois = new ObjectInputStream(new FileInputStream(modelFile));
- classifier = (Classifier) ois.readObject();
- }
-
- public void saveClassifier(String modelFile) throws FileNotFoundException, IOException {
- ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(modelFile));
- oos.writeObject(classifier);
- oos.close();
- }
-
- public Classification predict(String outcome, String features) {
- Instance instance = new Instance(features, null, null, null);
- System.err.println("PREDICT outcome = " + (String) instance.getTarget());
- System.err.println("PREDICT features = " + (String) instance.getData());
- Classification result = (Classification) classifier.classify(pipes.instanceFrom(instance));
-
- return result;
- }
-
- /**
- * Compute features. This works by walking over the target side phrase pieces, looking for every
- * word with a single source-aligned word. We then throw the annotations from that source word
- * into our prediction model to learn how much it likes the chosen word. Presumably the source-
- * language annotations have contextual features, so this effectively chooses the words in context.
- */
- @Override
- public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
- Sentence sentence, Accumulator acc) {
-
- Map<Integer, List<Integer>> points = rule.getAlignmentMap();
- for (int t: points.keySet()) {
- List<Integer> source_indices = points.get(t);
- if (source_indices.size() != 1)
- continue;
-
- String targetWord = Vocabulary.word(rule.getEnglish()[t]);
- int s = i + source_indices.get(0);
- Token sourceToken = sentence.getTokens().get(s);
- String featureString = sourceToken.getAnnotationString().replace('|', ' ');
-
- Classification result = predict(targetWord, featureString);
- if (result.bestLabelIsCorrect()) {
- acc.add(String.format("%s_match", name), 1);
- }
- }
-
- return null;
- }
-
- /**
- * Returns an array parallel to the source words array indicating, for each index, the absolute
- * position of that word into the source sentence. For example, for the rule with source side
- *
- * [ 17, 142, -14, 9 ]
- *
- * and source sentence
- *
- * [ 17, 18, 142, 1, 1, 9, 8 ]
- *
- * it will return
- *
- * [ 0, 2, -14, 5 ]
- *
- * which indicates that the first, second, and fourth words of the rule are anchored to the
- * first, third, and sixth words of the input sentence.
- *
- * @param rule
- * @param tailNodes
- * @param start
- * @return a list of alignment points anchored to the source sentence
- */
- public int[] anchorRuleSourceToSentence(Rule rule, List<HGNode> tailNodes, int start) {
- int[] source = rule.getFrench();
-
- // Map the source words in the rule to absolute positions in the sentence
- int[] anchoredSource = source.clone();
-
- int sourceIndex = start;
- int tailNodeIndex = 0;
- for (int i = 0; i < source.length; i++) {
- if (source[i] < 0) { // nonterminal
- anchoredSource[i] = source[i];
- sourceIndex = tailNodes.get(tailNodeIndex).j;
- tailNodeIndex++;
- } else { // terminal
- anchoredSource[i] = sourceIndex;
- sourceIndex++;
- }
- }
-
- return anchoredSource;
- }
-
- public static void main(String[] args) throws IOException, ClassNotFoundException {
- InflectionPredictor ts = new InflectionPredictor(null, args, null);
-
- String modelFile = "model";
-
- if (args.length > 0) {
- String dataFile = args[0];
-
- System.err.println("Training model from file " + dataFile);
- ts.train(dataFile);
-
- if (args.length > 1)
- modelFile = args[1];
-
- System.err.println("Writing model to file " + modelFile);
- ts.saveClassifier(modelFile);
- } else {
- System.err.println("Loading model from file " + modelFile);
- ts.loadClassifier(modelFile);
- }
-
- Scanner stdin = new Scanner(System.in);
- while(stdin.hasNextLine()) {
- String line = stdin.nextLine();
- String[] tokens = line.split(" ", 2);
- String outcome = tokens[0];
- String features = tokens[1];
- Classification result = ts.predict(outcome, features);
- System.out.println(String.format("%s %f", result.getLabelVector().getBestLabel(), result.getLabelVector().getBestValue()));
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/0ef5d3eb/src/joshua/decoder/ff/morph/LexicalSharpener.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/morph/LexicalSharpener.java b/src/joshua/decoder/ff/morph/LexicalSharpener.java
new file mode 100644
index 0000000..edf4390
--- /dev/null
+++ b/src/joshua/decoder/ff/morph/LexicalSharpener.java
@@ -0,0 +1,246 @@
+package joshua.decoder.ff.morph;
+
+/***
+ * This feature function scores a rule application by predicting, for each target word aligned with
+ * a source word, how likely the lexical translation is in context.
+ *
+ * The feature function can be provided with a trained model or a raw training file which it will
+ * then train prior to decoding.
+ *
+ * Format of training file:
+ *
+ * source_word target_word feature:value feature:value feature:value ...
+ *
+ * Invocation:
+ *
+ * java -cp /Users/post/code/joshua/lib/mallet-2.0.7.jar:/Users/post/code/joshua/lib/trove4j-2.0.2.jar:$JOSHUA/class joshua.decoder.ff.morph.LexicalSharpener /path/to/training/data
+ */
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Scanner;
+
+import cc.mallet.classify.*;
+import cc.mallet.pipe.*;
+import cc.mallet.pipe.iterator.CsvIterator;
+import cc.mallet.types.Instance;
+import cc.mallet.types.InstanceList;
+import joshua.corpus.Vocabulary;
+import joshua.decoder.JoshuaConfiguration;
+import joshua.decoder.chart_parser.SourcePath;
+import joshua.decoder.ff.FeatureVector;
+import joshua.decoder.ff.StatelessFF;
+import joshua.decoder.ff.state_maintenance.DPState;
+import joshua.decoder.ff.tm.Rule;
+import joshua.decoder.hypergraph.HGNode;
+import joshua.decoder.segment_file.Sentence;
+import joshua.decoder.segment_file.Token;
+
+public class LexicalSharpener extends StatelessFF {
+
+ private Classifier classifier = null;
+ private SerialPipes pipes = null;
+
+ public LexicalSharpener(final FeatureVector weights, String[] args, JoshuaConfiguration config) {
+ super(weights, "LexicalSharpener", args, config);
+
+ ArrayList<Pipe> pipeList = new ArrayList<Pipe>();
+
+ // I don't know if this is needed
+ pipeList.add(new Target2Label());
+ // Convert SVM-light format to sparse feature vector
+ pipeList.add(new SvmLight2FeatureVectorAndLabel());
+ // Validation
+// pipeList.add(new PrintInputAndTarget());
+
+ // name: english word
+ // data: features (FeatureVector)
+ // target: foreign inflection
+ // source: null
+
+ pipes = new SerialPipes(pipeList);
+
+ if (parsedArgs.containsKey("model")) {
+ String modelFile = parsedArgs.get("model");
+ if (! new File(modelFile).exists()) {
+ if (parsedArgs.getOrDefault("training-data", null) != null) {
+ try {
+ classifier = train(parsedArgs.get("training-data"));
+ } catch (FileNotFoundException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ } else {
+ System.err.println("* FATAL: no model and no training data.");
+ System.exit(1);
+ }
+ } else {
+ try {
+ loadClassifier(modelFile);
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ } catch (ClassNotFoundException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+
+ /**
+ * Trains a maxent classifier from the provided training data, returning a Mallet model.
+ *
+ * @param dataFile
+ * @return
+ * @throws FileNotFoundException
+ */
+ public Classifier train(String dataFile) throws FileNotFoundException {
+
+ // Remove the first field (Mallet's "name" field), leave the rest for SVM-light conversion
+ InstanceList instances = new InstanceList(pipes);
+ instances.addThruPipe(new CsvIterator(new FileReader(dataFile),
+ "(\\w+)\\s+(.*)",
+ 2, -1, 1));
+
+ ClassifierTrainer trainer = new MaxEntTrainer();
+ Classifier classifier = trainer.train(instances);
+
+ return classifier;
+ }
+
+ public void loadClassifier(String modelFile) throws ClassNotFoundException, IOException {
+ ObjectInputStream ois = new ObjectInputStream(new FileInputStream(modelFile));
+ classifier = (Classifier) ois.readObject();
+ }
+
+ public void saveClassifier(String modelFile) throws FileNotFoundException, IOException {
+ ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(modelFile));
+ oos.writeObject(classifier);
+ oos.close();
+ }
+
+ public Classification predict(String outcome, String features) {
+ Instance instance = new Instance(features, null, null, null);
+ System.err.println("PREDICT outcome = " + (String) instance.getTarget());
+ System.err.println("PREDICT features = " + (String) instance.getData());
+ Classification result = (Classification) classifier.classify(pipes.instanceFrom(instance));
+
+ return result;
+ }
+
+ /**
+ * Compute features. This works by walking over the target side phrase pieces, looking for every
+ * word with a single source-aligned word. We then throw the annotations from that source word
+ * into our prediction model to learn how much it likes the chosen word. Presumably the source-
+ * language annotations have contextual features, so this effectively chooses the words in context.
+ */
+ @Override
+ public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc) {
+
+ Map<Integer, List<Integer>> points = rule.getAlignmentMap();
+ for (int t: points.keySet()) {
+ List<Integer> source_indices = points.get(t);
+ if (source_indices.size() != 1)
+ continue;
+
+ String targetWord = Vocabulary.word(rule.getEnglish()[t]);
+ int s = i + source_indices.get(0);
+ Token sourceToken = sentence.getTokens().get(s);
+ String featureString = sourceToken.getAnnotationString().replace('|', ' ');
+
+ Classification result = predict(targetWord, featureString);
+ if (result.bestLabelIsCorrect()) {
+ acc.add(String.format("%s_match", name), 1);
+ }
+ }
+
+ return null;
+ }
+
+ /**
+ * Returns an array parallel to the source words array indicating, for each index, the absolute
+ * position of that word into the source sentence. For example, for the rule with source side
+ *
+ * [ 17, 142, -14, 9 ]
+ *
+ * and source sentence
+ *
+ * [ 17, 18, 142, 1, 1, 9, 8 ]
+ *
+ * it will return
+ *
+ * [ 0, 2, -14, 5 ]
+ *
+ * which indicates that the first, second, and fourth words of the rule are anchored to the
+ * first, third, and sixth words of the input sentence.
+ *
+ * @param rule
+ * @param tailNodes
+ * @param start
+ * @return a list of alignment points anchored to the source sentence
+ */
+ public int[] anchorRuleSourceToSentence(Rule rule, List<HGNode> tailNodes, int start) {
+ int[] source = rule.getFrench();
+
+ // Map the source words in the rule to absolute positions in the sentence
+ int[] anchoredSource = source.clone();
+
+ int sourceIndex = start;
+ int tailNodeIndex = 0;
+ for (int i = 0; i < source.length; i++) {
+ if (source[i] < 0) { // nonterminal
+ anchoredSource[i] = source[i];
+ sourceIndex = tailNodes.get(tailNodeIndex).j;
+ tailNodeIndex++;
+ } else { // terminal
+ anchoredSource[i] = sourceIndex;
+ sourceIndex++;
+ }
+ }
+
+ return anchoredSource;
+ }
+
+ public static void main(String[] args) throws IOException, ClassNotFoundException {
+ LexicalSharpener ts = new LexicalSharpener(null, args, null);
+
+ String modelFile = "model";
+
+ if (args.length > 0) {
+ String dataFile = args[0];
+
+ System.err.println("Training model from file " + dataFile);
+ ts.train(dataFile);
+
+ if (args.length > 1)
+ modelFile = args[1];
+
+ System.err.println("Writing model to file " + modelFile);
+ ts.saveClassifier(modelFile);
+ } else {
+ System.err.println("Loading model from file " + modelFile);
+ ts.loadClassifier(modelFile);
+ }
+
+ Scanner stdin = new Scanner(System.in);
+ while(stdin.hasNextLine()) {
+ String line = stdin.nextLine();
+ String[] tokens = line.split(" ", 2);
+ String outcome = tokens[0];
+ String features = tokens[1];
+ Classification result = ts.predict(outcome, features);
+ System.out.println(String.format("%s %f", result.getLabelVector().getBestLabel(), result.getLabelVector().getBestValue()));
+ }
+ }
+}