You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/04/24 22:53:29 UTC
[08/18] incubator-joshua git commit: pruning out predictors with too
many outcomes
pruning out predictors with too many outcomes
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/8b59b99d
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/8b59b99d
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/8b59b99d
Branch: refs/heads/morph
Commit: 8b59b99d8efa3b65ceb258ea3c65dc17534accea
Parents: dc6b411
Author: Matt Post <po...@cs.jhu.edu>
Authored: Fri Apr 22 21:52:35 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Fri Apr 22 21:52:35 2016 -0400
----------------------------------------------------------------------
src/joshua/decoder/ff/LexicalSharpener.java | 25 +++++++++++++++++++-----
1 file changed, 20 insertions(+), 5 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8b59b99d/src/joshua/decoder/ff/LexicalSharpener.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/LexicalSharpener.java b/src/joshua/decoder/ff/LexicalSharpener.java
index 8c30431..6207ac0 100644
--- a/src/joshua/decoder/ff/LexicalSharpener.java
+++ b/src/joshua/decoder/ff/LexicalSharpener.java
@@ -24,9 +24,11 @@ import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.ArrayList;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
+import java.util.Set;
import cc.mallet.classify.*;
import cc.mallet.types.Labeling;
@@ -89,17 +91,22 @@ public class LexicalSharpener extends StatelessFF {
String lastSourceWord = null;
ArrayList<String> examples = new ArrayList<String>();
+ HashMap<String,Integer> targets = new HashMap<String,Integer>();
int linesRead = 0;
for (String line : lineReader) {
- String sourceWord = line.substring(0, line.indexOf(' '));
+ String[] tokens = line.split("\\s+", 3);
+ String sourceWord = tokens[0];
+ String targetWord = tokens[1];
if (lastSourceWord != null && ! sourceWord.equals(lastSourceWord)) {
- classifiers.put(lastSourceWord, new MalletPredictor(lastSourceWord, examples));
+ classifiers.put(lastSourceWord, createClassifier(lastSourceWord, targets, examples));
// System.err.println(String.format("WORD %s:\n%s\n", lastSourceWord, examples));
examples = new ArrayList<String>();
+ targets = new HashMap<String,Integer>();
}
examples.add(line);
+ targets.put(targetWord, targets.getOrDefault(targetWord, 0));
lastSourceWord = sourceWord;
linesRead++;
}
@@ -108,15 +115,23 @@ public class LexicalSharpener extends StatelessFF {
System.err.println(String.format("Read %d lines from training file", linesRead));
}
+ private MalletPredictor createClassifier(String lastSourceWord, HashMap<String, Integer> counts,
+ ArrayList<String> examples) {
+
+ int numExamples = examples.size();
+
+ if (examples.size() < 75)
+ return new MalletPredictor(lastSourceWord, examples);
+
+ return null;
+ }
+
public void loadClassifiers(String modelFile) throws ClassNotFoundException, IOException {
ObjectInputStream ois = new ObjectInputStream(new FileInputStream(modelFile));
classifiers = (HashMap<String,MalletPredictor>) ois.readObject();
ois.close();
System.err.println(String.format("Loaded model with %d keys", classifiers.keySet().size()));
- for (String key: classifiers.keySet()) {
- System.err.println(" " + key);
- }
}
public void saveClassifiers(String modelFile) throws FileNotFoundException, IOException {