You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/04/24 22:53:27 UTC
[06/18] incubator-joshua git commit: huge efficiency fix in reading
in the data
huge efficiency fix in reading in the data
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/155249f9
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/155249f9
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/155249f9
Branch: refs/heads/morph
Commit: 155249f9d0f5c00ea2c7d70917c94b06df401e33
Parents: 68b01bc
Author: Matt Post <po...@cs.jhu.edu>
Authored: Fri Apr 22 16:52:42 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Fri Apr 22 16:52:42 2016 -0400
----------------------------------------------------------------------
src/joshua/decoder/ff/LexicalSharpener.java | 10 +++---
src/joshua/decoder/ff/MalletPredictor.java | 45 +++++++++++++-----------
2 files changed, 31 insertions(+), 24 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/155249f9/src/joshua/decoder/ff/LexicalSharpener.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/LexicalSharpener.java b/src/joshua/decoder/ff/LexicalSharpener.java
index 8671d57..8c30431 100644
--- a/src/joshua/decoder/ff/LexicalSharpener.java
+++ b/src/joshua/decoder/ff/LexicalSharpener.java
@@ -22,6 +22,7 @@ import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
+import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@@ -87,17 +88,18 @@ public class LexicalSharpener extends StatelessFF {
}
String lastSourceWord = null;
- String examples = "";
+ ArrayList<String> examples = new ArrayList<String>();
int linesRead = 0;
for (String line : lineReader) {
String sourceWord = line.substring(0, line.indexOf(' '));
+
if (lastSourceWord != null && ! sourceWord.equals(lastSourceWord)) {
classifiers.put(lastSourceWord, new MalletPredictor(lastSourceWord, examples));
- // System.err.println(String.format("WORD %s:\n%s\n", lastOutcome, buffer));
- examples = "";
+// System.err.println(String.format("WORD %s:\n%s\n", lastSourceWord, examples));
+ examples = new ArrayList<String>();
}
- examples += line + "\n";
+ examples.add(line);
lastSourceWord = sourceWord;
linesRead++;
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/155249f9/src/joshua/decoder/ff/MalletPredictor.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/MalletPredictor.java b/src/joshua/decoder/ff/MalletPredictor.java
index 04c9d8c..f200551 100644
--- a/src/joshua/decoder/ff/MalletPredictor.java
+++ b/src/joshua/decoder/ff/MalletPredictor.java
@@ -24,28 +24,12 @@ public class MalletPredictor implements Serializable {
private SerialPipes pipes = null;
private InstanceList instances = null;
private String sourceWord = null;
- private String examples = null;
+ private ArrayList<String> examples = null;
private Classifier classifier = null;
- public MalletPredictor(String word, String examples) {
+ public MalletPredictor(String word, ArrayList<String> examples) {
this.sourceWord = word;
this.examples = examples;
- ArrayList<Pipe> pipeList = new ArrayList<Pipe>();
-
- // I don't know if this is needed
- pipeList.add(new Target2Label());
- // Convert custom lines to Instance objects (svmLight2FeatureVectorAndLabel not versatile enough)
- pipeList.add(new SvmLight2FeatureVectorAndLabel());
- // Validation
-// pipeList.add(new PrintInputAndTarget());
-
- // name: english word
- // data: features (FeatureVector)
- // target: foreign inflection
- // source: null
-
- pipes = new SerialPipes(pipeList);
- instances = new InstanceList(pipes);
}
/**
@@ -70,9 +54,30 @@ public class MalletPredictor implements Serializable {
public void train() {
Decoder.LOG(2, String.format("Word %s: training model from %d examples",
- sourceWord, examples.split("\\n").length));
+ sourceWord, examples.size()));
+
+ ArrayList<Pipe> pipeList = new ArrayList<Pipe>();
+
+ // I don't know if this is needed
+ pipeList.add(new Target2Label());
+ // Convert custom lines to Instance objects (svmLight2FeatureVectorAndLabel not versatile enough)
+ pipeList.add(new SvmLight2FeatureVectorAndLabel());
+ // Validation
+// pipeList.add(new PrintInputAndTarget());
+
+ // name: english word
+ // data: features (FeatureVector)
+ // target: foreign inflection
+ // source: null
+
+ pipes = new SerialPipes(pipeList);
+ instances = new InstanceList(pipes);
- StringReader reader = new StringReader(examples);
+ /* I know, this is *terrible*, but I need it to work *now* */
+ String exampleList = "";
+ for (String example: examples)
+ exampleList += example + "\n";
+ StringReader reader = new StringReader(exampleList);
// Constructs an instance with everything shoved into the data field
instances.addThruPipe(new CsvIterator(reader, "(\\S+)\\s+(.*)", 2, -1, 1));