You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/04/24 22:53:27 UTC

[06/18] incubator-joshua git commit: huge efficiency fix in reading in the data

huge efficiency fix in reading in the data


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/155249f9
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/155249f9
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/155249f9

Branch: refs/heads/morph
Commit: 155249f9d0f5c00ea2c7d70917c94b06df401e33
Parents: 68b01bc
Author: Matt Post <po...@cs.jhu.edu>
Authored: Fri Apr 22 16:52:42 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Fri Apr 22 16:52:42 2016 -0400

----------------------------------------------------------------------
 src/joshua/decoder/ff/LexicalSharpener.java | 10 +++---
 src/joshua/decoder/ff/MalletPredictor.java  | 45 +++++++++++++-----------
 2 files changed, 31 insertions(+), 24 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/155249f9/src/joshua/decoder/ff/LexicalSharpener.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/LexicalSharpener.java b/src/joshua/decoder/ff/LexicalSharpener.java
index 8671d57..8c30431 100644
--- a/src/joshua/decoder/ff/LexicalSharpener.java
+++ b/src/joshua/decoder/ff/LexicalSharpener.java
@@ -22,6 +22,7 @@ import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.ObjectInputStream;
 import java.io.ObjectOutputStream;
+import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
@@ -87,17 +88,18 @@ public class LexicalSharpener extends StatelessFF {
     }
   
     String lastSourceWord = null;
-    String examples = "";
+    ArrayList<String> examples = new ArrayList<String>();
     int linesRead = 0;
     for (String line : lineReader) {
       String sourceWord = line.substring(0, line.indexOf(' '));
+
       if (lastSourceWord != null && ! sourceWord.equals(lastSourceWord)) {
         classifiers.put(lastSourceWord, new MalletPredictor(lastSourceWord, examples));
-        //        System.err.println(String.format("WORD %s:\n%s\n", lastOutcome, buffer));
-        examples = "";
+//                System.err.println(String.format("WORD %s:\n%s\n", lastSourceWord, examples));
+        examples = new ArrayList<String>();
       }
   
-      examples += line + "\n";
+      examples.add(line);
       lastSourceWord = sourceWord;
       linesRead++;
     }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/155249f9/src/joshua/decoder/ff/MalletPredictor.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/MalletPredictor.java b/src/joshua/decoder/ff/MalletPredictor.java
index 04c9d8c..f200551 100644
--- a/src/joshua/decoder/ff/MalletPredictor.java
+++ b/src/joshua/decoder/ff/MalletPredictor.java
@@ -24,28 +24,12 @@ public class MalletPredictor implements Serializable {
     private SerialPipes pipes = null;
     private InstanceList instances = null;
     private String sourceWord = null;
-    private String examples = null;
+    private ArrayList<String> examples = null;
     private Classifier classifier = null;
     
-    public MalletPredictor(String word, String examples) {
+    public MalletPredictor(String word, ArrayList<String> examples) {
       this.sourceWord = word;
       this.examples = examples;
-      ArrayList<Pipe> pipeList = new ArrayList<Pipe>();
-
-      // I don't know if this is needed
-      pipeList.add(new Target2Label());
-      // Convert custom lines to Instance objects (svmLight2FeatureVectorAndLabel not versatile enough)
-      pipeList.add(new SvmLight2FeatureVectorAndLabel());
-      // Validation
-//      pipeList.add(new PrintInputAndTarget());
-      
-      // name: english word
-      // data: features (FeatureVector)
-      // target: foreign inflection
-      // source: null
-
-      pipes = new SerialPipes(pipeList);
-      instances = new InstanceList(pipes);
     }
 
     /**
@@ -70,9 +54,30 @@ public class MalletPredictor implements Serializable {
 
     public void train() {
       Decoder.LOG(2, String.format("Word %s: training model from %d examples", 
-          sourceWord, examples.split("\\n").length));
+          sourceWord, examples.size()));
+      
+      ArrayList<Pipe> pipeList = new ArrayList<Pipe>();
+
+      // I don't know if this is needed
+      pipeList.add(new Target2Label());
+      // Convert custom lines to Instance objects (svmLight2FeatureVectorAndLabel not versatile enough)
+      pipeList.add(new SvmLight2FeatureVectorAndLabel());
+      // Validation
+//      pipeList.add(new PrintInputAndTarget());
+      
+      // name: english word
+      // data: features (FeatureVector)
+      // target: foreign inflection
+      // source: null
+
+      pipes = new SerialPipes(pipeList);
+      instances = new InstanceList(pipes);
       
-      StringReader reader = new StringReader(examples);
+      /* I know, this is *terrible*, but I need it to work *now* */
+      String exampleList = "";
+      for (String example: examples)
+        exampleList += example + "\n";
+      StringReader reader = new StringReader(exampleList);
 
       // Constructs an instance with everything shoved into the data field
       instances.addThruPipe(new CsvIterator(reader, "(\\S+)\\s+(.*)", 2, -1, 1));