You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by to...@apache.org on 2017/03/07 13:16:32 UTC

incubator-joshua git commit: JOSHUA-331 - berkeleyLM bin/gz files generated from plain text one

Repository: incubator-joshua
Updated Branches:
  refs/heads/master dd681a10a -> 66c5aad5a


JOSHUA-331 - berkeleyLM bin/gz files generated from plain text one


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/66c5aad5
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/66c5aad5
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/66c5aad5

Branch: refs/heads/master
Commit: 66c5aad5acf84f24990136558f882a5c33ec0114
Parents: dd681a1
Author: Tommaso Teofili <to...@apache.org>
Authored: Tue Mar 7 14:16:18 2017 +0100
Committer: Tommaso Teofili <to...@apache.org>
Committed: Tue Mar 7 14:16:18 2017 +0100

----------------------------------------------------------------------
 pom.xml                                         |   6 +++
 .../lm/berkeley_lm/LMGrammarBerkeleyTest.java   |  52 ++++++++++++++++---
 src/test/resources/berkeley_lm/lm.berkeleylm    | Bin 4310 -> 0 bytes
 src/test/resources/berkeley_lm/lm.berkeleylm.gz | Bin 1796 -> 0 bytes
 src/test/resources/berkeley_lm/lm.gz            | Bin 168 -> 0 bytes
 5 files changed, 52 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/66c5aad5/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index a04a194..a5824db 100644
--- a/pom.xml
+++ b/pom.xml
@@ -466,6 +466,12 @@
       <version>2.0.52-beta</version>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>commons-io</groupId>
+      <artifactId>commons-io</artifactId>
+      <version>2.5</version>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
 
   <profiles>

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/66c5aad5/src/test/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java b/src/test/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java
index 32c0762..aec6215 100644
--- a/src/test/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java
+++ b/src/test/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java
@@ -18,11 +18,23 @@
  */
 package org.apache.joshua.decoder.ff.lm.berkeley_lm;
 
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.nio.file.FileAlreadyExistsException;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.zip.GZIPOutputStream;
+
+import edu.berkeley.nlp.lm.io.MakeLmBinaryFromArpa;
+import org.apache.commons.io.IOUtils;
 import org.apache.joshua.decoder.Decoder;
 import org.apache.joshua.decoder.JoshuaConfiguration;
 import org.apache.joshua.decoder.Translation;
 import org.apache.joshua.decoder.segment_file.Sentence;
 import org.testng.annotations.AfterMethod;
+import org.testng.annotations.BeforeClass;
 import org.testng.annotations.DataProvider;
 import org.testng.annotations.Test;
 
@@ -39,15 +51,43 @@ public class LMGrammarBerkeleyTest {
   private static final String EXPECTED_OUTPUT_WITH_OOV = "tm_glue_0=2.000 lm_0=-7.153 lm_0_oov=0.000\n";
   private static final String[] OPTIONS = "-v 0 -output-format %f".split(" ");
 
+  private static final String lmFile = "src/test/resources/berkeley_lm/lm";
+  private static final String compressedLmFile = "target/lm.gz";
+  private static final String lmFileBin = "target/lm.berkeleylm";
+  private static final String compressedLmFileBin = "target/lm.berkeleylm.gz";
+
   private JoshuaConfiguration joshuaConfig;
   private Decoder decoder;
 
+  @BeforeClass
+  public static void before() throws Exception {
+    // generate lm.gz
+    FileInputStream lmFileStream = new FileInputStream(new File(lmFile));
+    compress(lmFileStream, compressedLmFile);
+
+    // generate lm.berkeleylm
+    MakeLmBinaryFromArpa.main(new String[] { lmFile, lmFileBin });
+
+    // generate lm.berkeleylm.gz
+    FileInputStream lmFileBinStream = new FileInputStream(new File(lmFileBin));
+    compress(lmFileBinStream, compressedLmFileBin);
+  }
+
+  private static void compress(FileInputStream lmFileStream, String target) throws IOException {
+    try {
+      Files.createFile(Paths.get(target));
+      GZIPOutputStream gzipOutputStream = new GZIPOutputStream(new FileOutputStream(target));
+      IOUtils.copy(lmFileStream, gzipOutputStream);
+      gzipOutputStream.finish();
+    } catch (FileAlreadyExistsException fae) {
+      // the file already exists, no need to recreate it
+    }
+  }
+
   @DataProvider(name = "languageModelFiles")
   public Object[][] lmFiles() {
-    return new Object[][]{{"src/test/resources/berkeley_lm/lm"},
-            {"src/test/resources/berkeley_lm/lm.gz"},
-            {"src/test/resources/berkeley_lm/lm.berkeleylm"},
-            {"src/test/resources/berkeley_lm/lm.berkeleylm.gz"}};
+    return new Object[][] { { lmFile }, { compressedLmFile }, { lmFileBin },
+        { compressedLmFileBin } };
   }
 
   @AfterMethod
@@ -74,12 +114,12 @@ public class LMGrammarBerkeleyTest {
   public void givenLmWithOovFeature_whenDecoder_thenCorrectFeaturesReturned() {
     joshuaConfig = new JoshuaConfiguration();
     joshuaConfig.processCommandLineOptions(OPTIONS);
-    joshuaConfig.features.add("LanguageModel -lm_type berkeleylm -oov_feature -lm_order 2 -lm_file src/test/resources/berkeley_lm/lm");
+    joshuaConfig.features.add(
+        "LanguageModel -lm_type berkeleylm -oov_feature -lm_order 2 -lm_file src/test/resources/berkeley_lm/lm");
     decoder = new Decoder(joshuaConfig, null);
     final String translation = decode(INPUT).toString();
     assertEquals(Decoder.weights.getDenseFeatures().size(), 3);
     assertEquals(translation, EXPECTED_OUTPUT_WITH_OOV);
   }
 
-
 }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/66c5aad5/src/test/resources/berkeley_lm/lm.berkeleylm
----------------------------------------------------------------------
diff --git a/src/test/resources/berkeley_lm/lm.berkeleylm b/src/test/resources/berkeley_lm/lm.berkeleylm
deleted file mode 100644
index 9472c18..0000000
Binary files a/src/test/resources/berkeley_lm/lm.berkeleylm and /dev/null differ

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/66c5aad5/src/test/resources/berkeley_lm/lm.berkeleylm.gz
----------------------------------------------------------------------
diff --git a/src/test/resources/berkeley_lm/lm.berkeleylm.gz b/src/test/resources/berkeley_lm/lm.berkeleylm.gz
deleted file mode 100644
index 57c319e..0000000
Binary files a/src/test/resources/berkeley_lm/lm.berkeleylm.gz and /dev/null differ

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/66c5aad5/src/test/resources/berkeley_lm/lm.gz
----------------------------------------------------------------------
diff --git a/src/test/resources/berkeley_lm/lm.gz b/src/test/resources/berkeley_lm/lm.gz
deleted file mode 100644
index 3a4f2c0..0000000
Binary files a/src/test/resources/berkeley_lm/lm.gz and /dev/null differ