You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2012/05/04 13:32:28 UTC

svn commit: r1333882 - in /opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools: cmdline/ formats/ formats/convert/ formats/frenchtreebank/ formats/muc/

Author: joern
Date: Fri May  4 11:32:28 2012
New Revision: 1333882

URL: http://svn.apache.org/viewvc?rev=1333882&view=rev
Log:
OPENNLP-342 Added a factory for the french treebank stream

Added:
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/convert/
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/convert/FileToByteArraySampleStream.java   (with props)
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/convert/FileToStringSampleStream.java   (with props)
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStreamFactory.java   (with props)
Modified:
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/DirectorySampleStream.java
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/muc/Muc6FullParseCorefSampleStreamFactory.java
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/muc/Muc6NameSampleStreamFactory.java

Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java?rev=1333882&r1=1333881&r2=1333882&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java Fri May  4 11:32:28 2012
@@ -44,6 +44,7 @@ import opennlp.tools.formats.ad.ADNameSa
 import opennlp.tools.formats.ad.ADPOSSampleStreamFactory;
 import opennlp.tools.formats.ad.ADSentenceSampleStreamFactory;
 import opennlp.tools.formats.ad.ADTokenSampleStreamFactory;
+import opennlp.tools.formats.frenchtreebank.ConstitParseSampleStreamFactory;
 import opennlp.tools.formats.muc.Muc6FullParseCorefSampleStreamFactory;
 import opennlp.tools.formats.muc.Muc6NameSampleStreamFactory;
 
@@ -85,6 +86,8 @@ public final class StreamFactoryRegistry
     
     Muc6NameSampleStreamFactory.registerFactory();
     Muc6FullParseCorefSampleStreamFactory.registerFactory();
+    
+    ConstitParseSampleStreamFactory.registerFactory();
   }
 
   public static final String DEFAULT_FORMAT = "opennlp";

Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/DirectorySampleStream.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/DirectorySampleStream.java?rev=1333882&r1=1333881&r2=1333882&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/DirectorySampleStream.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/DirectorySampleStream.java Fri May  4 11:32:28 2012
@@ -17,14 +17,9 @@
 
 package opennlp.tools.formats;
 
-import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileFilter;
-import java.io.FileInputStream;
 import java.io.IOException;
-import java.io.InputStreamReader;
-import java.io.Reader;
-import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
@@ -36,10 +31,8 @@ import opennlp.tools.util.ObjectStream;
  * The directory sample stream scans a directory (recursively) for plain text
  * files and outputs each file as a String object.
  */
-public class DirectorySampleStream implements ObjectStream<String> {
+public class DirectorySampleStream implements ObjectStream<File> {
 
-  private final Charset encoding;
-  
   private final List<File> inputDirectories;
   
   private final boolean isRecursiveScan;
@@ -50,9 +43,8 @@ public class DirectorySampleStream imple
   
   private Stack<File> textFiles = new Stack<File>();
   
-  public DirectorySampleStream(File dirs[], Charset encoding, FileFilter fileFilter, boolean recursive) {
+  public DirectorySampleStream(File dirs[], FileFilter fileFilter, boolean recursive) {
 
-    this.encoding = encoding;
     this.fileFilter= fileFilter; 
     isRecursiveScan = recursive;
     
@@ -73,36 +65,11 @@ public class DirectorySampleStream imple
     directories.addAll(inputDirectories);
   }
   
-  public DirectorySampleStream(File dir, Charset encoding, FileFilter fileFilter, boolean recursive) {
-    this(new File[]{dir}, encoding, fileFilter, recursive);
-  }
-  
-  static String readFile(File textFile, Charset encoding) throws IOException {
-    
-    Reader in = new BufferedReader(new InputStreamReader(new FileInputStream(textFile), encoding));
-
-    StringBuilder text = new StringBuilder();
-    
-    try {
-      char buffer[] = new char[1024];
-      int length;
-      while ((length = in.read(buffer, 0, buffer.length)) > 0) {
-        text.append(buffer, 0, length);
-      }
-    }
-    finally {
-      try {
-        in.close();
-      }
-      catch (IOException e) {
-        // sorry that this can fail!
-      }
-    }
-    
-    return text.toString();
+  public DirectorySampleStream(File dir, FileFilter fileFilter, boolean recursive) {
+    this(new File[]{dir}, fileFilter, recursive);
   }
   
-  public String read() throws IOException {
+  public File read() throws IOException {
 
     while(textFiles.isEmpty() && !directories.isEmpty()) {
       File dir = directories.pop();
@@ -127,7 +94,7 @@ public class DirectorySampleStream imple
     }
     
     if (!textFiles.isEmpty()) {
-      return readFile(textFiles.pop(), encoding);
+      return textFiles.pop();
     }
     else {
       return null;

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/convert/FileToByteArraySampleStream.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/convert/FileToByteArraySampleStream.java?rev=1333882&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/convert/FileToByteArraySampleStream.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/convert/FileToByteArraySampleStream.java Fri May  4 11:32:28 2012
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.convert;
+
+import java.io.BufferedInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+
+public class FileToByteArraySampleStream extends FilterObjectStream<File, byte[]> {
+
+  public FileToByteArraySampleStream(ObjectStream<File> samples) {
+    super(samples);
+  }
+
+  private static byte[] readFile(File file) throws IOException {
+    
+    InputStream in = new BufferedInputStream(new FileInputStream(file));
+
+    ByteArrayOutputStream bytes = new ByteArrayOutputStream();
+    
+    try {
+      byte buffer[] = new byte[1024];
+      int length;
+      while ((length = in.read(buffer, 0, buffer.length)) > 0) {
+        bytes.write(buffer, 0, length);
+      }
+    }
+    finally {
+      try {
+        in.close();
+      }
+      catch (IOException e) {
+        // sorry that this can fail!
+      }
+    }
+    
+    return bytes.toByteArray();
+  }
+  
+  public byte[] read() throws IOException {
+    
+    File sampleFile = samples.read();
+    
+    if (sampleFile != null) {
+      return readFile(sampleFile);
+    }
+    else {
+      return null;
+    }
+  }
+}

Propchange: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/convert/FileToByteArraySampleStream.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/convert/FileToStringSampleStream.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/convert/FileToStringSampleStream.java?rev=1333882&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/convert/FileToStringSampleStream.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/convert/FileToStringSampleStream.java Fri May  4 11:32:28 2012
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.convert;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.charset.Charset;
+
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+
+public class FileToStringSampleStream extends FilterObjectStream<File, String> {
+
+  private final Charset encoding;
+  
+  public FileToStringSampleStream(ObjectStream<File> samples, Charset encoding) {
+    super(samples);
+    
+    this.encoding = encoding;
+  }
+  
+  private static String readFile(File textFile, Charset encoding) throws IOException {
+    
+    Reader in = new BufferedReader(new InputStreamReader(new FileInputStream(textFile), encoding));
+
+    StringBuilder text = new StringBuilder();
+    
+    try {
+      char buffer[] = new char[1024];
+      int length;
+      while ((length = in.read(buffer, 0, buffer.length)) > 0) {
+        text.append(buffer, 0, length);
+      }
+    }
+    finally {
+      try {
+        in.close();
+      }
+      catch (IOException e) {
+        // sorry that this can fail!
+      }
+    }
+    
+    return text.toString();
+  }
+
+  public String read() throws IOException {
+    
+    File sampleFile = samples.read();
+    
+    if (sampleFile != null) {
+      return readFile(sampleFile, encoding);
+    }
+    else {
+      return null;
+    }
+  }
+}

Propchange: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/convert/FileToStringSampleStream.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStreamFactory.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStreamFactory.java?rev=1333882&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStreamFactory.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStreamFactory.java Fri May  4 11:32:28 2012
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.frenchtreebank;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.params.LanguageFormatParams;
+import opennlp.tools.formats.DirectorySampleStream;
+import opennlp.tools.formats.LanguageSampleStreamFactory;
+import opennlp.tools.formats.convert.FileToByteArraySampleStream;
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.parser.Parse;
+import opennlp.tools.util.ObjectStream;
+
+public class ConstitParseSampleStreamFactory extends LanguageSampleStreamFactory<Parse> {
+
+  interface Parameters extends LanguageFormatParams {    
+  }
+  
+  private ConstitParseSampleStreamFactory() {
+    super(Parameters.class);
+  }
+  
+  public ObjectStream<Parse> create(String[] args) {
+    
+    Parameters params = ArgumentParser.parse(args, Parameters.class);
+
+    language = params.getLang();
+    
+    return new ConstitParseSampleStream(new FileToByteArraySampleStream(new DirectorySampleStream(params.getData(),
+        null, false)));
+  }
+  
+  public static void registerFactory() {
+    StreamFactoryRegistry.registerFactory(NameSample.class, "frenchtreebank",
+        new ConstitParseSampleStreamFactory());
+  }
+}

Propchange: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStreamFactory.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/muc/Muc6FullParseCorefSampleStreamFactory.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/muc/Muc6FullParseCorefSampleStreamFactory.java?rev=1333882&r1=1333881&r2=1333882&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/muc/Muc6FullParseCorefSampleStreamFactory.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/muc/Muc6FullParseCorefSampleStreamFactory.java Fri May  4 11:32:28 2012
@@ -35,6 +35,7 @@ import opennlp.tools.cmdline.tokenizer.T
 import opennlp.tools.coref.CorefSample;
 import opennlp.tools.formats.DirectorySampleStream;
 import opennlp.tools.formats.LanguageSampleStreamFactory;
+import opennlp.tools.formats.convert.FileToStringSampleStream;
 import opennlp.tools.namefind.NameFinderME;
 import opennlp.tools.namefind.TokenNameFinder;
 import opennlp.tools.parser.Parser;
@@ -84,13 +85,13 @@ public class Muc6FullParseCorefSampleStr
     TokenizerModel tokenizerModel = new TokenizerModelLoader().load(params.getTokenizerModel());
     Tokenizer tokenizer = new TokenizerME(tokenizerModel);
     
-    ObjectStream<String> mucDocStream = 
-        new DirectorySampleStream(params.getData(), Charset.forName("UTF-8"), new FileFilter() {
+    ObjectStream<String> mucDocStream = new FileToStringSampleStream(
+        new DirectorySampleStream(params.getData(), new FileFilter() {
           
           public boolean accept(File file) {
             return file.getName().toLowerCase().endsWith(".sgm");
           }
-        }, false);
+        }, false), Charset.forName("UTF-8"));
     
     ObjectStream<RawCorefSample> rawSamples = 
         new MucCorefSampleStream(tokenizer, mucDocStream);

Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/muc/Muc6NameSampleStreamFactory.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/muc/Muc6NameSampleStreamFactory.java?rev=1333882&r1=1333881&r2=1333882&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/muc/Muc6NameSampleStreamFactory.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/muc/Muc6NameSampleStreamFactory.java Fri May  4 11:32:28 2012
@@ -28,6 +28,7 @@ import opennlp.tools.cmdline.params.Lang
 import opennlp.tools.cmdline.tokenizer.TokenizerModelLoader;
 import opennlp.tools.formats.DirectorySampleStream;
 import opennlp.tools.formats.LanguageSampleStreamFactory;
+import opennlp.tools.formats.convert.FileToStringSampleStream;
 import opennlp.tools.namefind.NameSample;
 import opennlp.tools.tokenize.Tokenizer;
 import opennlp.tools.tokenize.TokenizerME;
@@ -54,13 +55,13 @@ public class Muc6NameSampleStreamFactory
     TokenizerModel tokenizerModel = new TokenizerModelLoader().load(params.getTokenizerModel());
     Tokenizer tokenizer = new TokenizerME(tokenizerModel);
 
-    ObjectStream<String> mucDocStream =
-        new DirectorySampleStream(params.getData(), Charset.forName("UTF-8"), new FileFilter() {
+    ObjectStream<String> mucDocStream = new FileToStringSampleStream(
+        new DirectorySampleStream(params.getData(), new FileFilter() {
 
           public boolean accept(File file) {
             return file.getName().toLowerCase().endsWith(".sgm");
           }
-        }, false);
+        }, false), Charset.forName("UTF-8"));
 
     return new MucNameSampleStream(tokenizer, mucDocStream);
   }