You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2014/01/03 14:06:54 UTC

svn commit: r1555079 - in /opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools: cmdline/ formats/ontonotes/

Author: joern
Date: Fri Jan  3 13:06:54 2014
New Revision: 1555079

URL: http://svn.apache.org/r1555079
Log:
OPENNLP-623 Added OntoNotes format support for the parser and pos tagger.

Added:
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/DocumentToLineStream.java
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesFormatParameters.java
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesPOSSampleStreamFactory.java
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStream.java
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStreamFactory.java
Modified:
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStreamFactory.java

Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java?rev=1555079&r1=1555078&r2=1555079&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java Fri Jan  3 13:06:54 2014
@@ -51,6 +51,8 @@ import opennlp.tools.formats.convert.Par
 import opennlp.tools.formats.frenchtreebank.ConstitParseSampleStreamFactory;
 import opennlp.tools.formats.muc.Muc6NameSampleStreamFactory;
 import opennlp.tools.formats.ontonotes.OntoNotesNameSampleStreamFactory;
+import opennlp.tools.formats.ontonotes.OntoNotesPOSSampleStreamFactory;
+import opennlp.tools.formats.ontonotes.OntoNotesParseSampleStreamFactory;
 
 /**
  * Registry for object stream factories.
@@ -80,6 +82,9 @@ public final class StreamFactoryRegistry
     ParseToTokenSampleStreamFactory.registerFactory();
     
     OntoNotesNameSampleStreamFactory.registerFactory();
+    OntoNotesParseSampleStreamFactory.registerFactory();
+    OntoNotesPOSSampleStreamFactory.registerFactory();
+    
     BioNLP2004NameSampleStreamFactory.registerFactory();
     Conll02NameSampleStreamFactory.registerFactory();
     Conll03NameSampleStreamFactory.registerFactory();

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/DocumentToLineStream.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/DocumentToLineStream.java?rev=1555079&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/DocumentToLineStream.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/DocumentToLineStream.java Fri Jan  3 13:06:54 2014
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.formats.ontonotes;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import opennlp.tools.formats.brat.SegmenterObjectStream;
+import opennlp.tools.util.ObjectStream;
+
+/**
+ * Reads a plain text file and return each line as a <code>String</code> object.
+ */
+public class DocumentToLineStream extends SegmenterObjectStream<String, String> {
+  
+  public DocumentToLineStream(ObjectStream<String> samples) {
+    super(samples);
+  }
+
+  @Override
+  protected List<String> read(String sample) throws IOException {
+    List<String> lines = Arrays.asList(sample.split("\n"));
+    
+    // documents must be empty line terminated
+    if (!lines.get(lines.size() - 1).trim().isEmpty()) {
+      lines = new ArrayList<String>(lines);
+      lines.add("");
+    }
+    
+    return lines;
+  }
+}
+

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesFormatParameters.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesFormatParameters.java?rev=1555079&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesFormatParameters.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesFormatParameters.java Fri Jan  3 13:06:54 2014
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.ontonotes;
+
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+
+public interface OntoNotesFormatParameters {
+  @ParameterDescription(valueName = "OntoNotes 4.0 corpus directory")
+  String getOntoNotesDir();
+}

Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStreamFactory.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStreamFactory.java?rev=1555079&r1=1555078&r2=1555079&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStreamFactory.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStreamFactory.java Fri Jan  3 13:06:54 2014
@@ -22,7 +22,6 @@ import java.io.FileFilter;
 import java.nio.charset.Charset;
 
 import opennlp.tools.cmdline.ArgumentParser;
-import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
 import opennlp.tools.cmdline.StreamFactoryRegistry;
 import opennlp.tools.formats.AbstractSampleStreamFactory;
 import opennlp.tools.formats.DirectorySampleStream;
@@ -33,18 +32,13 @@ import opennlp.tools.util.ObjectStream;
 public class OntoNotesNameSampleStreamFactory extends
     AbstractSampleStreamFactory<NameSample> {
 
-  interface Parameters {
-    @ParameterDescription(valueName = "OntoNotes 4.0 corpus directory")
-    String getOntoNotesDir();
-  }
-
   public OntoNotesNameSampleStreamFactory() {
-    super(Parameters.class);
+    super(OntoNotesFormatParameters.class);
   }
 
   public ObjectStream<NameSample> create(String[] args) {
 
-    Parameters params = ArgumentParser.parse(args, Parameters.class);
+    OntoNotesFormatParameters params = ArgumentParser.parse(args, OntoNotesFormatParameters.class);
 
     ObjectStream<File> documentStream = new DirectorySampleStream(new File(
         params.getOntoNotesDir()), new FileFilter() {

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesPOSSampleStreamFactory.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesPOSSampleStreamFactory.java?rev=1555079&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesPOSSampleStreamFactory.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesPOSSampleStreamFactory.java Fri Jan  3 13:06:54 2014
@@ -0,0 +1,28 @@
+package opennlp.tools.formats.ontonotes;
+
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.formats.convert.ParseToPOSSampleStream;
+import opennlp.tools.parser.Parse;
+import opennlp.tools.postag.POSSample;
+import opennlp.tools.util.ObjectStream;
+
+public class OntoNotesPOSSampleStreamFactory extends AbstractSampleStreamFactory<POSSample> {
+
+  private OntoNotesParseSampleStreamFactory parseSampleStreamFactory =
+      new OntoNotesParseSampleStreamFactory();
+  
+  protected OntoNotesPOSSampleStreamFactory() {
+    super(OntoNotesFormatParameters.class);
+  }
+  
+  public ObjectStream<POSSample> create(String[] args) {
+    ObjectStream<Parse> parseSampleStream = parseSampleStreamFactory.create(args);
+    return new ParseToPOSSampleStream(parseSampleStream);
+  }
+  
+  public static void registerFactory() {
+    StreamFactoryRegistry.registerFactory(POSSample.class, "ontonotes",
+        new OntoNotesPOSSampleStreamFactory());
+  }
+}

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStream.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStream.java?rev=1555079&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStream.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStream.java Fri Jan  3 13:06:54 2014
@@ -0,0 +1,39 @@
+package opennlp.tools.formats.ontonotes;
+
+import java.io.IOException;
+
+import opennlp.tools.parser.Parse;
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+
+// Should be possible with this one, to train the parser and pos tagger!
+public class OntoNotesParseSampleStream extends FilterObjectStream<String, Parse> {
+
+  protected OntoNotesParseSampleStream(ObjectStream<String> samples) {
+    super(samples);
+  }
+
+  public Parse read() throws IOException {
+    
+    StringBuilder parseString = new StringBuilder();
+    
+    while(true) {
+      String parse = samples.read();
+      
+      if (parse != null) {
+        parse = parse.trim();
+      }
+      
+      if (parse == null || parse.isEmpty()) {
+        if (parseString.length() > 0) {
+          return Parse.parseParse(parseString.toString());
+        }
+        else {
+          return null;
+        }
+      }
+      
+      parseString.append(parse + " ");
+    }
+  }
+}

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStreamFactory.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStreamFactory.java?rev=1555079&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStreamFactory.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStreamFactory.java Fri Jan  3 13:06:54 2014
@@ -0,0 +1,50 @@
+package opennlp.tools.formats.ontonotes;
+
+import java.io.File;
+import java.io.FileFilter;
+import java.nio.charset.Charset;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.formats.DirectorySampleStream;
+import opennlp.tools.formats.convert.FileToStringSampleStream;
+import opennlp.tools.parser.Parse;
+import opennlp.tools.util.ObjectStream;
+
+public class OntoNotesParseSampleStreamFactory extends AbstractSampleStreamFactory<Parse> {
+
+  
+  protected OntoNotesParseSampleStreamFactory() {
+    super(OntoNotesFormatParameters.class);
+  }
+  
+  public ObjectStream<Parse> create(String[] args) {
+    
+    OntoNotesFormatParameters params = ArgumentParser.parse(args, OntoNotesFormatParameters.class);
+
+    ObjectStream<File> documentStream = new DirectorySampleStream(new File(
+        params.getOntoNotesDir()), new FileFilter() {
+
+      public boolean accept(File file) {
+        if (file.isFile()) {
+          return file.getName().endsWith(".parse");
+        }
+
+        return file.isDirectory();
+      }
+    }, true);
+
+    // We need file to line here ... and that is probably best doen with the plain text stream
+    // lets copy it over here, refactor it, and then at some point we replace the current version
+    // with the refactored version
+    
+    return new OntoNotesParseSampleStream(new DocumentToLineStream(new FileToStringSampleStream(
+        documentStream, Charset.forName("UTF-8"))));
+  }
+  
+  public static void registerFactory() {
+    StreamFactoryRegistry.registerFactory(Parse.class, "ontonotes",
+        new OntoNotesParseSampleStreamFactory());
+  }
+}