You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2014/01/03 14:06:54 UTC
svn commit: r1555079 - in
/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools: cmdline/
formats/ontonotes/
Author: joern
Date: Fri Jan 3 13:06:54 2014
New Revision: 1555079
URL: http://svn.apache.org/r1555079
Log:
OPENNLP-623 Added OntoNotes format support for the parser and pos tagger.
Added:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/DocumentToLineStream.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesFormatParameters.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesPOSSampleStreamFactory.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStream.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStreamFactory.java
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStreamFactory.java
Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java?rev=1555079&r1=1555078&r2=1555079&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java Fri Jan 3 13:06:54 2014
@@ -51,6 +51,8 @@ import opennlp.tools.formats.convert.Par
import opennlp.tools.formats.frenchtreebank.ConstitParseSampleStreamFactory;
import opennlp.tools.formats.muc.Muc6NameSampleStreamFactory;
import opennlp.tools.formats.ontonotes.OntoNotesNameSampleStreamFactory;
+import opennlp.tools.formats.ontonotes.OntoNotesPOSSampleStreamFactory;
+import opennlp.tools.formats.ontonotes.OntoNotesParseSampleStreamFactory;
/**
* Registry for object stream factories.
@@ -80,6 +82,9 @@ public final class StreamFactoryRegistry
ParseToTokenSampleStreamFactory.registerFactory();
OntoNotesNameSampleStreamFactory.registerFactory();
+ OntoNotesParseSampleStreamFactory.registerFactory();
+ OntoNotesPOSSampleStreamFactory.registerFactory();
+
BioNLP2004NameSampleStreamFactory.registerFactory();
Conll02NameSampleStreamFactory.registerFactory();
Conll03NameSampleStreamFactory.registerFactory();
Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/DocumentToLineStream.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/DocumentToLineStream.java?rev=1555079&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/DocumentToLineStream.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/DocumentToLineStream.java Fri Jan 3 13:06:54 2014
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.formats.ontonotes;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import opennlp.tools.formats.brat.SegmenterObjectStream;
+import opennlp.tools.util.ObjectStream;
+
+/**
+ * Reads a plain text file and return each line as a <code>String</code> object.
+ */
+public class DocumentToLineStream extends SegmenterObjectStream<String, String> {
+
+ public DocumentToLineStream(ObjectStream<String> samples) {
+ super(samples);
+ }
+
+ @Override
+ protected List<String> read(String sample) throws IOException {
+ List<String> lines = Arrays.asList(sample.split("\n"));
+
+ // documents must be empty line terminated
+ if (!lines.get(lines.size() - 1).trim().isEmpty()) {
+ lines = new ArrayList<String>(lines);
+ lines.add("");
+ }
+
+ return lines;
+ }
+}
+
Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesFormatParameters.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesFormatParameters.java?rev=1555079&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesFormatParameters.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesFormatParameters.java Fri Jan 3 13:06:54 2014
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.ontonotes;
+
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+
+public interface OntoNotesFormatParameters {
+ @ParameterDescription(valueName = "OntoNotes 4.0 corpus directory")
+ String getOntoNotesDir();
+}
Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStreamFactory.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStreamFactory.java?rev=1555079&r1=1555078&r2=1555079&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStreamFactory.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStreamFactory.java Fri Jan 3 13:06:54 2014
@@ -22,7 +22,6 @@ import java.io.FileFilter;
import java.nio.charset.Charset;
import opennlp.tools.cmdline.ArgumentParser;
-import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
import opennlp.tools.cmdline.StreamFactoryRegistry;
import opennlp.tools.formats.AbstractSampleStreamFactory;
import opennlp.tools.formats.DirectorySampleStream;
@@ -33,18 +32,13 @@ import opennlp.tools.util.ObjectStream;
public class OntoNotesNameSampleStreamFactory extends
AbstractSampleStreamFactory<NameSample> {
- interface Parameters {
- @ParameterDescription(valueName = "OntoNotes 4.0 corpus directory")
- String getOntoNotesDir();
- }
-
public OntoNotesNameSampleStreamFactory() {
- super(Parameters.class);
+ super(OntoNotesFormatParameters.class);
}
public ObjectStream<NameSample> create(String[] args) {
- Parameters params = ArgumentParser.parse(args, Parameters.class);
+ OntoNotesFormatParameters params = ArgumentParser.parse(args, OntoNotesFormatParameters.class);
ObjectStream<File> documentStream = new DirectorySampleStream(new File(
params.getOntoNotesDir()), new FileFilter() {
Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesPOSSampleStreamFactory.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesPOSSampleStreamFactory.java?rev=1555079&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesPOSSampleStreamFactory.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesPOSSampleStreamFactory.java Fri Jan 3 13:06:54 2014
@@ -0,0 +1,28 @@
+package opennlp.tools.formats.ontonotes;
+
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.formats.convert.ParseToPOSSampleStream;
+import opennlp.tools.parser.Parse;
+import opennlp.tools.postag.POSSample;
+import opennlp.tools.util.ObjectStream;
+
+public class OntoNotesPOSSampleStreamFactory extends AbstractSampleStreamFactory<POSSample> {
+
+ private OntoNotesParseSampleStreamFactory parseSampleStreamFactory =
+ new OntoNotesParseSampleStreamFactory();
+
+ protected OntoNotesPOSSampleStreamFactory() {
+ super(OntoNotesFormatParameters.class);
+ }
+
+ public ObjectStream<POSSample> create(String[] args) {
+ ObjectStream<Parse> parseSampleStream = parseSampleStreamFactory.create(args);
+ return new ParseToPOSSampleStream(parseSampleStream);
+ }
+
+ public static void registerFactory() {
+ StreamFactoryRegistry.registerFactory(POSSample.class, "ontonotes",
+ new OntoNotesPOSSampleStreamFactory());
+ }
+}
Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStream.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStream.java?rev=1555079&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStream.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStream.java Fri Jan 3 13:06:54 2014
@@ -0,0 +1,39 @@
+package opennlp.tools.formats.ontonotes;
+
+import java.io.IOException;
+
+import opennlp.tools.parser.Parse;
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+
+// Should be possible with this one, to train the parser and pos tagger!
+public class OntoNotesParseSampleStream extends FilterObjectStream<String, Parse> {
+
+ protected OntoNotesParseSampleStream(ObjectStream<String> samples) {
+ super(samples);
+ }
+
+ public Parse read() throws IOException {
+
+ StringBuilder parseString = new StringBuilder();
+
+ while(true) {
+ String parse = samples.read();
+
+ if (parse != null) {
+ parse = parse.trim();
+ }
+
+ if (parse == null || parse.isEmpty()) {
+ if (parseString.length() > 0) {
+ return Parse.parseParse(parseString.toString());
+ }
+ else {
+ return null;
+ }
+ }
+
+ parseString.append(parse + " ");
+ }
+ }
+}
Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStreamFactory.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStreamFactory.java?rev=1555079&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStreamFactory.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStreamFactory.java Fri Jan 3 13:06:54 2014
@@ -0,0 +1,50 @@
+package opennlp.tools.formats.ontonotes;
+
+import java.io.File;
+import java.io.FileFilter;
+import java.nio.charset.Charset;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.formats.DirectorySampleStream;
+import opennlp.tools.formats.convert.FileToStringSampleStream;
+import opennlp.tools.parser.Parse;
+import opennlp.tools.util.ObjectStream;
+
+public class OntoNotesParseSampleStreamFactory extends AbstractSampleStreamFactory<Parse> {
+
+
+ protected OntoNotesParseSampleStreamFactory() {
+ super(OntoNotesFormatParameters.class);
+ }
+
+ public ObjectStream<Parse> create(String[] args) {
+
+ OntoNotesFormatParameters params = ArgumentParser.parse(args, OntoNotesFormatParameters.class);
+
+ ObjectStream<File> documentStream = new DirectorySampleStream(new File(
+ params.getOntoNotesDir()), new FileFilter() {
+
+ public boolean accept(File file) {
+ if (file.isFile()) {
+ return file.getName().endsWith(".parse");
+ }
+
+ return file.isDirectory();
+ }
+ }, true);
+
+ // We need file to line here ... and that is probably best doen with the plain text stream
+ // lets copy it over here, refactor it, and then at some point we replace the current version
+ // with the refactored version
+
+ return new OntoNotesParseSampleStream(new DocumentToLineStream(new FileToStringSampleStream(
+ documentStream, Charset.forName("UTF-8"))));
+ }
+
+ public static void registerFactory() {
+ StreamFactoryRegistry.registerFactory(Parse.class, "ontonotes",
+ new OntoNotesParseSampleStreamFactory());
+ }
+}