You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2012/05/04 13:32:28 UTC
svn commit: r1333882 - in
/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools: cmdline/ formats/
formats/convert/ formats/frenchtreebank/ formats/muc/
Author: joern
Date: Fri May 4 11:32:28 2012
New Revision: 1333882
URL: http://svn.apache.org/viewvc?rev=1333882&view=rev
Log:
OPENNLP-342 Added a factory for the french treebank stream
Added:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/convert/
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/convert/FileToByteArraySampleStream.java (with props)
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/convert/FileToStringSampleStream.java (with props)
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStreamFactory.java (with props)
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/DirectorySampleStream.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/muc/Muc6FullParseCorefSampleStreamFactory.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/muc/Muc6NameSampleStreamFactory.java
Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java?rev=1333882&r1=1333881&r2=1333882&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java Fri May 4 11:32:28 2012
@@ -44,6 +44,7 @@ import opennlp.tools.formats.ad.ADNameSa
import opennlp.tools.formats.ad.ADPOSSampleStreamFactory;
import opennlp.tools.formats.ad.ADSentenceSampleStreamFactory;
import opennlp.tools.formats.ad.ADTokenSampleStreamFactory;
+import opennlp.tools.formats.frenchtreebank.ConstitParseSampleStreamFactory;
import opennlp.tools.formats.muc.Muc6FullParseCorefSampleStreamFactory;
import opennlp.tools.formats.muc.Muc6NameSampleStreamFactory;
@@ -85,6 +86,8 @@ public final class StreamFactoryRegistry
Muc6NameSampleStreamFactory.registerFactory();
Muc6FullParseCorefSampleStreamFactory.registerFactory();
+
+ ConstitParseSampleStreamFactory.registerFactory();
}
public static final String DEFAULT_FORMAT = "opennlp";
Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/DirectorySampleStream.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/DirectorySampleStream.java?rev=1333882&r1=1333881&r2=1333882&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/DirectorySampleStream.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/DirectorySampleStream.java Fri May 4 11:32:28 2012
@@ -17,14 +17,9 @@
package opennlp.tools.formats;
-import java.io.BufferedReader;
import java.io.File;
import java.io.FileFilter;
-import java.io.FileInputStream;
import java.io.IOException;
-import java.io.InputStreamReader;
-import java.io.Reader;
-import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
@@ -36,10 +31,8 @@ import opennlp.tools.util.ObjectStream;
* The directory sample stream scans a directory (recursively) for plain text
* files and outputs each file as a String object.
*/
-public class DirectorySampleStream implements ObjectStream<String> {
+public class DirectorySampleStream implements ObjectStream<File> {
- private final Charset encoding;
-
private final List<File> inputDirectories;
private final boolean isRecursiveScan;
@@ -50,9 +43,8 @@ public class DirectorySampleStream imple
private Stack<File> textFiles = new Stack<File>();
- public DirectorySampleStream(File dirs[], Charset encoding, FileFilter fileFilter, boolean recursive) {
+ public DirectorySampleStream(File dirs[], FileFilter fileFilter, boolean recursive) {
- this.encoding = encoding;
this.fileFilter= fileFilter;
isRecursiveScan = recursive;
@@ -73,36 +65,11 @@ public class DirectorySampleStream imple
directories.addAll(inputDirectories);
}
- public DirectorySampleStream(File dir, Charset encoding, FileFilter fileFilter, boolean recursive) {
- this(new File[]{dir}, encoding, fileFilter, recursive);
- }
-
- static String readFile(File textFile, Charset encoding) throws IOException {
-
- Reader in = new BufferedReader(new InputStreamReader(new FileInputStream(textFile), encoding));
-
- StringBuilder text = new StringBuilder();
-
- try {
- char buffer[] = new char[1024];
- int length;
- while ((length = in.read(buffer, 0, buffer.length)) > 0) {
- text.append(buffer, 0, length);
- }
- }
- finally {
- try {
- in.close();
- }
- catch (IOException e) {
- // sorry that this can fail!
- }
- }
-
- return text.toString();
+ public DirectorySampleStream(File dir, FileFilter fileFilter, boolean recursive) {
+ this(new File[]{dir}, fileFilter, recursive);
}
- public String read() throws IOException {
+ public File read() throws IOException {
while(textFiles.isEmpty() && !directories.isEmpty()) {
File dir = directories.pop();
@@ -127,7 +94,7 @@ public class DirectorySampleStream imple
}
if (!textFiles.isEmpty()) {
- return readFile(textFiles.pop(), encoding);
+ return textFiles.pop();
}
else {
return null;
Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/convert/FileToByteArraySampleStream.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/convert/FileToByteArraySampleStream.java?rev=1333882&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/convert/FileToByteArraySampleStream.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/convert/FileToByteArraySampleStream.java Fri May 4 11:32:28 2012
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.convert;
+
+import java.io.BufferedInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+
+public class FileToByteArraySampleStream extends FilterObjectStream<File, byte[]> {
+
+ public FileToByteArraySampleStream(ObjectStream<File> samples) {
+ super(samples);
+ }
+
+ private static byte[] readFile(File file) throws IOException {
+
+ InputStream in = new BufferedInputStream(new FileInputStream(file));
+
+ ByteArrayOutputStream bytes = new ByteArrayOutputStream();
+
+ try {
+ byte buffer[] = new byte[1024];
+ int length;
+ while ((length = in.read(buffer, 0, buffer.length)) > 0) {
+ bytes.write(buffer, 0, length);
+ }
+ }
+ finally {
+ try {
+ in.close();
+ }
+ catch (IOException e) {
+ // sorry that this can fail!
+ }
+ }
+
+ return bytes.toByteArray();
+ }
+
+ public byte[] read() throws IOException {
+
+ File sampleFile = samples.read();
+
+ if (sampleFile != null) {
+ return readFile(sampleFile);
+ }
+ else {
+ return null;
+ }
+ }
+}
Propchange: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/convert/FileToByteArraySampleStream.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/convert/FileToStringSampleStream.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/convert/FileToStringSampleStream.java?rev=1333882&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/convert/FileToStringSampleStream.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/convert/FileToStringSampleStream.java Fri May 4 11:32:28 2012
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.convert;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.charset.Charset;
+
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+
+public class FileToStringSampleStream extends FilterObjectStream<File, String> {
+
+ private final Charset encoding;
+
+ public FileToStringSampleStream(ObjectStream<File> samples, Charset encoding) {
+ super(samples);
+
+ this.encoding = encoding;
+ }
+
+ private static String readFile(File textFile, Charset encoding) throws IOException {
+
+ Reader in = new BufferedReader(new InputStreamReader(new FileInputStream(textFile), encoding));
+
+ StringBuilder text = new StringBuilder();
+
+ try {
+ char buffer[] = new char[1024];
+ int length;
+ while ((length = in.read(buffer, 0, buffer.length)) > 0) {
+ text.append(buffer, 0, length);
+ }
+ }
+ finally {
+ try {
+ in.close();
+ }
+ catch (IOException e) {
+ // sorry that this can fail!
+ }
+ }
+
+ return text.toString();
+ }
+
+ public String read() throws IOException {
+
+ File sampleFile = samples.read();
+
+ if (sampleFile != null) {
+ return readFile(sampleFile, encoding);
+ }
+ else {
+ return null;
+ }
+ }
+}
Propchange: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/convert/FileToStringSampleStream.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStreamFactory.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStreamFactory.java?rev=1333882&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStreamFactory.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStreamFactory.java Fri May 4 11:32:28 2012
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.frenchtreebank;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.params.LanguageFormatParams;
+import opennlp.tools.formats.DirectorySampleStream;
+import opennlp.tools.formats.LanguageSampleStreamFactory;
+import opennlp.tools.formats.convert.FileToByteArraySampleStream;
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.parser.Parse;
+import opennlp.tools.util.ObjectStream;
+
+public class ConstitParseSampleStreamFactory extends LanguageSampleStreamFactory<Parse> {
+
+ interface Parameters extends LanguageFormatParams {
+ }
+
+ private ConstitParseSampleStreamFactory() {
+ super(Parameters.class);
+ }
+
+ public ObjectStream<Parse> create(String[] args) {
+
+ Parameters params = ArgumentParser.parse(args, Parameters.class);
+
+ language = params.getLang();
+
+ return new ConstitParseSampleStream(new FileToByteArraySampleStream(new DirectorySampleStream(params.getData(),
+ null, false)));
+ }
+
+ public static void registerFactory() {
+ StreamFactoryRegistry.registerFactory(NameSample.class, "frenchtreebank",
+ new ConstitParseSampleStreamFactory());
+ }
+}
Propchange: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStreamFactory.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/muc/Muc6FullParseCorefSampleStreamFactory.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/muc/Muc6FullParseCorefSampleStreamFactory.java?rev=1333882&r1=1333881&r2=1333882&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/muc/Muc6FullParseCorefSampleStreamFactory.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/muc/Muc6FullParseCorefSampleStreamFactory.java Fri May 4 11:32:28 2012
@@ -35,6 +35,7 @@ import opennlp.tools.cmdline.tokenizer.T
import opennlp.tools.coref.CorefSample;
import opennlp.tools.formats.DirectorySampleStream;
import opennlp.tools.formats.LanguageSampleStreamFactory;
+import opennlp.tools.formats.convert.FileToStringSampleStream;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinder;
import opennlp.tools.parser.Parser;
@@ -84,13 +85,13 @@ public class Muc6FullParseCorefSampleStr
TokenizerModel tokenizerModel = new TokenizerModelLoader().load(params.getTokenizerModel());
Tokenizer tokenizer = new TokenizerME(tokenizerModel);
- ObjectStream<String> mucDocStream =
- new DirectorySampleStream(params.getData(), Charset.forName("UTF-8"), new FileFilter() {
+ ObjectStream<String> mucDocStream = new FileToStringSampleStream(
+ new DirectorySampleStream(params.getData(), new FileFilter() {
public boolean accept(File file) {
return file.getName().toLowerCase().endsWith(".sgm");
}
- }, false);
+ }, false), Charset.forName("UTF-8"));
ObjectStream<RawCorefSample> rawSamples =
new MucCorefSampleStream(tokenizer, mucDocStream);
Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/muc/Muc6NameSampleStreamFactory.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/muc/Muc6NameSampleStreamFactory.java?rev=1333882&r1=1333881&r2=1333882&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/muc/Muc6NameSampleStreamFactory.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/muc/Muc6NameSampleStreamFactory.java Fri May 4 11:32:28 2012
@@ -28,6 +28,7 @@ import opennlp.tools.cmdline.params.Lang
import opennlp.tools.cmdline.tokenizer.TokenizerModelLoader;
import opennlp.tools.formats.DirectorySampleStream;
import opennlp.tools.formats.LanguageSampleStreamFactory;
+import opennlp.tools.formats.convert.FileToStringSampleStream;
import opennlp.tools.namefind.NameSample;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerME;
@@ -54,13 +55,13 @@ public class Muc6NameSampleStreamFactory
TokenizerModel tokenizerModel = new TokenizerModelLoader().load(params.getTokenizerModel());
Tokenizer tokenizer = new TokenizerME(tokenizerModel);
- ObjectStream<String> mucDocStream =
- new DirectorySampleStream(params.getData(), Charset.forName("UTF-8"), new FileFilter() {
+ ObjectStream<String> mucDocStream = new FileToStringSampleStream(
+ new DirectorySampleStream(params.getData(), new FileFilter() {
public boolean accept(File file) {
return file.getName().toLowerCase().endsWith(".sgm");
}
- }, false);
+ }, false), Charset.forName("UTF-8"));
return new MucNameSampleStream(tokenizer, mucDocStream);
}