You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/06/09 11:32:24 UTC
svn commit: r1133746 - in
/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect:
EmptyLinePreprocessorStream.java SentenceSampleStream.java
Author: joern
Date: Thu Jun 9 09:32:24 2011
New Revision: 1133746
URL: http://svn.apache.org/viewvc?rev=1133746&view=rev
Log:
OPENNLP-201 Added stream to preprocess empty lines
Added:
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/EmptyLinePreprocessorStream.java (with props)
Modified:
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceSampleStream.java
Added: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/EmptyLinePreprocessorStream.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/EmptyLinePreprocessorStream.java?rev=1133746&view=auto
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/EmptyLinePreprocessorStream.java (added)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/EmptyLinePreprocessorStream.java Thu Jun 9 09:32:24 2011
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect;
+
+import java.io.IOException;
+
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+
+/**
+ * Stream to to clean up empty lines for empty line separated document streams.<br>
+ *
+ * - Skips empty line at training data start<br>
+ * - Transforms multiple empty lines in a row into one <br>
+ * - Replaces white space lines with empty lines <br>
+ * - TODO: Terminates last document with empty line if it is missing<br>
+ * <br>
+ * This stream should be used by the components that mark empty lines to mark document boundaries.
+ * <p>
+ * <b>Note:</b>
+ * This class is not thread safe. <br>
+ * Do not use this class, internal use only!
+ */
+public class EmptyLinePreprocessorStream extends FilterObjectStream<String, String> {
+
+ private boolean lastLineWasEmpty = true;
+
+ public EmptyLinePreprocessorStream(ObjectStream<String> in) {
+ super(in);
+ }
+
+ private static boolean isLineEmpty(String line) {
+ return line.trim().length() == 0;
+ }
+
+ public String read() throws IOException {
+
+ String line = samples.read();
+
+ if (lastLineWasEmpty) {
+ lastLineWasEmpty = false;
+
+ while (line != null && isLineEmpty(line)) {
+ line = samples.read();
+ }
+ }
+
+ if (line != null && isLineEmpty(line)) {
+ lastLineWasEmpty = true;
+ line = "";
+ }
+
+ return line;
+ }
+}
Propchange: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/EmptyLinePreprocessorStream.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceSampleStream.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceSampleStream.java?rev=1133746&r1=1133745&r2=1133746&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceSampleStream.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceSampleStream.java Thu Jun 9 09:32:24 2011
@@ -33,7 +33,7 @@ import opennlp.tools.util.Span;
public class SentenceSampleStream extends FilterObjectStream<String, SentenceSample> {
public SentenceSampleStream(ObjectStream<String> sentences) {
- super(sentences);
+ super(new EmptyLinePreprocessorStream(sentences));
}
public SentenceSample read() throws IOException {