You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/06/09 11:32:24 UTC

svn commit: r1133746 - in /incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect: EmptyLinePreprocessorStream.java SentenceSampleStream.java

Author: joern
Date: Thu Jun  9 09:32:24 2011
New Revision: 1133746

URL: http://svn.apache.org/viewvc?rev=1133746&view=rev
Log:
OPENNLP-201 Added stream to preprocess empty lines

Added:
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/EmptyLinePreprocessorStream.java   (with props)
Modified:
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceSampleStream.java

Added: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/EmptyLinePreprocessorStream.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/EmptyLinePreprocessorStream.java?rev=1133746&view=auto
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/EmptyLinePreprocessorStream.java (added)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/EmptyLinePreprocessorStream.java Thu Jun  9 09:32:24 2011
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect;
+
+import java.io.IOException;
+
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+
+/**
+ * Stream to to clean up empty lines for empty line separated document streams.<br>
+ * 
+ * - Skips empty line at training data start<br>
+ * - Transforms multiple empty lines in a row into one <br>
+ * - Replaces white space lines with empty lines <br>
+ * - TODO: Terminates last document with empty line if it is missing<br>
+ * <br>
+ * This stream should be used by the components that mark empty lines to mark document boundaries.
+ * <p>
+ * <b>Note:</b>
+ * This class is not thread safe. <br>
+ * Do not use this class, internal use only!
+ */
+public class EmptyLinePreprocessorStream extends FilterObjectStream<String, String> {
+  
+  private boolean lastLineWasEmpty = true;
+  
+  public EmptyLinePreprocessorStream(ObjectStream<String> in) {
+    super(in);
+  }
+  
+  private static boolean isLineEmpty(String line) {
+    return line.trim().length() == 0;
+  }
+  
+  public String read() throws IOException {
+    
+    String line = samples.read();
+    
+    if (lastLineWasEmpty) {
+      lastLineWasEmpty = false;
+      
+      while (line != null && isLineEmpty(line)) {
+        line = samples.read();
+      }
+    }
+ 
+    if (line != null && isLineEmpty(line)) {
+      lastLineWasEmpty = true;
+      line = "";
+    }
+    
+    return line;
+  }
+}

Propchange: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/EmptyLinePreprocessorStream.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceSampleStream.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceSampleStream.java?rev=1133746&r1=1133745&r2=1133746&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceSampleStream.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceSampleStream.java Thu Jun  9 09:32:24 2011
@@ -33,7 +33,7 @@ import opennlp.tools.util.Span;
 public class SentenceSampleStream extends FilterObjectStream<String, SentenceSample> {
 
   public SentenceSampleStream(ObjectStream<String> sentences) {
-    super(sentences);
+    super(new EmptyLinePreprocessorStream(sentences));
   }
 
   public SentenceSample read() throws IOException {