You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2012/03/28 19:58:24 UTC

svn commit: r1306518 - in /opennlp/trunk/opennlp-tools/src: main/java/opennlp/tools/formats/muc/ test/java/opennlp/tools/formats/muc/

Author: joern
Date: Wed Mar 28 17:58:23 2012
New Revision: 1306518

URL: http://svn.apache.org/viewvc?rev=1306518&view=rev
Log:
OPENNLP-341 Implemented a stream to split MUC files into document strings.

Added:
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/muc/
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/muc/DocumentSplitterStream.java   (with props)
    opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/muc/
    opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/muc/DocumentSplitterStreamTest.java   (with props)

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/muc/DocumentSplitterStream.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/muc/DocumentSplitterStream.java?rev=1306518&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/muc/DocumentSplitterStream.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/muc/DocumentSplitterStream.java Wed Mar 28 17:58:23 2012
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.muc;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.ObjectStream;
+
+class DocumentSplitterStream extends FilterObjectStream<String, String> {
+  
+  private static final String DOC_START_ELEMENT = "<DOC>";
+  private static final String DOC_END_ELEMENT = "</DOC>";
+  
+  private List<String> docs = new ArrayList<String>();
+  
+  DocumentSplitterStream(ObjectStream<String> samples) {
+    super(samples);
+  }
+
+  public String read() throws IOException {
+    
+    if (docs.isEmpty()) {
+      String newDocs = samples.read();
+      
+      if (newDocs != null) {
+        int docStartOffset = 0;
+        
+        while (true) {
+          int startDocElement = newDocs.indexOf(DOC_START_ELEMENT, docStartOffset);
+          int endDocElement = newDocs.indexOf(DOC_END_ELEMENT, docStartOffset);
+          
+          if (startDocElement != -1 && endDocElement != -1) {
+            
+            if (startDocElement < endDocElement) {
+              docs.add(newDocs.substring(startDocElement, endDocElement + DOC_END_ELEMENT.length()));
+              docStartOffset = endDocElement + DOC_END_ELEMENT.length();
+            }
+            else {
+              throw new InvalidFormatException("<DOC> element is not closed!");
+            }
+          }
+          else if (startDocElement != endDocElement) {
+            throw new InvalidFormatException("Missing <DOC> or </DOC> element!");            
+          }
+          else {
+            break;
+          }
+        }
+      }
+    }
+    
+    if (docs.size() > 0) {
+      return docs.remove(0);
+    }
+    else {
+      return null;
+    }
+  }
+}

Propchange: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/muc/DocumentSplitterStream.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/muc/DocumentSplitterStreamTest.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/muc/DocumentSplitterStreamTest.java?rev=1306518&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/muc/DocumentSplitterStreamTest.java (added)
+++ opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/muc/DocumentSplitterStreamTest.java Wed Mar 28 17:58:23 2012
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.muc;
+
+import java.io.IOException;
+
+import org.junit.Test;
+
+import junit.framework.Assert;
+
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.ObjectStreamUtils;
+
+public class DocumentSplitterStreamTest {
+
+  @Test
+  public void testSplitTwoDocuments() throws IOException {
+    
+    StringBuilder docsString = new StringBuilder();
+
+    for (int i = 0; i < 2; i++) {
+      docsString.append("<DOC>\n");
+      docsString.append("test document #"+ i + "\n");
+      docsString.append("</DOC>\n");
+    }
+    
+    ObjectStream<String> docs = new DocumentSplitterStream(
+        ObjectStreamUtils.createObjectStream(docsString.toString()));
+    
+    String doc1 = docs.read();
+    Assert.assertEquals(docsString.length() / 2, doc1.length() + 1);
+    Assert.assertTrue(doc1.contains("#0"));
+    
+    String doc2 = docs.read();
+    Assert.assertEquals(docsString.length() / 2, doc2.length() + 1);
+    Assert.assertTrue(doc2.contains("#1"));
+    
+    Assert.assertNull(docs.read());
+    Assert.assertNull(docs.read());
+  }
+}

Propchange: opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/muc/DocumentSplitterStreamTest.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain