You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2012/03/28 19:58:24 UTC
svn commit: r1306518 - in /opennlp/trunk/opennlp-tools/src:
main/java/opennlp/tools/formats/muc/ test/java/opennlp/tools/formats/muc/
Author: joern
Date: Wed Mar 28 17:58:23 2012
New Revision: 1306518
URL: http://svn.apache.org/viewvc?rev=1306518&view=rev
Log:
OPENNLP-341 Implemented a stream to split MUC files into document strings.
Added:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/muc/
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/muc/DocumentSplitterStream.java (with props)
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/muc/
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/muc/DocumentSplitterStreamTest.java (with props)
Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/muc/DocumentSplitterStream.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/muc/DocumentSplitterStream.java?rev=1306518&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/muc/DocumentSplitterStream.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/muc/DocumentSplitterStream.java Wed Mar 28 17:58:23 2012
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.muc;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.ObjectStream;
+
+class DocumentSplitterStream extends FilterObjectStream<String, String> {
+
+ private static final String DOC_START_ELEMENT = "<DOC>";
+ private static final String DOC_END_ELEMENT = "</DOC>";
+
+ private List<String> docs = new ArrayList<String>();
+
+ DocumentSplitterStream(ObjectStream<String> samples) {
+ super(samples);
+ }
+
+ public String read() throws IOException {
+
+ if (docs.isEmpty()) {
+ String newDocs = samples.read();
+
+ if (newDocs != null) {
+ int docStartOffset = 0;
+
+ while (true) {
+ int startDocElement = newDocs.indexOf(DOC_START_ELEMENT, docStartOffset);
+ int endDocElement = newDocs.indexOf(DOC_END_ELEMENT, docStartOffset);
+
+ if (startDocElement != -1 && endDocElement != -1) {
+
+ if (startDocElement < endDocElement) {
+ docs.add(newDocs.substring(startDocElement, endDocElement + DOC_END_ELEMENT.length()));
+ docStartOffset = endDocElement + DOC_END_ELEMENT.length();
+ }
+ else {
+ throw new InvalidFormatException("<DOC> element is not closed!");
+ }
+ }
+ else if (startDocElement != endDocElement) {
+ throw new InvalidFormatException("Missing <DOC> or </DOC> element!");
+ }
+ else {
+ break;
+ }
+ }
+ }
+ }
+
+ if (docs.size() > 0) {
+ return docs.remove(0);
+ }
+ else {
+ return null;
+ }
+ }
+}
Propchange: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/muc/DocumentSplitterStream.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/muc/DocumentSplitterStreamTest.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/muc/DocumentSplitterStreamTest.java?rev=1306518&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/muc/DocumentSplitterStreamTest.java (added)
+++ opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/muc/DocumentSplitterStreamTest.java Wed Mar 28 17:58:23 2012
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.muc;
+
+import java.io.IOException;
+
+import org.junit.Test;
+
+import junit.framework.Assert;
+
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.ObjectStreamUtils;
+
+public class DocumentSplitterStreamTest {
+
+ @Test
+ public void testSplitTwoDocuments() throws IOException {
+
+ StringBuilder docsString = new StringBuilder();
+
+ for (int i = 0; i < 2; i++) {
+ docsString.append("<DOC>\n");
+ docsString.append("test document #"+ i + "\n");
+ docsString.append("</DOC>\n");
+ }
+
+ ObjectStream<String> docs = new DocumentSplitterStream(
+ ObjectStreamUtils.createObjectStream(docsString.toString()));
+
+ String doc1 = docs.read();
+ Assert.assertEquals(docsString.length() / 2, doc1.length() + 1);
+ Assert.assertTrue(doc1.contains("#0"));
+
+ String doc2 = docs.read();
+ Assert.assertEquals(docsString.length() / 2, doc2.length() + 1);
+ Assert.assertTrue(doc2.contains("#1"));
+
+ Assert.assertNull(docs.read());
+ Assert.assertNull(docs.read());
+ }
+}
Propchange: opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/muc/DocumentSplitterStreamTest.java
------------------------------------------------------------------------------
svn:mime-type = text/plain