You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/01/24 14:49:59 UTC

svn commit: r1062774 - in /incubator/opennlp/trunk/opennlp-tools/src: main/java/opennlp/tools/doccat/ main/java/opennlp/tools/formats/ test/java/opennlp/tools/formats/ test/resources/opennlp/tools/formats/

Author: joern
Date: Mon Jan 24 13:49:59 2011
New Revision: 1062774

URL: http://svn.apache.org/viewvc?rev=1062774&view=rev
Log:
OPENNLP-79 Added parsing code for the Leipzig corpus

Added:
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java   (with props)
    incubator/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/LeipzigDoccatSampleStreamTest.java   (with props)
    incubator/opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/formats/leipzig-en.sample
Modified:
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentSample.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/Conll03NameSampleStream.java

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentSample.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentSample.java?rev=1062774&r1=1062773&r2=1062774&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentSample.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentSample.java Mon Jan 24 13:49:59 2011
@@ -46,11 +46,11 @@ public class DocumentSample {
     this.text = Collections.unmodifiableList(new ArrayList<String>(Arrays.asList(text)));
   }
 
-  String getCategory() {
+  public String getCategory() {
     return category;
   }
 
-  String[] getText() {
+  public String[] getText() {
     return text.toArray(new String[text.size()]);
   }
   

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/Conll03NameSampleStream.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/Conll03NameSampleStream.java?rev=1062774&r1=1062773&r2=1062774&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/Conll03NameSampleStream.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/Conll03NameSampleStream.java Mon Jan 24 13:49:59 2011
@@ -65,7 +65,7 @@ public class Conll03NameSampleStream imp
 
     this.lang = lang;
     try {
-      this.lineStream = new PlainTextByLineStream(in, "UTF-8");
+      this.lineStream = new PlainTextByLineStream(in, "ISO-8859-1");
     } catch (UnsupportedEncodingException e) {
       // UTF-8 is available on all JVMs, will never happen
       throw new IllegalStateException(e);

Added: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java?rev=1062774&view=auto
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java (added)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java Mon Jan 24 13:49:59 2011
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Map;
+
+import opennlp.tools.doccat.DocumentSample;
+import opennlp.tools.tokenize.SimpleTokenizer;
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
+
+/**
+ * Stream filter to produce document samples out of a Leipzig sentences.txt file.
+ * In the Leipzig corpus the encoding of the various senences.txt file is defined by
+ * the language. The language must be specified to produce the category tags and is used
+ * to determine the correct input encoding.
+ * <p>
+ * The input text is tokenized with the {@link SimpleTokenizer}. The input text classified
+ * by the language model must also be tokenized by the {@link SimpleTokenizer} to produce
+ * exactly the same tokenization during testing and training.
+ */
+public class LeipzigDoccatSampleStream extends
+    FilterObjectStream<String, DocumentSample> {
+  
+  private final String language;
+  private final int sentencesPerDocument;
+
+  /**
+   * Creates a new LeipzigDoccatSampleStream with the specified parameters.
+   * 
+   * @param language the Leipzig input sentences.txt file
+   * @param sentencesPerDocument the number of sentences which should be grouped into once {@link DocumentSample}
+   * @param in the InputStream pointing to the contents of the sentences.txt input file
+   */
+  LeipzigDoccatSampleStream(String language, int sentencesPerDocument, 
+      InputStream in) throws IOException {
+    super(new PlainTextByLineStream(in, mapLanguageToEncoding(language)));
+    this.language = language;
+    this.sentencesPerDocument = sentencesPerDocument;
+  }
+  
+  /**
+   * Maps the language to the file encoding, if the encoding
+   * cannot be specified an IOException is thrown.
+   * 
+   * @return
+   * @throws IOException
+   */
+  private static String mapLanguageToEncoding(String language) throws IOException {
+    
+    if (language == null)
+      throw new NullPointerException("language parameter must not be null!");
+    
+    
+    Map<String, String> encodingMap = new HashMap<String, String>();
+    encodingMap.put("cat", "ISO-8859-1");
+    encodingMap.put("de", "ISO-8859-1");
+    encodingMap.put("dk", "ISO-8859-1");
+    encodingMap.put("ee", "ISO-8859-4");
+    encodingMap.put("en", "ISO-8859-1");
+    encodingMap.put("fi", "ISO-8859-1");
+    encodingMap.put("fr", "ISO-8859-1");
+    encodingMap.put("it", "ISO-8859-1");
+    encodingMap.put("jp", "UTF-8");
+    encodingMap.put("kr", "UTF-8");
+    encodingMap.put("nl", "ISO-8859-1");
+    encodingMap.put("no", "ISO-8859-1");
+    encodingMap.put("se", "ISO-8859-1");
+    encodingMap.put("sorb", "ISO-8859-2");
+    encodingMap.put("tr", "ISO-8859-9");
+    
+    String encoding = encodingMap.get(language);
+    
+    if (encoding != null) {
+      return encoding;
+    }
+    else {
+      throw new IOException("Encoding for language " + language + " is not specified!");
+    }
+  }
+  
+  public DocumentSample read() throws IOException {
+
+    int count = 0;
+
+    StringBuilder sampleText = new StringBuilder();
+
+    String line;
+    while (count < sentencesPerDocument && (line = samples.read()) != null) {
+
+      String tokens[] = SimpleTokenizer.INSTANCE.tokenize(line);
+      
+      if (tokens.length == 0) {
+        throw new IOException("Empty lines are not allowed!");
+      }
+        
+      // Always skip first token, that is the sentence number!
+      for (int i = 1; i < tokens.length; i++) {
+        sampleText.append(tokens[i]);
+        sampleText.append(' ');
+      }
+      
+      count++;
+    }
+
+    
+    if (sampleText.length() > 0) {
+      return new DocumentSample(language, sampleText.toString());
+    }
+  
+    return null;
+  }
+}

Propchange: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/LeipzigDoccatSampleStreamTest.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/LeipzigDoccatSampleStreamTest.java?rev=1062774&view=auto
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/LeipzigDoccatSampleStreamTest.java (added)
+++ incubator/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/LeipzigDoccatSampleStreamTest.java Mon Jan 24 13:49:59 2011
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import opennlp.tools.doccat.DocumentSample;
+import opennlp.tools.util.ObjectStream;
+
+import org.junit.Test;
+
+public class LeipzigDoccatSampleStreamTest {
+
+  @Test
+  public void testParsingSample() throws IOException {
+    InputStream in = LeipzigDoccatSampleStreamTest.class.getResourceAsStream(
+        "/opennlp/tools/formats/leipzig-en.sample");
+    
+    ObjectStream<DocumentSample> sampleStream = 
+        new LeipzigDoccatSampleStream("en", 2, in);
+    
+    DocumentSample doc1 = sampleStream.read();
+    assertEquals("en", doc1.getCategory());
+    
+    DocumentSample doc2 = sampleStream.read();
+    assertEquals("en", doc2.getCategory());
+    
+    DocumentSample doc3 = sampleStream.read();
+    assertEquals("en", doc3.getCategory());
+
+    DocumentSample doc4 = sampleStream.read();
+    assertEquals("en", doc4.getCategory());
+    
+    assertNull(sampleStream.read());
+  }
+}

Propchange: incubator/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/LeipzigDoccatSampleStreamTest.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/formats/leipzig-en.sample
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/formats/leipzig-en.sample?rev=1062774&view=auto
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/formats/leipzig-en.sample (added)
+++ incubator/opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/formats/leipzig-en.sample Mon Jan 24 13:49:59 2011
@@ -0,0 +1,7 @@
+1	A rebel statement sent to Lisbon from Jamba said 86 government soldiers and 13 guerrillas were killed in the fighting that ended Jan. 3. It said the rebel forces sill held Mavinga.
+2	Authorities last week issued a vacate order for a club in Manhattan and closed another in the Bronx.
+3	At the first Pan Am bankruptcy hearing, for example, at least five airlines were represented.
+4	Mr. Neigum, poker-faced during the difficult task, manages a 46-second showing.
+5	This, combined with the container division talks, suggests the group's bankers might be considering an orderly disposal of all assets.
+6	She told the Post in an interview published Sunday that some of the money may have become "mingled" into improvements on her home that included a swimming pool, a $2,500 wide-screen television and renovations to her basement.
+7	According to a study by the Marshall Institute, the average NASA employee's age in 1963 was 30; now most of its senior and middle-managers will be eligible to retire in five years.