You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2012/03/23 03:51:40 UTC

svn commit: r1304169 - in /opennlp/trunk/opennlp-tools/src: main/java/opennlp/tools/formats/ad/ADTokenSampleStreamFactory.java test/java/opennlp/tools/formats/ad/ADTokenSampleStreamTest.java test/resources/opennlp/tools/formats/ad.sample

Author: colen
Date: Fri Mar 23 02:51:40 2012
New Revision: 1304169

URL: http://svn.apache.org/viewvc?rev=1304169&view=rev
Log:
OPENNLP-481: ADTokenSampleStream now uses a customized DictionaryDetokenizer that handles hyphens 

Added:
    opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADTokenSampleStreamTest.java   (with props)
Modified:
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADTokenSampleStreamFactory.java
    opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/formats/ad.sample

Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADTokenSampleStreamFactory.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADTokenSampleStreamFactory.java?rev=1304169&r1=1304168&r2=1304169&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADTokenSampleStreamFactory.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADTokenSampleStreamFactory.java Fri Mar 23 02:51:40 2012
@@ -17,12 +17,21 @@
 
 package opennlp.tools.formats.ad;
 
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.util.regex.Pattern;
+
 import opennlp.tools.cmdline.ArgumentParser;
 import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.TerminateToolException;
 import opennlp.tools.cmdline.params.DetokenizerParameter;
 import opennlp.tools.formats.DetokenizerSampleStreamFactory;
 import opennlp.tools.formats.NameToTokenSampleStream;
 import opennlp.tools.namefind.NameSample;
+import opennlp.tools.tokenize.DetokenizationDictionary;
+import opennlp.tools.tokenize.Detokenizer;
+import opennlp.tools.tokenize.DictionaryDetokenizer;
 import opennlp.tools.tokenize.TokenSample;
 import opennlp.tools.util.ObjectStream;
 
@@ -56,4 +65,45 @@ public class ADTokenSampleStreamFactory 
                 ADNameSampleStreamFactory.Parameters.class));
     return new NameToTokenSampleStream(createDetokenizer(params), samples);
   }
+
+  protected Detokenizer createDetokenizer(DetokenizerParameter p) {
+    try {
+      return new ADDictionaryDetokenizer(new DetokenizationDictionary(
+          new FileInputStream(new File(p.getDetokenizer()))));
+    } catch (IOException e) {
+      throw new TerminateToolException(-1,
+          "IO error while loading detokenizer dict: " + e.getMessage());
+    }
+  }
+
+  static class ADDictionaryDetokenizer extends DictionaryDetokenizer {
+
+    public ADDictionaryDetokenizer(DetokenizationDictionary dict) {
+      super(dict);
+    }
+
+    @Override
+    public DetokenizationOperation[] detokenize(String[] tokens) {
+      DetokenizationOperation[] operations = super.detokenize(tokens);
+      for (int i = 0; i < tokens.length; i++) {
+        if (operations[i].equals(DetokenizationOperation.NO_OPERATION)
+            && isMergeToRight(tokens[i])) {
+          operations[i] = DetokenizationOperation.MERGE_TO_RIGHT;
+        }
+      }
+      return operations;
+    }
+
+    private static final Pattern hyphenPattern = Pattern
+        .compile(".*?[\\p{L}]-$");
+
+    private boolean isMergeToRight(String token) {
+      if (token != null) {
+        if (hyphenPattern.matcher(token).matches()) {
+          return true;
+        }
+      }
+      return false;
+    }
+  }
 }

Added: opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADTokenSampleStreamTest.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADTokenSampleStreamTest.java?rev=1304169&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADTokenSampleStreamTest.java (added)
+++ opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADTokenSampleStreamTest.java Fri Mar 23 02:51:40 2012
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.ad;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.URISyntaxException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.tokenize.TokenSample;
+import opennlp.tools.util.ObjectStream;
+
+import org.junit.Before;
+import org.junit.Test;
+
+public class ADTokenSampleStreamTest {
+
+  List<TokenSample> samples = new ArrayList<TokenSample>();
+
+  @Test
+  public void testSimpleCount() throws IOException {
+    assertEquals(6, samples.size()); // means that there are 3 documents
+  }
+
+  @Test
+  public void testSentences() throws IOException {
+    assertTrue(samples.get(5).getText().contains("ofereceu-me"));
+  }
+
+  @Before
+  public void setup() throws IOException, URISyntaxException {
+    ADTokenSampleStreamFactory factory = new ADTokenSampleStreamFactory(
+        ADTokenSampleStreamFactory.Parameters.class);
+
+    File dict = new File(getClass().getClassLoader()
+        .getResource("opennlp/tools/tokenize/latin-detokenizer.xml").toURI());
+    File data = new File(getClass().getClassLoader()
+        .getResource("opennlp/tools/formats/ad.sample").toURI());
+    String[] args = { "-data", data.getCanonicalPath(), "-encoding", "UTF-8",
+        "-lang", "pt", "-detokenizer", dict.getCanonicalPath() };
+    ObjectStream<TokenSample> tokenSampleStream = factory.create(args);
+
+    TokenSample sample = tokenSampleStream.read();
+
+    while (sample != null) {
+      samples.add(sample);
+      sample = tokenSampleStream.read();
+    }
+
+  }
+
+}

Propchange: opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADTokenSampleStreamTest.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/formats/ad.sample
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/formats/ad.sample?rev=1304169&r1=1304168&r2=1304169&view=diff
==============================================================================
Binary files - no diff available.