You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2012/03/23 03:51:40 UTC
svn commit: r1304169 - in /opennlp/trunk/opennlp-tools/src:
main/java/opennlp/tools/formats/ad/ADTokenSampleStreamFactory.java
test/java/opennlp/tools/formats/ad/ADTokenSampleStreamTest.java
test/resources/opennlp/tools/formats/ad.sample
Author: colen
Date: Fri Mar 23 02:51:40 2012
New Revision: 1304169
URL: http://svn.apache.org/viewvc?rev=1304169&view=rev
Log:
OPENNLP-481: ADTokenSampleStream now uses a customized DictionaryDetokenizer that handles hyphens
Added:
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADTokenSampleStreamTest.java (with props)
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADTokenSampleStreamFactory.java
opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/formats/ad.sample
Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADTokenSampleStreamFactory.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADTokenSampleStreamFactory.java?rev=1304169&r1=1304168&r2=1304169&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADTokenSampleStreamFactory.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADTokenSampleStreamFactory.java Fri Mar 23 02:51:40 2012
@@ -17,12 +17,21 @@
package opennlp.tools.formats.ad;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.util.regex.Pattern;
+
import opennlp.tools.cmdline.ArgumentParser;
import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.TerminateToolException;
import opennlp.tools.cmdline.params.DetokenizerParameter;
import opennlp.tools.formats.DetokenizerSampleStreamFactory;
import opennlp.tools.formats.NameToTokenSampleStream;
import opennlp.tools.namefind.NameSample;
+import opennlp.tools.tokenize.DetokenizationDictionary;
+import opennlp.tools.tokenize.Detokenizer;
+import opennlp.tools.tokenize.DictionaryDetokenizer;
import opennlp.tools.tokenize.TokenSample;
import opennlp.tools.util.ObjectStream;
@@ -56,4 +65,45 @@ public class ADTokenSampleStreamFactory
ADNameSampleStreamFactory.Parameters.class));
return new NameToTokenSampleStream(createDetokenizer(params), samples);
}
+
+ protected Detokenizer createDetokenizer(DetokenizerParameter p) {
+ try {
+ return new ADDictionaryDetokenizer(new DetokenizationDictionary(
+ new FileInputStream(new File(p.getDetokenizer()))));
+ } catch (IOException e) {
+ throw new TerminateToolException(-1,
+ "IO error while loading detokenizer dict: " + e.getMessage());
+ }
+ }
+
+ static class ADDictionaryDetokenizer extends DictionaryDetokenizer {
+
+ public ADDictionaryDetokenizer(DetokenizationDictionary dict) {
+ super(dict);
+ }
+
+ @Override
+ public DetokenizationOperation[] detokenize(String[] tokens) {
+ DetokenizationOperation[] operations = super.detokenize(tokens);
+ for (int i = 0; i < tokens.length; i++) {
+ if (operations[i].equals(DetokenizationOperation.NO_OPERATION)
+ && isMergeToRight(tokens[i])) {
+ operations[i] = DetokenizationOperation.MERGE_TO_RIGHT;
+ }
+ }
+ return operations;
+ }
+
+ private static final Pattern hyphenPattern = Pattern
+ .compile(".*?[\\p{L}]-$");
+
+ private boolean isMergeToRight(String token) {
+ if (token != null) {
+ if (hyphenPattern.matcher(token).matches()) {
+ return true;
+ }
+ }
+ return false;
+ }
+ }
}
Added: opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADTokenSampleStreamTest.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADTokenSampleStreamTest.java?rev=1304169&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADTokenSampleStreamTest.java (added)
+++ opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADTokenSampleStreamTest.java Fri Mar 23 02:51:40 2012
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.ad;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.URISyntaxException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.tokenize.TokenSample;
+import opennlp.tools.util.ObjectStream;
+
+import org.junit.Before;
+import org.junit.Test;
+
+public class ADTokenSampleStreamTest {
+
+ List<TokenSample> samples = new ArrayList<TokenSample>();
+
+ @Test
+ public void testSimpleCount() throws IOException {
+ assertEquals(6, samples.size()); // means that there are 3 documents
+ }
+
+ @Test
+ public void testSentences() throws IOException {
+ assertTrue(samples.get(5).getText().contains("ofereceu-me"));
+ }
+
+ @Before
+ public void setup() throws IOException, URISyntaxException {
+ ADTokenSampleStreamFactory factory = new ADTokenSampleStreamFactory(
+ ADTokenSampleStreamFactory.Parameters.class);
+
+ File dict = new File(getClass().getClassLoader()
+ .getResource("opennlp/tools/tokenize/latin-detokenizer.xml").toURI());
+ File data = new File(getClass().getClassLoader()
+ .getResource("opennlp/tools/formats/ad.sample").toURI());
+ String[] args = { "-data", data.getCanonicalPath(), "-encoding", "UTF-8",
+ "-lang", "pt", "-detokenizer", dict.getCanonicalPath() };
+ ObjectStream<TokenSample> tokenSampleStream = factory.create(args);
+
+ TokenSample sample = tokenSampleStream.read();
+
+ while (sample != null) {
+ samples.add(sample);
+ sample = tokenSampleStream.read();
+ }
+
+ }
+
+}
Propchange: opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADTokenSampleStreamTest.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Modified: opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/formats/ad.sample
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/formats/ad.sample?rev=1304169&r1=1304168&r2=1304169&view=diff
==============================================================================
Binary files - no diff available.