You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2012/07/13 05:56:21 UTC

svn commit: r1361039 - in /opennlp/trunk/opennlp-tools/src: main/java/opennlp/tools/formats/ad/ADNameSampleStream.java main/java/opennlp/tools/formats/ad/ADNameSampleStreamFactory.java test/java/opennlp/tools/formats/ad/ADNameSampleStreamTest.java

Author: colen
Date: Fri Jul 13 03:56:21 2012
New Revision: 1361039

URL: http://svn.apache.org/viewvc?rev=1361039&view=rev
Log:
OPENNLP-481: Some applications would benefit from having the option of splitting tokens in the hyphen or not. Now it is configurable.

Modified:
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStream.java
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStreamFactory.java
    opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADNameSampleStreamTest.java

Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStream.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStream.java?rev=1361039&r1=1361038&r2=1361039&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStream.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStream.java Fri Jul 13 03:56:21 2012
@@ -156,6 +156,8 @@ public class ADNameSampleStream implemen
    * To keep the last left contraction part
    */
   private String leftContractionPart = null;
+
+  private final boolean splitHyphenatedTokens;
   
   /**
    * Creates a new {@link NameSample} stream from a line stream, i.e.
@@ -164,9 +166,13 @@ public class ADNameSampleStream implemen
    * 
    * @param lineStream
    *          a stream of lines as {@link String}
+   * @param splitHyphenatedTokens
+   *          if true hyphenated tokens will be separated: "carros-monstro" >
+   *          "carros" "-" "monstro"
    */
-  public ADNameSampleStream(ObjectStream<String> lineStream) {
+  public ADNameSampleStream(ObjectStream<String> lineStream, boolean splitHyphenatedTokens) {
     this.adSentenceStream = new ADSentenceStream(lineStream);
+    this.splitHyphenatedTokens = splitHyphenatedTokens;
   }
 
   /**
@@ -176,12 +182,17 @@ public class ADNameSampleStream implemen
    *          the Corpus {@link InputStream}
    * @param charsetName
    *          the charset of the Arvores Deitadas Corpus
+   * @param splitHyphenatedTokens
+   *          if true hyphenated tokens will be separated: "carros-monstro" >
+   *          "carros" "-" "monstro"
    */
-  public ADNameSampleStream(InputStream in, String charsetName) {
+  public ADNameSampleStream(InputStream in, String charsetName,
+      boolean splitHyphenatedTokens) {
 
     try {
       this.adSentenceStream = new ADSentenceStream(new PlainTextByLineStream(
           in, charsetName));
+      this.splitHyphenatedTokens = splitHyphenatedTokens;
     } catch (UnsupportedEncodingException e) {
       // UTF-8 is available on all JVMs, will never happen
       throw new IllegalStateException(e);
@@ -367,7 +378,7 @@ public class ADNameSampleStream implemen
     }
     
     // lets split all hyphens
-    if (tok.contains("-") && tok.length() > 1) {
+    if (this.splitHyphenatedTokens && tok.contains("-") && tok.length() > 1) {
       Matcher matcher = hyphenPattern.matcher(tok);
 
       String firstTok = null;

Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStreamFactory.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStreamFactory.java?rev=1361039&r1=1361038&r2=1361039&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStreamFactory.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStreamFactory.java Fri Jul 13 03:56:21 2012
@@ -22,6 +22,7 @@ import java.io.FileInputStream;
 import java.nio.charset.Charset;
 
 import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
 import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
 import opennlp.tools.cmdline.CmdLineUtil;
 import opennlp.tools.cmdline.StreamFactoryRegistry;
@@ -47,6 +48,10 @@ public class ADNameSampleStreamFactory e
 
     @ParameterDescription(valueName = "sampleData", description = "data to be used, usually a file name.")
     File getData();
+    
+    @ParameterDescription(valueName = "split", description = "if true all hyphenated tokens will be separated (default true)")
+    @OptionalParameter(defaultValue = "true")
+    Boolean getSplitHyphenatedTokens();
 
     @ParameterDescription(valueName = "language", description = "language which is being processed.")
     String getLang();
@@ -72,6 +77,6 @@ public class ADNameSampleStreamFactory e
     ObjectStream<String> lineStream = new PlainTextByLineStream(
         sampleDataIn.getChannel(), params.getEncoding());
 
-    return new ADNameSampleStream(lineStream);
+    return new ADNameSampleStream(lineStream, params.getSplitHyphenatedTokens());
   }
 }

Modified: opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADNameSampleStreamTest.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADNameSampleStreamTest.java?rev=1361039&r1=1361038&r2=1361039&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADNameSampleStreamTest.java (original)
+++ opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADNameSampleStreamTest.java Fri Jul 13 03:56:21 2012
@@ -105,7 +105,7 @@ public class ADNameSampleStreamTest {
         .getResourceAsStream("/opennlp/tools/formats/ad.sample");
 
     ADNameSampleStream stream = new ADNameSampleStream(
-        new PlainTextByLineStream(in, "UTF-8"));
+        new PlainTextByLineStream(in, "UTF-8"), true);
 
     NameSample sample = stream.read();