You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2012/07/13 05:56:21 UTC
svn commit: r1361039 - in /opennlp/trunk/opennlp-tools/src:
main/java/opennlp/tools/formats/ad/ADNameSampleStream.java
main/java/opennlp/tools/formats/ad/ADNameSampleStreamFactory.java
test/java/opennlp/tools/formats/ad/ADNameSampleStreamTest.java
Author: colen
Date: Fri Jul 13 03:56:21 2012
New Revision: 1361039
URL: http://svn.apache.org/viewvc?rev=1361039&view=rev
Log:
OPENNLP-481: Some applications would benefit from having the option of splitting tokens in the hyphen or not. Now it is configurable.
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStream.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStreamFactory.java
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADNameSampleStreamTest.java
Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStream.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStream.java?rev=1361039&r1=1361038&r2=1361039&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStream.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStream.java Fri Jul 13 03:56:21 2012
@@ -156,6 +156,8 @@ public class ADNameSampleStream implemen
* To keep the last left contraction part
*/
private String leftContractionPart = null;
+
+ private final boolean splitHyphenatedTokens;
/**
* Creates a new {@link NameSample} stream from a line stream, i.e.
@@ -164,9 +166,13 @@ public class ADNameSampleStream implemen
*
* @param lineStream
* a stream of lines as {@link String}
+ * @param splitHyphenatedTokens
+ * if true hyphenated tokens will be separated: "carros-monstro" >
+ * "carros" "-" "monstro"
*/
- public ADNameSampleStream(ObjectStream<String> lineStream) {
+ public ADNameSampleStream(ObjectStream<String> lineStream, boolean splitHyphenatedTokens) {
this.adSentenceStream = new ADSentenceStream(lineStream);
+ this.splitHyphenatedTokens = splitHyphenatedTokens;
}
/**
@@ -176,12 +182,17 @@ public class ADNameSampleStream implemen
* the Corpus {@link InputStream}
* @param charsetName
* the charset of the Arvores Deitadas Corpus
+ * @param splitHyphenatedTokens
+ * if true hyphenated tokens will be separated: "carros-monstro" >
+ * "carros" "-" "monstro"
*/
- public ADNameSampleStream(InputStream in, String charsetName) {
+ public ADNameSampleStream(InputStream in, String charsetName,
+ boolean splitHyphenatedTokens) {
try {
this.adSentenceStream = new ADSentenceStream(new PlainTextByLineStream(
in, charsetName));
+ this.splitHyphenatedTokens = splitHyphenatedTokens;
} catch (UnsupportedEncodingException e) {
// UTF-8 is available on all JVMs, will never happen
throw new IllegalStateException(e);
@@ -367,7 +378,7 @@ public class ADNameSampleStream implemen
}
// lets split all hyphens
- if (tok.contains("-") && tok.length() > 1) {
+ if (this.splitHyphenatedTokens && tok.contains("-") && tok.length() > 1) {
Matcher matcher = hyphenPattern.matcher(tok);
String firstTok = null;
Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStreamFactory.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStreamFactory.java?rev=1361039&r1=1361038&r2=1361039&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStreamFactory.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStreamFactory.java Fri Jul 13 03:56:21 2012
@@ -22,6 +22,7 @@ import java.io.FileInputStream;
import java.nio.charset.Charset;
import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.StreamFactoryRegistry;
@@ -47,6 +48,10 @@ public class ADNameSampleStreamFactory e
@ParameterDescription(valueName = "sampleData", description = "data to be used, usually a file name.")
File getData();
+
+ @ParameterDescription(valueName = "split", description = "if true all hyphenated tokens will be separated (default true)")
+ @OptionalParameter(defaultValue = "true")
+ Boolean getSplitHyphenatedTokens();
@ParameterDescription(valueName = "language", description = "language which is being processed.")
String getLang();
@@ -72,6 +77,6 @@ public class ADNameSampleStreamFactory e
ObjectStream<String> lineStream = new PlainTextByLineStream(
sampleDataIn.getChannel(), params.getEncoding());
- return new ADNameSampleStream(lineStream);
+ return new ADNameSampleStream(lineStream, params.getSplitHyphenatedTokens());
}
}
Modified: opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADNameSampleStreamTest.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADNameSampleStreamTest.java?rev=1361039&r1=1361038&r2=1361039&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADNameSampleStreamTest.java (original)
+++ opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/ad/ADNameSampleStreamTest.java Fri Jul 13 03:56:21 2012
@@ -105,7 +105,7 @@ public class ADNameSampleStreamTest {
.getResourceAsStream("/opennlp/tools/formats/ad.sample");
ADNameSampleStream stream = new ADNameSampleStream(
- new PlainTextByLineStream(in, "UTF-8"));
+ new PlainTextByLineStream(in, "UTF-8"), true);
NameSample sample = stream.read();