You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jz...@apache.org on 2022/01/19 13:40:47 UTC
[opennlp] branch master updated: OPENNLP-1353: include charset in dictionary lemmatizer (#402)
This is an automated email from the ASF dual-hosted git repository.
jzemerick pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/master by this push:
new dd8fcfd OPENNLP-1353: include charset in dictionary lemmatizer (#402)
dd8fcfd is described below
commit dd8fcfdd184a3b7397f7c2614294bf303f490f87
Author: Robert Wenig <r....@gmail.com>
AuthorDate: Wed Jan 19 14:40:41 2022 +0100
OPENNLP-1353: include charset in dictionary lemmatizer (#402)
include charset in dictionary lemmatizer
---
.../tools/lemmatizer/DictionaryLemmatizer.java | 35 +++++++++++-----------
1 file changed, 18 insertions(+), 17 deletions(-)
diff --git a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java
index a942381..bea8dfd 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java
@@ -17,18 +17,11 @@
package opennlp.tools.lemmatizer;
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
+import java.io.*;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
/**
* Lemmatize by simple dictionary lookup into a hashmap built from a file
@@ -49,16 +42,24 @@ public class DictionaryLemmatizer implements Lemmatizer {
* Alternatively, if multiple lemmas are possible for each word,postag pair,
* then the format should be word\tab\postag\tablemma01#lemma02#lemma03
*
- * @param dictionary
- * the input dictionary via inputstream
+ * @param dictionary the input dictionary via inputstream
+ * @param charset the encoding of the inputstream
*/
+ public DictionaryLemmatizer(final InputStream dictionary, Charset charset) throws IOException {
+ init(dictionary, charset);
+ }
+
public DictionaryLemmatizer(final InputStream dictionary) throws IOException {
- init(dictionary);
+ this(dictionary, StandardCharsets.UTF_8);
}
public DictionaryLemmatizer(File dictionaryFile) throws IOException {
+ this(dictionaryFile, StandardCharsets.UTF_8);
+ }
+
+ public DictionaryLemmatizer(File dictionaryFile, Charset charset) throws IOException {
try (InputStream in = new FileInputStream(dictionaryFile)) {
- init(in);
+ init(in, charset);
}
}
@@ -66,9 +67,9 @@ public class DictionaryLemmatizer implements Lemmatizer {
this(dictionaryFile.toFile());
}
- private void init(InputStream dictionary) throws IOException {
+ private void init(InputStream dictionary, Charset charset) throws IOException {
final BufferedReader breader = new BufferedReader(
- new InputStreamReader(dictionary));
+ new InputStreamReader(dictionary, charset));
String line;
while ((line = breader.readLine()) != null) {
final String[] elems = line.split("\t");