You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jz...@apache.org on 2022/01/19 13:40:47 UTC

[opennlp] branch master updated: OPENNLP-1353: include charset in dictionary lemmatizer (#402)

This is an automated email from the ASF dual-hosted git repository.

jzemerick pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git


The following commit(s) were added to refs/heads/master by this push:
     new dd8fcfd  OPENNLP-1353: include charset in dictionary lemmatizer (#402)
dd8fcfd is described below

commit dd8fcfdd184a3b7397f7c2614294bf303f490f87
Author: Robert Wenig <r....@gmail.com>
AuthorDate: Wed Jan 19 14:40:41 2022 +0100

    OPENNLP-1353: include charset in dictionary lemmatizer (#402)
    
    include charset in dictionary lemmatizer
---
 .../tools/lemmatizer/DictionaryLemmatizer.java     | 35 +++++++++++-----------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java
index a942381..bea8dfd 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java
@@ -17,18 +17,11 @@
 
 package opennlp.tools.lemmatizer;
 
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
+import java.io.*;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
 import java.nio.file.Path;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
 
 /**
  * Lemmatize by simple dictionary lookup into a hashmap built from a file
@@ -49,16 +42,24 @@ public class DictionaryLemmatizer implements Lemmatizer {
    * Alternatively, if multiple lemmas are possible for each word,postag pair,
    * then the format should be word\tab\postag\tablemma01#lemma02#lemma03
    *
-   * @param dictionary
-   *          the input dictionary via inputstream
+   * @param dictionary the input dictionary via inputstream
+   * @param charset the encoding of the inputstream
    */
+  public DictionaryLemmatizer(final InputStream dictionary, Charset charset) throws IOException {
+    init(dictionary, charset);
+  }
+
   public DictionaryLemmatizer(final InputStream dictionary) throws IOException {
-    init(dictionary);
+    this(dictionary, StandardCharsets.UTF_8);
   }
 
   public DictionaryLemmatizer(File dictionaryFile) throws IOException {
+    this(dictionaryFile, StandardCharsets.UTF_8);
+  }
+
+  public DictionaryLemmatizer(File dictionaryFile, Charset charset) throws IOException {
     try (InputStream in = new FileInputStream(dictionaryFile)) {
-      init(in);
+      init(in, charset);
     }
   }
 
@@ -66,9 +67,9 @@ public class DictionaryLemmatizer implements Lemmatizer {
     this(dictionaryFile.toFile());
   }
 
-  private void init(InputStream dictionary) throws IOException {
+  private void init(InputStream dictionary, Charset charset) throws IOException {
     final BufferedReader breader = new BufferedReader(
-        new InputStreamReader(dictionary));
+        new InputStreamReader(dictionary, charset));
     String line;
     while ((line = breader.readLine()) != null) {
       final String[] elems = line.split("\t");