You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by na...@apache.org on 2020/02/24 17:44:35 UTC

[lucene-solr] branch branch_8x updated: Revert "LUCENE-8954: refactor Nori analyzer"

This is an automated email from the ASF dual-hosted git repository.

namgyu pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/branch_8x by this push:
     new 8037234  Revert "LUCENE-8954: refactor Nori analyzer"
8037234 is described below

commit 80372341426344f7d89a36adefbd178fb0e2548a
Author: Namgyu Kim <kn...@gmail.com>
AuthorDate: Tue Feb 25 02:44:23 2020 +0900

    Revert "LUCENE-8954: refactor Nori analyzer"
    
    This reverts commit 29b7e1a95c3a8857ef8ce05c0679c66e04b1f3e0.
---
 .../lucene/analysis/ko/GraphvizFormatter.java      | 19 +++++-----
 .../apache/lucene/analysis/ko/KoreanAnalyzer.java  |  3 +-
 .../lucene/analysis/ko/KoreanNumberFilter.java     |  9 +++--
 .../analysis/ko/KoreanPartOfSpeechStopFilter.java  |  5 ++-
 .../apache/lucene/analysis/ko/KoreanTokenizer.java | 32 ++++++++++-------
 .../lucene/analysis/ko/dict/BinaryDictionary.java  | 18 ++++++----
 .../analysis/ko/dict/CharacterDefinition.java      |  8 +++--
 .../lucene/analysis/ko/dict/ConnectionCosts.java   |  2 +-
 .../lucene/analysis/ko/dict/TokenInfoFST.java      |  4 +--
 .../lucene/analysis/ko/dict/UserDictionary.java    | 16 ++++-----
 .../analysis/ko/util/BinaryDictionaryWriter.java   |  4 +--
 .../apache/lucene/analysis/ko/util/CSVUtil.java    |  4 +--
 .../ko/util/TokenInfoDictionaryBuilder.java        |  2 +-
 .../analysis/ko/StringMockResourceLoader.java      |  2 +-
 .../lucene/analysis/ko/TestKoreanAnalyzer.java     |  4 +--
 .../analysis/ko/TestKoreanNumberFilterFactory.java | 12 +++----
 .../TestKoreanPartOfSpeechStopFilterFactory.java   | 16 ++++-----
 .../ko/TestKoreanReadingFormFilterFactory.java     | 16 ++++-----
 .../analysis/ko/TestKoreanTokenizerFactory.java    | 40 +++++++++++-----------
 .../analysis/ko/dict/TokenInfoDictionaryTest.java  |  6 ++--
 .../analysis/ko/dict/UserDictionaryTest.java       |  5 ++-
 21 files changed, 124 insertions(+), 103 deletions(-)

diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/GraphvizFormatter.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/GraphvizFormatter.java
index a6c0e4b..9feb354 100644
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/GraphvizFormatter.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/GraphvizFormatter.java
@@ -157,14 +157,17 @@ public class GraphvizFormatter {
   }
   
   private String formatHeader() {
-    return "digraph viterbi {\n" +
-        "  graph [ fontsize=30 labelloc=\"t\" label=\"\" splines=true overlap=false rankdir = \"LR\"];\n" +
-        //sb.append("  // A2 paper size\n");
-        //sb.append("  size = \"34.4,16.5\";\n");
-        //sb.append("  // try to fill paper\n");
-        //sb.append("  ratio = fill;\n");
-        "  edge [ fontname=\"" + FONT_NAME + "\" fontcolor=\"red\" color=\"#606060\" ]\n" +
-        "  node [ style=\"filled\" fillcolor=\"#e8e8f0\" shape=\"Mrecord\" fontname=\"" + FONT_NAME + "\" ]\n";
+    StringBuilder sb = new StringBuilder();
+    sb.append("digraph viterbi {\n");
+    sb.append("  graph [ fontsize=30 labelloc=\"t\" label=\"\" splines=true overlap=false rankdir = \"LR\"];\n");
+    //sb.append("  // A2 paper size\n");
+    //sb.append("  size = \"34.4,16.5\";\n");
+    //sb.append("  // try to fill paper\n");
+    //sb.append("  ratio = fill;\n");
+    sb.append("  edge [ fontname=\"" + FONT_NAME + "\" fontcolor=\"red\" color=\"#606060\" ]\n");
+    sb.append("  node [ style=\"filled\" fillcolor=\"#e8e8f0\" shape=\"Mrecord\" fontname=\"" + FONT_NAME + "\" ]\n");
+    
+    return sb.toString();
   }
   
   private String formatTrailer() {
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanAnalyzer.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanAnalyzer.java
index 3f8769f..b0c26aa 100644
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanAnalyzer.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanAnalyzer.java
@@ -74,6 +74,7 @@ public class KoreanAnalyzer extends Analyzer {
 
   @Override
   protected TokenStream normalize(String fieldName, TokenStream in) {
-    return new LowerCaseFilter(in);
+    TokenStream result = new LowerCaseFilter(in);
+    return result;
   }
 }
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanNumberFilter.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanNumberFilter.java
index 732aeb3..a953a21 100644
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanNumberFilter.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanNumberFilter.java
@@ -19,7 +19,6 @@ package org.apache.lucene.analysis.ko;
 
 import java.io.IOException;
 import java.math.BigDecimal;
-import java.util.Arrays;
 
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
@@ -99,7 +98,9 @@ public class KoreanNumberFilter extends TokenFilter {
 
   static {
     numerals = new char[0x10000];
-    Arrays.fill(numerals, NO_NUMERAL);
+    for (int i = 0; i < numerals.length; i++) {
+      numerals[i] = NO_NUMERAL;
+    }
     numerals['영'] = 0; // 영 U+C601 0
     numerals['일'] = 1; // 일 U+C77C 1
     numerals['이'] = 2; // 이 U+C774 2
@@ -112,7 +113,9 @@ public class KoreanNumberFilter extends TokenFilter {
     numerals['구'] = 9; // 구 U+AD6C 9
 
     exponents = new char[0x10000];
-    Arrays.fill(exponents, (char) 0);
+    for (int i = 0; i < exponents.length; i++) {
+      exponents[i] = 0;
+    }
     exponents['십'] = 1;  // 십 U+C2ED 10
     exponents['백'] = 2;  // 백 U+BC31 100
     exponents['천'] = 3;  // 천 U+CC9C 1,000
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanPartOfSpeechStopFilter.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanPartOfSpeechStopFilter.java
index a5ab4d8..4fa7524 100644
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanPartOfSpeechStopFilter.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanPartOfSpeechStopFilter.java
@@ -18,7 +18,6 @@ package org.apache.lucene.analysis.ko;
 
 
 import java.util.Arrays;
-import java.util.HashSet;
 import java.util.Set;
 import java.util.stream.Collectors;
 
@@ -37,7 +36,7 @@ public final class KoreanPartOfSpeechStopFilter extends FilteringTokenFilter {
   /**
    * Default list of tags to filter.
    */
-  public static final Set<POS.Tag> DEFAULT_STOP_TAGS = new HashSet<>(Arrays.asList(
+  public static final Set<POS.Tag> DEFAULT_STOP_TAGS = Arrays.asList(
       POS.Tag.E,
       POS.Tag.IC,
       POS.Tag.J,
@@ -56,7 +55,7 @@ public final class KoreanPartOfSpeechStopFilter extends FilteringTokenFilter {
       POS.Tag.UNA,
       POS.Tag.NA,
       POS.Tag.VSV
-  ));
+  ).stream().collect(Collectors.toSet());
 
   /**
    * Create a new {@link KoreanPartOfSpeechStopFilter} with the default
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java
index 3efccaf..b408aa7 100644
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanTokenizer.java
@@ -340,7 +340,7 @@ public final class KoreanTokenizer extends Tokenizer {
 
   }
 
-  private void add(Dictionary dict, Position fromPosData, int wordPos, int endPos, int wordID, Type type) {
+  private void add(Dictionary dict, Position fromPosData, int wordPos, int endPos, int wordID, Type type) throws IOException {
     final POS.Tag leftPOS = dict.getLeftPOS(wordID);
     final int wordCost = dict.getWordCost(wordID);
     final int leftID = dict.getLeftId(wordID);
@@ -533,9 +533,15 @@ public final class KoreanTokenizer extends Tokenizer {
     int userWordMaxPosAhead = -1;
 
     // Advances over each position (character):
-    while (buffer.get(pos) != -1) {
+    while (true) {
+
+      if (buffer.get(pos) == -1) {
+        // End
+        break;
+      }
+
       final Position posData = positions.get(pos);
-      final boolean isFrontier = positions.getNextPos() == pos + 1;
+      final boolean isFrontier = positions.getNextPos() == pos+1;
 
       if (posData.count == 0) {
         // No arcs arrive here; move to next position:
@@ -579,9 +585,9 @@ public final class KoreanTokenizer extends Tokenizer {
         int leastIDX = -1;
         int leastCost = Integer.MAX_VALUE;
         Position leastPosData = null;
-        for (int pos2 = pos; pos2 < positions.getNextPos(); pos2++) {
+        for(int pos2=pos;pos2<positions.getNextPos();pos2++) {
           final Position posData2 = positions.get(pos2);
-          for (int idx = 0; idx < posData2.count; idx++) {
+          for(int idx=0;idx<posData2.count;idx++) {
             //System.out.println("    idx=" + idx + " cost=" + cost);
             final int cost = posData2.costs[idx];
             if (cost < leastCost) {
@@ -596,7 +602,7 @@ public final class KoreanTokenizer extends Tokenizer {
         assert leastIDX != -1;
 
         // Second pass: prune all but the best path:
-        for (int pos2 = pos; pos2 < positions.getNextPos(); pos2++) {
+        for(int pos2=pos;pos2<positions.getNextPos();pos2++) {
           final Position posData2 = positions.get(pos2);
           if (posData2 != leastPosData) {
             posData2.reset();
@@ -649,7 +655,7 @@ public final class KoreanTokenizer extends Tokenizer {
       if (Character.getType(buffer.get(pos)) == Character.SPACE_SEPARATOR) {
         int nextChar = buffer.get(++pos);
         while (nextChar != -1 && Character.getType(nextChar) == Character.SPACE_SEPARATOR) {
-          pos++;
+          pos ++;
           nextChar = buffer.get(pos);
         }
       }
@@ -667,7 +673,7 @@ public final class KoreanTokenizer extends Tokenizer {
         int outputMaxPosAhead = 0;
         int arcFinalOutMaxPosAhead = 0;
 
-        for (int posAhead = pos; ; posAhead++) {
+        for(int posAhead=pos;;posAhead++) {
           final int ch = buffer.get(posAhead);
           if (ch == -1) {
             break;
@@ -689,9 +695,9 @@ public final class KoreanTokenizer extends Tokenizer {
           if (VERBOSE) {
             System.out.println("    USER word " + new String(buffer.get(pos, maxPosAhead + 1)) + " toPos=" + (maxPosAhead + 1));
           }
-          add(userDictionary, posData, pos, maxPosAhead + 1, outputMaxPosAhead + arcFinalOutMaxPosAhead, Type.USER);
+          add(userDictionary, posData, pos, maxPosAhead+1, outputMaxPosAhead+arcFinalOutMaxPosAhead, Type.USER);
           userWordMaxPosAhead = Math.max(userWordMaxPosAhead, maxPosAhead);
-        }
+        } 
       }
 
       // TODO: we can be more aggressive about user
@@ -703,7 +709,7 @@ public final class KoreanTokenizer extends Tokenizer {
         fst.getFirstArc(arc);
         int output = 0;
 
-        for (int posAhead = pos; ; posAhead++) {
+        for(int posAhead=pos;;posAhead++) {
           final int ch = buffer.get(posAhead);
           if (ch == -1) {
             break;
@@ -728,7 +734,7 @@ public final class KoreanTokenizer extends Tokenizer {
               System.out.println("    KNOWN word " + new String(buffer.get(pos, posAhead - pos + 1)) + " toPos=" + (posAhead + 1) + " " + wordIdRef.length + " wordIDs");
             }
             for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
-              add(dictionary, posData, pos, posAhead + 1, wordIdRef.ints[wordIdRef.offset + ofs], Type.KNOWN);
+              add(dictionary, posData, pos, posAhead+1, wordIdRef.ints[wordIdRef.offset + ofs], Type.KNOWN);
               anyMatches = true;
             }
           }
@@ -752,7 +758,7 @@ public final class KoreanTokenizer extends Tokenizer {
         } else {
           // Extract unknown word. Characters with the same script are considered to be part of unknown word
           unknownWordLength = 1;
-          UnicodeScript scriptCode = UnicodeScript.of(firstCharacter);
+          UnicodeScript scriptCode = UnicodeScript.of((int) firstCharacter);
           final boolean isPunct = isPunctuation(firstCharacter);
           final boolean isDigit = Character.isDigit(firstCharacter);
           for (int posAhead = pos + 1; unknownWordLength < MAX_UNKNOWN_WORD_LENGTH; posAhead++) {
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/BinaryDictionary.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/BinaryDictionary.java
index 4d8074c..8649837 100644
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/BinaryDictionary.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/BinaryDictionary.java
@@ -81,8 +81,9 @@ public abstract class BinaryDictionary implements Dictionary {
       this.resourcePath = resourcePath;
     }
     InputStream mapIS = null, dictIS = null, posIS = null;
-    int[] targetMapOffsets, targetMap;
-    ByteBuffer buffer;
+    int[] targetMapOffsets = null, targetMap = null;
+    ByteBuffer buffer = null;
+    boolean success = false;
     try {
       mapIS = getResource(TARGETMAP_FILENAME_SUFFIX);
       mapIS = new BufferedInputStream(mapIS);
@@ -131,8 +132,13 @@ public abstract class BinaryDictionary implements Dictionary {
       }
       dictIS.close(); dictIS = null;
       buffer = tmpBuffer.asReadOnlyBuffer();
+      success = true;
     } finally {
-      IOUtils.closeWhileHandlingException(mapIS, posIS, dictIS);
+      if (success) {
+        IOUtils.close(mapIS, posIS, dictIS);
+      } else {
+        IOUtils.closeWhileHandlingException(mapIS, posIS, dictIS);
+      }
     }
 
     this.targetMap = targetMap;
@@ -152,7 +158,7 @@ public abstract class BinaryDictionary implements Dictionary {
   }
   
   // util, reused by ConnectionCosts and CharacterDefinition
-  public static InputStream getClassResource(Class<?> clazz, String suffix) throws IOException {
+  public static final InputStream getClassResource(Class<?> clazz, String suffix) throws IOException {
     final InputStream is = clazz.getResourceAsStream(clazz.getSimpleName() + suffix);
     if (is == null) {
       throw new FileNotFoundException("Not in classpath: " + clazz.getName().replace('.', '/') + suffix);
@@ -230,7 +236,7 @@ public abstract class BinaryDictionary implements Dictionary {
     int offset = wordId + 6;
     boolean hasSinglePos = hasSinglePOS(wordId);
     if (hasSinglePos == false) {
-      offset++; // skip rightPOS
+      offset ++; // skip rightPOS
     }
     int length = buffer.get(offset++);
     if (length == 0) {
@@ -258,7 +264,7 @@ public abstract class BinaryDictionary implements Dictionary {
   private String readString(int offset) {
     int strOffset = offset;
     int len = buffer.get(strOffset++);
-    char[] text = new char[len];
+    char text[] = new char[len];
     for (int i = 0; i < len; i++) {
       text[i] = buffer.getChar(strOffset + (i<<1));
     }
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/CharacterDefinition.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/CharacterDefinition.java
index 59d4dac..ac5230c 100644
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/CharacterDefinition.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/CharacterDefinition.java
@@ -38,7 +38,7 @@ public final class CharacterDefinition {
 
   // only used internally for lookup:
   enum CharacterClass {
-    NGRAM, DEFAULT, SPACE, SYMBOL, NUMERIC, ALPHA, CYRILLIC, GREEK, HIRAGANA, KATAKANA, KANJI, HANGUL, HANJA, HANJANUMERIC
+    NGRAM, DEFAULT, SPACE, SYMBOL, NUMERIC, ALPHA, CYRILLIC, GREEK, HIRAGANA, KATAKANA, KANJI, HANGUL, HANJA, HANJANUMERIC;
   }
 
   private final byte[] characterCategoryMap = new byte[0x10000];
@@ -108,7 +108,11 @@ public final class CharacterDefinition {
   }
 
   public boolean hasCoda(char ch){
-    return ((ch - 0xAC00) % 0x001C) != 0;
+    if (((ch - 0xAC00) % 0x001C) == 0) {
+      return false;
+    } else {
+      return true;
+    }
   }
 
   public static byte lookupCharacterClass(String characterClassName) {
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/ConnectionCosts.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/ConnectionCosts.java
index 36cbe15..95d0e8b 100644
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/ConnectionCosts.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/ConnectionCosts.java
@@ -40,7 +40,7 @@ public final class ConnectionCosts {
 
   private ConnectionCosts() throws IOException {
     InputStream is = null;
-    ByteBuffer buffer;
+    ByteBuffer buffer = null;
     boolean success = false;
     try {
       is = BinaryDictionary.getClassResource(getClass(), FILENAME_SUFFIX);
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoFST.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoFST.java
index a79b35b..7f9bec6 100644
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoFST.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoFST.java
@@ -28,7 +28,7 @@ public final class TokenInfoFST {
   private final FST<Long> fst;
 
   private final int cacheCeiling;
-  private final FST.Arc<Long>[] rootCache;
+  private final FST.Arc<Long> rootCache[];
   
   public final Long NO_OUTPUT;
 
@@ -41,7 +41,7 @@ public final class TokenInfoFST {
   
   @SuppressWarnings({"rawtypes","unchecked"})
   private FST.Arc<Long>[] cacheRootArcs() throws IOException {
-    FST.Arc<Long>[] rootCache = new FST.Arc[1+(cacheCeiling-0xAC00)];
+    FST.Arc<Long> rootCache[] = new FST.Arc[1+(cacheCeiling-0xAC00)];
     FST.Arc<Long> firstArc = new FST.Arc<>();
     fst.getFirstArc(firstArc);
     FST.Arc<Long> arc = new FST.Arc<>();
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java
index 186990e..e04d133 100644
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UserDictionary.java
@@ -37,26 +37,26 @@ public final class UserDictionary implements Dictionary {
   // text -> wordID
   private final TokenInfoFST fst;
 
-  private static final int WORD_COST = -100000;
+  public static final int WORD_COST = -100000;
 
   // NNG left
-  private static final short LEFT_ID = 1781;
+  public static final short LEFT_ID = 1781;
 
   // NNG right
-  private static final short RIGHT_ID = 3533;
+  public static final short RIGHT_ID = 3533;
   // NNG right with hangul and a coda on the last char
-  private static final short RIGHT_ID_T = 3535;
+  public static final short RIGHT_ID_T = 3535;
   // NNG right with hangul and no coda on the last char
-  private static final short RIGHT_ID_F = 3534;
+  public static final short RIGHT_ID_F = 3534;
 
   // length, length... indexed by compound ID or null for simple noun
-  private final int[][] segmentations;
+  private final int segmentations[][];
   private final short[] rightIds;
 
   public static UserDictionary open(Reader reader) throws IOException {
 
     BufferedReader br = new BufferedReader(reader);
-    String line;
+    String line = null;
     List<String> entries = new ArrayList<>();
 
     // text + optional segmentations
@@ -127,7 +127,7 @@ public final class UserDictionary implements Dictionary {
       scratch.grow(token.length());
       scratch.setLength(token.length());
       for (int i = 0; i < token.length(); i++) {
-        scratch.setIntAt(i, token.charAt(i));
+        scratch.setIntAt(i, (int) token.charAt(i));
       }
       fstBuilder.add(scratch.get(), ord);
       lastToken = token;
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/BinaryDictionaryWriter.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/BinaryDictionaryWriter.java
index fec02db..6a19b1b 100644
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/BinaryDictionaryWriter.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/BinaryDictionaryWriter.java
@@ -117,8 +117,8 @@ abstract class BinaryDictionaryWriter {
     boolean hasSinglePOS = (leftPOS == rightPOS);
     if (posType != POS.Type.MORPHEME && expression.length() > 0) {
       String[] exprTokens = expression.split("\\+");
-      for (String exprToken : exprTokens) {
-        String[] tokenSplit = exprToken.split("/");
+      for (int i = 0; i < exprTokens.length; i++) {
+        String[] tokenSplit = exprTokens[i].split("/");
         assert tokenSplit.length == 3;
         String surfaceForm = tokenSplit[0].trim();
         if (surfaceForm.isEmpty() == false) {
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/CSVUtil.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/CSVUtil.java
index 27380e9..f911b55 100644
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/CSVUtil.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/CSVUtil.java
@@ -70,7 +70,7 @@ public final class CSVUtil {
       return new String[0];
     }
     
-    return result.toArray(new String[0]);
+    return result.toArray(new String[result.size()]);
   }
   
   private static String unQuoteUnEscape(String original) {
@@ -84,7 +84,7 @@ public final class CSVUtil {
       }
     
       // Unescape
-      if (result.contains(ESCAPED_QUOTE)) {
+      if (result.indexOf(ESCAPED_QUOTE) >= 0) {
         result = result.replace(ESCAPED_QUOTE, "\"");
       }
     }
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryBuilder.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryBuilder.java
index 4f4f0b7..e4c288b 100644
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryBuilder.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryBuilder.java
@@ -114,7 +114,7 @@ class TokenInfoDictionaryBuilder {
         scratch.grow(surfaceForm.length());
         scratch.setLength(surfaceForm.length());
         for (int i = 0; i < surfaceForm.length(); i++) {
-          scratch.setIntAt(i, surfaceForm.charAt(i));
+          scratch.setIntAt(i, (int) surfaceForm.charAt(i));
         }
         fstBuilder.add(scratch.get(), ord);
       }
diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/StringMockResourceLoader.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/StringMockResourceLoader.java
index 8e7cd7b..cc1ee00 100644
--- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/StringMockResourceLoader.java
+++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/StringMockResourceLoader.java
@@ -26,7 +26,7 @@ import org.apache.lucene.analysis.util.ResourceLoader;
 
 /** Fake resource loader for tests: works if you want to fake reading a single file */
 class StringMockResourceLoader implements ResourceLoader {
-  private String text;
+  String text;
 
   public StringMockResourceLoader(String text) {
     this.text = text;
diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanAnalyzer.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanAnalyzer.java
index d82409f..2ba2f37 100644
--- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanAnalyzer.java
+++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanAnalyzer.java
@@ -18,9 +18,9 @@ package org.apache.lucene.analysis.ko;
 
 import java.io.IOException;
 import java.util.Arrays;
-import java.util.HashSet;
 import java.util.Random;
 import java.util.Set;
+import java.util.stream.Collectors;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
@@ -41,7 +41,7 @@ public class TestKoreanAnalyzer extends BaseTokenStreamTestCase {
   }
 
   public void testStopTags() throws IOException {
-    Set<POS.Tag> stopTags = new HashSet<>(Arrays.asList(POS.Tag.NNP, POS.Tag.NNG));
+    Set<POS.Tag> stopTags = Arrays.asList(POS.Tag.NNP, POS.Tag.NNG).stream().collect(Collectors.toSet());
     Analyzer a = new KoreanAnalyzer(null, KoreanTokenizer.DecompoundMode.DISCARD, stopTags, false);
     assertAnalyzesTo(a, "한국은 대단한 나라입니다.",
         new String[]{"은", "대단", "하", "ᆫ", "이", "ᄇ니다"},
diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanNumberFilterFactory.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanNumberFilterFactory.java
index 8564521..d549933 100644
--- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanNumberFilterFactory.java
+++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanNumberFilterFactory.java
@@ -50,12 +50,12 @@ public class TestKoreanNumberFilterFactory extends BaseTokenStreamTestCase {
   }
 
   /** Test that bogus arguments result in exception */
-  public void testBogusArguments() {
-    IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () ->
-        new KoreanNumberFilterFactory(new HashMap<String, String>() {{
-          put("bogusArg", "bogusValue");
-        }})
-    );
+  public void testBogusArguments() throws Exception {
+    IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
+      new KoreanNumberFilterFactory(new HashMap<String,String>() {{
+        put("bogusArg", "bogusValue");
+      }});
+    });
     assertTrue(expected.getMessage().contains("Unknown parameters"));
   }
 }
diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanPartOfSpeechStopFilterFactory.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanPartOfSpeechStopFilterFactory.java
index 72e5c58..5486f3f 100644
--- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanPartOfSpeechStopFilterFactory.java
+++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanPartOfSpeechStopFilterFactory.java
@@ -32,7 +32,7 @@ import org.apache.lucene.util.Version;
  */
 public class TestKoreanPartOfSpeechStopFilterFactory extends BaseTokenStreamTestCase {
   public void testStopTags() throws IOException {
-    KoreanTokenizerFactory tokenizerFactory = new KoreanTokenizerFactory(new HashMap<>());
+    KoreanTokenizerFactory tokenizerFactory = new KoreanTokenizerFactory(new HashMap<String,String>());
     tokenizerFactory.inform(new StringMockResourceLoader(""));
     TokenStream ts = tokenizerFactory.create();
     ((Tokenizer)ts).setReader(new StringReader(" 한국은 대단한 나라입니다."));
@@ -47,13 +47,13 @@ public class TestKoreanPartOfSpeechStopFilterFactory extends BaseTokenStreamTest
   }
 
   /** Test that bogus arguments result in exception */
-  public void testBogusArguments() {
-    IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () ->
-        new KoreanPartOfSpeechStopFilterFactory(new HashMap<String, String>() {{
-          put("luceneMatchVersion", Version.LATEST.toString());
-          put("bogusArg", "bogusValue");
-        }})
-    );
+  public void testBogusArguments() throws Exception {
+    IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
+      new KoreanPartOfSpeechStopFilterFactory(new HashMap<String,String>() {{
+        put("luceneMatchVersion", Version.LATEST.toString());
+        put("bogusArg", "bogusValue");
+      }});
+    });
     assertTrue(expected.getMessage().contains("Unknown parameters"));
   }
 }
diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanReadingFormFilterFactory.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanReadingFormFilterFactory.java
index ca9a8ea..f058a44 100644
--- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanReadingFormFilterFactory.java
+++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanReadingFormFilterFactory.java
@@ -31,8 +31,8 @@ public class TestKoreanReadingFormFilterFactory extends BaseTokenStreamTestCase
   public void testReadings() throws IOException {
     KoreanTokenizerFactory tokenizerFactory = new KoreanTokenizerFactory(new HashMap<>());
     tokenizerFactory.inform(new StringMockResourceLoader(""));
-    Tokenizer tokenStream = tokenizerFactory.create();
-    tokenStream.setReader(new StringReader("丞相"));
+    TokenStream tokenStream = tokenizerFactory.create();
+    ((Tokenizer)tokenStream).setReader(new StringReader("丞相"));
     KoreanReadingFormFilterFactory filterFactory = new KoreanReadingFormFilterFactory(new HashMap<>());
     assertTokenStreamContents(filterFactory.create(tokenStream),
         new String[] { "승상" }
@@ -40,12 +40,12 @@ public class TestKoreanReadingFormFilterFactory extends BaseTokenStreamTestCase
   }
   
   /** Test that bogus arguments result in exception */
-  public void testBogusArguments() {
-    IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () ->
-        new KoreanReadingFormFilterFactory(new HashMap<String, String>() {{
-          put("bogusArg", "bogusValue");
-        }})
-    );
+  public void testBogusArguments() throws Exception {
+    IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
+      new KoreanReadingFormFilterFactory(new HashMap<String,String>() {{
+        put("bogusArg", "bogusValue");
+      }});
+    });
     assertTrue(expected.getMessage().contains("Unknown parameters"));
   }
 }
diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizerFactory.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizerFactory.java
index 132f244..9ed6566 100644
--- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizerFactory.java
+++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizerFactory.java
@@ -33,8 +33,8 @@ public class TestKoreanTokenizerFactory extends BaseTokenStreamTestCase {
   public void testSimple() throws IOException {
     KoreanTokenizerFactory factory = new KoreanTokenizerFactory(Collections.emptyMap());
     factory.inform(new StringMockResourceLoader(""));
-    Tokenizer ts = factory.create(newAttributeFactory());
-    ts.setReader(new StringReader("안녕하세요"));
+    TokenStream ts = factory.create(newAttributeFactory());
+    ((Tokenizer)ts).setReader(new StringReader("안녕하세요"));
     assertTokenStreamContents(ts,
         new String[] { "안녕", "하", "시", "어요" },
         new int[] { 0, 2, 3, 3 },
@@ -50,8 +50,8 @@ public class TestKoreanTokenizerFactory extends BaseTokenStreamTestCase {
     args.put("decompoundMode", "discard");
     KoreanTokenizerFactory factory = new KoreanTokenizerFactory(args);
     factory.inform(new StringMockResourceLoader(""));
-    Tokenizer ts = factory.create(newAttributeFactory());
-    ts.setReader(new StringReader("갠지스강"));
+    TokenStream ts = factory.create(newAttributeFactory());
+    ((Tokenizer)ts).setReader(new StringReader("갠지스강"));
     assertTokenStreamContents(ts,
         new String[] { "갠지스", "강" }
     );
@@ -62,8 +62,8 @@ public class TestKoreanTokenizerFactory extends BaseTokenStreamTestCase {
     args.put("decompoundMode", "none");
     KoreanTokenizerFactory factory = new KoreanTokenizerFactory(args);
     factory.inform(new StringMockResourceLoader(""));
-    Tokenizer ts = factory.create(newAttributeFactory());
-    ts.setReader(new StringReader("갠지스강"));
+    TokenStream ts = factory.create(newAttributeFactory());
+    ((Tokenizer)ts).setReader(new StringReader("갠지스강"));
     assertTokenStreamContents(ts,
         new String[] { "갠지스강" }
     );
@@ -74,8 +74,8 @@ public class TestKoreanTokenizerFactory extends BaseTokenStreamTestCase {
     args.put("decompoundMode", "mixed");
     KoreanTokenizerFactory factory = new KoreanTokenizerFactory(args);
     factory.inform(new StringMockResourceLoader(""));
-    Tokenizer ts = factory.create(newAttributeFactory());
-    ts.setReader(new StringReader("갠지스강"));
+    TokenStream ts = factory.create(newAttributeFactory());
+    ((Tokenizer)ts).setReader(new StringReader("갠지스강"));
     assertTokenStreamContents(ts,
         new String[] { "갠지스강", "갠지스", "강" }
     );
@@ -94,8 +94,8 @@ public class TestKoreanTokenizerFactory extends BaseTokenStreamTestCase {
     args.put("userDictionary", "userdict.txt");
     KoreanTokenizerFactory factory = new KoreanTokenizerFactory(args);
     factory.inform(new StringMockResourceLoader(userDict));
-    Tokenizer ts = factory.create(newAttributeFactory());
-    ts.setReader(new StringReader("세종시"));
+    TokenStream ts = factory.create(newAttributeFactory());
+    ((Tokenizer)ts).setReader(new StringReader("세종시"));
     assertTokenStreamContents(ts,
         new String[] { "세종", "시" }
     );
@@ -109,8 +109,8 @@ public class TestKoreanTokenizerFactory extends BaseTokenStreamTestCase {
     args.put("discardPunctuation", "true");
     KoreanTokenizerFactory factory = new KoreanTokenizerFactory(args);
     factory.inform(new StringMockResourceLoader(""));
-    Tokenizer ts = factory.create(newAttributeFactory());
-    ts.setReader(new StringReader("10.1 인치 모니터"));
+    TokenStream ts = factory.create(newAttributeFactory());
+    ((Tokenizer)ts).setReader(new StringReader("10.1 인치 모니터"));
     assertTokenStreamContents(ts,
         new String[] { "10", "1", "인치", "모니터" }
     );
@@ -124,20 +124,20 @@ public class TestKoreanTokenizerFactory extends BaseTokenStreamTestCase {
     args.put("discardPunctuation", "false");
     KoreanTokenizerFactory factory = new KoreanTokenizerFactory(args);
     factory.inform(new StringMockResourceLoader(""));
-    Tokenizer ts = factory.create(newAttributeFactory());
-    ts.setReader(new StringReader("10.1 인치 모니터"));
+    TokenStream ts = factory.create(newAttributeFactory());
+    ((Tokenizer)ts).setReader(new StringReader("10.1 인치 모니터"));
     assertTokenStreamContents(ts,
         new String[] { "10", ".", "1", " ", "인치", " ", "모니터" }
     );
   }
 
   /** Test that bogus arguments result in exception */
-  public void testBogusArguments() {
-    IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () ->
-        new KoreanTokenizerFactory(new HashMap<String, String>() {{
-          put("bogusArg", "bogusValue");
-        }})
-    );
+  public void testBogusArguments() throws Exception {
+    IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
+      new KoreanTokenizerFactory(new HashMap<String,String>() {{
+        put("bogusArg", "bogusValue");
+      }});
+    });
     assertTrue(expected.getMessage().contains("Unknown parameters"));
   }
 }
diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryTest.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryTest.java
index bbbc07e..9bbf258 100644
--- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryTest.java
+++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryTest.java
@@ -136,11 +136,11 @@ public class TokenInfoDictionaryTest extends LuceneTestCase {
         POS.Tag rightPOS = tid.getRightPOS(wordId);
 
         if (type == POS.Type.MORPHEME) {
-          assertSame(leftPOS, rightPOS);
+          assertTrue(leftPOS == rightPOS);
           String reading = tid.getReading(wordId);
           boolean isHanja = charDef.isHanja(surfaceForm.charAt(0));
           if (isHanja) {
-            assertNotNull(reading);
+            assertTrue(reading != null);
             for (int j = 0; j < reading.length(); j++) {
               assertTrue(charDef.isHangul(reading.charAt(j)));
             }
@@ -150,7 +150,7 @@ public class TokenInfoDictionaryTest extends LuceneTestCase {
           }
         } else {
           if (type == POS.Type.COMPOUND) {
-            assertSame(leftPOS, rightPOS);
+            assertTrue(leftPOS == rightPOS);
             assertTrue(leftPOS == POS.Tag.NNG || rightPOS == POS.Tag.NNP);
           }
           Dictionary.Morpheme[] decompound = tid.getMorphemes(wordId,  chars, 0, chars.length);
diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/UserDictionaryTest.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/UserDictionaryTest.java
index 2f12ba4..b008cf3 100644
--- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/UserDictionaryTest.java
+++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/UserDictionaryTest.java
@@ -41,8 +41,7 @@ public class UserDictionaryTest extends LuceneTestCase {
     assertNull(dictionary.getMorphemes(wordIds.get(0), sArray, 0, s.length()));
 
     Dictionary.Morpheme[] decompound = dictionary.getMorphemes(wordIds.get(1), sArray, 0, s.length());
-    assertNotNull(decompound);
-    assertEquals(2, decompound.length);
+    assertTrue(decompound.length == 2);
     assertEquals(decompound[0].posTag, POS.Tag.NNG);
     assertEquals(decompound[0].surfaceForm, "세종");
     assertEquals(decompound[1].posTag, POS.Tag.NNG);
@@ -56,7 +55,7 @@ public class UserDictionaryTest extends LuceneTestCase {
   }
   
   @Test
-  public void testRead() {
+  public void testRead() throws IOException {
     UserDictionary dictionary = TestKoreanTokenizer.readDict();
     assertNotNull(dictionary);
   }