You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by na...@apache.org on 2022/09/16 15:42:41 UTC
[lucene] branch main updated: GITHUB#11778: Add detailed part-of-speech tag for particle and ending on Nori (#11779)
This is an automated email from the ASF dual-hosted git repository.
namgyu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/main by this push:
new 451bab300ec GITHUB#11778: Add detailed part-of-speech tag for particle and ending on Nori (#11779)
451bab300ec is described below
commit 451bab300eca451b0f09704f471443c29167e6ce
Author: Namgyu Kim <na...@apache.org>
AuthorDate: Sat Sep 17 00:42:35 2022 +0900
GITHUB#11778: Add detailed part-of-speech tag for particle and ending on Nori (#11779)
---
lucene/CHANGES.txt | 5 ++
.../analysis/ko/KoreanPartOfSpeechStopFilter.java | 16 ++++++-
.../java/org/apache/lucene/analysis/ko/POS.java | 53 ++++++++++++++++-----
.../org/apache/lucene/analysis/ko/Viterbi.java | 16 ++++++-
.../ko/dict/TokenInfoDictionary$buffer.dat | Bin 7287019 -> 7288372 bytes
.../ko/dict/TokenInfoDictionary$posDict.dat | Bin 2715 -> 2715 bytes
.../ko/dict/TokenInfoDictionary$targetMap.dat | Bin 816309 -> 816309 bytes
.../analysis/ko/dict/UnknownDictionary$posDict.dat | Bin 1826 -> 1826 bytes
.../TestKoreanPartOfSpeechStopFilterFactory.java | 2 +-
.../lucene/analysis/ko/TestKoreanTokenizer.java | 36 +++++++-------
10 files changed, 94 insertions(+), 34 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index a9178b0e83e..4d0bf33ecb6 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -93,6 +93,11 @@ API Changes
* GITHUB#11772: Removed native subproject and WindowsDirectory implementation from lucene.misc. Recommendation:
use MMapDirectory implementation on Windows. (Robert Muir, Uwe Schindler, Dawid Weiss)
+Improvements
+---------------------
+* GITHUB#11778: Detailed part-of-speech information for particle(조사) and ending(어미) on Nori
+ is now tagged. (Namgyu Kim)
+
Bug Fixes
---------------------
* GITHUB#11726: Indexing term vectors on large documents could fail due to
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanPartOfSpeechStopFilter.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanPartOfSpeechStopFilter.java
index d3361a62722..6f41962d03e 100644
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanPartOfSpeechStopFilter.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/KoreanPartOfSpeechStopFilter.java
@@ -36,9 +36,21 @@ public final class KoreanPartOfSpeechStopFilter extends FilteringTokenFilter {
public static final Set<POS.Tag> DEFAULT_STOP_TAGS =
new HashSet<>(
Arrays.asList(
- POS.Tag.E,
+ POS.Tag.EP,
+ POS.Tag.EF,
+ POS.Tag.EC,
+ POS.Tag.ETN,
+ POS.Tag.ETM,
POS.Tag.IC,
- POS.Tag.J,
+ POS.Tag.JKS,
+ POS.Tag.JKC,
+ POS.Tag.JKG,
+ POS.Tag.JKO,
+ POS.Tag.JKB,
+ POS.Tag.JKV,
+ POS.Tag.JKQ,
+ POS.Tag.JX,
+ POS.Tag.JC,
POS.Tag.MAG,
POS.Tag.MAJ,
POS.Tag.MM,
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/POS.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/POS.java
index 2f6a8097258..c083833facf 100644
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/POS.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/POS.java
@@ -42,14 +42,50 @@ public class POS {
/** Part of speech tag for Korean based on Sejong corpus classification. */
public enum Tag {
- /** Verbal endings */
- E(100, "Verbal endings"),
+ /** Pre-final ending */
+ EP(100, "Pre-final ending"),
+
+ /** Sentence-closing ending */
+ EF(101, "Sentence-closing ending"),
+
+ /** Connective ending */
+ EC(102, "Connective ending"),
+
+ /** Nominal transformative ending */
+ ETN(103, "Nominal transformative ending"),
+
+ /** Adnominal form transformative ending */
+ ETM(104, "Adnominal form transformative ending"),
/** Interjection */
IC(110, "Interjection"),
- /** Ending Particle */
- J(120, "Ending Particle"),
+ /** Subject case marker */
+ JKS(120, "Subject case marker"),
+
+ /** Complement case marker */
+ JKC(121, "Complement case marker"),
+
+ /** Adnominal case marker */
+ JKG(122, "Adnominal case marker"),
+
+ /** Object case marker */
+ JKO(123, "Object case marker"),
+
+ /** Adverbial case marker */
+ JKB(124, "Adverbial case marker"),
+
+ /** Vocative case marker */
+ JKV(125, "Vocative case marker"),
+
+ /** Quotative case marker */
+ JKQ(126, "Quotative case marker"),
+
+ /** Auxiliary postpositional particle */
+ JX(127, "Auxiliary postpositional particle"),
+
+ /** Conjunctive postpositional particle */
+ JC(128, "Conjunctive postpositional particle"),
/** General Adverb */
MAG(130, "General Adverb"),
@@ -177,14 +213,7 @@ public class POS {
/** Returns the {@link Tag} of the provided <code>name</code>. */
public static Tag resolveTag(String name) {
- String tagUpper = name.toUpperCase(Locale.ENGLISH);
- if (tagUpper.startsWith("J")) {
- return Tag.J;
- } else if (tagUpper.startsWith("E")) {
- return Tag.E;
- } else {
- return Tag.valueOf(tagUpper);
- }
+ return Tag.valueOf(name.toUpperCase(Locale.ENGLISH));
}
/** Returns the {@link Tag} of the provided <code>tag</code>. */
diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/Viterbi.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/Viterbi.java
index 2a8adbfcb62..9f7765eaadf 100644
--- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/Viterbi.java
+++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/Viterbi.java
@@ -348,8 +348,20 @@ final class Viterbi
if (numSpaces > 0) {
// TODO we should extract the penalty (left-space-penalty-factor) from the dicrc file.
switch (leftPOS) {
- case E:
- case J:
+ case EP:
+ case EF:
+ case EC:
+ case ETN:
+ case ETM:
+ case JKS:
+ case JKC:
+ case JKG:
+ case JKO:
+ case JKB:
+ case JKV:
+ case JKQ:
+ case JX:
+ case JC:
case VCP:
case XSA:
case XSN:
diff --git a/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$buffer.dat b/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$buffer.dat
index 5d951670592..1de35b54155 100644
Binary files a/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$buffer.dat and b/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$buffer.dat differ
diff --git a/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$posDict.dat b/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$posDict.dat
index 851e9d5df56..e222f2bb41d 100644
Binary files a/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$posDict.dat and b/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$posDict.dat differ
diff --git a/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$targetMap.dat b/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$targetMap.dat
index 85cd99141ce..94a22fde196 100644
Binary files a/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$targetMap.dat and b/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary$targetMap.dat differ
diff --git a/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/UnknownDictionary$posDict.dat b/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/UnknownDictionary$posDict.dat
index 75479935289..450adfc56b7 100644
Binary files a/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/UnknownDictionary$posDict.dat and b/lucene/analysis/nori/src/resources/org/apache/lucene/analysis/ko/dict/UnknownDictionary$posDict.dat differ
diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanPartOfSpeechStopFilterFactory.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanPartOfSpeechStopFilterFactory.java
index 5a6c31dca32..ef2312a5373 100644
--- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanPartOfSpeechStopFilterFactory.java
+++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanPartOfSpeechStopFilterFactory.java
@@ -35,7 +35,7 @@ public class TestKoreanPartOfSpeechStopFilterFactory extends BaseTokenStreamTest
((Tokenizer) ts).setReader(new StringReader(" 한국은 대단한 나라입니다."));
Map<String, String> args = new HashMap<>();
args.put("luceneMatchVersion", Version.LATEST.toString());
- args.put("tags", "E, J");
+ args.put("tags", "EP, EF, EC, ETN, ETM, JKS, JKC, JKG, JKO, JKB, JKV, JKQ, JX, JC");
KoreanPartOfSpeechStopFilterFactory factory = new KoreanPartOfSpeechStopFilterFactory(args);
ts = factory.create(ts);
assertTokenStreamContents(ts, new String[] {"한국", "대단", "하", "나라", "이"});
diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java
index 85f87c2ec82..9511ff1b1da 100644
--- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java
+++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/TestKoreanTokenizer.java
@@ -154,8 +154,8 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase {
analyzer,
"화학 이외의 것",
new POS.Type[] {POS.Type.MORPHEME, POS.Type.MORPHEME, POS.Type.MORPHEME, POS.Type.MORPHEME},
- new POS.Tag[] {POS.Tag.NNG, POS.Tag.NNG, POS.Tag.J, POS.Tag.NNB},
- new POS.Tag[] {POS.Tag.NNG, POS.Tag.NNG, POS.Tag.J, POS.Tag.NNB});
+ new POS.Tag[] {POS.Tag.NNG, POS.Tag.NNG, POS.Tag.JKG, POS.Tag.NNB},
+ new POS.Tag[] {POS.Tag.NNG, POS.Tag.NNG, POS.Tag.JKG, POS.Tag.NNB});
}
public void testPartOfSpeechs() throws IOException {
@@ -170,8 +170,8 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase {
analyzer,
"화학 이외의 것",
new POS.Type[] {POS.Type.MORPHEME, POS.Type.MORPHEME, POS.Type.MORPHEME, POS.Type.MORPHEME},
- new POS.Tag[] {POS.Tag.NNG, POS.Tag.NNG, POS.Tag.J, POS.Tag.NNB},
- new POS.Tag[] {POS.Tag.NNG, POS.Tag.NNG, POS.Tag.J, POS.Tag.NNB});
+ new POS.Tag[] {POS.Tag.NNG, POS.Tag.NNG, POS.Tag.JKG, POS.Tag.NNB},
+ new POS.Tag[] {POS.Tag.NNG, POS.Tag.NNG, POS.Tag.JKG, POS.Tag.NNB});
}
public void testPartOfSpeechsWithPunc() throws IOException {
@@ -195,10 +195,10 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase {
POS.Type.MORPHEME
},
new POS.Tag[] {
- POS.Tag.NNG, POS.Tag.SP, POS.Tag.NNG, POS.Tag.J, POS.Tag.SP, POS.Tag.NNB, POS.Tag.SF
+ POS.Tag.NNG, POS.Tag.SP, POS.Tag.NNG, POS.Tag.JKG, POS.Tag.SP, POS.Tag.NNB, POS.Tag.SF
},
new POS.Tag[] {
- POS.Tag.NNG, POS.Tag.SP, POS.Tag.NNG, POS.Tag.J, POS.Tag.SP, POS.Tag.NNB, POS.Tag.SF
+ POS.Tag.NNG, POS.Tag.SP, POS.Tag.NNG, POS.Tag.JKG, POS.Tag.SP, POS.Tag.NNB, POS.Tag.SF
});
}
@@ -239,8 +239,8 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase {
POS.Type.MORPHEME,
POS.Type.MORPHEME
},
- new POS.Tag[] {POS.Tag.NNG, POS.Tag.J, POS.Tag.NNP, POS.Tag.NNP, POS.Tag.NNP},
- new POS.Tag[] {POS.Tag.NNG, POS.Tag.J, POS.Tag.NNP, POS.Tag.NNP, POS.Tag.NNP});
+ new POS.Tag[] {POS.Tag.NNG, POS.Tag.JX, POS.Tag.NNP, POS.Tag.NNP, POS.Tag.NNP},
+ new POS.Tag[] {POS.Tag.NNG, POS.Tag.JX, POS.Tag.NNP, POS.Tag.NNP, POS.Tag.NNP});
assertAnalyzesTo(
analyzerDecompound,
@@ -271,8 +271,10 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase {
POS.Type.MORPHEME,
POS.Type.MORPHEME
},
- new POS.Tag[] {POS.Tag.NNG, POS.Tag.NNG, POS.Tag.J, POS.Tag.NNP, POS.Tag.NNP, POS.Tag.NNP},
- new POS.Tag[] {POS.Tag.NNG, POS.Tag.NNG, POS.Tag.J, POS.Tag.NNP, POS.Tag.NNP, POS.Tag.NNP});
+ new POS.Tag[] {POS.Tag.NNG, POS.Tag.NNG, POS.Tag.JX, POS.Tag.NNP, POS.Tag.NNP, POS.Tag.NNP},
+ new POS.Tag[] {
+ POS.Tag.NNG, POS.Tag.NNG, POS.Tag.JX, POS.Tag.NNP, POS.Tag.NNP, POS.Tag.NNP
+ });
assertPartsOfSpeech(
analyzerDecompoundKeep,
@@ -287,10 +289,10 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase {
POS.Type.MORPHEME
},
new POS.Tag[] {
- POS.Tag.NNG, POS.Tag.NNG, POS.Tag.NNG, POS.Tag.J, POS.Tag.NNP, POS.Tag.NNP, POS.Tag.NNP
+ POS.Tag.NNG, POS.Tag.NNG, POS.Tag.NNG, POS.Tag.JX, POS.Tag.NNP, POS.Tag.NNP, POS.Tag.NNP
},
new POS.Tag[] {
- POS.Tag.NNG, POS.Tag.NNG, POS.Tag.NNG, POS.Tag.J, POS.Tag.NNP, POS.Tag.NNP, POS.Tag.NNP
+ POS.Tag.NNG, POS.Tag.NNG, POS.Tag.NNG, POS.Tag.JX, POS.Tag.NNP, POS.Tag.NNP, POS.Tag.NNP
});
}
@@ -303,7 +305,7 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase {
"감싸여",
new POS.Type[] {POS.Type.INFLECT},
new POS.Tag[] {POS.Tag.VV},
- new POS.Tag[] {POS.Tag.E});
+ new POS.Tag[] {POS.Tag.EC});
assertAnalyzesTo(
analyzerDecompound,
@@ -327,15 +329,15 @@ public class TestKoreanTokenizer extends BaseTokenStreamTestCase {
analyzerDecompound,
"감싸여",
new POS.Type[] {POS.Type.MORPHEME, POS.Type.MORPHEME},
- new POS.Tag[] {POS.Tag.VV, POS.Tag.E},
- new POS.Tag[] {POS.Tag.VV, POS.Tag.E});
+ new POS.Tag[] {POS.Tag.VV, POS.Tag.EC},
+ new POS.Tag[] {POS.Tag.VV, POS.Tag.EC});
assertPartsOfSpeech(
analyzerDecompoundKeep,
"감싸여",
new POS.Type[] {POS.Type.INFLECT, POS.Type.MORPHEME, POS.Type.MORPHEME},
- new POS.Tag[] {POS.Tag.VV, POS.Tag.VV, POS.Tag.E},
- new POS.Tag[] {POS.Tag.E, POS.Tag.VV, POS.Tag.E});
+ new POS.Tag[] {POS.Tag.VV, POS.Tag.VV, POS.Tag.EC},
+ new POS.Tag[] {POS.Tag.EC, POS.Tag.VV, POS.Tag.EC});
}
public void testUnknownWord() throws IOException {