You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucene.apache.org by "Trejkaz (JIRA)" <ji...@apache.org> on 2017/11/01 01:21:00 UTC
[jira] [Commented] (LUCENE-5905) Different behaviour of
JapaneseAnalyzer at indexing time vs. at search time results in no matches
for some words.
[ https://issues.apache.org/jira/browse/LUCENE-5905?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16233550#comment-16233550 ]
Trejkaz commented on LUCENE-5905:
---------------------------------
Adding one more case we found which seems to behave the same in Normal/Extended modes as well, but where adding a space at the _end_ somehow changes the tokenisation to break the word 1/2 instead of 2/1.
{noformat}
"加藤木" (no space) =>
加藤 名詞,固有名詞,人名,姓 加藤 カトウ カトー
木 名詞,一般,*,* 木 キ キ
"加藤木 " (one space at the end) =>
加 名詞,固有名詞,地域,国 加 カ カ
藤木 名詞,固有名詞,地域,一般 藤木 フジキ フジキ
記号,空白,*,* ? ?
{noformat}
> Different behaviour of JapaneseAnalyzer at indexing time vs. at search time results in no matches for some words.
> -----------------------------------------------------------------------------------------------------------------
>
> Key: LUCENE-5905
> URL: https://issues.apache.org/jira/browse/LUCENE-5905
> Project: Lucene - Core
> Issue Type: Bug
> Components: modules/analysis
> Affects Versions: 3.6.2, 4.9, 5.2.1, 6.6
> Environment: Java 8u5
> Reporter: Trejkaz
> Priority: Major
>
> A document with the word 秋葉原 in the body, when analysed by the JapaneseAnalyzer (AKA Kuromoji), cannot be found when searching for the same text as a phrase query.
> Two programs are provided to reproduce the issue. Both programs print out the term docs and positions and then the result of parsing the phrase query.
> As shown by the output, at analysis time, there is a lone Japanese term "秋葉原". At query parsing time, there are *three* such terms - "秋葉" and "秋葉原" at position 0 and "原" at position 1. Because all terms must be present for a phrase query to be a match, the query never matches, which is quite a serious issue for us.
> *Any workarounds, no matter how hacky, would be extremely helpful at this point.*
> My guess is that this is a quirk with the analyser. If it happened with StandardAnalyzer, surely someone would have discovered it before I did.
> Lucene 5.2.1 reproduction:
> {code:java}
> import org.apache.lucene.analysis.Analyzer;
> import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
> import org.apache.lucene.document.Document;
> import org.apache.lucene.document.Field;
> import org.apache.lucene.document.TextField;
> import org.apache.lucene.index.DirectoryReader;
> import org.apache.lucene.index.IndexReader;
> import org.apache.lucene.index.IndexWriter;
> import org.apache.lucene.index.IndexWriterConfig;
> import org.apache.lucene.index.LeafReader;
> import org.apache.lucene.index.LeafReaderContext;
> import org.apache.lucene.index.MultiFields;
> import org.apache.lucene.index.PostingsEnum;
> import org.apache.lucene.index.Terms;
> import org.apache.lucene.index.TermsEnum;
> import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser;
> import org.apache.lucene.queryparser.flexible.standard.config.StandardQueryConfigHandler;
> import org.apache.lucene.search.DocIdSetIterator;
> import org.apache.lucene.search.IndexSearcher;
> import org.apache.lucene.search.Query;
> import org.apache.lucene.search.TopDocs;
> import org.apache.lucene.store.Directory;
> import org.apache.lucene.store.RAMDirectory;
> import org.apache.lucene.util.Bits;
> import org.apache.lucene.util.BytesRef;
> public class LuceneMissingTerms {
> public static void main(String[] args) throws Exception {
> try (Directory directory = new RAMDirectory()) {
> Analyzer analyser = new JapaneseAnalyzer();
> try (IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(analyser))) {
> Document document = new Document();
> document.add(new TextField("content", "blah blah commercial blah blah \u79CB\u8449\u539F blah blah", Field.Store.NO));
> writer.addDocument(document);
> }
> try (IndexReader multiReader = DirectoryReader.open(directory)) {
> for (LeafReaderContext leaf : multiReader.leaves()) {
> LeafReader reader = leaf.reader();
> Terms terms = MultiFields.getFields(reader).terms("content");
> TermsEnum termsEnum = terms.iterator();
> BytesRef text;
> //noinspection NestedAssignment
> while ((text = termsEnum.next()) != null) {
> System.out.println("term: " + text.utf8ToString());
> Bits liveDocs = reader.getLiveDocs();
> PostingsEnum postingsEnum = termsEnum.postings(liveDocs, null, PostingsEnum.POSITIONS);
> int doc;
> //noinspection NestedAssignment
> while ((doc = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
> System.out.println(" doc: " + doc);
> int freq = postingsEnum.freq();
> for (int i = 0; i < freq; i++) {
> int pos = postingsEnum.nextPosition();
> System.out.println(" pos: " + pos);
> }
> }
> }
> }
> StandardQueryParser queryParser = new StandardQueryParser(analyser);
> queryParser.setDefaultOperator(StandardQueryConfigHandler.Operator.AND);
> // quoted to work around strange behaviour of StandardQueryParser treating this as a boolean query.
> Query query = queryParser.parse("\"\u79CB\u8449\u539F\"", "content");
> System.out.println(query);
> TopDocs topDocs = new IndexSearcher(multiReader).search(query, 10);
> System.out.println(topDocs.totalHits);
> }
> }
> }
> }
> {code}
> Lucene 4.9 reproduction:
> {code:java}
> import org.apache.lucene.analysis.Analyzer;
> import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
> import org.apache.lucene.document.Document;
> import org.apache.lucene.document.Field;
> import org.apache.lucene.document.TextField;
> import org.apache.lucene.index.AtomicReader;
> import org.apache.lucene.index.AtomicReaderContext;
> import org.apache.lucene.index.DirectoryReader;
> import org.apache.lucene.index.DocsAndPositionsEnum;
> import org.apache.lucene.index.IndexReader;
> import org.apache.lucene.index.IndexWriter;
> import org.apache.lucene.index.IndexWriterConfig;
> import org.apache.lucene.index.MultiFields;
> import org.apache.lucene.index.Terms;
> import org.apache.lucene.index.TermsEnum;
> import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser;
> import org.apache.lucene.queryparser.flexible.standard.config.StandardQueryConfigHandler;
> import org.apache.lucene.search.DocIdSetIterator;
> import org.apache.lucene.search.IndexSearcher;
> import org.apache.lucene.search.Query;
> import org.apache.lucene.search.TopDocs;
> import org.apache.lucene.store.Directory;
> import org.apache.lucene.store.RAMDirectory;
> import org.apache.lucene.util.Bits;
> import org.apache.lucene.util.BytesRef;
> import org.apache.lucene.util.Version;
> public class LuceneMissingTerms {
> public static void main(String[] args) throws Exception {
> try (Directory directory = new RAMDirectory()) {
> Analyzer analyser = new JapaneseAnalyzer(Version.LUCENE_4_9);
> try (IndexWriter writer = new IndexWriter(directory,
> new IndexWriterConfig(Version.LUCENE_4_9, analyser))) {
> Document document = new Document();
> document.add(new TextField("content", "blah blah
> commercial blah blah \u79CB\u8449\u539F blah blah", Field.Store.NO));
> writer.addDocument(document);
> }
> try (IndexReader multiReader =
> DirectoryReader.open(directory)) {
> for (AtomicReaderContext atomicReaderContext :
> multiReader.leaves()) {
> AtomicReader reader = atomicReaderContext.reader();
> Terms terms =
> MultiFields.getFields(reader).terms("content");
> TermsEnum termsEnum = terms.iterator(null);
> BytesRef text;
> //noinspection NestedAssignment
> while ((text = termsEnum.next()) != null) {
> System.out.println("term: " + text.utf8ToString());
> Bits liveDocs = reader.getLiveDocs();
> DocsAndPositionsEnum docsAndPositionsEnum
> = termsEnum.docsAndPositions(liveDocs, null);
> int doc;
> //noinspection NestedAssignment
> while ((doc =
> docsAndPositionsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
> System.out.println(" doc: " + doc);
> int freq = docsAndPositionsEnum.freq();
> for (int i = 0; i < freq; i++) {
> int pos =
> docsAndPositionsEnum.nextPosition();
> System.out.println(" pos: " + pos);
> }
> }
> }
> }
> StandardQueryParser queryParser = new
> StandardQueryParser(analyser);
> queryParser.setDefaultOperator(StandardQueryConfigHandler.Operator.AND);
> // quoted to work around strange behaviour of
> StandardQueryParser treating this as a boolean query.
> Query query =
> queryParser.parse("\"\u79CB\u8449\u539F\"", "content");
> System.out.println(query);
> TopDocs topDocs = new
> IndexSearcher(multiReader).search(query, 10);
> System.out.println(topDocs.totalHits);
> }
> }
> }
> }
> {code}
> Lucene 3.6.2 reproduction:
> {code:java}
> import org.apache.lucene.analysis.Analyzer;
> import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
> import org.apache.lucene.document.Document;
> import org.apache.lucene.document.Field;
> import org.apache.lucene.index.IndexReader;
> import org.apache.lucene.index.IndexWriter;
> import org.apache.lucene.index.IndexWriterConfig;
> import org.apache.lucene.index.Term;
> import org.apache.lucene.index.TermEnum;
> import org.apache.lucene.index.TermPositions;
> import org.apache.lucene.queryParser.standard.StandardQueryParser;
> import org.apache.lucene.queryParser.standard.config.StandardQueryConfigHandler;
> import org.apache.lucene.search.IndexSearcher;
> import org.apache.lucene.search.Query;
> import org.apache.lucene.search.TopDocs;
> import org.apache.lucene.store.Directory;
> import org.apache.lucene.store.RAMDirectory;
> import org.apache.lucene.util.Version;
> import org.junit.Test;
> import static org.hamcrest.Matchers.*;
> import static org.junit.Assert.*;
> public class TestJapaneseAnalysis {
> @Test
> public void testJapaneseAnalysis() throws Exception {
> try (Directory directory = new RAMDirectory()) {
> Analyzer analyser = new JapaneseAnalyzer(Version.LUCENE_36);
> try (IndexWriter writer = new IndexWriter(directory,
> new IndexWriterConfig(Version.LUCENE_36, analyser))) {
> Document document = new Document();
> document.add(new Field("content", "blah blah
> commercial blah blah \u79CB\u8449\u539F blah blah", Field.Store.NO,
> Field.Index.ANALYZED));
> writer.addDocument(document);
> }
> try (IndexReader reader = IndexReader.open(directory);
> TermEnum terms = reader.terms(new Term("content", ""));
> TermPositions termPositions = reader.termPositions()) {
> do {
> Term term = terms.term();
> if (term.field() != "content") {
> break;
> }
> System.out.println(term);
> termPositions.seek(terms);
> while (termPositions.next()) {
> System.out.println(" " + termPositions.doc());
> int freq = termPositions.freq();
> for (int i = 0; i < freq; i++) {
> System.out.println(" " +
> termPositions.nextPosition());
> }
> }
> }
> while (terms.next());
> StandardQueryParser queryParser = new
> StandardQueryParser(analyser);
> queryParser.setDefaultOperator(StandardQueryConfigHandler.Operator.AND);
> // quoted to work around strange behaviour of
> StandardQueryParser treating this as a boolean query.
> Query query =
> queryParser.parse("\"\u79CB\u8449\u539F\"", "content");
> System.out.println(query);
> TopDocs topDocs = new
> IndexSearcher(reader).search(query, 10);
> assertThat(topDocs.totalHits, is(1));
> }
> }
> }
> }
> {code}
--
This message was sent by Atlassian JIRA
(v6.4.14#64029)
---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: dev-help@lucene.apache.org