You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/10/21 05:31:30 UTC
svn commit: r1400565 - in /lucene/dev/trunk: lucene/
lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/
lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/
lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/ s...
Author: rmuir
Date: Sun Oct 21 03:31:29 2012
New Revision: 1400565
URL: http://svn.apache.org/viewvc?rev=1400565&view=rev
Log:
SOLR-3906: add factory for AnalyzingSuggester
Added:
lucene/dev/trunk/solr/core/src/java/org/apache/solr/spelling/suggest/fst/AnalyzingLookupFactory.java (with props)
lucene/dev/trunk/solr/core/src/test-files/solr/collection1/conf/jasuggest.txt (with props)
lucene/dev/trunk/solr/core/src/test/org/apache/solr/spelling/suggest/TestAnalyzedSuggestions.java (with props)
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilter.java
lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilter.java
lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
lucene/dev/trunk/solr/CHANGES.txt
lucene/dev/trunk/solr/core/src/test-files/solr/collection1/conf/schema-phrasesuggest.xml
lucene/dev/trunk/solr/core/src/test-files/solr/collection1/conf/solrconfig-phrasesuggest.xml
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1400565&r1=1400564&r2=1400565&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Sun Oct 21 03:31:29 2012
@@ -79,6 +79,10 @@ Bug Fixes
* LUCENE-4479: Highlighter works correctly for fields with term vector
positions, but no offsets. (Alan Woodward)
+* SOLR-3906: JapaneseReadingFormFilter in romaji mode will return
+ romaji even for out-of-vocabulary kana cases (e.g. half-width forms).
+ (Robert Muir)
+
Optimizations
* LUCENE-4443: BlockPostingsFormat no longer writes unnecessary offsets
Modified: lucene/dev/trunk/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilter.java?rev=1400565&r1=1400564&r2=1400565&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilter.java Sun Oct 21 03:31:29 2012
@@ -35,6 +35,7 @@ public final class JapaneseReadingFormFi
private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class);
private final ReadingAttribute readingAttr = addAttribute(ReadingAttribute.class);
+ private StringBuilder buffer = new StringBuilder();
private boolean useRomaji;
public JapaneseReadingFormFilter(TokenStream input, boolean useRomaji) {
@@ -50,10 +51,19 @@ public final class JapaneseReadingFormFi
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
String reading = readingAttr.getReading();
- if (reading != null) {
- if (useRomaji) {
- ToStringUtil.getRomanization(termAttr.setEmpty(), reading);
+
+ if (useRomaji) {
+ if (reading == null) {
+ // if its an OOV term, just try the term text
+ buffer.setLength(0);
+ ToStringUtil.getRomanization(buffer, termAttr);
+ termAttr.setEmpty().append(buffer);
} else {
+ ToStringUtil.getRomanization(termAttr.setEmpty(), reading);
+ }
+ } else {
+ // just replace the term text with the reading, if it exists
+ if (reading != null) {
termAttr.setEmpty().append(reading);
}
}
Modified: lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilter.java?rev=1400565&r1=1400564&r2=1400565&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilter.java Sun Oct 21 03:31:29 2012
@@ -19,7 +19,9 @@ package org.apache.lucene.analysis.ja;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.cjk.CJKWidthFilter;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import java.io.IOException;
@@ -52,12 +54,40 @@ public class TestJapaneseReadingFormFilt
new String[] { "ã³ã³ã¤", "ã", "ããã¼ã", "ã»ã³ã»ã¤", "ã", "ããã·", "ã¿" }
);
}
+
+ public void testKatakanaReadingsHalfWidth() throws IOException {
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new JapaneseTokenizer(reader, null, true, JapaneseTokenizer.Mode.SEARCH);
+ TokenStream stream = new CJKWidthFilter(tokenizer);
+ return new TokenStreamComponents(tokenizer, new JapaneseReadingFormFilter(stream, false));
+ }
+ };
+ assertAnalyzesTo(a, "ä»å¤ã¯ï¾ï¾ï¾ï½°ï¾å
çã¨è©±ãã",
+ new String[] { "ã³ã³ã¤", "ã", "ããã¼ã", "ã»ã³ã»ã¤", "ã", "ããã·", "ã¿" }
+ );
+ }
public void testRomajiReadings() throws IOException {
assertAnalyzesTo(romajiAnalyzer, "ä»å¤ã¯ããã¼ãå
çã¨è©±ãã",
new String[] { "kon'ya", "ha", "robato", "sensei", "to", "hanashi", "ta" }
);
}
+
+ public void testRomajiReadingsHalfWidth() throws IOException {
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new JapaneseTokenizer(reader, null, true, JapaneseTokenizer.Mode.SEARCH);
+ TokenStream stream = new CJKWidthFilter(tokenizer);
+ return new TokenStreamComponents(tokenizer, new JapaneseReadingFormFilter(stream, true));
+ }
+ };
+ assertAnalyzesTo(a, "ä»å¤ã¯ï¾ï¾ï¾ï½°ï¾å
çã¨è©±ãã",
+ new String[] { "kon'ya", "ha", "robato", "sensei", "to", "hanashi", "ta" }
+ );
+ }
public void testRandomData() throws IOException {
Random random = random();
Modified: lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java?rev=1400565&r1=1400564&r2=1400565&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java (original)
+++ lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java Sun Oct 21 03:31:29 2012
@@ -127,7 +127,7 @@ public class AnalyzingSuggester extends
private final boolean exactFirst;
/**
- * True if separator between tokens should be preservered.
+ * True if separator between tokens should be preserved.
*/
private final boolean preserveSep;
Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1400565&r1=1400564&r2=1400565&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Sun Oct 21 03:31:29 2012
@@ -45,6 +45,10 @@ New Features
* SOLR-3929: Support configuring IndexWriter max thread count in solrconfig.
(phunt via Mark Miller)
+* SOLR-3906: Add support for AnalyzingSuggester (LUCENE-3842), where the
+ underlying analyzed form used for suggestions is separate from the returned
+ text. (Robert Muir)
+
Optimizations
----------------------
Added: lucene/dev/trunk/solr/core/src/java/org/apache/solr/spelling/suggest/fst/AnalyzingLookupFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/spelling/suggest/fst/AnalyzingLookupFactory.java?rev=1400565&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/spelling/suggest/fst/AnalyzingLookupFactory.java (added)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/spelling/suggest/fst/AnalyzingLookupFactory.java Sun Oct 21 03:31:29 2012
@@ -0,0 +1,118 @@
+package org.apache.solr.spelling.suggest.fst;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.search.suggest.Lookup;
+import org.apache.lucene.search.suggest.analyzing.AnalyzingSuggester;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.schema.FieldType;
+import org.apache.solr.spelling.suggest.LookupFactory;
+
+/**
+ * Factory for {@link AnalyzingSuggester}
+ * @lucene.experimental
+ */
+public class AnalyzingLookupFactory extends LookupFactory {
+ /**
+ * If <code>true</code>, exact suggestions are returned first, even if they are prefixes
+ * of other strings in the automaton (possibly with larger weights).
+ */
+ public static final String EXACT_MATCH_FIRST = "exactMatchFirst";
+
+ /**
+ * If <code>true</code>, then a separator between tokens is preserved. This means that
+ * suggestions are sensitive to tokenization (e.g. baseball is different from base ball).
+ */
+ public static final String PRESERVE_SEP = "preserveSep";
+
+ /**
+ * When multiple suggestions collide to the same analyzed form, this is the limit of
+ * how many unique surface forms we keep.
+ */
+ public static final String MAX_SURFACE_FORMS = "maxSurfaceFormsPerAnalyzedForm";
+
+ /**
+ * When building the FST ("index-time"), we add each path through the tokenstream graph
+ * as an individual entry. This places an upper-bound on how many expansions will be added
+ * for a single suggestion.
+ */
+ public static final String MAX_EXPANSIONS = "maxGraphExpansions";
+
+ // confusingly: the queryAnalyzerFieldType parameter is something totally different, this
+ // is solr's "analysis" of the queries before they even reach the suggester (really makes
+ // little sense for suggest at all, only for spellcheck). So we pick different names.
+
+ /**
+ * The analyzer used at "query-time" and "build-time" to analyze suggestions.
+ */
+ public static final String QUERY_ANALYZER = "suggestAnalyzerFieldType";
+
+ /**
+ * File name for the automaton.
+ *
+ */
+ private static final String FILENAME = "wfsta.bin";
+
+ @Override
+ public Lookup create(NamedList params, SolrCore core) {
+ // mandatory parameter
+ Object fieldTypeName = params.get(QUERY_ANALYZER);
+ if (fieldTypeName == null) {
+ throw new IllegalArgumentException("Error in configuration: " + QUERY_ANALYZER + " parameter is mandatory");
+ }
+ FieldType ft = core.getSchema().getFieldTypeByName(fieldTypeName.toString());
+ Analyzer indexAnalyzer = ft.getAnalyzer();
+ Analyzer queryAnalyzer = ft.getQueryAnalyzer();
+
+ // optional parameters
+
+ boolean exactMatchFirst = params.get(EXACT_MATCH_FIRST) != null
+ ? Boolean.valueOf(params.get(EXACT_MATCH_FIRST).toString())
+ : true;
+
+ boolean preserveSep = params.get(PRESERVE_SEP) != null
+ ? Boolean.valueOf(params.get(PRESERVE_SEP).toString())
+ : true;
+
+ int flags = 0;
+ if (exactMatchFirst) {
+ flags |= AnalyzingSuggester.EXACT_FIRST;
+ }
+ if (preserveSep) {
+ flags |= AnalyzingSuggester.PRESERVE_SEP;
+ }
+
+ int maxSurfaceFormsPerAnalyzedForm = params.get(MAX_SURFACE_FORMS) != null
+ ? Integer.parseInt(params.get(MAX_SURFACE_FORMS).toString())
+ : 256;
+
+ int maxGraphExpansions = params.get(MAX_EXPANSIONS) != null
+ ? Integer.parseInt(params.get(MAX_EXPANSIONS).toString())
+ : -1;
+
+
+ return new AnalyzingSuggester(indexAnalyzer, queryAnalyzer, flags, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions);
+ }
+
+ @Override
+ public String storeFileName() {
+ return FILENAME;
+ }
+}
Added: lucene/dev/trunk/solr/core/src/test-files/solr/collection1/conf/jasuggest.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test-files/solr/collection1/conf/jasuggest.txt?rev=1400565&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/test-files/solr/collection1/conf/jasuggest.txt (added)
+++ lucene/dev/trunk/solr/core/src/test-files/solr/collection1/conf/jasuggest.txt Sun Oct 21 03:31:29 2012
@@ -0,0 +1,5 @@
+# simple auto-suggest phrase dictionary for testing
+# note this uses tabs as separator!
+åæµ·é 1.0
+ä»å¤ 3.0
+話ãã 6.0
\ No newline at end of file
Modified: lucene/dev/trunk/solr/core/src/test-files/solr/collection1/conf/schema-phrasesuggest.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test-files/solr/collection1/conf/schema-phrasesuggest.xml?rev=1400565&r1=1400564&r2=1400565&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/test-files/solr/collection1/conf/schema-phrasesuggest.xml (original)
+++ lucene/dev/trunk/solr/core/src/test-files/solr/collection1/conf/schema-phrasesuggest.xml Sun Oct 21 03:31:29 2012
@@ -40,6 +40,14 @@
<filter class="solr.TrimFilterFactory"/>
</analyzer>
</fieldtype>
+
+ <fieldtype name="ja_suggest" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.JapaneseTokenizerFactory" mode="normal"/>
+ <filter class="solr.CJKWidthFilterFactory"/>
+ <filter class="solr.JapaneseReadingFormFilterFactory" useRomaji="true"/>
+ </analyzer>
+ </fieldtype>
</types>
<fields>
Modified: lucene/dev/trunk/solr/core/src/test-files/solr/collection1/conf/solrconfig-phrasesuggest.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test-files/solr/collection1/conf/solrconfig-phrasesuggest.xml?rev=1400565&r1=1400564&r2=1400565&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/test-files/solr/collection1/conf/solrconfig-phrasesuggest.xml (original)
+++ lucene/dev/trunk/solr/core/src/test-files/solr/collection1/conf/solrconfig-phrasesuggest.xml Sun Oct 21 03:31:29 2012
@@ -43,6 +43,27 @@
<str name="queryAnalyzerFieldType">phrase_suggest</str>
</searchComponent>
+ <!-- AnalyzingLookup suggest component -->
+ <searchComponent class="solr.SpellCheckComponent" name="suggest_analyzing">
+ <lst name="spellchecker">
+ <str name="name">suggest_analyzing</str>
+ <str name="classname">org.apache.solr.spelling.suggest.Suggester</str>
+ <str name="lookupImpl">org.apache.solr.spelling.suggest.fst.AnalyzingLookupFactory</str>
+ <str name="storeDir">suggest_analyzing</str>
+ <str name="buildOnCommit">false</str>
+
+ <!-- Suggester properties -->
+ <bool name="exactMatchFirst">true</bool>
+ <str name="suggestAnalyzerFieldType">ja_suggest</str>
+ <bool name="preserveSep">false</bool>
+
+ <str name="sourceLocation">jasuggest.txt</str>
+ </lst>
+
+ <!-- specify a fieldtype using keywordtokenizer + lowercase + cleanup -->
+ <str name="queryAnalyzerFieldType">phrase_suggest</str>
+ </searchComponent>
+
<!-- is this thing just configured globally or wtf is going on here?! -->
<queryConverter name="queryConverter" class="org.apache.solr.spelling.SuggestQueryConverter"/>
@@ -60,4 +81,18 @@
</arr>
</requestHandler>
+ <!-- analyzing (finite state automaton based) -->
+ <requestHandler class="org.apache.solr.handler.component.SearchHandler" name="/suggest_analyzing">
+ <lst name="defaults">
+ <str name="spellcheck">true</str>
+ <str name="spellcheck.dictionary">suggest_analyzing</str>
+ <str name="spellcheck.collate">false</str>
+ <!-- NOTE: if this is false, results are alpha-ordered, not by weight! -->
+ <str name="spellcheck.onlyMorePopular">true</str>
+ </lst>
+ <arr name="components">
+ <str>suggest_analyzing</str>
+ </arr>
+ </requestHandler>
+
</config>
Added: lucene/dev/trunk/solr/core/src/test/org/apache/solr/spelling/suggest/TestAnalyzedSuggestions.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/spelling/suggest/TestAnalyzedSuggestions.java?rev=1400565&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/spelling/suggest/TestAnalyzedSuggestions.java (added)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/spelling/suggest/TestAnalyzedSuggestions.java Sun Oct 21 03:31:29 2012
@@ -0,0 +1,59 @@
+package org.apache.solr.spelling.suggest;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.common.params.SpellingParams;
+import org.junit.BeforeClass;
+
+public class TestAnalyzedSuggestions extends SolrTestCaseJ4 {
+ static final String URI = "/suggest_analyzing";
+
+ @BeforeClass
+ public static void beforeClass() throws Exception {
+ initCore("solrconfig-phrasesuggest.xml","schema-phrasesuggest.xml");
+ assertQ(req("qt", URI, "q", "", SpellingParams.SPELLCHECK_BUILD, "true"));
+ }
+
+ public void test() {
+ assertQ(req("qt", URI, "q", "hokk", SpellingParams.SPELLCHECK_COUNT, "1"),
+ "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='hokk']/int[@name='numFound'][.='1']",
+ "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='hokk']/arr[@name='suggestion']/str[1][.='åæµ·é']"
+ );
+ assertQ(req("qt", URI, "q", "ã»ã£k", SpellingParams.SPELLCHECK_COUNT, "1"),
+ "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ã»ã£k']/int[@name='numFound'][.='1']",
+ "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ã»ã£k']/arr[@name='suggestion']/str[1][.='åæµ·é']"
+ );
+ assertQ(req("qt", URI, "q", "ããk", SpellingParams.SPELLCHECK_COUNT, "1"),
+ "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ããk']/int[@name='numFound'][.='1']",
+ "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ããk']/arr[@name='suggestion']/str[1][.='åæµ·é']"
+ );
+ assertQ(req("qt", URI, "q", "ï¾ï½¯k", SpellingParams.SPELLCHECK_COUNT, "1"),
+ "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ï¾ï½¯k']/int[@name='numFound'][.='1']",
+ "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ï¾ï½¯k']/arr[@name='suggestion']/str[1][.='åæµ·é']"
+ );
+ }
+
+ public void testMultiple() {
+ assertQ(req("qt", URI, "q", "h", SpellingParams.SPELLCHECK_COUNT, "2"),
+ "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='h']/int[@name='numFound'][.='2']",
+ "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='h']/arr[@name='suggestion']/str[1][.='話ãã']",
+ "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='h']/arr[@name='suggestion']/str[2][.='åæµ·é']"
+ );
+ }
+}