You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/02/20 16:48:39 UTC
svn commit: r1291322 - in /lucene/dev/trunk/solr: ./
core/src/java/org/apache/solr/spelling/ core/src/test-files/solr/conf/
core/src/test/org/apache/solr/spelling/
core/src/test/org/apache/solr/spelling/suggest/
Author: rmuir
Date: Mon Feb 20 15:48:38 2012
New Revision: 1291322
URL: http://svn.apache.org/viewvc?rev=1291322&view=rev
Log:
SOLR-3143: add SuggestQueryConverter for autosuggesters
Added:
lucene/dev/trunk/solr/core/src/java/org/apache/solr/spelling/SuggestQueryConverter.java (with props)
lucene/dev/trunk/solr/core/src/test-files/solr/conf/phrasesuggest.txt (with props)
lucene/dev/trunk/solr/core/src/test-files/solr/conf/schema-phrasesuggest.xml (with props)
lucene/dev/trunk/solr/core/src/test-files/solr/conf/solrconfig-phrasesuggest.xml (with props)
lucene/dev/trunk/solr/core/src/test/org/apache/solr/spelling/TestSuggestSpellingConverter.java (with props)
lucene/dev/trunk/solr/core/src/test/org/apache/solr/spelling/suggest/TestPhraseSuggestions.java (with props)
Modified:
lucene/dev/trunk/solr/CHANGES.txt
lucene/dev/trunk/solr/core/src/java/org/apache/solr/spelling/SpellingQueryConverter.java
Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1291322&r1=1291321&r2=1291322&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Mon Feb 20 15:48:38 2012
@@ -506,6 +506,9 @@ New Features
* LUCENE-3714: Add WFSTLookupFactory, a suggester that uses a weighted FST
for more fine-grained suggestions. (Mike McCandless, Dawid Weiss, Robert Muir)
+* SOLR-3143: Add SuggestQueryConverter, a QueryConverter intended for
+ auto-suggesters. (Robert Muir)
+
Optimizations
----------------------
* SOLR-1931: Speedup for LukeRequestHandler and admin/schema browser. New parameter
Modified: lucene/dev/trunk/solr/core/src/java/org/apache/solr/spelling/SpellingQueryConverter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/spelling/SpellingQueryConverter.java?rev=1291322&r1=1291321&r2=1291322&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/spelling/SpellingQueryConverter.java (original)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/spelling/SpellingQueryConverter.java Mon Feb 20 15:48:38 2012
@@ -18,6 +18,7 @@
package org.apache.solr.spelling;
import java.io.IOException;
+import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collection;
@@ -100,39 +101,42 @@ public class SpellingQueryConverter exte
Collection<Token> result = new ArrayList<Token>();
//TODO: Extract the words using a simple regex, but not query stuff, and then analyze them to produce the token stream
Matcher matcher = QUERY_REGEX.matcher(original);
- TokenStream stream;
while (matcher.find()) {
String word = matcher.group(0);
if (word.equals("AND") == false && word.equals("OR") == false) {
try {
- stream = analyzer.tokenStream("", new StringReader(word));
- // TODO: support custom attributes
- CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
- FlagsAttribute flagsAtt = stream.addAttribute(FlagsAttribute.class);
- TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class);
- PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class);
- PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
- OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
- stream.reset();
- while (stream.incrementToken()) {
- Token token = new Token();
- token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
- token.setStartOffset(matcher.start() + offsetAtt.startOffset());
- token.setEndOffset(matcher.start() + offsetAtt.endOffset());
- token.setFlags(flagsAtt.getFlags());
- token.setType(typeAtt.type());
- token.setPayload(payloadAtt.getPayload());
- token.setPositionIncrement(posIncAtt.getPositionIncrement());
- result.add(token);
- }
- stream.end();
- stream.close();
+ analyze(result, new StringReader(word), matcher.start());
} catch (IOException e) {
+ // TODO: shouldn't we log something?
}
}
}
return result;
}
-
+
+ protected void analyze(Collection<Token> result, Reader text, int offset) throws IOException {
+ TokenStream stream = analyzer.tokenStream("", text);
+ // TODO: support custom attributes
+ CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
+ FlagsAttribute flagsAtt = stream.addAttribute(FlagsAttribute.class);
+ TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class);
+ PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class);
+ PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
+ OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
+ stream.reset();
+ while (stream.incrementToken()) {
+ Token token = new Token();
+ token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
+ token.setStartOffset(offset + offsetAtt.startOffset());
+ token.setEndOffset(offset + offsetAtt.endOffset());
+ token.setFlags(flagsAtt.getFlags());
+ token.setType(typeAtt.type());
+ token.setPayload(payloadAtt.getPayload());
+ token.setPositionIncrement(posIncAtt.getPositionIncrement());
+ result.add(token);
+ }
+ stream.end();
+ stream.close();
+ }
}
Added: lucene/dev/trunk/solr/core/src/java/org/apache/solr/spelling/SuggestQueryConverter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/spelling/SuggestQueryConverter.java?rev=1291322&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/spelling/SuggestQueryConverter.java (added)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/spelling/SuggestQueryConverter.java Mon Feb 20 15:48:38 2012
@@ -0,0 +1,47 @@
+package org.apache.solr.spelling;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+
+import org.apache.lucene.analysis.Token;
+
+/**
+ * Passes the entire query string to the configured analyzer as-is.
+ **/
+public class SuggestQueryConverter extends SpellingQueryConverter {
+
+ @Override
+ public Collection<Token> convert(String original) {
+ if (original == null) { // this can happen with q.alt = and no query
+ return Collections.emptyList();
+ }
+
+ Collection<Token> result = new ArrayList<Token>();
+ try {
+ analyze(result, new StringReader(original), 0);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ return result;
+ }
+}
Added: lucene/dev/trunk/solr/core/src/test-files/solr/conf/phrasesuggest.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test-files/solr/conf/phrasesuggest.txt?rev=1291322&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/test-files/solr/conf/phrasesuggest.txt (added)
+++ lucene/dev/trunk/solr/core/src/test-files/solr/conf/phrasesuggest.txt Mon Feb 20 15:48:38 2012
@@ -0,0 +1,8 @@
+# simple auto-suggest phrase dictionary for testing
+# note this uses tabs as separator!
+the first phrase 1.0
+the second phrase 2.0
+testing 1234 3.0
+foo 5.0
+the fifth phrase 2.0
+the final phrase 4.0
Added: lucene/dev/trunk/solr/core/src/test-files/solr/conf/schema-phrasesuggest.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test-files/solr/conf/schema-phrasesuggest.xml?rev=1291322&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/test-files/solr/conf/schema-phrasesuggest.xml (added)
+++ lucene/dev/trunk/solr/core/src/test-files/solr/conf/schema-phrasesuggest.xml Mon Feb 20 15:48:38 2012
@@ -0,0 +1,52 @@
+<?xml version="1.0" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- Test schema file for phrase suggestions -->
+
+<schema name="test" version="1.0">
+ <types>
+ <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+
+ <!-- basic text field -->
+ <fieldtype name="text" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldtype>
+
+ <fieldtype name="phrase_suggest" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.KeywordTokenizerFactory"/>
+ <filter class="solr.PatternReplaceFilterFactory"
+ pattern="([^\p{L}\p{M}\p{N}\p{Cs}]*[\p{L}\p{M}\p{N}\p{Cs}\_]+:)|([^\p{L}\p{M}\p{N}\p{Cs}])+"
+ replacement=" " replace="all"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.TrimFilterFactory"/>
+ </analyzer>
+ </fieldtype>
+ </types>
+
+ <fields>
+ <field name="id" type="int" indexed="true" stored="true" multiValued="false" required="false"/>
+ <field name="text" type="text" indexed="true" stored="false"/>
+ </fields>
+
+ <defaultSearchField>text</defaultSearchField>
+ <uniqueKey>id</uniqueKey>
+</schema>
Added: lucene/dev/trunk/solr/core/src/test-files/solr/conf/solrconfig-phrasesuggest.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test-files/solr/conf/solrconfig-phrasesuggest.xml?rev=1291322&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/test-files/solr/conf/solrconfig-phrasesuggest.xml (added)
+++ lucene/dev/trunk/solr/core/src/test-files/solr/conf/solrconfig-phrasesuggest.xml Mon Feb 20 15:48:38 2012
@@ -0,0 +1,63 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- solrconfig.xml for a WFST phrase suggester -->
+<config>
+ <luceneMatchVersion>${tests.luceneMatchVersion:LUCENE_CURRENT}</luceneMatchVersion>
+ <dataDir>${solr.data.dir:}</dataDir>
+ <directoryFactory name="DirectoryFactory" class="${solr.directoryFactory:solr.RAMDirectoryFactory}"/>
+ <requestHandler name="standard" class="solr.StandardRequestHandler"></requestHandler>
+
+ <!-- WFSTLookup suggest component -->
+ <searchComponent class="solr.SpellCheckComponent" name="suggest_wfst">
+ <lst name="spellchecker">
+ <str name="name">suggest_wfst</str>
+ <str name="classname">org.apache.solr.spelling.suggest.Suggester</str>
+ <str name="lookupImpl">org.apache.solr.spelling.suggest.fst.WFSTLookupFactory</str>
+ <str name="storeDir">suggest_wfst</str>
+ <str name="buildOnCommit">false</str>
+
+ <!-- Suggester properties -->
+ <bool name="exactMatchFirst">true</bool>
+
+ <str name="sourceLocation">phrasesuggest.txt</str>
+ </lst>
+
+ <!-- specify a fieldtype using keywordtokenizer + lowercase + cleanup -->
+ <str name="queryAnalyzerFieldType">phrase_suggest</str>
+ </searchComponent>
+
+ <!-- is this thing just configured globally or wtf is going on here?! -->
+ <queryConverter name="queryConverter" class="org.apache.solr.spelling.SuggestQueryConverter"/>
+
+ <!-- wfst (finite state automaton based) -->
+ <requestHandler class="org.apache.solr.handler.component.SearchHandler" name="/suggest_wfst">
+ <lst name="defaults">
+ <str name="spellcheck">true</str>
+ <str name="spellcheck.dictionary">suggest_wfst</str>
+ <str name="spellcheck.collate">false</str>
+ <!-- NOTE: if this is false, results are alpha-ordered, not by weight! -->
+ <str name="spellcheck.onlyMorePopular">true</str>
+ </lst>
+ <arr name="components">
+ <str>suggest_wfst</str>
+ </arr>
+ </requestHandler>
+
+</config>
Added: lucene/dev/trunk/solr/core/src/test/org/apache/solr/spelling/TestSuggestSpellingConverter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/spelling/TestSuggestSpellingConverter.java?rev=1291322&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/spelling/TestSuggestSpellingConverter.java (added)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/spelling/TestSuggestSpellingConverter.java Mon Feb 20 15:48:38 2012
@@ -0,0 +1,73 @@
+package org.apache.solr.spelling;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Collection;
+import java.util.regex.Pattern;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CannedTokenStream;
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.miscellaneous.TrimFilter;
+import org.apache.lucene.analysis.pattern.PatternReplaceFilter;
+
+public class TestSuggestSpellingConverter extends BaseTokenStreamTestCase {
+ SuggestQueryConverter converter = new SuggestQueryConverter();
+
+ public void testSimple() throws Exception {
+ // lowercases only!
+ converter.setAnalyzer(new MockAnalyzer(random, MockTokenizer.KEYWORD, true));
+ assertConvertsTo("This is a test", new String[] { "this is a test" });
+ }
+
+ public void testComplicated() throws Exception {
+ // lowercases, removes field names, other syntax, collapses runs of whitespace, etc.
+ converter.setAnalyzer(new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new KeywordTokenizer(reader);
+ TokenStream filter = new PatternReplaceFilter(tokenizer,
+ Pattern.compile("([^\\p{L}\\p{M}\\p{N}\\p{Cs}]*[\\p{L}\\p{M}\\p{N}\\p{Cs}\\_]+:)|([^\\p{L}\\p{M}\\p{N}\\p{Cs}])+"), " ", true);
+ filter = new LowerCaseFilter(TEST_VERSION_CURRENT, filter);
+ filter = new TrimFilter(filter, false);
+ return new TokenStreamComponents(tokenizer, filter);
+ }
+ });
+ assertConvertsTo("test1 +test2", new String[] { "test1 test2" });
+ assertConvertsTo("test~", new String[] { "test" });
+ assertConvertsTo("field:test", new String[] { "test" });
+ assertConvertsTo("This is a test", new String[] { "this is a test" });
+ assertConvertsTo(" This is a test", new String[] { "this is a test" });
+ assertConvertsTo("Foo (field:bar) text_hi:हिनà¥à¤¦à¥ ", new String[] { "foo bar हिनà¥à¤¦à¥" });
+ }
+
+ public void assertConvertsTo(String text, String expected[]) throws IOException {
+ Collection<Token> tokens = converter.convert(text);
+ TokenStream ts = new CannedTokenStream(tokens.toArray(new Token[0]));
+ assertTokenStreamContents(ts, expected);
+ }
+}
Added: lucene/dev/trunk/solr/core/src/test/org/apache/solr/spelling/suggest/TestPhraseSuggestions.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/spelling/suggest/TestPhraseSuggestions.java?rev=1291322&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/spelling/suggest/TestPhraseSuggestions.java (added)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/spelling/suggest/TestPhraseSuggestions.java Mon Feb 20 15:48:38 2012
@@ -0,0 +1,46 @@
+package org.apache.solr.spelling.suggest;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.common.params.SpellingParams;
+import org.junit.BeforeClass;
+
+public class TestPhraseSuggestions extends SolrTestCaseJ4 {
+ static final String URI = "/suggest_wfst";
+
+ @BeforeClass
+ public static void beforeClass() throws Exception {
+ initCore("solrconfig-phrasesuggest.xml","schema-phrasesuggest.xml");
+ assertQ(req("qt", URI, "q", "", SpellingParams.SPELLCHECK_BUILD, "true"));
+ }
+
+ public void test() {
+ assertQ(req("qt", URI, "q", "the f", SpellingParams.SPELLCHECK_COUNT, "4"),
+ "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='the f']/int[@name='numFound'][.='3']",
+ "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='the f']/arr[@name='suggestion']/str[1][.='the final phrase']",
+ "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='the f']/arr[@name='suggestion']/str[2][.='the fifth phrase']",
+ "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='the f']/arr[@name='suggestion']/str[3][.='the first phrase']"
+ );
+
+ assertQ(req("qt", URI, "q", "Testing +12", SpellingParams.SPELLCHECK_COUNT, "4"),
+ "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='testing 12']/int[@name='numFound'][.='1']",
+ "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='testing 12']/arr[@name='suggestion']/str[1][.='testing 1234']"
+ );
+ }
+}