You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2013/08/14 18:02:36 UTC

svn commit: r1513942 - in /lucene/dev/branches/branch_4x: ./ lucene/ lucene/suggest/ lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/ lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/ lucene/test-framework/ lucene/t...

Author: mikemccand
Date: Wed Aug 14 16:02:35 2013
New Revision: 1513942

URL: http://svn.apache.org/r1513942
Log:
LUCENE-5165: add SuggestStopFilter

Added:
    lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/SuggestStopFilter.java
      - copied unchanged from r1513940, lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/SuggestStopFilter.java
    lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestSuggestStopFilter.java
      - copied unchanged from r1513940, lucene/dev/trunk/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestSuggestStopFilter.java
Removed:
    lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/ForkLastTokenFilter.java
    lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/StopKeywordFilter.java
Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_4x/lucene/suggest/   (props changed)
    lucene/dev/branches/branch_4x/lucene/suggest/build.xml
    lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggesterTest.java
    lucene/dev/branches/branch_4x/lucene/test-framework/   (props changed)
    lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java

Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1513942&r1=1513941&r2=1513942&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Wed Aug 14 16:02:35 2013
@@ -40,6 +40,13 @@ New features
   FacetsAggregator.createOrdinalValueResolver. This gives better options for
   resolving an ordinal's value by FacetAggregators. (Shai Erera)
 
+* LUCENE-5165: Add SuggestStopFilter, to be used with analyzing
+  suggesters, so that a stop word at the very end of the lookup query,
+  and without any trailing token characters, will be preserved.  This
+  enables query "a" to suggest apple; see 
+  http://blog.mikemccandless.com/2013/08/suggeststopfilter-carefully-removes.html
+  for details.
+
 Bug Fixes
 
 * LUCENE-5116: IndexWriter.addIndexes(IndexReader...) should drop empty (or all

Modified: lucene/dev/branches/branch_4x/lucene/suggest/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/suggest/build.xml?rev=1513942&r1=1513941&r2=1513942&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/suggest/build.xml (original)
+++ lucene/dev/branches/branch_4x/lucene/suggest/build.xml Wed Aug 14 16:02:35 2013
@@ -34,6 +34,15 @@
     <path refid="base.classpath"/>
   </path>
 
+
+  <target name="javadocs" depends="javadocs-queries,compile-core">
+    <invoke-module-javadoc>
+      <links>
+        <link href="../analyzers-common"/>
+      </links>
+    </invoke-module-javadoc>
+  </target>
+
   <target name="compile-core" depends="jar-misc, jar-analyzers-common, common.compile-core" />
 
 </project>

Modified: lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggesterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggesterTest.java?rev=1513942&r1=1513941&r2=1513942&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggesterTest.java (original)
+++ lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggesterTest.java Wed Aug 14 16:02:35 2013
@@ -25,11 +25,8 @@ import java.util.Locale;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.analysis.MockTokenizer;
-import org.apache.lucene.search.BooleanClause;
-import org.apache.lucene.search.BooleanQuery;
-import org.apache.lucene.search.PrefixQuery;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.search.suggest.Lookup.LookupResult;
 import org.apache.lucene.search.suggest.TermFreqPayload;
 import org.apache.lucene.search.suggest.TermFreqPayloadArrayIterator;
@@ -294,64 +291,39 @@ public class AnalyzingInfixSuggesterTest
     suggester.close();
   }
 
-  public void testForkLastToken() throws Exception {
-    Analyzer a = new Analyzer() {
+  public void testSuggestStopFilter() throws Exception {
+    final CharArraySet stopWords = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "a");
+    Analyzer indexAnalyzer = new Analyzer() {
         @Override
         protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
           MockTokenizer tokens = new MockTokenizer(reader);
-          // ForkLastTokenFilter is a bit evil:
-          tokens.setEnableChecks(false);
           return new TokenStreamComponents(tokens,
-                                           new StopKeywordFilter(TEST_VERSION_CURRENT,
-                                                                 new ForkLastTokenFilter(tokens), StopKeywordFilter.makeStopSet(TEST_VERSION_CURRENT, "a")));
+                                           new StopFilter(TEST_VERSION_CURRENT, tokens, stopWords));
         }
       };
 
-    TermFreqPayload keys[] = new TermFreqPayload[] {
-      new TermFreqPayload("a bob for apples", 10, new BytesRef("foobaz")),
-    };
-
-    File tempDir = _TestUtil.getTempDir("AnalyzingInfixSuggesterTest");
-
-    AnalyzingInfixSuggester suggester = new AnalyzingInfixSuggester(TEST_VERSION_CURRENT, tempDir, a, a, 3) {
+    Analyzer queryAnalyzer = new Analyzer() {
         @Override
-        protected Query finishQuery(BooleanQuery in, boolean allTermsRequired) {
-          List<BooleanClause> clauses = in.clauses();
-          if (clauses.size() >= 2 && allTermsRequired) {
-            String t1 = getTerm(clauses.get(clauses.size()-2).getQuery());
-            String t2 = getTerm(clauses.get(clauses.size()-1).getQuery());
-            if (t1.equals(t2)) {
-              // The last 2 tokens came from
-              // ForkLastTokenFilter; we remove them and
-              // replace them with a MUST BooleanQuery that
-              // SHOULDs the two of them together:
-              BooleanQuery sub = new BooleanQuery();
-              BooleanClause other = clauses.get(clauses.size()-2);
-              sub.add(new BooleanClause(clauses.get(clauses.size()-2).getQuery(), BooleanClause.Occur.SHOULD));
-              sub.add(new BooleanClause(clauses.get(clauses.size()-1).getQuery(), BooleanClause.Occur.SHOULD));
-              clauses.subList(clauses.size()-2, clauses.size()).clear();
-              clauses.add(new BooleanClause(sub, BooleanClause.Occur.MUST));
-            }
-          }
-          return in;
+        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+          MockTokenizer tokens = new MockTokenizer(reader);
+          return new TokenStreamComponents(tokens,
+                                           new SuggestStopFilter(tokens, stopWords));
         }
+      };
 
-        private String getTerm(Query query) {
-          if (query instanceof TermQuery) {
-            return ((TermQuery) query).getTerm().text();
-          } else if (query instanceof PrefixQuery) {
-            return ((PrefixQuery) query).getPrefix().text();
-          } else {
-            return null;
-          }
-        }
+    File tempDir = _TestUtil.getTempDir("AnalyzingInfixSuggesterTest");
 
+    AnalyzingInfixSuggester suggester = new AnalyzingInfixSuggester(TEST_VERSION_CURRENT, tempDir, indexAnalyzer, queryAnalyzer, 3) {
         @Override
         protected Directory getDirectory(File path) {
           return newDirectory();
         }
       };
 
+    TermFreqPayload keys[] = new TermFreqPayload[] {
+      new TermFreqPayload("a bob for apples", 10, new BytesRef("foobaz")),
+    };
+
     suggester.build(new TermFreqPayloadArrayIterator(keys));
     List<LookupResult> results = suggester.lookup(_TestUtil.stringToCharSequence("a", random()), 10, true, true);
     assertEquals(1, results.size());

Modified: lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java?rev=1513942&r1=1513941&r2=1513942&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java (original)
+++ lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java Wed Aug 14 16:02:35 2013
@@ -111,7 +111,8 @@ public abstract class BaseTokenStreamTes
   //     arriving to pos Y have the same endOffset)
   //   - offsets only move forwards (startOffset >=
   //     lastStartOffset)
-  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset,
+  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[],
+                                               int posLengths[], Integer finalOffset, boolean[] keywordAtts,
                                                boolean offsetsAreCorrect) throws IOException {
     assertNotNull(output);
     CheckClearAttributesAttribute checkClearAtt = ts.addAttribute(CheckClearAttributesAttribute.class);
@@ -145,6 +146,12 @@ public abstract class BaseTokenStreamTes
       assertTrue("has no PositionLengthAttribute", ts.hasAttribute(PositionLengthAttribute.class));
       posLengthAtt = ts.getAttribute(PositionLengthAttribute.class);
     }
+
+    KeywordAttribute keywordAtt = null;
+    if (keywordAtts != null) {
+      assertTrue("has no KeywordAttribute", ts.hasAttribute(KeywordAttribute.class));
+      keywordAtt = ts.getAttribute(KeywordAttribute.class);
+    }
     
     // Maps position to the start/end offset:
     final Map<Integer,Integer> posToStartOffset = new HashMap<Integer,Integer>();
@@ -161,22 +168,31 @@ public abstract class BaseTokenStreamTes
       if (typeAtt != null) typeAtt.setType("bogusType");
       if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657);
       if (posLengthAtt != null) posLengthAtt.setPositionLength(45987653);
+      if (keywordAtt != null) keywordAtt.setKeyword((i&1) == 0);
       
       checkClearAtt.getAndResetClearCalled(); // reset it, because we called clearAttribute() before
       assertTrue("token "+i+" does not exist", ts.incrementToken());
       assertTrue("clearAttributes() was not called correctly in TokenStream chain", checkClearAtt.getAndResetClearCalled());
       
       assertEquals("term "+i, output[i], termAtt.toString());
-      if (startOffsets != null)
+      if (startOffsets != null) {
         assertEquals("startOffset "+i, startOffsets[i], offsetAtt.startOffset());
-      if (endOffsets != null)
+      }
+      if (endOffsets != null) {
         assertEquals("endOffset "+i, endOffsets[i], offsetAtt.endOffset());
-      if (types != null)
+      }
+      if (types != null) {
         assertEquals("type "+i, types[i], typeAtt.type());
-      if (posIncrements != null)
+      }
+      if (posIncrements != null) {
         assertEquals("posIncrement "+i, posIncrements[i], posIncrAtt.getPositionIncrement());
-      if (posLengths != null)
+      }
+      if (posLengths != null) {
         assertEquals("posLength "+i, posLengths[i], posLengthAtt.getPositionLength());
+      }
+      if (keywordAtts != null) {
+        assertEquals("keywordAtt " + i, keywordAtts[i], keywordAtt.isKeyword());
+      }
       
       // we can enforce some basic things about a few attributes even if the caller doesn't check:
       if (offsetAtt != null) {
@@ -239,7 +255,9 @@ public abstract class BaseTokenStreamTes
         assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);
       }
     }
-    assertFalse("TokenStream has more tokens than expected (expected count=" + output.length + ")", ts.incrementToken());
+    if (ts.incrementToken()) {
+      fail("TokenStream has more tokens than expected (expected count=" + output.length + "); extra token=" + termAtt.toString());
+    }
     ts.end();
     if (finalOffset != null) {
       assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset());
@@ -250,6 +268,10 @@ public abstract class BaseTokenStreamTes
     ts.close();
   }
   
+  public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset, boolean offsetsAreCorrect) throws IOException {
+    assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, null, offsetsAreCorrect);
+  }
+
   public static void assertTokenStreamContents(TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], int posLengths[], Integer finalOffset) throws IOException {
     assertTokenStreamContents(ts, output, startOffsets, endOffsets, types, posIncrements, posLengths, finalOffset, true);
   }