You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2013/10/02 17:30:29 UTC

svn commit: r1528521 - in /lucene/dev/branches/branch_4x: ./ lucene/ lucene/core/ lucene/core/src/java/org/apache/lucene/util/ lucene/core/src/java/org/apache/lucene/util/fst/ lucene/suggest/ lucene/suggest/src/java/org/apache/lucene/search/suggest/ana...

Author: mikemccand
Date: Wed Oct  2 15:30:29 2013
New Revision: 1528521

URL: http://svn.apache.org/r1528521
Log:
LUCENE-5214: add FreeTextSuggester

Added:
    lucene/dev/branches/branch_4x/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java
      - copied unchanged from r1528517, lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java
    lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestFreeTextSuggester.java
      - copied, changed from r1528517, lucene/dev/trunk/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestFreeTextSuggester.java
Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_4x/lucene/core/   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java
    lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/Util.java
    lucene/dev/branches/branch_4x/lucene/suggest/   (props changed)
    lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java

Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1528521&r1=1528520&r2=1528521&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Wed Oct  2 15:30:29 2013
@@ -33,6 +33,11 @@ New Features
   on best effort which was not user-friendly.
   (Uwe Schindler, Robert Muir)
 
+* LUCENE-5214: Add new FreeTextSuggester, to predict the next word
+  using a simple ngram language model.  This is useful for the "long
+  tail" suggestions, when a primary suggester fails to find a
+  suggestion.  (Mike McCandless)
+
 Bug Fixes
 
 * LUCENE-4998: Fixed a few places to pass IOContext.READONCE instead

Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java?rev=1528521&r1=1528520&r2=1528521&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java Wed Oct  2 15:30:29 2013
@@ -383,7 +383,7 @@ public final class BytesRefHash {
     return ids[findHash(bytes, code)];
   }
   
-  private final int findHash(BytesRef bytes, int code) {
+  private int findHash(BytesRef bytes, int code) {
     assert bytesStart != null : "bytesStart is null - not initialized";
     // final position
     int hashPos = code & hashMask;
@@ -578,7 +578,7 @@ public final class BytesRefHash {
   }
 
   /** A simple {@link BytesStartArray} that tracks
-   *  memory allocation using a private {@link AtomicLong}
+   *  memory allocation using a private {@link Counter}
    *  instance.  */
   public static class DirectBytesStartArray extends BytesStartArray {
     // TODO: can't we just merge this w/

Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/Util.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/Util.java?rev=1528521&r1=1528520&r2=1528521&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/Util.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/Util.java Wed Oct  2 15:30:29 2013
@@ -238,11 +238,16 @@ public final class Util {
     }    
   }
 
-  private static class FSTPath<T> {
+  /** Represents a path in TopNSearcher.
+   *
+   *  @lucene.experimental
+   */
+  public static class FSTPath<T> {
     public FST.Arc<T> arc;
     public T cost;
     public final IntsRef input;
 
+    /** Sole constructor */
     public FSTPath(T cost, FST.Arc<T> arc, IntsRef input) {
       this.arc = new FST.Arc<T>().copyFrom(arc);
       this.cost = cost;
@@ -300,7 +305,7 @@ public final class Util {
     }
 
     // If back plus this arc is competitive then add to queue:
-    private void addIfCompetitive(FSTPath<T> path) {
+    protected void addIfCompetitive(FSTPath<T> path) {
 
       assert queue != null;
 
@@ -399,6 +404,7 @@ public final class Util {
 
         if (queue == null) {
           // Ran out of paths
+          //System.out.println("  break queue=null");
           break;
         }
 
@@ -408,6 +414,7 @@ public final class Util {
 
         if (path == null) {
           // There were less than topN paths available:
+          //System.out.println("  break no more paths");
           break;
         }
 
@@ -478,6 +485,7 @@ public final class Util {
             //System.out.println("    done!: " + path);
             T finalOutput = fst.outputs.add(path.cost, path.arc.output);
             if (acceptResult(path.input, finalOutput)) {
+              //System.out.println("    add result: " + path);
               results.add(new MinResult<T>(path.input, finalOutput));
             } else {
               rejectCount++;
@@ -761,11 +769,12 @@ public final class Util {
    * Ensures an arc's label is indeed printable (dot uses US-ASCII). 
    */
   private static String printableLabel(int label) {
-    if (label >= 0x20 && label <= 0x7d) {
+    // Any ordinary ascii character, except for " or \, are
+    // printed as the character; else, as a hex string:
+    if (label >= 0x20 && label <= 0x7d && label != 0x22 && label != 0x5c) {  // " OR \
       return Character.toString((char) label);
-    } else {
-      return "0x" + Integer.toHexString(label);
     }
+    return "0x" + Integer.toHexString(label);
   }
 
   /** Just maps each UTF16 unit (char) to the ints in an

Copied: lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestFreeTextSuggester.java (from r1528517, lucene/dev/trunk/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestFreeTextSuggester.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestFreeTextSuggester.java?p2=lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestFreeTextSuggester.java&p1=lucene/dev/trunk/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestFreeTextSuggester.java&r1=1528517&r2=1528521&rev=1528521&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestFreeTextSuggester.java (original)
+++ lucene/dev/branches/branch_4x/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestFreeTextSuggester.java Wed Oct  2 15:30:29 2013
@@ -146,6 +146,11 @@ public class TestFreeTextSuggester exten
         }
 
         @Override
+        public Comparator<BytesRef> getComparator() {
+          return null;
+        }
+
+        @Override
         public BytesRef next() {
           Document doc;
           try {
@@ -324,6 +329,11 @@ public class TestFreeTextSuggester exten
         int upto;
 
         @Override
+        public Comparator<BytesRef> getComparator() {
+          return null;
+        }
+
+        @Override
         public BytesRef next() {
           if (upto == docs.length) {
             return null;

Modified: lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java?rev=1528521&r1=1528520&r2=1528521&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java (original)
+++ lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/util/_TestUtil.java Wed Oct  2 15:30:29 2013
@@ -282,7 +282,11 @@ public class _TestUtil {
   }
 
   public static String randomSimpleString(Random r, int maxLength) {
-    final int end = nextInt(r, 0, maxLength);
+    return randomSimpleString(r, 0, maxLength);
+  }
+  
+  public static String randomSimpleString(Random r, int minLength, int maxLength) {
+    final int end = nextInt(r, minLength, maxLength);
     if (end == 0) {
       // allow 0 length
       return "";