You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/01/16 00:42:03 UTC

svn commit: r1231796 - in /lucene/dev/branches/branch_3x: ./ lucene/ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/ lucene/src/java/org/apache/lucene/util/fst/ lucene/src/test/org/apache/lucene/index/ lucene/src/test/org/a...

Author: mikemccand
Date: Sun Jan 15 23:42:02 2012
New Revision: 1231796

URL: http://svn.apache.org/viewvc?rev=1231796&view=rev
Log:
LUCENE-3695: move some confusing FST sugar out

Modified:
    lucene/dev/branches/branch_3x/   (props changed)
    lucene/dev/branches/branch_3x/lucene/   (props changed)
    lucene/dev/branches/branch_3x/lucene/CHANGES.txt
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java
    lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/Builder.java
    lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/Util.java
    lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestIndexWriterReader.java
    lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java

Modified: lucene/dev/branches/branch_3x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/CHANGES.txt?rev=1231796&r1=1231795&r2=1231796&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/lucene/CHANGES.txt Sun Jan 15 23:42:02 2012
@@ -47,6 +47,9 @@ Changes in backwards compatibility polic
   has been removed and replaced with the experimental getFieldInfos
   API.  All IndexReader subclasses must implement getFieldInfos.
   (Mike McCandless)
+
+* LUCENE-3695: Move confusing add(X) methods out of FST.Builder into
+  FST.Util.  (Robert Muir, Mike McCandless)
   
 Security fixes
 

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java?rev=1231796&r1=1231795&r2=1231796&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java Sun Jan 15 23:42:02 2012
@@ -33,9 +33,11 @@ import org.apache.lucene.store.ByteArray
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefHash;
 import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.util.fst.ByteSequenceOutputs;
 import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.Util;
 
 /**
  * A map of synonyms, keys and values are phrases.
@@ -263,6 +265,8 @@ public class SynonymMap {
       CharsRef sortedKeys[] = keys.toArray(new CharsRef[keys.size()]);
       Arrays.sort(sortedKeys, CharsRef.getUTF16SortedAsUTF8Comparator());
       
+      final IntsRef scratchIntsRef = new IntsRef();
+
       //System.out.println("fmap.build");
       for (int keyIdx = 0; keyIdx < sortedKeys.length; keyIdx++) {
         CharsRef input = sortedKeys[keyIdx];
@@ -307,7 +311,7 @@ public class SynonymMap {
         
         scratch.length = scratchOutput.getPosition() - scratch.offset;
         //System.out.println("  add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count);
-        builder.add(input, BytesRef.deepCopyOf(scratch));
+        builder.add(Util.toUTF32(input, scratchIntsRef), BytesRef.deepCopyOf(scratch));
       }
       
       FST<BytesRef> fst = builder.finish();

Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/Builder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/Builder.java?rev=1231796&r1=1231795&r2=1231796&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/Builder.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/Builder.java Sun Jan 15 23:42:02 2012
@@ -19,7 +19,6 @@ package org.apache.lucene.util.fst;
 
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.RamUsageEstimator;
-import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.fst.FST.INPUT_TYPE; // javadoc
 
@@ -272,54 +271,6 @@ public class Builder<T> {
     }
   }
 
-  private final IntsRef scratchIntsRef = new IntsRef(10);
-
-  public void add(BytesRef input, T output) throws IOException {
-    assert fst.getInputType() == FST.INPUT_TYPE.BYTE1;
-    scratchIntsRef.grow(input.length);
-    for(int i=0;i<input.length;i++) {
-      scratchIntsRef.ints[i] = input.bytes[i+input.offset] & 0xFF;
-    }
-    scratchIntsRef.length = input.length;
-    add(scratchIntsRef, output);
-  }
-
-  /** Sugar: adds the UTF32 codepoints from char[] slice.  FST
-   *  must be FST.INPUT_TYPE.BYTE4! */
-  public void add(char[] s, int offset, int length, T output) throws IOException {
-    assert fst.getInputType() == FST.INPUT_TYPE.BYTE4;
-    int charIdx = offset;
-    int intIdx = 0;
-    final int charLimit = offset + length;
-    while(charIdx < charLimit) {
-      scratchIntsRef.grow(intIdx+1);
-      final int utf32 = Character.codePointAt(s, charIdx);
-      scratchIntsRef.ints[intIdx] = utf32;
-      charIdx += Character.charCount(utf32);
-      intIdx++;
-    }
-    scratchIntsRef.length = intIdx;
-    add(scratchIntsRef, output);
-  }
-
-  /** Sugar: adds the UTF32 codepoints from CharSequence.  FST
-   *  must be FST.INPUT_TYPE.BYTE4! */
-  public void add(CharSequence s, T output) throws IOException {
-    assert fst.getInputType() == FST.INPUT_TYPE.BYTE4;
-    int charIdx = 0;
-    int intIdx = 0;
-    final int charLimit = s.length();
-    while(charIdx < charLimit) {
-      scratchIntsRef.grow(intIdx+1);
-      final int utf32 = Character.codePointAt(s, charIdx);
-      scratchIntsRef.ints[intIdx] = utf32;
-      charIdx += Character.charCount(utf32);
-      intIdx++;
-    }
-    scratchIntsRef.length = intIdx;
-    add(scratchIntsRef, output);
-  }
-
   /** It's OK to add the same input twice in a row with
    *  different outputs, as long as outputs impls the merge
    *  method. */

Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/Util.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/Util.java?rev=1231796&r1=1231795&r2=1231796&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/Util.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/Util.java Sun Jan 15 23:42:02 2012
@@ -31,10 +31,8 @@ public final class Util {
   }
 
   /** Looks up the output for this input, or null if the
-   *  input is not accepted. FST must be
-   *  INPUT_TYPE.BYTE4. */
+   *  input is not accepted. */
   public static<T> T get(FST<T> fst, IntsRef input) throws IOException {
-    assert fst.inputType == FST.INPUT_TYPE.BYTE4;
 
     // TODO: would be nice not to alloc this on every lookup
     final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
@@ -59,77 +57,7 @@ public final class Util {
     }
   }
 
-  /** Logically casts input to UTF32 ints then looks up the output
-   *  or null if the input is not accepted.  FST must be
-   *  INPUT_TYPE.BYTE4.  */
-  public static<T> T get(FST<T> fst, char[] input, int offset, int length) throws IOException {
-    assert fst.inputType == FST.INPUT_TYPE.BYTE4;
-
-    // TODO: would be nice not to alloc this on every lookup
-    final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
-
-    int charIdx = offset;
-    final int charLimit = offset + length;
-
-    // Accumulate output as we go
-    final T NO_OUTPUT = fst.outputs.getNoOutput();
-    T output = NO_OUTPUT;
-    while(charIdx < charLimit) {
-      final int utf32 = Character.codePointAt(input, charIdx);
-      charIdx += Character.charCount(utf32);
-
-      if (fst.findTargetArc(utf32, arc, arc) == null) {
-        return null;
-      } else if (arc.output != NO_OUTPUT) {
-        output = fst.outputs.add(output, arc.output);
-      }
-    }
-
-    if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) {
-      return null;
-    } else if (arc.output != NO_OUTPUT) {
-      return fst.outputs.add(output, arc.output);
-    } else {
-      return output;
-    }
-  }
-
-
-  /** Logically casts input to UTF32 ints then looks up the output
-   *  or null if the input is not accepted.  FST must be
-   *  INPUT_TYPE.BYTE4.  */
-  public static<T> T get(FST<T> fst, CharSequence input) throws IOException {
-    assert fst.inputType == FST.INPUT_TYPE.BYTE4;
-    
-    // TODO: would be nice not to alloc this on every lookup
-    final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
-
-    int charIdx = 0;
-    final int charLimit = input.length();
-
-    // Accumulate output as we go
-    final T NO_OUTPUT = fst.outputs.getNoOutput();
-    T output = NO_OUTPUT;
-
-    while(charIdx < charLimit) {
-      final int utf32 = Character.codePointAt(input, charIdx);
-      charIdx += Character.charCount(utf32);
-
-      if (fst.findTargetArc(utf32, arc, arc) == null) {
-        return null;
-      } else if (arc.output != NO_OUTPUT) {
-        output = fst.outputs.add(output, arc.output);
-      }
-    }
-
-    if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) {
-      return null;
-    } else if (arc.output != NO_OUTPUT) {
-      return fst.outputs.add(output, arc.output);
-    } else {
-      return output;
-    }
-  }
+  // TODO: maybe a CharsRef version for BYTE2
 
   /** Looks up the output for this input, or null if the
    *  input is not accepted */
@@ -381,4 +309,51 @@ public final class Util {
       return "0x" + Integer.toHexString(label);
     }
   }
+
+  /** Decodes the Unicode codepoints from the provided
+   *  CharSequence and places them in the provided scratch
+   *  IntsRef, which must not be null, returning it. */
+  public static IntsRef toUTF32(CharSequence s, IntsRef scratch) {
+    int charIdx = 0;
+    int intIdx = 0;
+    final int charLimit = s.length();
+    while(charIdx < charLimit) {
+      scratch.grow(intIdx+1);
+      final int utf32 = Character.codePointAt(s, charIdx);
+      scratch.ints[intIdx] = utf32;
+      charIdx += Character.charCount(utf32);
+      intIdx++;
+    }
+    scratch.length = intIdx;
+    return scratch;
+  }
+
+  /** Decodes the Unicode codepoints from the provided
+   *  char[] and places them in the provided scratch
+   *  IntsRef, which must not be null, returning it. */
+  public static IntsRef toUTF32(char[] s, int offset, int length, IntsRef scratch) {
+    int charIdx = offset;
+    int intIdx = 0;
+    final int charLimit = offset + length;
+    while(charIdx < charLimit) {
+      scratch.grow(intIdx+1);
+      final int utf32 = Character.codePointAt(s, charIdx);
+      scratch.ints[intIdx] = utf32;
+      charIdx += Character.charCount(utf32);
+      intIdx++;
+    }
+    scratch.length = intIdx;
+    return scratch;
+  }
+
+  /** Just takes unsigned byte values from the BytesRef and
+   *  converts into an IntsRef. */
+  public static IntsRef toIntsRef(BytesRef input, IntsRef scratch) {
+    scratch.grow(input.length);
+    for(int i=0;i<input.length;i++) {
+      scratch.ints[i] = input.bytes[i+input.offset] & 0xFF;
+    }
+    scratch.length = input.length;
+    return scratch;
+  }
 }

Modified: lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestIndexWriterReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestIndexWriterReader.java?rev=1231796&r1=1231795&r2=1231796&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestIndexWriterReader.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/index/TestIndexWriterReader.java Sun Jan 15 23:42:02 2012
@@ -19,29 +19,30 @@ package org.apache.lucene.index;
 import java.io.IOException;
 import java.io.PrintStream;
 import java.util.ArrayList;
+import java.util.Collection;
 import java.util.Collections;
 import java.util.List;
 import java.util.Random;
 import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
 
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.analysis.WhitespaceAnalyzer;
 import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
 import org.apache.lucene.document.Field.Index;
 import org.apache.lucene.document.Field.Store;
-import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.document.Field;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.store.AlreadyClosedException;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.MockDirectoryWrapper;
-import org.apache.lucene.store.AlreadyClosedException;
 import org.apache.lucene.store.RAMDirectory;
 import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.util._TestUtil;
 import org.apache.lucene.util.ThreadInterruptedException;
-import java.util.concurrent.atomic.AtomicInteger;
+import org.apache.lucene.util._TestUtil;
 
 public class TestIndexWriterReader extends LuceneTestCase {
   static PrintStream infoStream = VERBOSE ? System.out : null;
@@ -787,7 +788,8 @@ public class TestIndexWriterReader exten
 
     assertEquals(0, excs.size());
     r.close();
-    assertEquals(0, dir1.getOpenDeletedFiles().size());
+    final Collection<String> openDeletedFiles = dir1.getOpenDeletedFiles();
+    assertEquals("openDeleted=" + openDeletedFiles, 0, openDeletedFiles.size());
 
     writer.close();
 

Modified: lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java?rev=1231796&r1=1231795&r2=1231796&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java Sun Jan 15 23:42:02 2012
@@ -24,6 +24,7 @@ import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.io.OutputStreamWriter;
+import java.io.StringWriter;
 import java.io.Writer;
 import java.util.*;
 
@@ -1045,6 +1046,7 @@ public class TestFSTs extends LuceneTest
         System.out.println("FST stores docFreq");
       }
     }
+    final IntsRef scratchIntsRef = new IntsRef();
     TermEnum termEnum = r.terms(new Term("body", ""));
     if (VERBOSE) {
       System.out.println("TEST: got termEnum=" + termEnum);
@@ -1380,7 +1382,7 @@ public class TestFSTs extends LuceneTest
   public void testSingleString() throws Exception {
     final Outputs<Object> outputs = NoOutputs.getSingleton();
     final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, outputs);
-    b.add(new BytesRef("foobar"), outputs.getNoOutput());
+    b.add(Util.toIntsRef(new BytesRef("foobar"), new IntsRef()), outputs.getNoOutput());
     final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<Object>(b.finish());
     assertNull(fstEnum.seekFloor(new BytesRef("foo")));
     assertNull(fstEnum.seekCeil(new BytesRef("foobaz")));
@@ -1402,9 +1404,9 @@ public class TestFSTs extends LuceneTest
     final BytesRef b = new BytesRef("b");
     final BytesRef c = new BytesRef("c");
 
-    builder.add(a, outputs.get(17));
-    builder.add(b, outputs.get(42));
-    builder.add(c, outputs.get(13824324872317238L));
+    builder.add(Util.toIntsRef(a, new IntsRef()), outputs.get(17));
+    builder.add(Util.toIntsRef(b, new IntsRef()), outputs.get(42));
+    builder.add(Util.toIntsRef(c, new IntsRef()), outputs.get(13824324872317238L));
 
     final FST<Long> fst = builder.finish();
 
@@ -1628,13 +1630,14 @@ public class TestFSTs extends LuceneTest
 
         int line = 0;
         final BytesRef term = new BytesRef();
+        final IntsRef scratchIntsRef = new IntsRef();
         while (line < lines.length) {
           String w = lines[line++];
           if (w == null) {
             break;
           }
           term.copyChars(w);
-          b.add(term, nothing);
+          b.add(Util.toIntsRef(term, scratchIntsRef), nothing);
         }
         
         return b.finish();
@@ -1694,6 +1697,36 @@ public class TestFSTs extends LuceneTest
     s.verifyStateAndBelow(fst, arc, 1);
   }
 
+  public void testFinalOutputOnEndState() throws Exception {
+    final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
+
+    final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs);
+    builder.add(Util.toUTF32("stat", new IntsRef()), outputs.get(17));
+    builder.add(Util.toUTF32("station", new IntsRef()), outputs.get(10));
+    final FST<Long> fst = builder.finish();
+    //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot"));
+    StringWriter w = new StringWriter();
+    Util.toDot(fst, w, false, false);
+    w.close();
+    //System.out.println(w.toString());
+    assertTrue(w.toString().indexOf("label=\"t/[7]\"") != -1);
+  }
+
+  public void testInternalFinalState() throws Exception {
+    final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
+
+    final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs);
+    builder.add(Util.toIntsRef(new BytesRef("stat"), new IntsRef()), outputs.getNoOutput());
+    builder.add(Util.toIntsRef(new BytesRef("station"), new IntsRef()), outputs.getNoOutput());
+    final FST<Long> fst = builder.finish();
+    StringWriter w = new StringWriter();
+    //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot"));
+    Util.toDot(fst, w, false, false);
+    w.close();
+    //System.out.println(w.toString());
+    assertTrue(w.toString().indexOf("6 [shape=doublecircle") != -1);
+  }
+
   // Make sure raw FST can differentiate between final vs
   // non-final end nodes
   public void testNonFinalStopNodes() throws Exception {