You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2011/08/04 13:03:35 UTC

svn commit: r1153845 - in /lucene/dev/branches/blocktree_3030: ./ lucene/src/java/org/apache/lucene/index/codecs/ lucene/src/java/org/apache/lucene/index/codecs/pulsing/ lucene/src/java/org/apache/lucene/index/codecs/standard/ lucene/src/test/org/apach...

Author: mikemccand
Date: Thu Aug  4 11:03:34 2011
New Revision: 1153845

URL: http://svn.apache.org/viewvc?rev=1153845&view=rev
Log:
LUCENE-3030: fix more false exc in test sops; add missing & 0xff (caused Test2BTerms to fail); fix bug in IntersectEnum seekToStartTerm causing fail in TestFuzzyQuery2

Modified:
    lucene/dev/branches/blocktree_3030/TODO
    lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsReader.java
    lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsWriter.java
    lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java
    lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java
    lucene/dev/branches/blocktree_3030/lucene/src/test/org/apache/lucene/index/Test2BTerms.java
    lucene/dev/branches/blocktree_3030/lucene/src/test/org/apache/lucene/index/TestTermsEnum.java

Modified: lucene/dev/branches/blocktree_3030/TODO
URL: http://svn.apache.org/viewvc/lucene/dev/branches/blocktree_3030/TODO?rev=1153845&r1=1153844&r2=1153845&view=diff
==============================================================================
--- lucene/dev/branches/blocktree_3030/TODO (original)
+++ lucene/dev/branches/blocktree_3030/TODO Thu Aug  4 11:03:34 2011
@@ -1,4 +1,3 @@
-
 perf tests:
   - GRRR -- indexing MUCH slower now?
     trunk:
@@ -34,9 +33,8 @@ automaton q should apply maxlength test 
 
 intersect should use suffix ref
 
-maybe blocks should NOT store sub-block pointers?  it's reudundant w/ the index...
-
 LATER:
+  - maybe blocks should NOT store sub-block pointers?  it's reudundant w/ the index...
   - hmm: maybe switch PKLookupTask to intersect!?  do we have fast string builder?
   - hmm -- fix DOT when there are multiple outputs!?  oh, maybe not -- it just works?
   - maybe we should provide a "terms dict rewriter" tool?  ie can rewrite terms dict w/ new settings after segment was already created

Modified: lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsReader.java?rev=1153845&r1=1153844&r2=1153845&view=diff
==============================================================================
--- lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsReader.java (original)
+++ lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsReader.java Thu Aug  4 11:03:34 2011
@@ -624,7 +624,7 @@ public class BlockTreeTermsReader extend
 
         void load(BytesRef frameIndexData) throws IOException {
 
-          if (DEBUG) System.out.println("    load fp=" + fp + " fpOrig=" + fpOrig + " frameIndexData=" + frameIndexData + " trans=" + (transitions.length != 0 ? transitions[0] : "n/a"));
+          if (DEBUG) System.out.println("    load fp=" + fp + " fpOrig=" + fpOrig + " frameIndexData=" + frameIndexData + " trans=" + (transitions.length != 0 ? transitions[0] : "n/a" + " state=" + state));
 
           if (frameIndexData != null && transitions.length != 0) {
             // Floor frame
@@ -772,6 +772,9 @@ public class BlockTreeTermsReader extend
       private final BytesRef savedStartTerm;
 
       public IntersectEnum(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
+        if (DEBUG) {
+          System.out.println("\nintEnum.init seg=" + segment);
+        }
         // nocommit can we use suffixRef?
         // nocommit in some cases we can do hard filter by
         // length!!  eg regexp ????????
@@ -805,9 +808,6 @@ public class BlockTreeTermsReader extend
         if (startTerm != null) {
           seekToStartTerm(startTerm);
         }
-        if (DEBUG) {
-          System.out.println("\nintEnum.init seg=" + segment);
-        }
       }
 
       @Override
@@ -846,6 +846,7 @@ public class BlockTreeTermsReader extend
         
         f.fp = f.fpOrig = currentFrame.lastSubFP;
         f.prefix = currentFrame.prefix + currentFrame.suffix;
+        if (DEBUG) System.out.println("    pushFrame state=" + state + " prefix=" + f.prefix);
         f.setState(state);
 
         // Walk the arc through the index -- we only
@@ -909,6 +910,15 @@ public class BlockTreeTermsReader extend
         }
       }
 
+      private int getState() {
+        int state = currentFrame.state;
+        for(int idx=0;idx<currentFrame.suffix;idx++) {
+          state = runAutomaton.step(state,  currentFrame.suffixBytes[currentFrame.startBytePos+idx] & 0xff);
+          assert state != -1;
+        }
+        return state;
+      }
+
       // NOTE: specialized to only doing the first-time
       // seek, but we could generalize it to allow
       // arbitrary seekExact/Ceil.  Note that this is a
@@ -923,15 +933,6 @@ public class BlockTreeTermsReader extend
         assert arc == currentFrame.arc;
 
         for(int idx=0;idx<=target.length;idx++) {
-          final int targetLabel = idx == target.length ? -1 : target.bytes[target.offset+idx] & 0xff;
-          final int nextState;
-          if (idx < target.length) {
-            nextState = runAutomaton.step(currentFrame.state, targetLabel);
-            assert nextState != -1;
-          } else {
-            nextState = -1;
-          }
-          if (DEBUG) System.out.println("  idx=" + idx + " label=" + (char) targetLabel + " f.ord=" + currentFrame.ord);
 
           boolean lastIsSubBlock = false;
 
@@ -953,8 +954,7 @@ public class BlockTreeTermsReader extend
 
             if (isSubBlock && target.startsWith(term)) {
               // Recurse
-              assert nextState != -1;
-              currentFrame = pushFrame(nextState);
+              currentFrame = pushFrame(getState());
               break;
             } else {
               final int cmp = term.compareTo(target);
@@ -989,10 +989,10 @@ public class BlockTreeTermsReader extend
                 term.length = currentFrame.prefix + currentFrame.suffix;
                 if (lastIsSubBlock) {
                   // Recurse
-                  currentFrame = pushFrame(nextState);
+                  currentFrame = pushFrame(getState());
                   break;
                 } else {
-                  if (DEBUG) System.out.println("  return term=" + brToString(term));
+                  if (DEBUG) System.out.println("  fallback return term=" + brToString(term) + " curFrame.nextEnt=" + currentFrame.nextEnt);
                   return;
                 }
               }
@@ -1053,9 +1053,6 @@ public class BlockTreeTermsReader extend
 
                 // sneaky!  forces a pop above
                 currentFrame.isLastInFloor = true;
-                //while (!currentFrame.isLastInFloor) {
-                //currentFrame.loadNextFloorBlock();
-                //}
                 currentFrame.nextEnt = currentFrame.entCount;
                 continue nextTerm;
               }
@@ -1072,7 +1069,10 @@ public class BlockTreeTermsReader extend
             state = runAutomaton.step(state,  currentFrame.suffixBytes[currentFrame.startBytePos+idx] & 0xff);
             if (state == -1) {
               // No match
+              //System.out.println("    no s=" + state);
               continue nextTerm;
+            } else {
+              //System.out.println("    c s=" + state);
             }
           }
 
@@ -1088,7 +1088,7 @@ public class BlockTreeTermsReader extend
             assert savedStartTerm == null || term.compareTo(savedStartTerm) > 0: "saveStartTerm=" + savedStartTerm.utf8ToString() + " term=" + term.utf8ToString();
             return term;
           } else {
-            //System.out.println("    no match");
+            //System.out.println("    no s=" + state);
           }
         }
       }

Modified: lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsWriter.java?rev=1153845&r1=1153844&r2=1153845&view=diff
==============================================================================
--- lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsWriter.java (original)
+++ lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/BlockTreeTermsWriter.java Thu Aug  4 11:03:34 2011
@@ -461,12 +461,12 @@ public class BlockTreeTermsWriter extend
               assert numSubs == 0;
               label = -1;
             } else {
-              label = term.term.bytes[term.term.offset + prefixLength];
+              label = term.term.bytes[term.term.offset + prefixLength] & 0xff;
             }
           } else {
             PendingBlock block = (PendingBlock) ent;
             assert block.prefix.length > prefixLength;
-            label = block.prefix.bytes[block.prefix.offset + prefixLength];
+            label = block.prefix.bytes[block.prefix.offset + prefixLength] & 0xff;
           }
 
           if (label != lastLabel && (termCount + subCount) != 0) {
@@ -579,6 +579,8 @@ public class BlockTreeTermsWriter extend
             }
 
             if (curStart <= maxItemsInBlock) {
+              // nocommit -- should we do a better job
+              // segmenting here...?
               // remainder is small enough to fit into a
               // block.  NOTE that this may be too small (<
               // minItemsInBlock); need a true segmenter
@@ -605,16 +607,14 @@ public class BlockTreeTermsWriter extend
       return 1;
     }
 
-    String brPrefixToString(BytesRef b) {
-      // nocommit
-      return b.toString();
-      //return b.utf8ToString() + " " + b;
-    }
-
-    String brToString(BytesRef b) {
-      // nocommit
-      // return b.toString();
-      return b.utf8ToString() + " " + b;
+    // for debugging
+    private String toString(BytesRef b) {
+      final String s;
+      try {
+        return b.utf8ToString() + " " + b;
+      } catch (Throwable t) {
+        return b.toString();
+      }
     }
 
     // TODO: we could block-write the term suffix pointers;
@@ -624,27 +624,21 @@ public class BlockTreeTermsWriter extend
 
       assert length > 0;
 
-      final BytesRef prefix = new BytesRef(indexPrefixLength);
-      for(int m=0;m<indexPrefixLength;m++) {
-        prefix.bytes[m] = (byte) prevTerm.ints[m];
-      }
-      prefix.length = indexPrefixLength;
-
-      /*if (isFloor) {
-        System.out.println("  wb seg=" + segment + " prefix=" + prefix.utf8ToString() + " " + prefix + " field=" + fieldInfo.name + " prefix=" + prefixLength + " pending=" + pending.size() + " start=" + start + " length=" + length);
-      } else {
-        System.out.println("\nWB seg=" + segment + " prefix=" + prefix.utf8ToString() + " " + prefix + " field=" + fieldInfo.name + " prefix=" + prefixLength + " pending=" + pending.size() + " start=" + start + " length=" + length);
-        }*/
       assert pending.size() >= start: "pending.size()=" + pending.size() + " start=" + start + " length=" + length;
 
       final List<Object> slice = pending.subList(pending.size()-start, pending.size()-start + length);
 
       final long startFP = out.getFilePointer();
 
+      final BytesRef prefix = new BytesRef(indexPrefixLength);
+      for(int m=0;m<indexPrefixLength;m++) {
+        prefix.bytes[m] = (byte) prevTerm.ints[m];
+      }
+      prefix.length = indexPrefixLength;
       out.writeVInt((length<<1)|(isLastInFloor ? 1:0));
 
       if (DEBUG2 || DEBUG) {
-        System.out.println("  writeBlock " + (isFloor ? "(floor) " : "") + "seg=" + segment + " pending.size()=" + pending.size() + " prefixLength=" + prefixLength + " indexPrefix=" + prefix + " entCount=" + length + " startFP=" + startFP + " futureTermCount=" + futureTermCount + (isFloor ? (" floorLeadByte=" + Integer.toHexString(floorLeadByte&0xff)) : "") + " isLastInFloor=" + isLastInFloor);
+        System.out.println("  writeBlock " + (isFloor ? "(floor) " : "") + "seg=" + segment + " pending.size()=" + pending.size() + " prefixLength=" + prefixLength + " indexPrefix=" + toString(prefix) + " entCount=" + length + " startFP=" + startFP + " futureTermCount=" + futureTermCount + (isFloor ? (" floorLeadByte=" + Integer.toHexString(floorLeadByte&0xff)) : "") + " isLastInFloor=" + isLastInFloor);
       }
 
       // 1st pass: pack term suffix bytes into byte[] blob
@@ -699,7 +693,7 @@ public class BlockTreeTermsWriter extend
             BytesRef suffixBytes = new BytesRef(suffix);
             System.arraycopy(block.prefix.bytes, prefixLength, suffixBytes.bytes, 0, suffix);
             suffixBytes.length = suffix;
-            System.out.println("    write sub-block suffix=" + brPrefixToString(suffixBytes) + " subFP=" + block.fp + " subCode=" + (startFP-block.fp) + " floor=" + block.isFloor);
+            System.out.println("    write sub-block suffix=" + toString(suffixBytes) + " subFP=" + block.fp + " subCode=" + (startFP-block.fp) + " floor=" + block.isFloor);
           }
 
           bytesWriter.writeVLong(startFP - block.fp);
@@ -771,7 +765,7 @@ public class BlockTreeTermsWriter extend
 
     @Override
     public PostingsConsumer startTerm(BytesRef text) throws IOException {
-      if (DEBUG) System.out.println("\nBTTW.startTerm term=" + fieldInfo.name + ":" + text.utf8ToString() + " " + text + " seg=" + segment);
+      if (DEBUG) System.out.println("\nBTTW.startTerm term=" + fieldInfo.name + ":" + toString(text) + " seg=" + segment);
       postingsWriter.startTerm();
       /*
       if (fieldInfo.name.equals("id")) {
@@ -787,7 +781,7 @@ public class BlockTreeTermsWriter extend
     public void finishTerm(BytesRef text, TermStats stats) throws IOException {
 
       assert stats.docFreq > 0;
-      if (DEBUG) System.out.println("BTTW.finishTerm term=" + fieldInfo.name + ":" + text.utf8ToString() + " " + text + " seg=" + segment + " df=" + stats.docFreq);
+      if (DEBUG) System.out.println("BTTW.finishTerm term=" + fieldInfo.name + ":" + toString(text) + " seg=" + segment + " df=" + stats.docFreq);
 
       blockBuilder.add(text, noOutputs.getNoOutput());
       pending.add(new PendingTerm(new BytesRef(text), stats));

Modified: lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java?rev=1153845&r1=1153844&r2=1153845&view=diff
==============================================================================
--- lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java (original)
+++ lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java Thu Aug  4 11:03:34 2011
@@ -66,7 +66,7 @@ public class PulsingCodec extends Codec 
 
   @Override
   public String toString() {
-    return name + "(freqCutoff=" + freqCutoff + ")";
+    return name + "(freqCutoff=" + freqCutoff + " minBlockSize=" + minBlockSize + " maxBlockSize=" + maxBlockSize + ")";
   }
 
   @Override

Modified: lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java?rev=1153845&r1=1153844&r2=1153845&view=diff
==============================================================================
--- lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java (original)
+++ lucene/dev/branches/blocktree_3030/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java Thu Aug  4 11:03:34 2011
@@ -124,6 +124,11 @@ public class StandardCodec extends Codec
   }
 
   @Override
+  public String toString() {
+    return name + "(minBlockSize=" + minBlockSize + " maxBlockSize=" + maxBlockSize + ")";
+  }
+
+  @Override
   public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException {
     return new DefaultDocValuesConsumer(state, getDocValuesSortComparator(), getDocValuesUseCFS());
   }

Modified: lucene/dev/branches/blocktree_3030/lucene/src/test/org/apache/lucene/index/Test2BTerms.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/blocktree_3030/lucene/src/test/org/apache/lucene/index/Test2BTerms.java?rev=1153845&r1=1153844&r2=1153845&view=diff
==============================================================================
--- lucene/dev/branches/blocktree_3030/lucene/src/test/org/apache/lucene/index/Test2BTerms.java (original)
+++ lucene/dev/branches/blocktree_3030/lucene/src/test/org/apache/lucene/index/Test2BTerms.java Thu Aug  4 11:03:34 2011
@@ -169,6 +169,7 @@ public class Test2BTerms extends LuceneT
                                       .setMergePolicy(newLogMergePolicy(false, 10))
                                       .setOpenMode(IndexWriterConfig.OpenMode.CREATE));
 
+      w.setInfoStream(VERBOSE ? System.out : null);
       MergePolicy mp = w.getConfig().getMergePolicy();
       if (mp instanceof LogByteSizeMergePolicy) {
         // 1 petabyte:

Modified: lucene/dev/branches/blocktree_3030/lucene/src/test/org/apache/lucene/index/TestTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/blocktree_3030/lucene/src/test/org/apache/lucene/index/TestTermsEnum.java?rev=1153845&r1=1153844&r2=1153845&view=diff
==============================================================================
--- lucene/dev/branches/blocktree_3030/lucene/src/test/org/apache/lucene/index/TestTermsEnum.java (original)
+++ lucene/dev/branches/blocktree_3030/lucene/src/test/org/apache/lucene/index/TestTermsEnum.java Thu Aug  4 11:03:34 2011
@@ -175,12 +175,21 @@ public class TestTermsEnum extends Lucen
     terms.clear();
   }
 
+  private boolean accepts(CompiledAutomaton c, BytesRef b) {
+    int state = c.runAutomaton.getInitialState();
+    for(int idx=0;idx<b.length;idx++) {
+      assertTrue(state != -1);
+      state = c.runAutomaton.step(state, b.bytes[b.offset+idx] & 0xff);
+    }
+    return c.runAutomaton.isAccept(state);
+  }
+
   // Tests Terms.intersect
   public void testIntersectRandom() throws IOException {
 
     final Directory dir = newDirectory();
     final RandomIndexWriter w = new RandomIndexWriter(random, dir);
-
+    
     final int numTerms = atLeast(1000);
 
     final Set<String> terms = new HashSet<String>();
@@ -267,6 +276,7 @@ public class TestTermsEnum extends Lucen
         final BytesRef b = new BytesRef(s);
         acceptTermsArray[upto++] = b;
         acceptTermsSet.add(b);
+        assertTrue(accepts(c, b));
       }
       Arrays.sort(acceptTermsArray);