You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2010/06/24 15:35:41 UTC
svn commit: r957545 - in /lucene/dev/trunk/lucene: contrib/instantiated/src/java/org/apache/lucene/store/instantiated/ contrib/memory/src/java/org/apache/lucene/index/memory/ src/java/org/apache/lucene/index/ src/java/org/apache/lucene/index/codecs/int...

Author: mikemccand
Date: Thu Jun 24 13:35:40 2010
New Revision: 957545

URL: http://svn.apache.org/viewvc?rev=957545&view=rev
Log:
LUCENE-2426: change index term sort order to unicode code point order (used to be UTF16 order)

Added:
    lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/preflex/
    lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/preflex/TermInfosWriter.java   (with props)
    lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java   (with props)
Modified:
    lucene/dev/trunk/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java
    lucene/dev/trunk/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java
    lucene/dev/trunk/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/CheckIndex.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FieldInfos.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/IndexWriter.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/SegmentReadState.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsEnum.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/IntBlockCodec.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/ArrayUtil.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/BytesRef.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/UnicodeUtil.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/automaton/Transition.java
    lucene/dev/trunk/lucene/src/test/org/apache/lucene/TestExternalCodecs.java
    lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java
    lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/intblock/TestIntBlockCodec.java
    lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/TestNumericUtils.java

Modified: lucene/dev/trunk/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java?rev=957545&r1=957544&r2=957545&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java (original)
+++ lucene/dev/trunk/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java Thu Jun 24 13:35:40 2010
@@ -426,7 +426,7 @@ public class InstantiatedIndexReader ext
 
           @Override
           public Comparator<BytesRef> getComparator() {
-            return BytesRef.getUTF8SortedAsUTF16Comparator();
+            return BytesRef.getUTF8SortedAsUnicodeComparator();
           }
         };
       }

Modified: lucene/dev/trunk/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java?rev=957545&r1=957544&r2=957545&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java (original)
+++ lucene/dev/trunk/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java Thu Jun 24 13:35:40 2010
@@ -123,7 +123,7 @@ public class InstantiatedTermsEnum exten
 
   @Override
   public Comparator<BytesRef> getComparator() {
-    return BytesRef.getUTF8SortedAsUTF16Comparator();
+    return BytesRef.getUTF8SortedAsUnicodeComparator();
   }
 }
 

Modified: lucene/dev/trunk/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java?rev=957545&r1=957544&r2=957545&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java (original)
+++ lucene/dev/trunk/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java Thu Jun 24 13:35:40 2010
@@ -808,7 +808,7 @@ public class MemoryIndex implements Seri
 
               @Override
               public Comparator<BytesRef> getComparator() {
-                return BytesRef.getUTF8SortedAsUTF16Comparator();
+                return BytesRef.getUTF8SortedAsUnicodeComparator();
               }
 
               @Override
@@ -903,7 +903,7 @@ public class MemoryIndex implements Seri
 
       @Override
       public Comparator<BytesRef> getComparator() {
-        return BytesRef.getUTF8SortedAsUTF16Comparator();
+        return BytesRef.getUTF8SortedAsUnicodeComparator();
       }
     }
 

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/CheckIndex.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/CheckIndex.java?rev=957545&r1=957544&r2=957545&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/CheckIndex.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/CheckIndex.java Thu Jun 24 13:35:40 2010
@@ -32,7 +32,7 @@ import java.io.PrintStream;
 import java.io.IOException;
 import java.io.File;
 import java.util.Collection;
-
+import java.util.Comparator;
 import java.util.List;
 import java.util.ArrayList;
 import java.util.Map;
@@ -596,6 +596,10 @@ public class CheckIndex {
         boolean hasOrd = true;
         final long termCountStart = status.termCount;
 
+        BytesRef lastTerm = null;
+
+        Comparator<BytesRef> termComp = terms.getComparator();
+
         while(true) {
 
           final BytesRef term = terms.next();
@@ -603,6 +607,17 @@ public class CheckIndex {
             break;
           }
 
+          // make sure terms arrive in order according to
+          // the comp
+          if (lastTerm == null) {
+            lastTerm = new BytesRef(term);
+          } else {
+            if (termComp.compare(lastTerm, term) >= 0) {
+              throw new RuntimeException("terms out of order: lastTerm=" + lastTerm + " term=" + term);
+            }
+            lastTerm.copy(term);
+          }
+
           final int docFreq = terms.docFreq();
           status.totFreq += docFreq;
 

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FieldInfos.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FieldInfos.java?rev=957545&r1=957544&r2=957545&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FieldInfos.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FieldInfos.java Thu Jun 24 13:35:40 2010
@@ -53,7 +53,7 @@ public final class FieldInfos {
   private final HashMap<String,FieldInfo> byName = new HashMap<String,FieldInfo>();
   private int format;
 
-  FieldInfos() { }
+  public FieldInfos() { }
 
   /**
    * Construct a FieldInfos object using the directory and the name of the file
@@ -62,7 +62,7 @@ public final class FieldInfos {
    * @param name The name of the file to open the IndexInput from in the Directory
    * @throws IOException
    */
-  FieldInfos(Directory d, String name) throws IOException {
+  public FieldInfos(Directory d, String name) throws IOException {
     IndexInput input = d.openInput(name);
     try {
       read(input, name);

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/IndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/IndexWriter.java?rev=957545&r1=957544&r2=957545&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/IndexWriter.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/IndexWriter.java Thu Jun 24 13:35:40 2010
@@ -3964,7 +3964,7 @@ public class IndexWriter implements Clos
         // commit merged deletes
         SegmentReader reader = merge.readers[i] = readerPool.get(info, merge.mergeDocStores,
                                                                  MERGE_READ_BUFFER_SIZE,
-                                                                 -1);
+                                                                 -config.getReaderTermsIndexDivisor());
 
         // We clone the segment readers because other
         // deletes may come in while we're merging so we

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/SegmentReadState.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/SegmentReadState.java?rev=957545&r1=957544&r2=957545&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/SegmentReadState.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/SegmentReadState.java Thu Jun 24 13:35:40 2010
@@ -27,6 +27,12 @@ public class SegmentReadState {
   public final SegmentInfo segmentInfo;
   public final FieldInfos fieldInfos;
   public final int readBufferSize;
+
+  // NOTE: if this is < 0, that means "defer terms index
+  // load until needed".  But if the codec must load the
+  // terms index on init (preflex is the only once currently
+  // that must do so), then it should negate this value to
+  // get the app's terms divisor:
   public final int termsIndexDivisor;
 
   public SegmentReadState(Directory dir,

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java?rev=957545&r1=957544&r2=957545&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java Thu Jun 24 13:35:40 2010
@@ -130,7 +130,7 @@ final class TermVectorsTermsWriterPerFie
 
     // TODO: we may want to make this sort in same order
     // as Codec's terms dict?
-    final int[] termIDs = termsHashPerField.sortPostings(BytesRef.getUTF8SortedAsUTF16Comparator());
+    final int[] termIDs = termsHashPerField.sortPostings(BytesRef.getUTF8SortedAsUnicodeComparator());
 
     tvf.writeVInt(numPostings);
     byte bits = 0x0;

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsEnum.java?rev=957545&r1=957544&r2=957545&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsEnum.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsEnum.java Thu Jun 24 13:35:40 2010
@@ -144,8 +144,7 @@ public abstract class TermsEnum {
 
     @Override
     public Comparator<BytesRef> getComparator() {
-      // return an unused dummy to prevent NPE
-      return BytesRef.getUTF8SortedAsUTF16Comparator();
+      return null;
     }
       
     @Override

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/IntBlockCodec.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/IntBlockCodec.java?rev=957545&r1=957544&r2=957545&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/IntBlockCodec.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/intblock/IntBlockCodec.java Thu Jun 24 13:35:40 2010
@@ -67,7 +67,7 @@ public class IntBlockCodec extends Codec
 
     success = false;
     try {
-      FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUTF16Comparator());
+      FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
       success = true;
       return ret;
     } finally {
@@ -95,7 +95,7 @@ public class IntBlockCodec extends Codec
                                                        state.fieldInfos,
                                                        state.segmentInfo.name,
                                                        state.termsIndexDivisor,
-                                                       BytesRef.getUTF8SortedAsUTF16Comparator());
+                                                       BytesRef.getUTF8SortedAsUnicodeComparator());
       success = true;
     } finally {
       if (!success) {
@@ -111,7 +111,7 @@ public class IntBlockCodec extends Codec
                                                        state.segmentInfo.name,
                                                        postingsReader,
                                                        state.readBufferSize,
-                                                       BytesRef.getUTF8SortedAsUTF16Comparator(),
+                                                       BytesRef.getUTF8SortedAsUnicodeComparator(),
                                                        StandardCodec.TERMS_CACHE_SIZE);
       success = true;
       return ret;

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java?rev=957545&r1=957544&r2=957545&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java Thu Jun 24 13:35:40 2010
@@ -39,11 +39,15 @@ import org.apache.lucene.store.Directory
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util.ArrayUtil;
 
 /** Exposes flex API on a pre-flex index, as a codec. 
  * @lucene.experimental */
 public class PreFlexFields extends FieldsProducer {
 
+  private static final boolean DEBUG_SURROGATES = false;
+
   public TermInfosReader tis;
   public final TermInfosReader tisNoIndex;
 
@@ -60,6 +64,15 @@ public class PreFlexFields extends Field
     throws IOException {
 
     si = info;
+
+    // NOTE: we must always load terms index, even for
+    // "sequential" scan during merging, because what is
+    // sequential to merger may not be to TermInfosReader
+    // since we do the surrogates dance:
+    if (indexDivisor < 0) {
+      indexDivisor = -indexDivisor;
+    }
+
     TermInfosReader r = new TermInfosReader(dir, info.name, fieldInfos, readBufferSize, indexDivisor);    
     if (indexDivisor == -1) {
       tisNoIndex = r;
@@ -174,7 +187,6 @@ public class PreFlexFields extends Field
   private class PreFlexFieldsEnum extends FieldsEnum {
     final Iterator<FieldInfo> it;
     private final PreTermsEnum termsEnum;
-    private int count;
     FieldInfo current;
 
     public PreFlexFieldsEnum() throws IOException {
@@ -185,7 +197,6 @@ public class PreFlexFields extends Field
     @Override
     public String next() {
       if (it.hasNext()) {
-        count++;
         current = it.next();
         return current.name;
       } else {
@@ -195,7 +206,7 @@ public class PreFlexFields extends Field
 
     @Override
     public TermsEnum terms() throws IOException {
-      termsEnum.reset(current, count == 1);
+      termsEnum.reset(current);
       return termsEnum;
     }
   }
@@ -209,14 +220,15 @@ public class PreFlexFields extends Field
     @Override
     public TermsEnum iterator() throws IOException {    
       PreTermsEnum termsEnum = new PreTermsEnum();
-      termsEnum.reset(fieldInfo, false);
+      termsEnum.reset(fieldInfo);
       return termsEnum;
     }
 
     @Override
     public Comparator<BytesRef> getComparator() {
-      // Pre-flex indexes always sorted in UTF16 order
-      return BytesRef.getUTF8SortedAsUTF16Comparator();
+      // Pre-flex indexes always sorted in UTF16 order, but
+      // we remap on-the-fly to unicode order
+      return BytesRef.getUTF8SortedAsUnicodeComparator();
     }
   }
 
@@ -227,37 +239,229 @@ public class PreFlexFields extends Field
     private BytesRef current;
     private final BytesRef scratchBytesRef = new BytesRef();
 
-    void reset(FieldInfo fieldInfo, boolean isFirstField) throws IOException {
+    private int[] surrogateSeekPending = new int[1];
+    private boolean[] surrogateDidSeekBack = new boolean[1];
+    private int surrogateSeekUpto;
+    private char[] pendingPrefix;
+
+    private SegmentTermEnum seekTermEnum;
+    private Term protoTerm;
+    private int newSuffixStart;
+
+    void reset(FieldInfo fieldInfo) throws IOException {
       this.fieldInfo = fieldInfo;
+      protoTerm = new Term(fieldInfo.name);
       if (termEnum == null) {
-        // First time reset is called
-        if (isFirstField) {
-          termEnum = getTermsDict().terms();
-          skipNext = false;
-        } else {
-          termEnum = getTermsDict().terms(new Term(fieldInfo.name, ""));
-          skipNext = true;
-        }
+        termEnum = getTermsDict().terms(protoTerm);
+        seekTermEnum = getTermsDict().terms(protoTerm);
       } else {
-        final Term t = termEnum.term();
-        if (t != null && t.field() == fieldInfo.name) {
-          // No need to seek -- we have already advanced onto
-          // this field.  We must be @ first term because
-          // flex API will not advance this enum further, on
-          // seeing a different field.
-        } else {
-          assert t == null || !t.field().equals(fieldInfo.name);  // make sure field name is interned
-          final TermInfosReader tis = getTermsDict();
-          tis.seekEnum(termEnum, new Term(fieldInfo.name, ""));
+        getTermsDict().seekEnum(termEnum, protoTerm);
+      }
+      skipNext = true;
+      
+      surrogateSeekUpto = 0;
+      newSuffixStart = 0;
+
+      surrogatesDance();
+    }
+
+    private void surrogatesDance() throws IOException {
+      
+      // Tricky: prior to 4.0, Lucene index sorted terms in
+      // UTF16 order, but as of 4.0 we sort by Unicode code
+      // point order.  These orders differ because of the
+      // surrrogates; so we have to fixup our enum, here, by
+      // carefully first seeking past the surrogates and
+      // then back again at the end.  The process is
+      // recursive, since any given term could have multiple
+      // new occurrences of surrogate pairs, so we use a
+      // stack to record the pending seek-backs.
+      if (DEBUG_SURROGATES) {
+        System.out.println("  dance start term=" + (termEnum.term() == null ? null : UnicodeUtil.toHexString(termEnum.term().text())));
+      }
+
+      while(popPendingSeek());
+      while(pushNewSurrogate());
+    }
+
+    // only for debugging
+    private String getStack() {
+      if (surrogateSeekUpto == 0) {
+        return "null";
+      } else {
+        StringBuffer sb = new StringBuffer();
+        for(int i=0;i<surrogateSeekUpto;i++) {
+          if (i > 0) {
+            sb.append(' ');
+          }
+          sb.append(surrogateSeekPending[i]);
         }
-        skipNext = true;
+        sb.append(" pendingSeekText=" + new String(pendingPrefix, 0, surrogateSeekPending[surrogateSeekUpto-1]));
+        return sb.toString();
       }
     }
 
+    private boolean popPendingSeek() throws IOException {
+      if (DEBUG_SURROGATES) {
+        System.out.println("  check pop newSuffix=" + newSuffixStart + " stack=" + getStack());
+      }
+      // if a .next() has advanced beyond the
+      // after-surrogates range we had last seeked to, we
+      // must seek back to the start and resume .next from
+      // there.  this pops the pending seek off the stack.
+      final Term t = termEnum.term();
+      if (surrogateSeekUpto > 0) {
+        final int seekPrefix = surrogateSeekPending[surrogateSeekUpto-1];
+        if (DEBUG_SURROGATES) {
+          System.out.println("    seekPrefix=" + seekPrefix);
+        }
+        if (newSuffixStart < seekPrefix) {
+          assert pendingPrefix != null;
+          assert pendingPrefix.length > seekPrefix;
+          pendingPrefix[seekPrefix] = UnicodeUtil.UNI_SUR_HIGH_START;
+          Term t2 = protoTerm.createTerm(new String(pendingPrefix, 0, 1+seekPrefix));
+          if (DEBUG_SURROGATES) {
+            System.out.println("    do pop; seek back to " + UnicodeUtil.toHexString(t2.text()));
+          }
+          getTermsDict().seekEnum(termEnum, t2);
+          surrogateDidSeekBack[surrogateSeekUpto-1] = true;
+
+          // +2 because we don't want to re-check the
+          // surrogates we just seek'd back to
+          newSuffixStart = seekPrefix + 2;
+          return true;
+        } else if (newSuffixStart == seekPrefix && surrogateDidSeekBack[surrogateSeekUpto-1] && t != null && t.field() == fieldInfo.name && t.text().charAt(seekPrefix) > UnicodeUtil.UNI_SUR_LOW_END) {
+          assert pendingPrefix != null;
+          assert pendingPrefix.length > seekPrefix;
+          pendingPrefix[seekPrefix] = 0xffff;
+          Term t2 = protoTerm.createTerm(new String(pendingPrefix, 0, 1+seekPrefix));
+          if (DEBUG_SURROGATES) {
+            System.out.println("    finish pop; seek fwd to " + UnicodeUtil.toHexString(t2.text()));
+          }
+          getTermsDict().seekEnum(termEnum, t2);
+          if (DEBUG_SURROGATES) {
+            System.out.println("    found term=" + (termEnum.term() == null ? null : UnicodeUtil.toHexString(termEnum.term().text())));
+          }
+          surrogateSeekUpto--;
+
+          if (termEnum.term() == null || termEnum.term().field() != fieldInfo.name) {
+            // force pop
+            newSuffixStart = -1;
+          } else {
+            newSuffixStart = termEnum.newSuffixStart;
+          }
+
+          return true;
+        }
+      }
+
+      return false;
+    }
+
+    private boolean pushNewSurrogate() throws IOException {
+      if (DEBUG_SURROGATES) {
+        System.out.println("  check push newSuffix=" + newSuffixStart + " stack=" + getStack());
+      }
+      final Term t = termEnum.term();
+      if (t == null || t.field() != fieldInfo.name) {
+        return false;
+      }
+      final String text = t.text();
+      final int textLen = text.length();
+
+      for(int i=Math.max(0,newSuffixStart);i<textLen;i++) {
+        final char ch = text.charAt(i);
+        if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && ch <= UnicodeUtil.UNI_SUR_HIGH_END && (surrogateSeekUpto == 0 || i > surrogateSeekPending[surrogateSeekUpto-1])) {
+
+          if (DEBUG_SURROGATES) {
+            System.out.println("    found high surr 0x" + Integer.toHexString(ch) + " at pos=" + i);
+          }
+
+          // the next() that we just did read in a new
+          // suffix, containing a surrogate pair
+
+          // seek forward to see if there are any terms with
+          // this same prefix, but with characters after the
+          // surrogate range; if so, we must first iterate
+          // them, then seek back to the surrogates
+
+          char[] testPrefix = new char[i+1];
+          for(int j=0;j<i;j++) {
+            testPrefix[j] = text.charAt(j);
+          }
+          testPrefix[i] = 1+UnicodeUtil.UNI_SUR_LOW_END;
+
+          getTermsDict().seekEnum(seekTermEnum, protoTerm.createTerm(new String(testPrefix)));
+
+          Term t2 = seekTermEnum.term();
+          boolean isPrefix;
+          if (t2 != null && t2.field() == fieldInfo.name) {
+            String seekText = t2.text();
+            isPrefix = true;
+            if (DEBUG_SURROGATES) {
+              System.out.println("      seek found " + UnicodeUtil.toHexString(seekText));
+            }
+            for(int j=0;j<i;j++) {
+              if (testPrefix[j] != seekText.charAt(j)) {
+                isPrefix = false;
+                break;
+              }
+            }
+            if (DEBUG_SURROGATES && !isPrefix) {
+              System.out.println("      no end terms");
+            }
+          } else {
+            if (DEBUG_SURROGATES) {
+              System.out.println("      no end terms");
+            }
+            isPrefix = false;
+          }
+
+          if (isPrefix) {
+            // we found a term, sharing the same prefix,
+            // with characters after the surrogates, so we
+            // must first enum those, and then return the
+            // the surrogates afterwards.  push that pending
+            // seek on the surrogates stack now:
+            pendingPrefix = testPrefix;
+
+            getTermsDict().seekEnum(termEnum, t2);
+
+            if (surrogateSeekUpto == surrogateSeekPending.length) {
+              surrogateSeekPending = ArrayUtil.grow(surrogateSeekPending);
+            }
+            if (surrogateSeekUpto == surrogateDidSeekBack.length) {
+              surrogateDidSeekBack = ArrayUtil.grow(surrogateDidSeekBack);
+            }
+            surrogateSeekPending[surrogateSeekUpto] = i;
+            surrogateDidSeekBack[surrogateSeekUpto] = false;
+            surrogateSeekUpto++;
+
+            if (DEBUG_SURROGATES) {
+              System.out.println("      do push " + i + "; end term=" + UnicodeUtil.toHexString(t2.text()));
+            }
+
+            newSuffixStart = i+1;
+
+            return true;
+          } else {
+            // there are no terms after the surrogates, so
+            // we do nothing to the enum and just step
+            // through the surrogates like normal.  but we
+            // must keep iterating through the term, in case
+            // another surrogate pair appears later
+          }
+        }
+      }
+
+      return false;
+    }
+
     @Override
     public Comparator<BytesRef> getComparator() {
-      // Pre-flex indexes always sorted in UTF16 order
-      return BytesRef.getUTF8SortedAsUTF16Comparator();
+      // Pre-flex indexes always sorted in UTF16 order, but
+      // we remap on-the-fly to unicode order
+      return BytesRef.getUTF8SortedAsUnicodeComparator();
     }
 
     @Override
@@ -272,14 +476,24 @@ public class PreFlexFields extends Field
 
     @Override
     public SeekStatus seek(BytesRef term, boolean useCache) throws IOException {
+      if (DEBUG_SURROGATES) {
+        System.out.println("TE.seek() term=" + term.utf8ToString());
+      }
       skipNext = false;
       final TermInfosReader tis = getTermsDict();
-      final Term t0 = new Term(fieldInfo.name, term.utf8ToString());
+      final Term t0 = protoTerm.createTerm(term.utf8ToString());
+
+      assert termEnum != null;
+
       if (termEnum == null) {
         termEnum = tis.terms(t0);
       } else {
         tis.seekEnum(termEnum, t0);
       }
+
+      surrogateSeekUpto = 0;
+      surrogatesDance();
+
       final Term t = termEnum.term();
 
       final BytesRef tr;
@@ -304,6 +518,9 @@ public class PreFlexFields extends Field
 
     @Override
     public BytesRef next() throws IOException {
+      if (DEBUG_SURROGATES) {
+        System.out.println("TE.next() skipNext=" + skipNext);
+      }
       if (skipNext) {
         skipNext = false;
         if (termEnum.term() == null) {
@@ -313,19 +530,37 @@ public class PreFlexFields extends Field
           return current = scratchBytesRef;
         }
       }
-      if (termEnum.next()) {
+      if (termEnum.next() && termEnum.term().field() == fieldInfo.name) {
+        newSuffixStart = termEnum.newSuffixStart;
+        if (DEBUG_SURROGATES) {
+          System.out.println("  set newSuffixStart=" + newSuffixStart);
+        }
+        surrogatesDance();
         final Term t = termEnum.term();
-        if (t.field() == fieldInfo.name) {
+        if (t == null || t.field() != fieldInfo.name) {
+          assert t == null || !t.field().equals(fieldInfo.name); // make sure fields are in fact interned
+          current = null;
+        } else {
           scratchBytesRef.copy(t.text());
           current = scratchBytesRef;
-          return current;
-        } else {
-          assert !t.field().equals(fieldInfo.name);  // make sure field name is interned
-          // Crossed into new field
-          return null;
         }
+        return current;
       } else {
-        return null;
+        if (DEBUG_SURROGATES) {
+          System.out.println("  force pop");
+        }
+        // force pop
+        newSuffixStart = -1;
+        surrogatesDance();
+        final Term t = termEnum.term();
+        if (t == null || t.field() != fieldInfo.name) {
+          assert t == null || !t.field().equals(fieldInfo.name); // make sure fields are in fact interned
+          return null;
+        } else {
+          scratchBytesRef.copy(t.text());
+          current = scratchBytesRef;
+          return current;
+        }
       }
     }
 

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java?rev=957545&r1=957544&r2=957545&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java Thu Jun 24 13:35:40 2010
@@ -53,6 +53,7 @@ public final class SegmentTermEnum imple
   long indexPointer = 0;
   int indexInterval;
   int skipInterval;
+  int newSuffixStart;
   int maxSkipLevels;
   private int formatM1SkipInterval;
 
@@ -136,6 +137,7 @@ public final class SegmentTermEnum imple
 
     prevBuffer.set(termBuffer);
     termBuffer.read(input, fieldInfos);
+    newSuffixStart = termBuffer.newSuffixStart;
 
     termInfo.docFreq = input.readVInt();	  // read doc freq
     termInfo.freqPointer += input.readVLong();	  // read freq pointer

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java?rev=957545&r1=957544&r2=957545&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java Thu Jun 24 13:35:40 2010
@@ -19,7 +19,6 @@ package org.apache.lucene.index.codecs.p
 
 import java.io.IOException;
 import org.apache.lucene.store.IndexInput;
-import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.index.Term;
@@ -34,6 +33,8 @@ final class TermBuffer implements Clonea
   private UnicodeUtil.UTF16Result text = new UnicodeUtil.UTF16Result();
   private BytesRef bytes = new BytesRef(10);
 
+  int newSuffixStart;
+
   public final int compareTo(TermBuffer other) {
     if (field == other.field) 	  // fields are interned
       return compareChars(text.result, text.length, other.text.result, other.text.length);
@@ -60,23 +61,33 @@ final class TermBuffer implements Clonea
     int start = input.readVInt();
     int length = input.readVInt();
     int totalLength = start + length;
+    if (bytes.bytes.length < totalLength) {
+      bytes.grow(totalLength);
+    }
     if (dirty) {
       // Fully convert all bytes since bytes is dirty
       UnicodeUtil.UTF16toUTF8(text.result, 0, text.length, bytes);
-      if (bytes.bytes.length < totalLength)
-        bytes.bytes = new byte[totalLength];
       bytes.length = totalLength;
       input.readBytes(bytes.bytes, start, length);
       UnicodeUtil.UTF8toUTF16(bytes.bytes, 0, totalLength, text);
       dirty = false;
     } else {
       // Incrementally convert only the UTF8 bytes that are new:
-      if (bytes.bytes.length < totalLength)
-        bytes.bytes = ArrayUtil.grow(bytes.bytes, totalLength);
       bytes.length = totalLength;
       input.readBytes(bytes.bytes, start, length);
       UnicodeUtil.UTF8toUTF16(bytes.bytes, start, length, text);
     }
+
+    while(true) {
+      newSuffixStart = text.offsets[start];
+      if (newSuffixStart != -1) {
+        break;
+      }
+      if (--start == 0) {
+        newSuffixStart = 0;
+        break;
+      }
+    }
     this.field = fieldInfos.fieldName(input.readVInt());
   }
 
@@ -124,10 +135,11 @@ final class TermBuffer implements Clonea
     try {
       clone = (TermBuffer)super.clone();
     } catch (CloneNotSupportedException e) {}
-
     clone.dirty = true;
     clone.bytes = new BytesRef(10);
     clone.text = new UnicodeUtil.UTF16Result();
+    clone.text.offsets = new int[text.offsets.length];
+    System.arraycopy(text.offsets, 0, clone.text.offsets, 0, text.offsets.length);
     clone.text.copyText(text);
     return clone;
   }

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java?rev=957545&r1=957544&r2=957545&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java Thu Jun 24 13:35:40 2010
@@ -80,7 +80,7 @@ public class PulsingCodec extends Codec 
     // Terms dict
     success = false;
     try {
-      FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, pulsingWriter, BytesRef.getUTF8SortedAsUTF16Comparator());
+      FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, pulsingWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
       success = true;
       return ret;
     } finally {
@@ -111,7 +111,7 @@ public class PulsingCodec extends Codec 
                                                        state.fieldInfos,
                                                        state.segmentInfo.name,
                                                        state.termsIndexDivisor,
-                                                       BytesRef.getUTF8SortedAsUTF16Comparator());
+                                                       BytesRef.getUTF8SortedAsUnicodeComparator());
       success = true;
     } finally {
       if (!success) {
@@ -126,7 +126,7 @@ public class PulsingCodec extends Codec 
                                                        state.dir, state.fieldInfos, state.segmentInfo.name,
                                                        pulsingReader,
                                                        state.readBufferSize,
-                                                       BytesRef.getUTF8SortedAsUTF16Comparator(),
+                                                       BytesRef.getUTF8SortedAsUnicodeComparator(),
                                                        StandardCodec.TERMS_CACHE_SIZE);
       success = true;
       return ret;

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java?rev=957545&r1=957544&r2=957545&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java Thu Jun 24 13:35:40 2010
@@ -63,7 +63,7 @@ public class SepCodec extends Codec {
 
     success = false;
     try {
-      FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUTF16Comparator());
+      FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
       success = true;
       return ret;
     } finally {
@@ -95,7 +95,7 @@ public class SepCodec extends Codec {
                                                        state.fieldInfos,
                                                        state.segmentInfo.name,
                                                        state.termsIndexDivisor,
-                                                       BytesRef.getUTF8SortedAsUTF16Comparator());
+                                                       BytesRef.getUTF8SortedAsUnicodeComparator());
       success = true;
     } finally {
       if (!success) {
@@ -111,7 +111,7 @@ public class SepCodec extends Codec {
                                                        state.segmentInfo.name,
                                                        postingsReader,
                                                        state.readBufferSize,
-                                                       BytesRef.getUTF8SortedAsUTF16Comparator(),
+                                                       BytesRef.getUTF8SortedAsUnicodeComparator(),
                                                        StandardCodec.TERMS_CACHE_SIZE);
       success = true;
       return ret;

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java?rev=957545&r1=957544&r2=957545&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java Thu Jun 24 13:35:40 2010
@@ -104,7 +104,7 @@ public class SimpleStandardTermsIndexRea
       indexInterval = in.readInt();
       this.indexDivisor = indexDivisor;
 
-      if (indexDivisor == -1) {
+      if (indexDivisor < 0) {
         totalIndexInterval = indexInterval;
       } else {
         // In case terms index gets loaded, later, on demand
@@ -131,7 +131,7 @@ public class SimpleStandardTermsIndexRea
       }
       success = true;
     } finally {
-      if (indexDivisor != -1) {
+      if (indexDivisor > 0) {
         in.close();
         this.in = null;
         if (success) {
@@ -173,7 +173,7 @@ public class SimpleStandardTermsIndexRea
       // We still create the indexReader when indexDivisor
       // is -1, so that StandardTermsDictReader can call
       // isIndexTerm for each field:
-      if (indexDivisor != -1) {
+      if (indexDivisor > 0) {
         coreIndex = new CoreFieldIndex(indexStart,
                                        termsStart,
                                        packedIndexStart,
@@ -218,7 +218,8 @@ public class SimpleStandardTermsIndexRea
 
     @Override
     public void getIndexOffset(long ord, TermsIndexResult result) throws IOException {
-      // You must call loadTermsIndex if you had specified -1 for indexDivisor
+      // You must call loadTermsIndex if you had specified
+      // indexDivisor < 0 to ctor
       if (coreIndex == null) {
         throw new IllegalStateException("terms index was not loaded");
       }

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java?rev=957545&r1=957544&r2=957545&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java Thu Jun 24 13:35:40 2010
@@ -58,7 +58,7 @@ public class StandardCodec extends Codec
 
     success = false;
     try {
-      FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, docs, BytesRef.getUTF8SortedAsUTF16Comparator());
+      FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, docs, BytesRef.getUTF8SortedAsUnicodeComparator());
       success = true;
       return ret;
     } finally {
@@ -85,7 +85,7 @@ public class StandardCodec extends Codec
                                                        state.fieldInfos,
                                                        state.segmentInfo.name,
                                                        state.termsIndexDivisor,
-                                                       BytesRef.getUTF8SortedAsUTF16Comparator());
+                                                       BytesRef.getUTF8SortedAsUnicodeComparator());
       success = true;
     } finally {
       if (!success) {
@@ -101,7 +101,7 @@ public class StandardCodec extends Codec
                                                        state.segmentInfo.name,
                                                        postings,
                                                        state.readBufferSize,
-                                                       BytesRef.getUTF8SortedAsUTF16Comparator(),
+                                                       BytesRef.getUTF8SortedAsUnicodeComparator(),
                                                        TERMS_CACHE_SIZE);
       success = true;
       return ret;

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java?rev=957545&r1=957544&r2=957545&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java Thu Jun 24 13:35:40 2010
@@ -103,7 +103,7 @@ public class AutomatonTermsEnum extends 
     // build a cache of sorted transitions for every state
     allTransitions = new Transition[runAutomaton.getSize()][];
     for (State state : this.automaton.getNumberedStates()) {
-      state.sortTransitions(Transition.CompareByMinMaxThenDestUTF8InUTF16Order);
+      state.sortTransitions(Transition.CompareByMinMaxThenDest);
       state.trimTransitionsArray();
       allTransitions[state.getNumber()] = state.transitionsArray;
     }
@@ -158,11 +158,7 @@ public class AutomatonTermsEnum extends 
     // seek to the next possible string;
     if (nextString()) {
       // reposition
-      
-      // FIXME: this is really bad to turn off
-      // but it cannot work correctly until terms are in utf8 order.
-      linear = false;
-      
+           
       if (linear)
         setLinear(infinitePosition);
       return seekBytesRef;
@@ -188,15 +184,15 @@ public class AutomatonTermsEnum extends 
     }
     for (int i = 0; i < allTransitions[state].length; i++) {
       Transition t = allTransitions[state][i];
-      if (compareToUTF16(t.getMin(), (seekBytesRef.bytes[position] & 0xff)) <= 0 && 
-          compareToUTF16((seekBytesRef.bytes[position] & 0xff), t.getMax()) <= 0) {
+      if (t.getMin() <= (seekBytesRef.bytes[position] & 0xff) && 
+          (seekBytesRef.bytes[position] & 0xff) <= t.getMax()) {
         maxInterval = t.getMax();
         break;
       }
     }
-    // 0xef terms don't get the optimization... not worth the trouble.
-    if (maxInterval != 0xef)
-      maxInterval = incrementUTF16(maxInterval);
+    // 0xff terms don't get the optimization... not worth the trouble.
+    if (maxInterval != 0xff)
+      maxInterval = incrementUTF8(maxInterval);
     int length = position + 1; /* position + maxTransition */
     if (linearUpperBound.bytes.length < length)
       linearUpperBound.bytes = new byte[length];
@@ -281,7 +277,7 @@ public class AutomatonTermsEnum extends 
       // if the next character is U+FFFF and is not part of the useful portion,
       // then by definition it puts us in a reject state, and therefore this
       // path is dead. there cannot be any higher transitions. backtrack.
-      c = incrementUTF16(c);
+      c = incrementUTF8(c);
       if (c == -1)
         return false;
     }
@@ -295,8 +291,8 @@ public class AutomatonTermsEnum extends 
     
     for (int i = 0; i < transitions.length; i++) {
       Transition transition = transitions[i];
-      if (compareToUTF16(transition.getMax(), c) >= 0) {
-        int nextChar = compareToUTF16(c, transition.getMin()) > 0 ? c : transition.getMin();
+      if (transition.getMax() >= c) {
+        int nextChar = Math.max(c, transition.getMin());
         // append either the next sequential char, or the minimum transition
         seekBytesRef.grow(seekBytesRef.length + 1);
         seekBytesRef.length++;
@@ -342,9 +338,9 @@ public class AutomatonTermsEnum extends 
   private boolean backtrack(int position) {
     while (position > 0) {
       int nextChar = seekBytesRef.bytes[position - 1] & 0xff;
-      // if a character is 0xef its a dead-end too,
-      // because there is no higher character in UTF-16 sort order.
-      nextChar = incrementUTF16(nextChar);
+      // if a character is 0xff its a dead-end too,
+      // because there is no higher character in UTF-8 sort order.
+      nextChar = incrementUTF8(nextChar);
       if (nextChar != -1) {
         seekBytesRef.bytes[position - 1] = (byte) nextChar;
         seekBytesRef.length = position;
@@ -355,34 +351,11 @@ public class AutomatonTermsEnum extends 
     return false; /* all solutions exhausted */
   }
 
-  /* return the next utf8 byte in utf16 order, or -1 if exhausted */
-  private final int incrementUTF16(int utf8) {
+  /* return the next utf8 byte in utf8 order, or -1 if exhausted */
+  private final int incrementUTF8(int utf8) {
     switch(utf8) {
-      case 0xed: return 0xf0;
-      case 0xfd: return 0xee;
-      case 0xee: return 0xef;
-      case 0xef: return -1;
+      case 0xff: return -1;
       default: return utf8 + 1;
     }
   }
-  
-  int compareToUTF16(int aByte, int bByte) {
-    if (aByte != bByte) {
-      // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order
-
-      // We know the terms are not equal, but, we may
-      // have to carefully fixup the bytes at the
-      // difference to match UTF16's sort order:
-      if (aByte >= 0xee && bByte >= 0xee) {
-        if ((aByte & 0xfe) == 0xee) {
-          aByte += 0x10;
-        }
-        if ((bByte&0xfe) == 0xee) {
-          bByte += 0x10;
-        }
-      }
-      return aByte - bByte;
-    }
-    return 0;
-  }
 }

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/ArrayUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/ArrayUtil.java?rev=957545&r1=957544&r2=957545&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/ArrayUtil.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/ArrayUtil.java Thu Jun 24 13:35:40 2010
@@ -327,6 +327,29 @@ public final class ArrayUtil {
       return array;
   }
 
+  public static boolean[] grow(boolean[] array, int minSize) {
+    if (array.length < minSize) {
+      boolean[] newArray = new boolean[oversize(minSize, 1)];
+      System.arraycopy(array, 0, newArray, 0, array.length);
+      return newArray;
+    } else
+      return array;
+  }
+
+  public static boolean[] grow(boolean[] array) {
+    return grow(array, 1 + array.length);
+  }
+
+  public static boolean[] shrink(boolean[] array, int targetSize) {
+    final int newSize = getShrinkSize(array.length, targetSize, 1);
+    if (newSize != array.length) {
+      boolean[] newArray = new boolean[newSize];
+      System.arraycopy(array, 0, newArray, 0, newSize);
+      return newArray;
+    } else
+      return array;
+  }
+
   public static char[] grow(char[] array, int minSize) {
     if (array.length < minSize) {
       char[] newArray = new char[oversize(minSize, RamUsageEstimator.NUM_BYTES_CHAR)];

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/BytesRef.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/BytesRef.java?rev=957545&r1=957544&r2=957545&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/BytesRef.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/BytesRef.java Thu Jun 24 13:35:40 2010
@@ -217,14 +217,7 @@ public final class BytesRef implements C
     bytes = ArrayUtil.grow(bytes, newLength);
   }
 
-  private final static Comparator<BytesRef> utf8SortedAsUTF16SortOrder = new UTF8SortedAsUTF16Comparator();
-
-  public static Comparator<BytesRef> getUTF8SortedAsUTF16Comparator() {
-    return utf8SortedAsUTF16SortOrder;
-  }
-
   /** Unsigned byte order comparison */
-  /*
   public int compareTo(BytesRef other) {
     if (this == other) return 0;
 
@@ -245,52 +238,18 @@ public final class BytesRef implements C
     // One is a prefix of the other, or, they are equal:
     return this.length - other.length;
   }
-  */
 
-  /** Lucene default index order. Currently the same as String.compareTo() (UTF16) but will change
-   * in the future to unsigned byte comparison. */
-  public int compareTo(BytesRef other) {
-    if (this == other) return 0;
+  private final static Comparator<BytesRef> utf8SortedAsUnicodeSortOrder = new UTF8SortedAsUnicodeComparator();
 
-    final byte[] aBytes = this.bytes;
-    int aUpto = this.offset;
-    final byte[] bBytes = other.bytes;
-    int bUpto = other.offset;
-
-    final int aStop = aUpto + Math.min(this.length, other.length);
-
-    while(aUpto < aStop) {
-      int aByte = aBytes[aUpto++] & 0xff;
-      int bByte = bBytes[bUpto++] & 0xff;
-      if (aByte != bByte) {
-
-        // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order
-
-        // We know the terms are not equal, but, we may
-        // have to carefully fixup the bytes at the
-        // difference to match UTF16's sort order:
-        if (aByte >= 0xee && bByte >= 0xee) {
-          if ((aByte & 0xfe) == 0xee) {
-            aByte += 0x10;
-          }
-          if ((bByte&0xfe) == 0xee) {
-            bByte += 0x10;
-          }
-        }
-        return aByte - bByte;
-      }
-    }
-
-    // One is a prefix of the other, or, they are equal:
-    return this.length - other.length;
+  public static Comparator<BytesRef> getUTF8SortedAsUnicodeComparator() {
+    return utf8SortedAsUnicodeSortOrder;
   }
 
-  private static class UTF8SortedAsUTF16Comparator implements Comparator<BytesRef> {
+  private static class UTF8SortedAsUnicodeComparator implements Comparator<BytesRef> {
     // Only singleton
-    private UTF8SortedAsUTF16Comparator() {};
+    private UTF8SortedAsUnicodeComparator() {};
 
     public int compare(BytesRef a, BytesRef b) {
-
       final byte[] aBytes = a.bytes;
       int aUpto = a.offset;
       final byte[] bBytes = b.bytes;
@@ -307,32 +266,15 @@ public final class BytesRef implements C
         int aByte = aBytes[aUpto++] & 0xff;
         int bByte = bBytes[bUpto++] & 0xff;
 
-        if (aByte != bByte) {
-
-          // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order
-
-          // We know the terms are not equal, but, we may
-          // have to carefully fixup the bytes at the
-          // difference to match UTF16's sort order:
-          if (aByte >= 0xee && bByte >= 0xee) {
-            if ((aByte & 0xfe) == 0xee) {
-              aByte += 0x10;
-            }
-            if ((bByte&0xfe) == 0xee) {
-              bByte += 0x10;
-            }
-          }
-          return aByte - bByte;
+        int diff = aByte - bByte;
+        if (diff != 0) {
+          return diff;
         }
       }
 
       // One is a prefix of the other, or, they are equal:
       return a.length - b.length;
-    }
-
-    public boolean equals(Object other) {
-      return this == other;
-    }
+    }    
   }
 
   public void writeExternal(ObjectOutput out)

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/UnicodeUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/UnicodeUtil.java?rev=957545&r1=957544&r2=957545&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/UnicodeUtil.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/UnicodeUtil.java Thu Jun 24 13:35:40 2010
@@ -358,7 +358,6 @@ final public class UnicodeUtil {
         out[outUpto++] = (char) ((chHalf & HALF_MASK) + UNI_SUR_LOW_START);
       }
     }
-
     offsets[upto] = outUpto;
     result.length = outUpto;
   }
@@ -483,7 +482,7 @@ final public class UnicodeUtil {
     }
   }
   */
-  public static final boolean validUTF16String(CharSequence s) {
+  public static boolean validUTF16String(CharSequence s) {
     final int size = s.length();
     for(int i=0;i<size;i++) {
       char ch = s.charAt(i);
@@ -507,7 +506,7 @@ final public class UnicodeUtil {
     return true;
   }
 
-  public static final boolean validUTF16String(char[] s, int size) {
+  public static boolean validUTF16String(char[] s, int size) {
     for(int i=0;i<size;i++) {
       char ch = s[i];
       if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
@@ -559,7 +558,7 @@ final public class UnicodeUtil {
   /** Returns the number of code points in this utf8
    *  sequence.  Behavior is undefined if the utf8 sequence
    *  is invalid.*/
-  public static final int codePointCount(BytesRef utf8) {
+  public static int codePointCount(BytesRef utf8) {
     int upto = utf8.offset;
     final int limit = utf8.offset + utf8.length;
     final byte[] bytes = utf8.bytes;
@@ -673,4 +672,33 @@ final public class UnicodeUtil {
       }
       return new String(chars, 0, w);
   }
+
+  // for debugging
+  public static String toHexString(String s) {
+    StringBuilder sb = new StringBuilder();
+    for(int i=0;i<s.length();i++) {
+      char ch = s.charAt(i);
+      if (i > 0) {
+        sb.append(' ');
+      }
+      if (ch < 128) {
+        sb.append(ch);
+      } else {
+        if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
+          sb.append("H:");
+        } else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
+          sb.append("L:");
+        } else if (ch > UNI_SUR_LOW_END) {
+          if (ch == 0xffff) {
+            sb.append("F:");
+          } else {
+            sb.append("E:");
+          }
+        }
+        
+        sb.append("0x" + Integer.toHexString(ch));
+      }
+    }
+    return sb.toString();
+  }
 }

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/automaton/Transition.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/automaton/Transition.java?rev=957545&r1=957544&r2=957545&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/automaton/Transition.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/automaton/Transition.java Thu Jun 24 13:35:40 2010
@@ -210,64 +210,4 @@ public class Transition implements Seria
   }
 
   public static final Comparator<Transition> CompareByMinMaxThenDest = new CompareByMinMaxThenDestSingle();
-
-  private static class UTF8InUTF16Order {
-    protected int compareCodePoint(int aByte, int bByte) {
-      if (aByte != bByte) {
-        // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order
-
-        // We know the terms are not equal, but, we may
-        // have to carefully fixup the bytes at the
-        // difference to match UTF16's sort order:
-        if (aByte >= 0xee && bByte >= 0xee) {
-          if ((aByte & 0xfe) == 0xee) {
-            aByte += 0x10;
-          }
-          if ((bByte&0xfe) == 0xee) {
-            bByte += 0x10;
-          }
-        }
-        return aByte - bByte;
-      }
-      return 0;
-    }
-  }
-
-  private static final class CompareByDestThenMinMaxUTF8InUTF16OrderSingle extends UTF8InUTF16Order implements Comparator<Transition> {
-    public int compare(Transition t1, Transition t2) {
-      if (t1.to != t2.to) {
-        if (t1.to == null) return -1;
-        else if (t2.to == null) return 1;
-        else if (t1.to.number < t2.to.number) return -1;
-        else if (t1.to.number > t2.to.number) return 1;
-      }
-      int minComp = compareCodePoint(t1.min, t2.min);
-      if (minComp != 0) return minComp;
-      int maxComp = compareCodePoint(t1.max, t2.max);
-      if (maxComp != 0) return maxComp;
-      return 0;
-    }
-  }
-
-  public static final Comparator<Transition> CompareByDestThenMinMaxUTF8InUTF16Order = new CompareByDestThenMinMaxUTF8InUTF16OrderSingle();
-
-  private static final class CompareByMinMaxThenDestUTF8InUTF16OrderSingle extends UTF8InUTF16Order implements Comparator<Transition> {
-    public int compare(Transition t1, Transition t2) {
-      int minComp = compareCodePoint(t1.min, t2.min);
-      if (minComp != 0) return minComp;
-      int maxComp = compareCodePoint(t1.max, t2.max);
-      if (maxComp != 0) return maxComp;
-      if (t1.to != t2.to) {
-        if (t1.to == null) return -1;
-        else if (t2.to == null) return 1;
-        else if (t1.to.number < t2.to.number) return -1;
-        else if (t1.to.number > t2.to.number) return 1;
-      }
-      return 0;
-    }
-  }
-
-  public static final Comparator<Transition> CompareByMinMaxThenDestUTF8InUTF16Order = new CompareByMinMaxThenDestUTF8InUTF16OrderSingle();
-
-
 }

Modified: lucene/dev/trunk/lucene/src/test/org/apache/lucene/TestExternalCodecs.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test/org/apache/lucene/TestExternalCodecs.java?rev=957545&r1=957544&r2=957545&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/test/org/apache/lucene/TestExternalCodecs.java (original)
+++ lucene/dev/trunk/lucene/src/test/org/apache/lucene/TestExternalCodecs.java Thu Jun 24 13:35:40 2010
@@ -179,7 +179,7 @@ public class TestExternalCodecs extends 
       
       @Override
       public Comparator<BytesRef> getComparator() {
-        return BytesRef.getUTF8SortedAsUTF16Comparator();
+        return BytesRef.getUTF8SortedAsUnicodeComparator();
       }
 
       @Override
@@ -263,7 +263,7 @@ public class TestExternalCodecs extends 
       
       @Override
       public Comparator<BytesRef> getComparator() {
-        return BytesRef.getUTF8SortedAsUTF16Comparator();
+        return BytesRef.getUTF8SortedAsUnicodeComparator();
       }
 
       @Override

Modified: lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java?rev=957545&r1=957544&r2=957545&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java (original)
+++ lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java Thu Jun 24 13:35:40 2010
@@ -4621,38 +4621,22 @@ public class TestIndexWriter extends Luc
   private void checkTermsOrder(IndexReader r, Set<String> allTerms, boolean isTop) throws IOException {
     TermsEnum terms = MultiFields.getFields(r).terms("f").iterator();
 
-    char[] last = new char[2];
-    int lastLength = 0;
+    BytesRef last = new BytesRef();
 
     Set<String> seenTerms = new HashSet<String>();
 
-    UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
     while(true) {
       final BytesRef term = terms.next();
       if (term == null) {
         break;
       }
-      UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16);
-      assertTrue(utf16.length <= 2);
 
-      // Make sure last term comes before current one, in
-      // UTF16 sort order
-      int i = 0;
-      for(i=0;i<lastLength && i<utf16.length;i++) {
-        assertTrue("UTF16 code unit " + termDesc(new String(utf16.result, 0, utf16.length)) + " incorrectly sorted after code unit " + termDesc(new String(last, 0, lastLength)), last[i] <= utf16.result[i]);
-        if (last[i] < utf16.result[i]) {
-          break;
-        }
-      }
-      // Terms should not have been identical
-      assertTrue(lastLength != utf16.length || i < lastLength);
+      assertTrue(last.compareTo(term) < 0);
+      last.copy(term);
 
-      final String s = new String(utf16.result, 0, utf16.length);
+      final String s = term.utf8ToString();
       assertTrue("term " + termDesc(s) + " was not added to index (count=" + allTerms.size() + ")", allTerms.contains(s));
       seenTerms.add(s);
-
-      System.arraycopy(utf16.result, 0, last, 0, utf16.length);
-      lastLength = utf16.length;
     }
 
     if (isTop) {

Modified: lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/intblock/TestIntBlockCodec.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/intblock/TestIntBlockCodec.java?rev=957545&r1=957544&r2=957545&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/intblock/TestIntBlockCodec.java (original)
+++ lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/intblock/TestIntBlockCodec.java Thu Jun 24 13:35:40 2010
@@ -1,5 +1,22 @@
 package org.apache.lucene.index.codecs.intblock;
 
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.store.*;
 import org.apache.lucene.index.codecs.sep.*;
@@ -34,7 +51,7 @@ public class TestIntBlockCodec extends L
     out.close();
 
     IntIndexInput in = new SimpleIntBlockIndexInput(dir, "test", 128);
-    IntIndexInput.Reader r = in.reader();
+    in.reader();
     // read no ints
     in.close();
     dir.close();

Added: lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/preflex/TermInfosWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/preflex/TermInfosWriter.java?rev=957545&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/preflex/TermInfosWriter.java (added)
+++ lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/preflex/TermInfosWriter.java Thu Jun 24 13:35:40 2010
@@ -0,0 +1,227 @@
+package org.apache.lucene.index.codecs.preflex;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.io.IOException;
+import org.apache.lucene.store.*;
+import org.apache.lucene.index.*;
+import org.apache.lucene.util.*;
+
+
+/** This stores a monotonically increasing set of <Term, TermInfo> pairs in a
+  Directory.  A TermInfos can be written once, in order.  */
+
+final class TermInfosWriter {
+  /** The file format version, a negative number. */
+  public static final int FORMAT = -3;
+
+  // Changed strings to true utf8 with length-in-bytes not
+  // length-in-chars
+  public static final int FORMAT_VERSION_UTF8_LENGTH_IN_BYTES = -4;
+
+  // NOTE: always change this if you switch to a new format!
+  public static final int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES;
+
+  private FieldInfos fieldInfos;
+  private IndexOutput output;
+  private TermInfo lastTi = new TermInfo();
+  private long size;
+
+  // TODO: the default values for these two parameters should be settable from
+  // IndexWriter.  However, once that's done, folks will start setting them to
+  // ridiculous values and complaining that things don't work well, as with
+  // mergeFactor.  So, let's wait until a number of folks find that alternate
+  // values work better.  Note that both of these values are stored in the
+  // segment, so that it's safe to change these w/o rebuilding all indexes.
+
+  /** Expert: The fraction of terms in the "dictionary" which should be stored
+   * in RAM.  Smaller values use more memory, but make searching slightly
+   * faster, while larger values use less memory and make searching slightly
+   * slower.  Searching is typically not dominated by dictionary lookup, so
+   * tweaking this is rarely useful.*/
+  int indexInterval = 128;
+
+  /** Expert: The fraction of {@link TermDocs} entries stored in skip tables,
+   * used to accelerate {@link TermDocs#skipTo(int)}.  Larger values result in
+   * smaller indexes, greater acceleration, but fewer accelerable cases, while
+   * smaller values result in bigger indexes, less acceleration and more
+   * accelerable cases. More detailed experiments would be useful here. */
+  int skipInterval = 16;
+  
+  /** Expert: The maximum number of skip levels. Smaller values result in 
+   * slightly smaller indexes, but slower skipping in big posting lists.
+   */
+  int maxSkipLevels = 10;
+
+  private long lastIndexPointer;
+  private boolean isIndex;
+  private byte[] lastTermBytes = new byte[10];
+  private int lastTermBytesLength = 0;
+  private int lastFieldNumber = -1;
+
+  private TermInfosWriter other;
+  private BytesRef utf8Result = new BytesRef(10);
+
+  TermInfosWriter(Directory directory, String segment, FieldInfos fis,
+                  int interval)
+       throws IOException {
+    initialize(directory, segment, fis, interval, false);
+    other = new TermInfosWriter(directory, segment, fis, interval, true);
+    other.other = this;
+  }
+
+  private TermInfosWriter(Directory directory, String segment, FieldInfos fis,
+                          int interval, boolean isIndex) throws IOException {
+    initialize(directory, segment, fis, interval, isIndex);
+  }
+
+  private void initialize(Directory directory, String segment, FieldInfos fis,
+                          int interval, boolean isi) throws IOException {
+    indexInterval = interval;
+    fieldInfos = fis;
+    isIndex = isi;
+    output = directory.createOutput(segment + (isIndex ? ".tii" : ".tis"));
+    output.writeInt(FORMAT_CURRENT);              // write format
+    output.writeLong(0);                          // leave space for size
+    output.writeInt(indexInterval);               // write indexInterval
+    output.writeInt(skipInterval);                // write skipInterval
+    output.writeInt(maxSkipLevels);               // write maxSkipLevels
+    assert initUTF16Results();
+  }
+
+  void add(Term term, TermInfo ti) throws IOException {
+    UnicodeUtil.UTF16toUTF8(term.text(), 0, term.text().length(), utf8Result);
+    add(fieldInfos.fieldNumber(term.field()), utf8Result.bytes, utf8Result.length, ti);
+  }
+
+  // Currently used only by assert statements
+  UnicodeUtil.UTF16Result utf16Result1;
+  UnicodeUtil.UTF16Result utf16Result2;
+
+  // Currently used only by assert statements
+  private boolean initUTF16Results() {
+    utf16Result1 = new UnicodeUtil.UTF16Result();
+    utf16Result2 = new UnicodeUtil.UTF16Result();
+    return true;
+  }
+
+  // Currently used only by assert statement
+  private int compareToLastTerm(int fieldNumber, byte[] termBytes, int termBytesLength) {
+
+    if (lastFieldNumber != fieldNumber) {
+      final int cmp = fieldInfos.fieldName(lastFieldNumber).compareTo(fieldInfos.fieldName(fieldNumber));
+      // If there is a field named "" (empty string) then we
+      // will get 0 on this comparison, yet, it's "OK".  But
+      // it's not OK if two different field numbers map to
+      // the same name.
+      if (cmp != 0 || lastFieldNumber != -1)
+        return cmp;
+    }
+
+    UnicodeUtil.UTF8toUTF16(lastTermBytes, 0, lastTermBytesLength, utf16Result1);
+    UnicodeUtil.UTF8toUTF16(termBytes, 0, termBytesLength, utf16Result2);
+    final int len;
+    if (utf16Result1.length < utf16Result2.length)
+      len = utf16Result1.length;
+    else
+      len = utf16Result2.length;
+
+    for(int i=0;i<len;i++) {
+      final char ch1 = utf16Result1.result[i];
+      final char ch2 = utf16Result2.result[i];
+      if (ch1 != ch2)
+        return ch1-ch2;
+    }
+    return utf16Result1.length - utf16Result2.length;
+  }
+
+  /** Adds a new <<fieldNumber, termBytes>, TermInfo> pair to the set.
+    Term must be lexicographically greater than all previous Terms added.
+    TermInfo pointers must be positive and greater than all previous.*/
+  void add(int fieldNumber, byte[] termBytes, int termBytesLength, TermInfo ti)
+    throws IOException {
+
+    assert compareToLastTerm(fieldNumber, termBytes, termBytesLength) < 0 ||
+      (isIndex && termBytesLength == 0 && lastTermBytesLength == 0) :
+      "Terms are out of order: field=" + fieldInfos.fieldName(fieldNumber) + " (number " + fieldNumber + ")" +
+        " lastField=" + fieldInfos.fieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" +
+        " text=" + new String(termBytes, 0, termBytesLength, "UTF-8") + " lastText=" + new String(lastTermBytes, 0, lastTermBytesLength, "UTF-8");
+
+    assert ti.freqPointer >= lastTi.freqPointer: "freqPointer out of order (" + ti.freqPointer + " < " + lastTi.freqPointer + ")";
+    assert ti.proxPointer >= lastTi.proxPointer: "proxPointer out of order (" + ti.proxPointer + " < " + lastTi.proxPointer + ")";
+
+    if (!isIndex && size % indexInterval == 0)
+      other.add(lastFieldNumber, lastTermBytes, lastTermBytesLength, lastTi);                      // add an index term
+
+    writeTerm(fieldNumber, termBytes, termBytesLength);                        // write term
+
+    output.writeVInt(ti.docFreq);                       // write doc freq
+    output.writeVLong(ti.freqPointer - lastTi.freqPointer); // write pointers
+    output.writeVLong(ti.proxPointer - lastTi.proxPointer);
+
+    if (ti.docFreq >= skipInterval) {
+      output.writeVInt(ti.skipOffset);
+    }
+
+    if (isIndex) {
+      output.writeVLong(other.output.getFilePointer() - lastIndexPointer);
+      lastIndexPointer = other.output.getFilePointer(); // write pointer
+    }
+
+    lastFieldNumber = fieldNumber;
+    lastTi.set(ti);
+    size++;
+  }
+
+  private void writeTerm(int fieldNumber, byte[] termBytes, int termBytesLength)
+       throws IOException {
+
+    // TODO: UTF16toUTF8 could tell us this prefix
+    // Compute prefix in common with last term:
+    int start = 0;
+    final int limit = termBytesLength < lastTermBytesLength ? termBytesLength : lastTermBytesLength;
+    while(start < limit) {
+      if (termBytes[start] != lastTermBytes[start])
+        break;
+      start++;
+    }
+
+    final int length = termBytesLength - start;
+    output.writeVInt(start);                     // write shared prefix length
+    output.writeVInt(length);                  // write delta length
+    output.writeBytes(termBytes, start, length);  // write delta bytes
+    output.writeVInt(fieldNumber); // write field num
+    if (lastTermBytes.length < termBytesLength) {
+      lastTermBytes = ArrayUtil.grow(lastTermBytes, termBytesLength);
+    }
+    System.arraycopy(termBytes, start, lastTermBytes, start, length);
+    lastTermBytesLength = termBytesLength;
+  }
+
+  /** Called to complete TermInfos creation. */
+  void close() throws IOException {
+    output.seek(4);          // write size after format
+    output.writeLong(size);
+    output.close();
+
+    if (!isIndex)
+      other.close();
+  }
+
+}

Propchange: lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/preflex/TermInfosWriter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java?rev=957545&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java (added)
+++ lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java Thu Jun 24 13:35:40 2010
@@ -0,0 +1,212 @@
+package org.apache.lucene.index.codecs.preflex;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.store.*;
+import org.apache.lucene.index.*;
+import org.apache.lucene.index.codecs.*;
+import org.apache.lucene.util.*;
+
+import java.util.*;
+import java.io.IOException;
+
+public class TestSurrogates extends LuceneTestCase {
+
+  private static final boolean DEBUG = false;
+
+  // like Term, but uses BytesRef for text
+  private static class FieldAndText implements Comparable<FieldAndText> {
+    String field;
+    BytesRef text;
+
+    public FieldAndText(Term t) {
+      field = t.field();
+      text = new BytesRef(t.text());
+    }
+    
+    public int compareTo(FieldAndText other) {
+      if (other.field == field) {
+        return text.compareTo(other.text);
+      } else {
+        return field.compareTo(other.field);
+      }
+    }
+  }
+
+  // chooses from a very limited alphabet to exacerbate the
+  // surrogate seeking required
+  private static String makeDifficultRandomUnicodeString(Random r) {
+    final int end = r.nextInt(20);
+    if (end == 0) {
+      // allow 0 length
+      return "";
+    }
+    final char[] buffer = new char[end];
+    for (int i = 0; i < end; i++) {
+      int t = r.nextInt(5);
+
+      if (0 == t && i < end - 1) {
+        // hi
+        buffer[i++] = (char) 0xd800;
+        // lo
+        buffer[i] = (char) 0xdc00;
+      } else if (t <= 3) {
+        buffer[i] = 'a';
+      }  else if (4 == t) {
+        buffer[i] = 0xe000;
+      }
+    }
+
+    return new String(buffer, 0, end);
+  }
+
+  private SegmentInfo makePreFlexSegment(Random r, String segName, Directory dir, FieldInfos fieldInfos, Codec codec, List<FieldAndText> fieldTerms) throws IOException {
+
+    final int numField = _TestUtil.nextInt(r, 2, 5);
+
+    List<Term> terms = new ArrayList<Term>();
+
+    int tc = 0;
+
+    for(int f=0;f<numField;f++) {
+      String field = "f" + f;
+      Term protoTerm = new Term(field);
+
+      fieldInfos.add(field, true, false, false, false, false, false, false);
+      final int numTerms = 10000*_TestUtil.getRandomMultiplier();
+      for(int i=0;i<numTerms;i++) {
+        String s;
+        if (r.nextInt(3) == 1) {
+          s = makeDifficultRandomUnicodeString(r);
+        } else {
+          s = _TestUtil.randomUnicodeString(r);
+
+          // The surrogate dance uses 0xffff to seek-to-end
+          // of blocks.  Also, pre-4.0 indices are already
+          // guaranteed to not contain the char 0xffff since
+          // it's mapped during indexing:
+          s = s.replace((char) 0xffff, (char) 0xfffe);
+        }
+        terms.add(protoTerm.createTerm(s + "_" + (tc++)));
+      }
+    }
+
+    fieldInfos.write(dir, segName);
+
+    // sorts in UTF16 order, just like preflex:
+    Collections.sort(terms);
+
+    TermInfosWriter w = new TermInfosWriter(dir, segName, fieldInfos, 128);
+    TermInfo ti = new TermInfo();
+    BytesRef utf8 = new BytesRef(10);
+    String lastText = null;
+    int uniqueTermCount = 0;
+    if (DEBUG) {
+      System.out.println("TEST: utf16 order:");
+    }
+    for(Term t : terms) {
+      FieldInfo fi = fieldInfos.fieldInfo(t.field());
+
+      String text = t.text();
+      if (lastText != null && lastText.equals(text)) {
+        continue;
+      }
+      fieldTerms.add(new FieldAndText(t));
+      uniqueTermCount++;
+      lastText = text;
+      UnicodeUtil.UTF16toUTF8(text, 0, text.length(), utf8);
+
+      if (DEBUG) {
+        System.out.println("  " + toHexString(t));
+      }
+      w.add(fi.number, utf8.bytes, utf8.length, ti);
+    }
+    w.close();
+
+    Collections.sort(fieldTerms);
+    if (DEBUG) {
+      System.out.println("\nTEST: codepoint order");
+      for(FieldAndText t: fieldTerms) {
+        System.out.println("  " + t.field + ":" + UnicodeUtil.toHexString(t.text.utf8ToString()));
+      }
+    }
+
+    dir.createOutput(segName + ".prx").close();
+    dir.createOutput(segName + ".frq").close();
+
+    // !!hack alert!! stuffing uniqueTermCount in as docCount
+    return new SegmentInfo(segName, uniqueTermCount, dir, false, -1, null, false, true, codec);
+  }
+
+  private String toHexString(Term t) {
+    return t.field() + ":" + UnicodeUtil.toHexString(t.text());
+  }
+
+  public void testSurrogatesOrder() throws Exception {
+    Directory dir = new MockRAMDirectory();
+
+    Codec codec = new PreFlexCodec();
+
+    Random r = newRandom();
+    FieldInfos fieldInfos = new FieldInfos();
+    List<FieldAndText> fieldTerms = new ArrayList<FieldAndText>();
+    SegmentInfo si = makePreFlexSegment(r, "_0", dir, fieldInfos, codec, fieldTerms);
+
+    // hack alert!!
+    int uniqueTermCount = si.docCount;
+
+    FieldsProducer fields = codec.fieldsProducer(new SegmentReadState(dir, si, fieldInfos, 1024, 1));
+    assertNotNull(fields);
+
+    if (DEBUG) {
+      System.out.println("\nTEST: now enum");
+    }
+    FieldsEnum fieldsEnum = fields.iterator();
+    String field;
+    UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
+
+    int termCount = 0;
+    while((field = fieldsEnum.next()) != null) {
+      TermsEnum termsEnum = fieldsEnum.terms();
+      BytesRef text;
+      BytesRef lastText = null;
+      while((text = termsEnum.next()) != null) {
+        UnicodeUtil.UTF8toUTF16(text.bytes, text.offset, text.length, utf16);
+        if (DEBUG) {
+          System.out.println("got term=" + field + ":" + UnicodeUtil.toHexString(new String(utf16.result, 0, utf16.length)));
+          System.out.println();
+        }
+        if (lastText == null) {
+          lastText = new BytesRef(text);
+        } else {
+          assertTrue(lastText.compareTo(text) < 0);
+          lastText.copy(text);
+        }
+        assertEquals(fieldTerms.get(termCount).field, field);
+        assertEquals(fieldTerms.get(termCount).text, text);
+        termCount++;
+      }
+      if (DEBUG) {
+        System.out.println("  no more terms for field=" + field);
+      }
+    }
+    assertEquals(uniqueTermCount, termCount);
+
+    fields.close();
+  }
+}

Propchange: lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/TestNumericUtils.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/TestNumericUtils.java?rev=957545&r1=957544&r2=957545&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/TestNumericUtils.java (original)
+++ lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/TestNumericUtils.java Thu Jun 24 13:35:40 2010
@@ -30,7 +30,7 @@ public class TestNumericUtils extends Lu
       NumericUtils.longToPrefixCoded(l, 0, act);
       if (last!=null) {
         // test if smaller
-        assertTrue("actual bigger than last (BytesRef)", BytesRef.getUTF8SortedAsUTF16Comparator().compare(last, act) < 0 );
+        assertTrue("actual bigger than last (BytesRef)", BytesRef.getUTF8SortedAsUnicodeComparator().compare(last, act) < 0 );
         assertTrue("actual bigger than last (as String)", last.utf8ToString().compareTo(act.utf8ToString()) < 0 );
       }
       // test is back and forward conversion works
@@ -48,7 +48,7 @@ public class TestNumericUtils extends Lu
       NumericUtils.intToPrefixCoded(i, 0, act);
       if (last!=null) {
         // test if smaller
-        assertTrue("actual bigger than last (BytesRef)", BytesRef.getUTF8SortedAsUTF16Comparator().compare(last, act) < 0 );
+        assertTrue("actual bigger than last (BytesRef)", BytesRef.getUTF8SortedAsUnicodeComparator().compare(last, act) < 0 );
         assertTrue("actual bigger than last (as String)", last.utf8ToString().compareTo(act.utf8ToString()) < 0 );
       }
       // test is back and forward conversion works
@@ -84,7 +84,7 @@ public class TestNumericUtils extends Lu
     
     // check sort order (prefixVals should be ascending)
     for (int i=1; i<prefixVals.length; i++) {
-      assertTrue( "check sort order", BytesRef.getUTF8SortedAsUTF16Comparator().compare(prefixVals[i-1], prefixVals[i] ) < 0 );
+      assertTrue( "check sort order", BytesRef.getUTF8SortedAsUnicodeComparator().compare(prefixVals[i-1], prefixVals[i] ) < 0 );
     }
         
     // check the prefix encoding, lower precision should have the difference to original value equal to the lower removed bits
@@ -124,7 +124,7 @@ public class TestNumericUtils extends Lu
     
     // check sort order (prefixVals should be ascending)
     for (int i=1; i<prefixVals.length; i++) {
-      assertTrue( "check sort order", BytesRef.getUTF8SortedAsUTF16Comparator().compare(prefixVals[i-1], prefixVals[i] ) < 0 );
+      assertTrue( "check sort order", BytesRef.getUTF8SortedAsUnicodeComparator().compare(prefixVals[i-1], prefixVals[i] ) < 0 );
     }
     
     // check the prefix encoding, lower precision should have the difference to original value equal to the lower removed bits