You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by si...@apache.org on 2011/03/28 12:50:48 UTC
svn commit: r1086181 [12/20] - in /lucene/dev/branches/docvalues: ./ dev-tools/eclipse/ dev-tools/idea/ dev-tools/idea/.idea/ dev-tools/idea/.idea/libraries/ dev-tools/idea/lucene/ dev-tools/idea/solr/ dev-tools/idea/solr/contrib/analysis-extras/ dev-t...

Modified: lucene/dev/branches/docvalues/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadNearQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadNearQuery.java?rev=1086181&r1=1086180&r2=1086181&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadNearQuery.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadNearQuery.java Mon Mar 28 10:50:28 2011
@@ -36,6 +36,8 @@ import org.apache.lucene.search.Explanat
 import org.apache.lucene.search.QueryUtils;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Similarity;
+import org.apache.lucene.search.SimilarityProvider;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.spans.SpanQuery;
 import org.apache.lucene.search.spans.SpanNearQuery;
@@ -50,7 +52,7 @@ public class TestPayloadNearQuery extend
   private IndexSearcher searcher;
   private IndexReader reader;
   private Directory directory;
-  private BoostingSimilarity similarity = new BoostingSimilarity();
+  private BoostingSimilarityProvider similarityProvider = new BoostingSimilarityProvider();
   private byte[] payload2 = new byte[]{2};
   private byte[] payload4 = new byte[]{4};
 
@@ -105,7 +107,7 @@ public class TestPayloadNearQuery extend
     directory = newDirectory();
     RandomIndexWriter writer = new RandomIndexWriter(random, directory, 
         newIndexWriterConfig(TEST_VERSION_CURRENT, new PayloadAnalyzer())
-        .setSimilarityProvider(similarity));
+        .setSimilarityProvider(similarityProvider));
     //writer.infoStream = System.out;
     for (int i = 0; i < 1000; i++) {
       Document doc = new Document();
@@ -118,7 +120,7 @@ public class TestPayloadNearQuery extend
     writer.close();
 
     searcher = newSearcher(reader);
-    searcher.setSimilarityProvider(similarity);
+    searcher.setSimilarityProvider(similarityProvider);
   }
 
   @Override
@@ -297,43 +299,57 @@ public class TestPayloadNearQuery extend
   }
 
   // must be static for weight serialization tests 
-  static class BoostingSimilarity extends DefaultSimilarity {
+  static class BoostingSimilarityProvider implements SimilarityProvider {
 
-    @Override public float scorePayload(int docId, int start, int end, byte[] payload, int offset, int length) {
-      //we know it is size 4 here, so ignore the offset/length
-      return payload[offset];
-    }
-    //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-    //Make everything else 1 so we see the effect of the payload
-    //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-    @Override public float computeNorm(FieldInvertState state) {
-      return state.getBoost();
-    }
-
-    @Override public float queryNorm(float sumOfSquaredWeights) {
+    public float queryNorm(float sumOfSquaredWeights) {
       return 1.0f;
     }
-
-    @Override public float sloppyFreq(int distance) {
+    
+    public float coord(int overlap, int maxOverlap) {
       return 1.0f;
     }
+    
+    public Similarity get(String field) {
+      return new DefaultSimilarity() {
+    
+        @Override 
+        public float scorePayload(int docId, int start, int end, byte[] payload, int offset, int length) {
+          //we know it is size 4 here, so ignore the offset/length
+          return payload[offset];
+        }
+    
+        //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+        //Make everything else 1 so we see the effect of the payload
+        //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+        @Override 
+        public float computeNorm(FieldInvertState state) {
+          return state.getBoost();
+        }
 
-    @Override public float coord(int overlap, int maxOverlap) {
-      return 1.0f;
-    }
-    @Override public float tf(float freq) {
-      return 1.0f;
-    }
-    // idf used for phrase queries
-    @Override public IDFExplanation idfExplain(Collection<Term> terms, IndexSearcher searcher) throws IOException {
-      return new IDFExplanation() {
-        @Override
-        public float getIdf() {
+        @Override 
+        public float sloppyFreq(int distance) {
+          return 1.0f;
+        }
+
+        @Override 
+        public float tf(float freq) {
           return 1.0f;
         }
-        @Override
-        public String explain() {
-          return "Inexplicable";
+    
+        // idf used for phrase queries
+        @Override 
+        public IDFExplanation idfExplain(Collection<Term> terms, IndexSearcher searcher) throws IOException {
+          return new IDFExplanation() {
+            @Override
+            public float getIdf() {
+              return 1.0f;
+            }
+        
+            @Override
+            public String explain() {
+              return "Inexplicable";
+            }
+          };
         }
       };
     }

Modified: lucene/dev/branches/docvalues/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadTermQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadTermQuery.java?rev=1086181&r1=1086180&r2=1086181&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadTermQuery.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadTermQuery.java Mon Mar 28 10:50:28 2011
@@ -18,8 +18,11 @@ package org.apache.lucene.search.payload
 
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.English;
+import org.apache.lucene.search.DefaultSimilarityProvider;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.QueryUtils;
+import org.apache.lucene.search.Similarity;
+import org.apache.lucene.search.SimilarityProvider;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.CheckHits;
@@ -54,7 +57,7 @@ import java.io.IOException;
 public class TestPayloadTermQuery extends LuceneTestCase {
   private IndexSearcher searcher;
   private IndexReader reader;
-  private BoostingSimilarity similarity = new BoostingSimilarity();
+  private SimilarityProvider similarityProvider = new BoostingSimilarityProvider();
   private byte[] payloadField = new byte[]{1};
   private byte[] payloadMultiField1 = new byte[]{2};
   private byte[] payloadMultiField2 = new byte[]{4};
@@ -110,7 +113,7 @@ public class TestPayloadTermQuery extend
     directory = newDirectory();
     RandomIndexWriter writer = new RandomIndexWriter(random, directory, 
         newIndexWriterConfig(TEST_VERSION_CURRENT, new PayloadAnalyzer())
-                                                     .setSimilarityProvider(similarity).setMergePolicy(newInOrderLogMergePolicy()));
+                                                     .setSimilarityProvider(similarityProvider).setMergePolicy(newInOrderLogMergePolicy()));
     //writer.infoStream = System.out;
     for (int i = 0; i < 1000; i++) {
       Document doc = new Document();
@@ -125,7 +128,7 @@ public class TestPayloadTermQuery extend
     writer.close();
 
     searcher = newSearcher(reader);
-    searcher.setSimilarityProvider(similarity);
+    searcher.setSimilarityProvider(similarityProvider);
   }
 
   @Override
@@ -220,7 +223,12 @@ public class TestPayloadTermQuery extend
             new MaxPayloadFunction(), false);
 
     IndexSearcher theSearcher = new IndexSearcher(directory, true);
-    theSearcher.setSimilarityProvider(new FullSimilarity());
+    theSearcher.setSimilarityProvider(new DefaultSimilarityProvider() {
+      @Override
+      public Similarity get(String field) {
+        return new FullSimilarity();
+      }
+    });
     TopDocs hits = searcher.search(query, null, 100);
     assertTrue("hits is null and it shouldn't be", hits != null);
     assertTrue("hits Size: " + hits.totalHits + " is not: " + 100, hits.totalHits == 100);
@@ -283,46 +291,49 @@ public class TestPayloadTermQuery extend
   }
 
   // must be static for weight serialization tests 
-  static class BoostingSimilarity extends DefaultSimilarity {
-
-    // TODO: Remove warning after API has been finalized
-    @Override
-    public float scorePayload(int docId, int start, int end, byte[] payload, int offset, int length) {
-      //we know it is size 4 here, so ignore the offset/length
-      return payload[offset];
-    }
+  static class BoostingSimilarityProvider implements SimilarityProvider {
 
-    //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-    //Make everything else 1 so we see the effect of the payload
-    //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-    @Override
-    public float computeNorm(FieldInvertState state) {
-      return state.getBoost();
-    }
-
-    @Override
     public float queryNorm(float sumOfSquaredWeights) {
       return 1;
     }
-
-    @Override
-    public float sloppyFreq(int distance) {
-      return 1;
-    }
-
-    @Override
+    
     public float coord(int overlap, int maxOverlap) {
       return 1;
     }
 
-    @Override
-    public float idf(int docFreq, int numDocs) {
-      return 1;
-    }
+    public Similarity get(String field) {
+      return new DefaultSimilarity() {
+    
+        // TODO: Remove warning after API has been finalized
+        @Override
+        public float scorePayload(int docId, int start, int end, byte[] payload, int offset, int length) {
+          //we know it is size 4 here, so ignore the offset/length
+          return payload[offset];
+        }
 
-    @Override
-    public float tf(float freq) {
-      return freq == 0 ? 0 : 1;
+        //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+        //Make everything else 1 so we see the effect of the payload
+        //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+        @Override
+        public float computeNorm(FieldInvertState state) {
+          return state.getBoost();
+        }
+
+        @Override
+        public float sloppyFreq(int distance) {
+          return 1;
+        }
+
+        @Override
+        public float idf(int docFreq, int numDocs) {
+          return 1;
+        }
+
+        @Override
+        public float tf(float freq) {
+          return freq == 0 ? 0 : 1;
+        }
+      };
     }
   }
 

Modified: lucene/dev/branches/docvalues/lucene/src/test/org/apache/lucene/search/spans/TestPayloadSpans.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/test/org/apache/lucene/search/spans/TestPayloadSpans.java?rev=1086181&r1=1086180&r2=1086181&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/test/org/apache/lucene/search/spans/TestPayloadSpans.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/test/org/apache/lucene/search/spans/TestPayloadSpans.java Mon Mar 28 10:50:28 2011
@@ -37,7 +37,7 @@ import org.apache.lucene.index.RandomInd
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Payload;
 import org.apache.lucene.index.Term;
-import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.search.DefaultSimilarityProvider;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.SimilarityProvider;
 import org.apache.lucene.search.TermQuery;
@@ -50,7 +50,7 @@ import org.apache.lucene.util.LuceneTest
 
 public class TestPayloadSpans extends LuceneTestCase {
   private IndexSearcher searcher;
-  private SimilarityProvider similarity = new DefaultSimilarity();
+  private SimilarityProvider similarity = new DefaultSimilarityProvider();
   protected IndexReader indexReader;
   private IndexReader closeIndexReader;
   private Directory directory;

Modified: lucene/dev/branches/docvalues/lucene/src/test/org/apache/lucene/search/spans/TestSpans.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/test/org/apache/lucene/search/spans/TestSpans.java?rev=1086181&r1=1086180&r2=1086181&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/test/org/apache/lucene/search/spans/TestSpans.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/test/org/apache/lucene/search/spans/TestSpans.java Mon Mar 28 10:50:28 2011
@@ -17,11 +17,13 @@ package org.apache.lucene.search.spans;
  * limitations under the License.
  */
 
+import org.apache.lucene.search.DefaultSimilarityProvider;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.CheckHits;
 import org.apache.lucene.search.DefaultSimilarity;
 import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.Similarity;
 import org.apache.lucene.search.SimilarityProvider;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.IndexSearcher;
@@ -410,10 +412,14 @@ public class TestSpans extends LuceneTes
     for (int i = 0; i < leaves.length; i++) {
       
      
-      final SimilarityProvider sim = new DefaultSimilarity() {
-        @Override
-        public float sloppyFreq(int distance) {
-          return 0.0f;
+      final SimilarityProvider sim = new DefaultSimilarityProvider() {
+        public Similarity get(String field) {
+          return new DefaultSimilarity() {
+            @Override
+            public float sloppyFreq(int distance) {
+              return 0.0f;
+            }
+          };
         }
       };
   

Modified: lucene/dev/branches/docvalues/lucene/src/test/org/apache/lucene/util/TestCollectionUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/test/org/apache/lucene/util/TestCollectionUtil.java?rev=1086181&r1=1086180&r2=1086181&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/test/org/apache/lucene/util/TestCollectionUtil.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/test/org/apache/lucene/util/TestCollectionUtil.java Mon Mar 28 10:50:28 2011
@@ -20,6 +20,7 @@ package org.apache.lucene.util;
 import java.util.Arrays;
 import java.util.ArrayList;
 import java.util.Collections;
+import java.util.LinkedList;
 import java.util.List;
 
 public class TestCollectionUtil extends LuceneTestCase {
@@ -89,9 +90,30 @@ public class TestCollectionUtil extends 
     }
   }
   
-  // should produce no exceptions
-  public void testEmptyArraySort() {
-    List<Integer> list = Collections.emptyList();
+  public void testEmptyListSort() {
+    // should produce no exceptions
+    List<Integer> list = Arrays.asList(new Integer[0]);
+    CollectionUtil.quickSort(list);
+    CollectionUtil.mergeSort(list);
+    CollectionUtil.insertionSort(list);
+    CollectionUtil.quickSort(list, Collections.reverseOrder());
+    CollectionUtil.mergeSort(list, Collections.reverseOrder());
+    CollectionUtil.insertionSort(list, Collections.reverseOrder());
+    
+    // check that empty non-random access lists pass sorting without ex (as sorting is not needed)
+    list = new LinkedList<Integer>();
+    CollectionUtil.quickSort(list);
+    CollectionUtil.mergeSort(list);
+    CollectionUtil.insertionSort(list);
+    CollectionUtil.quickSort(list, Collections.reverseOrder());
+    CollectionUtil.mergeSort(list, Collections.reverseOrder());
+    CollectionUtil.insertionSort(list, Collections.reverseOrder());
+  }
+  
+  public void testOneElementListSort() {
+    // check that one-element non-random access lists pass sorting without ex (as sorting is not needed)
+    List<Integer> list = new LinkedList<Integer>();
+    list.add(1);
     CollectionUtil.quickSort(list);
     CollectionUtil.mergeSort(list);
     CollectionUtil.insertionSort(list);

Modified: lucene/dev/branches/docvalues/lucene/src/test/org/apache/lucene/util/TestIndexableBinaryStringTools.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/test/org/apache/lucene/util/TestIndexableBinaryStringTools.java?rev=1086181&r1=1086180&r2=1086181&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/test/org/apache/lucene/util/TestIndexableBinaryStringTools.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/test/org/apache/lucene/util/TestIndexableBinaryStringTools.java Mon Mar 28 10:50:28 2011
@@ -17,6 +17,10 @@ package org.apache.lucene.util;
  * limitations under the License.
  */
 
+/**
+ * @deprecated Remove when IndexableBinaryStringTools is removed.
+ */
+@Deprecated
 public class TestIndexableBinaryStringTools extends LuceneTestCase {
   private static final int NUM_RANDOM_TESTS = 2000 * RANDOM_MULTIPLIER;
   private static final int MAX_RANDOM_BINARY_LENGTH = 300 * RANDOM_MULTIPLIER;

Modified: lucene/dev/branches/docvalues/lucene/src/test/org/apache/lucene/util/TestPriorityQueue.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/test/org/apache/lucene/util/TestPriorityQueue.java?rev=1086181&r1=1086180&r2=1086181&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/test/org/apache/lucene/util/TestPriorityQueue.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/test/org/apache/lucene/util/TestPriorityQueue.java Mon Mar 28 10:50:28 2011
@@ -23,8 +23,7 @@ public class TestPriorityQueue extends L
 
     private static class IntegerQueue extends PriorityQueue<Integer> {
         public IntegerQueue(int count) {
-            super();
-            initialize(count);
+            super(count);
         }
 
         @Override

Modified: lucene/dev/branches/docvalues/lucene/src/test/org/apache/lucene/util/TestSmallFloat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/test/org/apache/lucene/util/TestSmallFloat.java?rev=1086181&r1=1086180&r2=1086181&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/test/org/apache/lucene/util/TestSmallFloat.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/test/org/apache/lucene/util/TestSmallFloat.java Mon Mar 28 10:50:28 2011
@@ -28,8 +28,8 @@ public class TestSmallFloat extends Luce
     return Float.intBitsToFloat(bits);
   }
 
-  // original lucene floatToByte
-  static byte orig_floatToByte(float f) {
+  // original lucene floatToByte (since lucene 1.3)
+  static byte orig_floatToByte_v13(float f) {
     if (f < 0.0f)                                 // round negatives up to zero
       f = 0.0f;
 
@@ -53,6 +53,33 @@ public class TestSmallFloat extends Luce
     return (byte)((exponent << 3) | mantissa);    // pack into a byte
   }
 
+  // This is the original lucene floatToBytes (from v1.3)
+  // except with the underflow detection bug fixed for values like 5.8123817E-10f
+  static byte orig_floatToByte(float f) {
+    if (f < 0.0f)                                 // round negatives up to zero
+      f = 0.0f;
+
+    if (f == 0.0f)                                // zero is a special case
+      return 0;
+
+    int bits = Float.floatToIntBits(f);           // parse float into parts
+    int mantissa = (bits & 0xffffff) >> 21;
+    int exponent = (((bits >> 24) & 0x7f) - 63) + 15;
+
+    if (exponent > 31) {                          // overflow: use max value
+      exponent = 31;
+      mantissa = 7;
+    }
+
+    if (exponent < 0 || exponent == 0 && mantissa == 0) { // underflow: use min value
+      exponent = 0;
+      mantissa = 1;
+    }
+
+    return (byte)((exponent << 3) | mantissa);    // pack into a byte
+  }
+
+
   public void testByteToFloat() {
     for (int i=0; i<256; i++) {
       float f1 = orig_byteToFloat((byte)i);
@@ -68,6 +95,22 @@ public class TestSmallFloat extends Luce
   }
 
   public void testFloatToByte() {
+    assertEquals(0, orig_floatToByte_v13(5.8123817E-10f));       // verify the old bug (see LUCENE-2937)
+    assertEquals(1, orig_floatToByte(5.8123817E-10f));           // verify it's fixed in this test code
+    assertEquals(1, SmallFloat.floatToByte315(5.8123817E-10f));  // verify it's fixed
+
+    // test some constants
+    assertEquals(0, SmallFloat.floatToByte315(0));
+    assertEquals(1, SmallFloat.floatToByte315(Float.MIN_VALUE));             // underflow rounds up to smallest positive
+    assertEquals(255, SmallFloat.floatToByte315(Float.MAX_VALUE) & 0xff);    // overflow rounds down to largest positive
+    assertEquals(255, SmallFloat.floatToByte315(Float.POSITIVE_INFINITY) & 0xff);
+
+    // all negatives map to 0
+    assertEquals(0, SmallFloat.floatToByte315(-Float.MIN_VALUE));
+    assertEquals(0, SmallFloat.floatToByte315(-Float.MAX_VALUE));
+    assertEquals(0, SmallFloat.floatToByte315(Float.NEGATIVE_INFINITY));
+
+
     // up iterations for more exhaustive test after changing something
     int num = 100000 * RANDOM_MULTIPLIER;
     for (int i = 0; i < num; i++) {
@@ -95,8 +138,8 @@ public class TestSmallFloat extends Luce
       if (f==f) { // skip non-numbers
         byte b1 = orig_floatToByte(f);
         byte b2 = SmallFloat.floatToByte315(f);
-        if (b1!=b2) {
-          TestCase.fail("Failed floatToByte315 for float " + f);
+        if (b1!=b2 || b2==0 && f>0) {
+          fail("Failed floatToByte315 for float " + f + " source bits="+Integer.toHexString(i) + " float raw bits=" + Integer.toHexString(Float.floatToRawIntBits(i)));
         }
       }
       if (i==Integer.MAX_VALUE) break;

Modified: lucene/dev/branches/docvalues/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java?rev=1086181&r1=1086180&r2=1086181&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java Mon Mar 28 10:50:28 2011
@@ -20,19 +20,12 @@ package org.apache.lucene.util.automaton
 import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileInputStream;
+import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStreamReader;
-import java.io.PrintStream;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Random;
-import java.util.Set;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.util.*;
 
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.document.Document;
@@ -54,6 +47,7 @@ import org.apache.lucene.util.LineFileDo
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.util._TestUtil;
+import org.apache.lucene.util.automaton.fst.FST.Arc;
 
 public class TestFSTs extends LuceneTestCase {
 
@@ -445,9 +439,9 @@ public class TestFSTs extends LuceneTest
       }
 
       if (VERBOSE && pairs.size() <= 20 && fst != null) {
-        PrintStream ps = new PrintStream("out.dot");
-        Util.toDot(fst, ps);
-        ps.close();
+        Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
+        Util.toDot(fst, w, false, false);
+        w.close();
         System.out.println("SAVED out.dot");
       }
 
@@ -1095,7 +1089,7 @@ public class TestFSTs extends LuceneTest
 
     protected abstract T getOutput(IntsRef input, int ord) throws IOException;
 
-    public void run(int limit) throws IOException {
+    public void run(int limit, boolean verify) throws IOException {
       BufferedReader is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536);
       try {
         final IntsRef intsRef = new IntsRef(10);
@@ -1112,7 +1106,9 @@ public class TestFSTs extends LuceneTest
 
           ord++;
           if (ord % 500000 == 0) {
-            System.out.println(((System.currentTimeMillis()-tStart)/1000.0) + "s: " + ord + "...");
+            System.out.println(
+                String.format(Locale.ENGLISH, 
+                    "%6.2fs: %9d...", ((System.currentTimeMillis() - tStart) / 1000.0), ord));
           }
           if (ord >= limit) {
             break;
@@ -1126,11 +1122,14 @@ public class TestFSTs extends LuceneTest
           System.exit(0);
         }
 
+        if (dirOut == null)
+          return;
+
         System.out.println(ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs; " + fst.getArcWithOutputCount() + " arcs w/ output; tot size " + fst.sizeInBytes());
         if (fst.getNodeCount() < 100) {
-          PrintStream ps = new PrintStream("out.dot");
-          Util.toDot(fst, ps);
-          ps.close();
+          Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
+          Util.toDot(fst, w, false, false);
+          w.close();
           System.out.println("Wrote FST to out.dot");
         }
 
@@ -1141,6 +1140,10 @@ public class TestFSTs extends LuceneTest
 
         System.out.println("Saved FST to fst.bin.");
 
+        if (!verify) {
+          return;
+        }
+
         System.out.println("\nNow verify...");
 
         is.close();
@@ -1183,37 +1186,54 @@ public class TestFSTs extends LuceneTest
 
   // java -cp build/classes/test:build/classes/java:lib/junit-4.7.jar org.apache.lucene.util.automaton.fst.TestFSTs /x/tmp/allTerms3.txt out
   public static void main(String[] args) throws IOException {
-    final String wordsFileIn = args[0];
-    final String dirOut = args[1];
-    int idx = 2;
     int prune = 0;
     int limit = Integer.MAX_VALUE;
     int inputMode = 0;                             // utf8
     boolean storeOrds = false;
     boolean storeDocFreqs = false;
-    while(idx < args.length) {
+    boolean verify = true;
+    
+    String wordsFileIn = null;
+    String dirOut = null;
+
+    int idx = 0;
+    while (idx < args.length) {
       if (args[idx].equals("-prune")) {
-        prune = Integer.valueOf(args[1+idx]);
+        prune = Integer.valueOf(args[1 + idx]);
         idx++;
-      }
-      if (args[idx].equals("-limit")) {
-        limit = Integer.valueOf(args[1+idx]);
+      } else if (args[idx].equals("-limit")) {
+        limit = Integer.valueOf(args[1 + idx]);
         idx++;
-      }
-      if (args[idx].equals("-utf8")) {
+      } else if (args[idx].equals("-utf8")) {
         inputMode = 0;
-      }
-      if (args[idx].equals("-utf32")) {
+      } else if (args[idx].equals("-utf32")) {
         inputMode = 1;
-      }
-      if (args[idx].equals("-docFreq")) {
+      } else if (args[idx].equals("-docFreq")) {
         storeDocFreqs = true;
-      }
-      if (args[idx].equals("-ords")) {
+      } else if (args[idx].equals("-ords")) {
         storeOrds = true;
+      } else if (args[idx].equals("-noverify")) {
+        verify = false;
+      } else if (args[idx].startsWith("-")) {
+        System.err.println("Unrecognized option: " + args[idx]);
+        System.exit(-1);
+      } else {
+        if (wordsFileIn == null) {
+          wordsFileIn = args[idx];
+        } else if (dirOut == null) {
+          dirOut = args[idx];
+        } else {
+          System.err.println("Too many arguments, expected: input [output]");
+          System.exit(-1);
+        }
       }
       idx++;
     }
+    
+    if (wordsFileIn == null) {
+      System.err.println("No input file.");
+      System.exit(-1);
+    }
 
     // ord benefits from share, docFreqs don't:
 
@@ -1232,7 +1252,7 @@ public class TestFSTs extends LuceneTest
           return new PairOutputs.Pair<Long,Long>(o1.get(ord),
                                                  o2.get(_TestUtil.nextInt(rand, 1, 5000)));
         }
-      }.run(limit);
+      }.run(limit, verify);
     } else if (storeOrds) {
       // Store only ords
       final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
@@ -1241,7 +1261,7 @@ public class TestFSTs extends LuceneTest
         public Long getOutput(IntsRef input, int ord) {
           return outputs.get(ord);
         }
-      }.run(limit);
+      }.run(limit, verify);
     } else if (storeDocFreqs) {
       // Store only docFreq
       final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(false);
@@ -1254,7 +1274,7 @@ public class TestFSTs extends LuceneTest
           }
           return outputs.get(_TestUtil.nextInt(rand, 1, 5000));
         }
-      }.run(limit);
+      }.run(limit, verify);
     } else {
       // Store nothing
       final NoOutputs outputs = NoOutputs.getSingleton();
@@ -1264,7 +1284,7 @@ public class TestFSTs extends LuceneTest
         public Object getOutput(IntsRef input, int ord) {
           return NO_OUTPUT;
         }
-      }.run(limit);
+      }.run(limit, verify);
     }
   }
 
@@ -1320,4 +1340,85 @@ public class TestFSTs extends LuceneTest
     assertEquals(b, seekResult.input);
     assertEquals(42, (long) seekResult.output);
   }
+
+  /**
+   * Test state expansion (array format) on close-to-root states. Creates
+   * synthetic input that has one expanded state on each level.
+   * 
+   * @see "https://issues.apache.org/jira/browse/LUCENE-2933" 
+   */
+  public void testExpandedCloseToRoot() throws Exception {
+    class SyntheticData {
+      FST<Object> compile(String[] lines) throws IOException {
+        final NoOutputs outputs = NoOutputs.getSingleton();
+        final Object nothing = outputs.getNoOutput();
+        final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs);
+
+        int line = 0;
+        final BytesRef term = new BytesRef();
+        while (line < lines.length) {
+          String w = lines[line++];
+          if (w == null) {
+            break;
+          }
+          term.copy(w);
+          b.add(term, nothing);
+        }
+        
+        return b.finish();
+      }
+      
+      void generate(ArrayList<String> out, StringBuilder b, char from, char to,
+          int depth) {
+        if (depth == 0 || from == to) {
+          String seq = b.toString() + "_" + out.size() + "_end";
+          out.add(seq);
+        } else {
+          for (char c = from; c <= to; c++) {
+            b.append(c);
+            generate(out, b, from, c == to ? to : from, depth - 1);
+            b.deleteCharAt(b.length() - 1);
+          }
+        }
+      }
+
+      public int verifyStateAndBelow(FST<Object> fst, Arc<Object> arc, int depth) 
+        throws IOException {
+        if (fst.targetHasArcs(arc)) {
+          int childCount = 0;
+          for (arc = fst.readFirstTargetArc(arc, arc);; 
+               arc = fst.readNextArc(arc), childCount++)
+          {
+            boolean expanded = fst.isExpandedTarget(arc);
+            int children = verifyStateAndBelow(fst, new FST.Arc<Object>().copyFrom(arc), depth + 1);
+
+            assertEquals(
+                expanded,
+                (depth <= FST.FIXED_ARRAY_SHALLOW_DISTANCE && 
+                    children >= FST.FIXED_ARRAY_NUM_ARCS_SHALLOW) ||
+                 children >= FST.FIXED_ARRAY_NUM_ARCS_DEEP);
+            if (arc.isLast()) break;
+          }
+
+          return childCount;
+        }
+        return 0;
+      }
+    }
+
+    // Sanity check.
+    assertTrue(FST.FIXED_ARRAY_NUM_ARCS_SHALLOW < FST.FIXED_ARRAY_NUM_ARCS_DEEP);
+    assertTrue(FST.FIXED_ARRAY_SHALLOW_DISTANCE >= 0);
+
+    SyntheticData s = new SyntheticData();
+
+    ArrayList<String> out = new ArrayList<String>();
+    StringBuilder b = new StringBuilder();
+    s.generate(out, b, 'a', 'i', 10);
+    String[] input = out.toArray(new String[out.size()]);
+    Arrays.sort(input);
+    FST<Object> fst = s.compile(input);
+    FST.Arc<Object> arc = fst.getFirstArc(new FST.Arc<Object>());
+    s.verifyStateAndBelow(fst, arc, 1);
+  }
 }

Modified: lucene/dev/branches/docvalues/modules/analysis/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/analysis/CHANGES.txt?rev=1086181&r1=1086180&r2=1086181&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/analysis/CHANGES.txt (original)
+++ lucene/dev/branches/docvalues/modules/analysis/CHANGES.txt Mon Mar 28 10:50:28 2011
@@ -25,6 +25,10 @@ API Changes
  * LUCENE-1370: Added ShingleFilter option to output unigrams if no shingles
    can be generated. (Chris Harris via Steven Rowe)
    
+ * LUCENE-2514, LUCENE-2551: JDK and ICU CollationKeyAnalyzers were changed to
+   use pure byte keys when Version >= 4.0. This cuts sort key size approximately
+   in half. (Robert Muir)
+   
 New Features
    
  * LUCENE-2413: Consolidated Solr analysis components into common. 

Modified: lucene/dev/branches/docvalues/modules/analysis/NOTICE.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/analysis/NOTICE.txt?rev=1086181&r1=1086180&r2=1086181&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/analysis/NOTICE.txt (original)
+++ lucene/dev/branches/docvalues/modules/analysis/NOTICE.txt Mon Mar 28 10:50:28 2011
@@ -1,5 +1,5 @@
 Apache Lucene
-Copyright 2006 The Apache Software Foundation
+Copyright 2011 The Apache Software Foundation
 
 This product includes software developed by
 The Apache Software Foundation (http://www.apache.org/).

Modified: lucene/dev/branches/docvalues/modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java?rev=1086181&r1=1086180&r2=1086181&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java (original)
+++ lucene/dev/branches/docvalues/modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java Mon Mar 28 10:50:28 2011
@@ -29,8 +29,8 @@ import org.apache.lucene.util.AttributeS
  * Emits the entire input as a single token.
  */
 public final class KeywordTokenizer extends Tokenizer {
-  
-  private static final int DEFAULT_BUFFER_SIZE = 256;
+  /** Default read buffer size */ 
+  public static final int DEFAULT_BUFFER_SIZE = 256;
 
   private boolean done = false;
   private int finalOffset;

Modified: lucene/dev/branches/docvalues/modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java?rev=1086181&r1=1086180&r2=1086181&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java (original)
+++ lucene/dev/branches/docvalues/modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java Mon Mar 28 10:50:28 2011
@@ -18,14 +18,13 @@ package org.apache.lucene.collation;
  */
 
 
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+import org.apache.lucene.util.IndexableBinaryStringTools; // javadoc @link
+import org.apache.lucene.util.Version;
 
 import java.text.Collator;
 import java.io.Reader;
-import java.io.IOException;
 
 /**
  * <p>
@@ -33,8 +32,8 @@ import java.io.IOException;
  * </p>
  * <p>
  *   Converts the token into its {@link java.text.CollationKey}, and then
- *   encodes the CollationKey with 
- *   {@link org.apache.lucene.util.IndexableBinaryStringTools}, to allow 
+ *   encodes the CollationKey either directly or with 
+ *   {@link IndexableBinaryStringTools} (see <a href="#version">below</a>), to allow 
  *   it to be stored as an index term.
  * </p>
  * <p>
@@ -75,39 +74,49 @@ import java.io.IOException;
  *   CollationKeyAnalyzer to generate index terms, do not use
  *   ICUCollationKeyAnalyzer on the query side, or vice versa.
  * </p>
+ * <a name="version"/>
+ * <p>You must specify the required {@link Version}
+ * compatibility when creating CollationKeyAnalyzer:
+ * <ul>
+ *   <li> As of 4.0, Collation Keys are directly encoded as bytes. Previous
+ *   versions will encode the bytes with {@link IndexableBinaryStringTools}.
+ * </ul>
  */
-public final class CollationKeyAnalyzer extends Analyzer {
-  private Collator collator;
-
-  public CollationKeyAnalyzer(Collator collator) {
+public final class CollationKeyAnalyzer extends ReusableAnalyzerBase {
+  private final Collator collator;
+  private final CollationAttributeFactory factory;
+  private final Version matchVersion;
+  
+  /**
+   * Create a new CollationKeyAnalyzer, using the specified collator.
+   * 
+   * @param matchVersion See <a href="#version">above</a>
+   * @param collator CollationKey generator
+   */
+  public CollationKeyAnalyzer(Version matchVersion, Collator collator) {
+    this.matchVersion = matchVersion;
     this.collator = collator;
-  }
-
-  @Override
-  public TokenStream tokenStream(String fieldName, Reader reader) {
-    TokenStream result = new KeywordTokenizer(reader);
-    result = new CollationKeyFilter(result, collator);
-    return result;
+    this.factory = new CollationAttributeFactory(collator);
   }
   
-  private class SavedStreams {
-    Tokenizer source;
-    TokenStream result;
+  /**
+   * @deprecated Use {@link CollationKeyAnalyzer#CollationKeyAnalyzer(Version, Collator)}
+   *   and specify a version instead. This ctor will be removed in Lucene 5.0
+   */
+  @Deprecated
+  public CollationKeyAnalyzer(Collator collator) {
+    this(Version.LUCENE_31, collator);
   }
-  
+
   @Override
-  public TokenStream reusableTokenStream(String fieldName, Reader reader) 
-    throws IOException {
-    
-    SavedStreams streams = (SavedStreams)getPreviousTokenStream();
-    if (streams == null) {
-      streams = new SavedStreams();
-      streams.source = new KeywordTokenizer(reader);
-      streams.result = new CollationKeyFilter(streams.source, collator);
-      setPreviousTokenStream(streams);
+  protected TokenStreamComponents createComponents(String fieldName,
+      Reader reader) {
+    if (matchVersion.onOrAfter(Version.LUCENE_40)) {
+      KeywordTokenizer tokenizer = new KeywordTokenizer(factory, reader, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
+      return new TokenStreamComponents(tokenizer, tokenizer);
     } else {
-      streams.source.reset(reader);
+      KeywordTokenizer tokenizer = new KeywordTokenizer(reader);
+      return new TokenStreamComponents(tokenizer, new CollationKeyFilter(tokenizer, collator));
     }
-    return streams.result;
   }
 }

Modified: lucene/dev/branches/docvalues/modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyFilter.java?rev=1086181&r1=1086180&r2=1086181&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyFilter.java (original)
+++ lucene/dev/branches/docvalues/modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyFilter.java Mon Mar 28 10:50:28 2011
@@ -71,7 +71,10 @@ import java.text.Collator;
  *   CollationKeyFilter to generate index terms, do not use
  *   ICUCollationKeyFilter on the query side, or vice versa.
  * </p>
+ * @deprecated Use {@link CollationAttributeFactory} instead, which encodes
+ *  terms directly as bytes. This filter will be removed in Lucene 5.0
  */
+@Deprecated
 public final class CollationKeyFilter extends TokenFilter {
   private final Collator collator;
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
@@ -82,7 +85,9 @@ public final class CollationKeyFilter ex
    */
   public CollationKeyFilter(TokenStream input, Collator collator) {
     super(input);
-    this.collator = collator;
+    // clone in case JRE doesnt properly sync,
+    // or to reduce contention in case they do
+    this.collator = (Collator) collator.clone();
   }
 
   @Override

Modified: lucene/dev/branches/docvalues/modules/analysis/common/src/java/org/apache/lucene/collation/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/analysis/common/src/java/org/apache/lucene/collation/package.html?rev=1086181&r1=1086180&r2=1086181&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/analysis/common/src/java/org/apache/lucene/collation/package.html (original)
+++ lucene/dev/branches/docvalues/modules/analysis/common/src/java/org/apache/lucene/collation/package.html Mon Mar 28 10:50:28 2011
@@ -52,13 +52,12 @@
 <h2>Example Usages</h2>
 
 <h3>Farsi Range Queries</h3>
-<code><pre>
+<pre class="prettyprint">
   // "fa" Locale is not supported by Sun JDK 1.4 or 1.5
   Collator collator = Collator.getInstance(new Locale("ar"));
-  CollationKeyAnalyzer analyzer = new CollationKeyAnalyzer(collator);
+  CollationKeyAnalyzer analyzer = new CollationKeyAnalyzer(Version.LUCENE_40, collator);
   RAMDirectory ramDir = new RAMDirectory();
-  IndexWriter writer = new IndexWriter
-    (ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
+  IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_40, analyzer));
   Document doc = new Document();
   doc.add(new Field("content", "\u0633\u0627\u0628", 
                     Field.Store.YES, Field.Index.ANALYZED));
@@ -66,12 +65,9 @@
   writer.close();
   IndexSearcher is = new IndexSearcher(ramDir, true);
 
-  // The AnalyzingQueryParser in Lucene's contrib allows terms in range queries
-  // to be passed through an analyzer - Lucene's standard QueryParser does not
-  // allow this.
-  AnalyzingQueryParser aqp = new AnalyzingQueryParser("content", analyzer);
-  aqp.setLowercaseExpandedTerms(false);
-  
+  QueryParser aqp = new QueryParser(Version.LUCENE_40, "content", analyzer);
+  aqp.setAnalyzeRangeTerms(true);
+    
   // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
   // orders the U+0698 character before the U+0633 character, so the single
   // indexed Term above should NOT be returned by a ConstantScoreRangeQuery
@@ -80,15 +76,14 @@
   ScoreDoc[] result
     = is.search(aqp.parse("[ \u062F TO \u0698 ]"), null, 1000).scoreDocs;
   assertEquals("The index Term should not be included.", 0, result.length);
-</pre></code>
+</pre>
 
 <h3>Danish Sorting</h3>
-<code><pre>
+<pre class="prettyprint">
   Analyzer analyzer 
-    = new CollationKeyAnalyzer(Collator.getInstance(new Locale("da", "dk")));
+    = new CollationKeyAnalyzer(Version.LUCENE_40, Collator.getInstance(new Locale("da", "dk")));
   RAMDirectory indexStore = new RAMDirectory();
-  IndexWriter writer = new IndexWriter 
-    (indexStore, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
+  IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig(Version.LUCENE_40, analyzer));
   String[] tracer = new String[] { "A", "B", "C", "D", "E" };
   String[] data = new String[] { "HAT", "HUT", "H\u00C5T", "H\u00D8T", "HOT" };
   String[] sortedTracerOrder = new String[] { "A", "E", "B", "D", "C" };
@@ -99,7 +94,7 @@
     writer.addDocument(doc);
   }
   writer.close();
-  Searcher searcher = new IndexSearcher(indexStore, true);
+  IndexSearcher searcher = new IndexSearcher(indexStore, true);
   Sort sort = new Sort();
   sort.setSort(new SortField("contents", SortField.STRING));
   Query query = new MatchAllDocsQuery();
@@ -108,26 +103,25 @@
     Document doc = searcher.doc(result[i].doc);
     assertEquals(sortedTracerOrder[i], doc.getValues("tracer")[0]);
   }
-</pre></code>
+</pre>
 
 <h3>Turkish Case Normalization</h3>
-<code><pre>
+<pre class="prettyprint">
   Collator collator = Collator.getInstance(new Locale("tr", "TR"));
   collator.setStrength(Collator.PRIMARY);
-  Analyzer analyzer = new CollationKeyAnalyzer(collator);
+  Analyzer analyzer = new CollationKeyAnalyzer(Version.LUCENE_40, collator);
   RAMDirectory ramDir = new RAMDirectory();
-  IndexWriter writer = new IndexWriter
-    (ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
+  IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_40, analyzer));
   Document doc = new Document();
   doc.add(new Field("contents", "DIGY", Field.Store.NO, Field.Index.ANALYZED));
   writer.addDocument(doc);
   writer.close();
   IndexSearcher is = new IndexSearcher(ramDir, true);
-  QueryParser parser = new QueryParser("contents", analyzer);
+  QueryParser parser = new QueryParser(Version.LUCENE_40, "contents", analyzer);
   Query query = parser.parse("d\u0131gy");   // U+0131: dotless i
   ScoreDoc[] result = is.search(query, null, 1000).scoreDocs;
   assertEquals("The index Term should be included.", 1, result.length);
-</pre></code>
+</pre>
 
 <h2>Caveats and Comparisons</h2>
 <p>

Modified: lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/collation/CollationTestBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/collation/CollationTestBase.java?rev=1086181&r1=1086180&r2=1086181&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/collation/CollationTestBase.java (original)
+++ lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/collation/CollationTestBase.java Mon Mar 28 10:50:28 2011
@@ -21,6 +21,8 @@ package org.apache.lucene.collation;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
 import org.apache.lucene.store.RAMDirectory;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
@@ -36,11 +38,15 @@ import org.apache.lucene.search.Sort;
 import org.apache.lucene.search.SortField;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.Document;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IndexableBinaryStringTools;
 import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util._TestUtil;
 
 import java.io.StringReader;
 import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
 
 public abstract class CollationTestBase extends LuceneTestCase {
 
@@ -56,7 +62,9 @@ public abstract class CollationTestBase 
    * @param keyBits the result from 
    *  collator.getCollationKey(original).toByteArray()
    * @return The encoded collation key for the original String
+   * @deprecated only for testing deprecated filters
    */
+  @Deprecated
   protected String encodeCollationKey(byte[] keyBits) {
     // Ensure that the backing char[] array is large enough to hold the encoded
     // Binary String
@@ -65,10 +73,10 @@ public abstract class CollationTestBase 
     IndexableBinaryStringTools.encode(keyBits, 0, keyBits.length, encodedBegArray, 0, encodedLength);
     return new String(encodedBegArray);
   }
-    
-  public void testFarsiRangeFilterCollating(Analyzer analyzer, String firstBeg, 
-                                            String firstEnd, String secondBeg,
-                                            String secondEnd) throws Exception {
+  
+  public void testFarsiRangeFilterCollating(Analyzer analyzer, BytesRef firstBeg, 
+                                            BytesRef firstEnd, BytesRef secondBeg,
+                                            BytesRef secondEnd) throws Exception {
     RAMDirectory ramDir = new RAMDirectory();
     IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(
         TEST_VERSION_CURRENT, analyzer));
@@ -98,9 +106,9 @@ public abstract class CollationTestBase 
     searcher.close();
   }
  
-  public void testFarsiRangeQueryCollating(Analyzer analyzer, String firstBeg, 
-                                            String firstEnd, String secondBeg,
-                                            String secondEnd) throws Exception {
+  public void testFarsiRangeQueryCollating(Analyzer analyzer, BytesRef firstBeg, 
+                                            BytesRef firstEnd, BytesRef secondBeg,
+                                            BytesRef secondEnd) throws Exception {
     RAMDirectory ramDir = new RAMDirectory();
     IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(
         TEST_VERSION_CURRENT, analyzer));
@@ -126,8 +134,8 @@ public abstract class CollationTestBase 
     searcher.close();
   }
 
-  public void testFarsiTermRangeQuery(Analyzer analyzer, String firstBeg,
-      String firstEnd, String secondBeg, String secondEnd) throws Exception {
+  public void testFarsiTermRangeQuery(Analyzer analyzer, BytesRef firstBeg,
+      BytesRef firstEnd, BytesRef secondBeg, BytesRef secondEnd) throws Exception {
 
     RAMDirectory farsiIndex = new RAMDirectory();
     IndexWriter writer = new IndexWriter(farsiIndex, new IndexWriterConfig(
@@ -249,4 +257,77 @@ public abstract class CollationTestBase 
     }
     assertEquals(expectedResult, buff.toString());
   }
+  
+  private String randomString() {
+    // ideally we could do this!
+    // return _TestUtil.randomUnicodeString(random);
+    //
+    // http://bugs.icu-project.org/trac/ticket/8060
+    // http://bugs.icu-project.org/trac/ticket/7732
+    // ...
+    // 
+    // as a workaround, just test the BMP for now (and avoid 0xFFFF etc)
+    int length = _TestUtil.nextInt(random, 0, 10);
+    char chars[] = new char[length];
+    for (int i = 0; i < length; i++) {
+      if (random.nextBoolean()) {
+        chars[i] = (char) _TestUtil.nextInt(random, 0, 0xD7FF);
+      } else {
+        chars[i] = (char) _TestUtil.nextInt(random, 0xE000, 0xFFFD);
+      }
+    }
+    return new String(chars, 0, length);
+  }
+
+  public void assertThreadSafe(final Analyzer analyzer) throws Exception {
+    int numTestPoints = 100;
+    int numThreads = _TestUtil.nextInt(random, 3, 5);
+    final HashMap<String,BytesRef> map = new HashMap<String,BytesRef>();
+    
+    // create a map<String,SortKey> up front.
+    // then with multiple threads, generate sort keys for all the keys in the map
+    // and ensure they are the same as the ones we produced in serial fashion.
+
+    for (int i = 0; i < numTestPoints; i++) {
+      String term = randomString();
+      TokenStream ts = analyzer.reusableTokenStream("fake", new StringReader(term));
+      TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
+      BytesRef bytes = termAtt.getBytesRef();
+      ts.reset();
+      assertTrue(ts.incrementToken());
+      termAtt.fillBytesRef();
+      // ensure we make a copy of the actual bytes too
+      map.put(term, new BytesRef(bytes));
+    }
+    
+    Thread threads[] = new Thread[numThreads];
+    for (int i = 0; i < numThreads; i++) {
+      threads[i] = new Thread() {
+        @Override
+        public void run() {
+          try {
+            for (Map.Entry<String,BytesRef> mapping : map.entrySet()) {
+              String term = mapping.getKey();
+              BytesRef expected = mapping.getValue();
+              TokenStream ts = analyzer.reusableTokenStream("fake", new StringReader(term));
+              TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
+              BytesRef bytes = termAtt.getBytesRef();
+              ts.reset();
+              assertTrue(ts.incrementToken());
+              termAtt.fillBytesRef();
+              assertEquals(expected, bytes);
+            }
+          } catch (IOException e) {
+            throw new RuntimeException(e);
+          }
+        }
+      };
+    }
+    for (int i = 0; i < numThreads; i++) {
+      threads[i].start();
+    }
+    for (int i = 0; i < numThreads; i++) {
+      threads[i].join();
+    }
+  }
 }

Modified: lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java?rev=1086181&r1=1086180&r2=1086181&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java (original)
+++ lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java Mon Mar 28 10:50:28 2011
@@ -19,6 +19,8 @@ package org.apache.lucene.collation;
 
 
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.index.codecs.CodecProvider;
+import org.apache.lucene.util.BytesRef;
 
 import java.text.Collator;
 import java.util.Locale;
@@ -34,17 +36,19 @@ public class TestCollationKeyAnalyzer ex
   // RuleBasedCollator.  However, the Arabic Locale seems to order the Farsi
   // characters properly.
   private Collator collator = Collator.getInstance(new Locale("ar"));
-  private Analyzer analyzer = new CollationKeyAnalyzer(collator);
+  private Analyzer analyzer = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, collator);
 
-  private String firstRangeBeginning = encodeCollationKey
-    (collator.getCollationKey(firstRangeBeginningOriginal).toByteArray());
-  private String firstRangeEnd = encodeCollationKey
-    (collator.getCollationKey(firstRangeEndOriginal).toByteArray());
-  private String secondRangeBeginning = encodeCollationKey
-    (collator.getCollationKey(secondRangeBeginningOriginal).toByteArray());
-  private String secondRangeEnd = encodeCollationKey
-    (collator.getCollationKey(secondRangeEndOriginal).toByteArray());
+  private BytesRef firstRangeBeginning = new BytesRef(collator.getCollationKey(firstRangeBeginningOriginal).toByteArray());
+  private BytesRef firstRangeEnd = new BytesRef(collator.getCollationKey(firstRangeEndOriginal).toByteArray());
+  private BytesRef secondRangeBeginning = new BytesRef(collator.getCollationKey(secondRangeBeginningOriginal).toByteArray());
+  private BytesRef secondRangeEnd = new BytesRef(collator.getCollationKey(secondRangeEndOriginal).toByteArray());
   
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+    assumeFalse("preflex format only supports UTF-8 encoded bytes", "PreFlex".equals(CodecProvider.getDefault().getDefaultFieldCodec()));
+  }
+
   public void testFarsiRangeFilterCollating() throws Exception {
     testFarsiRangeFilterCollating
       (analyzer, firstRangeBeginning, firstRangeEnd, 
@@ -65,13 +69,13 @@ public class TestCollationKeyAnalyzer ex
   
   public void testCollationKeySort() throws Exception {
     Analyzer usAnalyzer 
-      = new CollationKeyAnalyzer(Collator.getInstance(Locale.US));
+      = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(Locale.US));
     Analyzer franceAnalyzer 
-      = new CollationKeyAnalyzer(Collator.getInstance(Locale.FRANCE));
+      = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(Locale.FRANCE));
     Analyzer swedenAnalyzer 
-      = new CollationKeyAnalyzer(Collator.getInstance(new Locale("sv", "se")));
+      = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(new Locale("sv", "se")));
     Analyzer denmarkAnalyzer 
-      = new CollationKeyAnalyzer(Collator.getInstance(new Locale("da", "dk")));
+      = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(new Locale("da", "dk")));
     
     // The ICU Collator and Sun java.text.Collator implementations differ in their
     // orderings - "BFJDH" is the ordering for java.text.Collator for Locale.US.
@@ -79,4 +83,14 @@ public class TestCollationKeyAnalyzer ex
     (usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer, 
      oStrokeFirst ? "BFJHD" : "BFJDH", "EACGI", "BJDFH", "BJDHF");
   }
+  
+  public void testThreadSafe() throws Exception {
+    int iters = 20 * RANDOM_MULTIPLIER;
+    for (int i = 0; i < iters; i++) {
+      Locale locale = randomLocale(random);
+      Collator collator = Collator.getInstance(locale);
+      collator.setStrength(Collator.PRIMARY);
+      assertThreadSafe(new CollationKeyAnalyzer(TEST_VERSION_CURRENT, collator));
+    }
+  }
 }

Modified: lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java?rev=1086181&r1=1086180&r2=1086181&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java (original)
+++ lucene/dev/branches/docvalues/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java Mon Mar 28 10:50:28 2011
@@ -21,12 +21,16 @@ package org.apache.lucene.collation;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.util.BytesRef;
 
 import java.text.Collator;
 import java.util.Locale;
 import java.io.Reader;
 
-
+/**
+ * @deprecated remove when CollationKeyFilter is removed.
+ */
+@Deprecated
 public class TestCollationKeyFilter extends CollationTestBase {
   // the sort order of Ã versus U depends on the version of the rules being used
   // for the inherited root locale: Ã's order isnt specified in Locale.US since 
@@ -39,14 +43,14 @@ public class TestCollationKeyFilter exte
   private Collator collator = Collator.getInstance(new Locale("ar"));
   private Analyzer analyzer = new TestAnalyzer(collator);
 
-  private String firstRangeBeginning = encodeCollationKey
-    (collator.getCollationKey(firstRangeBeginningOriginal).toByteArray());
-  private String firstRangeEnd = encodeCollationKey
-    (collator.getCollationKey(firstRangeEndOriginal).toByteArray());
-  private String secondRangeBeginning = encodeCollationKey
-    (collator.getCollationKey(secondRangeBeginningOriginal).toByteArray());
-  private String secondRangeEnd = encodeCollationKey
-    (collator.getCollationKey(secondRangeEndOriginal).toByteArray());
+  private BytesRef firstRangeBeginning = new BytesRef(encodeCollationKey
+    (collator.getCollationKey(firstRangeBeginningOriginal).toByteArray()));
+  private BytesRef firstRangeEnd = new BytesRef(encodeCollationKey
+    (collator.getCollationKey(firstRangeEndOriginal).toByteArray()));
+  private BytesRef secondRangeBeginning = new BytesRef(encodeCollationKey
+    (collator.getCollationKey(secondRangeBeginningOriginal).toByteArray()));
+  private BytesRef secondRangeEnd = new BytesRef(encodeCollationKey
+    (collator.getCollationKey(secondRangeEndOriginal).toByteArray()));
 
   
   public final class TestAnalyzer extends Analyzer {

Modified: lucene/dev/branches/docvalues/modules/analysis/icu/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/analysis/icu/build.xml?rev=1086181&r1=1086180&r2=1086181&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/analysis/icu/build.xml (original)
+++ lucene/dev/branches/docvalues/modules/analysis/icu/build.xml Mon Mar 28 10:50:28 2011
@@ -132,4 +132,9 @@ are part of the ICU4C package. See http:
       <classpath refid="classpath"/>
     </compile>
   </target>
+  
+  <target name="dist-maven" depends="contrib-build.dist-maven">
+    <m2-deploy-with-pom-template pom.xml="lib/lucene-icu4j-pom.xml.template"
+                                 jar.file="lib/icu4j-4_6.jar" />
+  </target>
 </project>

Modified: lucene/dev/branches/docvalues/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyAnalyzer.java?rev=1086181&r1=1086180&r2=1086181&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyAnalyzer.java (original)
+++ lucene/dev/branches/docvalues/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyAnalyzer.java Mon Mar 28 10:50:28 2011
@@ -19,24 +19,21 @@ package org.apache.lucene.collation;
 
 
 import com.ibm.icu.text.Collator;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.core.KeywordTokenizer;
-import org.apache.lucene.analysis.Tokenizer;
-
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
 import org.apache.lucene.collation.CollationKeyAnalyzer; // javadocs
+import org.apache.lucene.util.IndexableBinaryStringTools; // javadocs
+import org.apache.lucene.util.Version;
 
 import java.io.Reader;
-import java.io.IOException;
-
 
 /**
  * <p>
  *   Filters {@link KeywordTokenizer} with {@link ICUCollationKeyFilter}.
  * <p>
  *   Converts the token into its {@link com.ibm.icu.text.CollationKey}, and
- *   then encodes the CollationKey with 
- *   {@link org.apache.lucene.util.IndexableBinaryStringTools}, to allow it to
+ *   then encodes the CollationKey either directly or with 
+ *   {@link IndexableBinaryStringTools} (see <a href="#version">below</a>), to allow it to
  *   be stored as an index term.
  * </p>
  * <p>
@@ -70,39 +67,48 @@ import java.io.IOException;
  *   generation timing and key length comparisons between ICU4J and
  *   java.text.Collator over several languages.
  * </p>
+ * <a name="version"/>
+ * <p>You must specify the required {@link Version}
+ * compatibility when creating ICUCollationKeyAnalyzer:
+ * <ul>
+ *   <li> As of 4.0, Collation Keys are directly encoded as bytes. Previous
+ *   versions will encode the bytes with {@link IndexableBinaryStringTools}.
+ * </ul>
  */
-public final class ICUCollationKeyAnalyzer extends Analyzer {
-  private Collator collator;
-
-  public ICUCollationKeyAnalyzer(Collator collator) {
+public final class ICUCollationKeyAnalyzer extends ReusableAnalyzerBase {
+  private final Collator collator;
+  private final ICUCollationAttributeFactory factory;
+  private final Version matchVersion;
+
+  /**
+   * Create a new ICUCollationKeyAnalyzer, using the specified collator.
+   * 
+   * @param matchVersion See <a href="#version">above</a>
+   * @param collator CollationKey generator
+   */
+  public ICUCollationKeyAnalyzer(Version matchVersion, Collator collator) {
+    this.matchVersion = matchVersion;
     this.collator = collator;
+    this.factory = new ICUCollationAttributeFactory(collator);
   }
 
-  @Override
-  public TokenStream tokenStream(String fieldName, Reader reader) {
-    TokenStream result = new KeywordTokenizer(reader);
-    result = new ICUCollationKeyFilter(result, collator);
-    return result;
-  }
-  
-  private class SavedStreams {
-    Tokenizer source;
-    TokenStream result;
+  /**
+   * @deprecated Use {@link ICUCollationKeyAnalyzer#ICUCollationKeyAnalyzer(Version, Collator)}
+   *   and specify a version instead. This ctor will be removed in Lucene 5.0
+   */
+  @Deprecated
+  public ICUCollationKeyAnalyzer(Collator collator) {
+    this(Version.LUCENE_31, collator);
   }
-  
+
   @Override
-  public TokenStream reusableTokenStream(String fieldName, Reader reader) 
-    throws IOException {
-    
-    SavedStreams streams = (SavedStreams)getPreviousTokenStream();
-    if (streams == null) {
-      streams = new SavedStreams();
-      streams.source = new KeywordTokenizer(reader);
-      streams.result = new ICUCollationKeyFilter(streams.source, collator);
-      setPreviousTokenStream(streams);
+  protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+    if (matchVersion.onOrAfter(Version.LUCENE_40)) {
+      KeywordTokenizer tokenizer = new KeywordTokenizer(factory, reader, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
+      return new TokenStreamComponents(tokenizer, tokenizer);
     } else {
-      streams.source.reset(reader);
+      KeywordTokenizer tokenizer = new KeywordTokenizer(reader);
+      return new TokenStreamComponents(tokenizer, new ICUCollationKeyFilter(tokenizer, collator));
     }
-    return streams.result;
   }
 }

Modified: lucene/dev/branches/docvalues/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java?rev=1086181&r1=1086180&r2=1086181&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java (original)
+++ lucene/dev/branches/docvalues/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java Mon Mar 28 10:50:28 2011
@@ -68,7 +68,10 @@ import java.io.IOException;
  *   generation timing and key length comparisons between ICU4J and
  *   java.text.Collator over several languages.
  * </p>
+ *  @deprecated Use {@link ICUCollationAttributeFactory} instead, which encodes
+ *  terms directly as bytes. This filter will be removed in Lucene 5.0
  */
+@Deprecated
 public final class ICUCollationKeyFilter extends TokenFilter {
   private Collator collator = null;
   private RawCollationKey reusableKey = new RawCollationKey();
@@ -81,7 +84,12 @@ public final class ICUCollationKeyFilter
    */
   public ICUCollationKeyFilter(TokenStream input, Collator collator) {
     super(input);
-    this.collator = collator;
+    // clone the collator: see http://userguide.icu-project.org/collation/architecture
+    try {
+      this.collator = (Collator) collator.clone();
+    } catch (CloneNotSupportedException e) {
+      throw new RuntimeException(e);
+    }
   }
 
   @Override

Modified: lucene/dev/branches/docvalues/modules/analysis/icu/src/java/overview.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/analysis/icu/src/java/overview.html?rev=1086181&r1=1086180&r2=1086181&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/analysis/icu/src/java/overview.html (original)
+++ lucene/dev/branches/docvalues/modules/analysis/icu/src/java/overview.html Mon Mar 28 10:50:28 2011
@@ -66,12 +66,12 @@ algorithm.
 </ul>
 <h2>Example Usages</h2>
 <h3>Tokenizing multilanguage text</h3>
-<code><pre>
+<pre class="prettyprint">
   /**
    * This tokenizer will work well in general for most languages.
    */
   Tokenizer tokenizer = new ICUTokenizer(reader);
-</pre></code>
+</pre>
 <hr/>
 <h1><a name="collation">Collation</a></h1>
 <p>
@@ -111,12 +111,11 @@ algorithm.
 <h2>Example Usages</h2>
 
 <h3>Farsi Range Queries</h3>
-<code><pre>
-  Collator collator = Collator.getInstance(new Locale("ar"));
-  ICUCollationKeyAnalyzer analyzer = new ICUCollationKeyAnalyzer(collator);
+<pre class="prettyprint">
+  Collator collator = Collator.getInstance(new ULocale("ar"));
+  ICUCollationKeyAnalyzer analyzer = new ICUCollationKeyAnalyzer(Version.LUCENE_40, collator);
   RAMDirectory ramDir = new RAMDirectory();
-  IndexWriter writer = new IndexWriter
-    (ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
+  IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_40, analyzer));
   Document doc = new Document();
   doc.add(new Field("content", "\u0633\u0627\u0628", 
                     Field.Store.YES, Field.Index.ANALYZED));
@@ -124,12 +123,9 @@ algorithm.
   writer.close();
   IndexSearcher is = new IndexSearcher(ramDir, true);
 
-  // The AnalyzingQueryParser in Lucene's contrib allows terms in range queries
-  // to be passed through an analyzer - Lucene's standard QueryParser does not
-  // allow this.
-  AnalyzingQueryParser aqp = new AnalyzingQueryParser("content", analyzer);
-  aqp.setLowercaseExpandedTerms(false);
-  
+  QueryParser aqp = new QueryParser(Version.LUCENE_40, "content", analyzer);
+  aqp.setAnalyzeRangeTerms(true);
+    
   // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
   // orders the U+0698 character before the U+0633 character, so the single
   // indexed Term above should NOT be returned by a ConstantScoreRangeQuery
@@ -138,15 +134,14 @@ algorithm.
   ScoreDoc[] result
     = is.search(aqp.parse("[ \u062F TO \u0698 ]"), null, 1000).scoreDocs;
   assertEquals("The index Term should not be included.", 0, result.length);
-</pre></code>
+</pre>
 
 <h3>Danish Sorting</h3>
-<code><pre>
+<pre class="prettyprint">
   Analyzer analyzer 
-    = new ICUCollationKeyAnalyzer(Collator.getInstance(new Locale("da", "dk")));
+    = new ICUCollationKeyAnalyzer(Version.LUCENE_40, Collator.getInstance(new ULocale("da", "dk")));
   RAMDirectory indexStore = new RAMDirectory();
-  IndexWriter writer = new IndexWriter 
-    (indexStore, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
+  IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig(Version.LUCENE_40, analyzer));
   String[] tracer = new String[] { "A", "B", "C", "D", "E" };
   String[] data = new String[] { "HAT", "HUT", "H\u00C5T", "H\u00D8T", "HOT" };
   String[] sortedTracerOrder = new String[] { "A", "E", "B", "D", "C" };
@@ -157,7 +152,7 @@ algorithm.
     writer.addDocument(doc);
   }
   writer.close();
-  Searcher searcher = new IndexSearcher(indexStore, true);
+  IndexSearcher searcher = new IndexSearcher(indexStore, true);
   Sort sort = new Sort();
   sort.setSort(new SortField("contents", SortField.STRING));
   Query query = new MatchAllDocsQuery();
@@ -166,26 +161,25 @@ algorithm.
     Document doc = searcher.doc(result[i].doc);
     assertEquals(sortedTracerOrder[i], doc.getValues("tracer")[0]);
   }
-</pre></code>
+</pre>
 
 <h3>Turkish Case Normalization</h3>
-<code><pre>
-  Collator collator = Collator.getInstance(new Locale("tr", "TR"));
+<pre class="prettyprint">
+  Collator collator = Collator.getInstance(new ULocale("tr", "TR"));
   collator.setStrength(Collator.PRIMARY);
-  Analyzer analyzer = new ICUCollationKeyAnalyzer(collator);
+  Analyzer analyzer = new ICUCollationKeyAnalyzer(Version.LUCENE_40, collator);
   RAMDirectory ramDir = new RAMDirectory();
-  IndexWriter writer = new IndexWriter
-    (ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
+  IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_40, analyzer));
   Document doc = new Document();
   doc.add(new Field("contents", "DIGY", Field.Store.NO, Field.Index.ANALYZED));
   writer.addDocument(doc);
   writer.close();
   IndexSearcher is = new IndexSearcher(ramDir, true);
-  QueryParser parser = new QueryParser("contents", analyzer);
+  QueryParser parser = new QueryParser(Version.LUCENE_40, "contents", analyzer);
   Query query = parser.parse("d\u0131gy");   // U+0131: dotless i
   ScoreDoc[] result = is.search(query, null, 1000).scoreDocs;
   assertEquals("The index Term should be included.", 1, result.length);
-</pre></code>
+</pre>
 
 <h2>Caveats and Comparisons</h2>
 <p>
@@ -245,7 +239,7 @@ algorithm.
 </ul>
 <h2>Example Usages</h2>
 <h3>Normalizing text to NFC</h3>
-<code><pre>
+<pre class="prettyprint">
   /**
    * Normalizer2 objects are unmodifiable and immutable.
    */
@@ -254,7 +248,7 @@ algorithm.
    * This filter will normalize to NFC.
    */
   TokenStream tokenstream = new ICUNormalizer2Filter(tokenizer, normalizer);
-</pre></code>
+</pre>
 <hr/>
 <h1><a name="casefolding">Case Folding</a></h1>
 <p>
@@ -284,12 +278,12 @@ this integration. To perform case-foldin
 </ul>
 <h2>Example Usages</h2>
 <h3>Lowercasing text</h3>
-<code><pre>
+<pre class="prettyprint">
   /**
    * This filter will case-fold and normalize to NFKC.
    */
   TokenStream tokenstream = new ICUNormalizer2Filter(tokenizer);
-</pre></code>
+</pre>
 <hr/>
 <h1><a name="searchfolding">Search Term Folding</a></h1>
 <p>
@@ -311,13 +305,13 @@ many character foldings recursively.
 </ul>
 <h2>Example Usages</h2>
 <h3>Removing accents</h3>
-<code><pre>
+<pre class="prettyprint">
   /**
    * This filter will case-fold, remove accents and other distinctions, and
    * normalize to NFKC.
    */
   TokenStream tokenstream = new ICUFoldingFilter(tokenizer);
-</pre></code>
+</pre>
 <hr/>
 <h1><a name="transform">Text Transformation</a></h1>
 <p>
@@ -341,19 +335,19 @@ and 
 </ul>
 <h2>Example Usages</h2>
 <h3>Convert Traditional to Simplified</h3>
-<code><pre>
+<pre class="prettyprint">
   /**
    * This filter will map Traditional Chinese to Simplified Chinese
    */
   TokenStream tokenstream = new ICUTransformFilter(tokenizer, Transliterator.getInstance("Traditional-Simplified"));
-</pre></code>
+</pre>
 <h3>Transliterate Serbian Cyrillic to Serbian Latin</h3>
-  <code><pre>
+<pre class="prettyprint">
   /**
    * This filter will map Serbian Cyrillic to Serbian Latin according to BGN rules
    */
   TokenStream tokenstream = new ICUTransformFilter(tokenizer, Transliterator.getInstance("Serbian-Latin/BGN"));
-</pre></code>
+</pre>
 <hr/>
 <h1><a name="backcompat">Backwards Compatibility</a></h1>
 <p>
@@ -365,7 +359,7 @@ a specific Unicode Version by using a {@
 </p>
 <h2>Example Usages</h2>
 <h3>Restricting normalization to Unicode 5.0</h3>
-<code><pre>
+<pre class="prettyprint">
   /**
    * This filter will do NFC normalization, but will ignore any characters that
    * did not exist as of Unicode 5.0. Because of the normalization stability policy
@@ -377,6 +371,6 @@ a specific Unicode Version by using a {@
     set.freeze(); 
     FilteredNormalizer2 unicode50 = new FilteredNormalizer2(normalizer, set);
     TokenStream tokenstream = new ICUNormalizer2Filter(tokenizer, unicode50);
-</pre></code>
+</pre>
 </body>
 </html>

Modified: lucene/dev/branches/docvalues/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java?rev=1086181&r1=1086180&r2=1086181&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java (original)
+++ lucene/dev/branches/docvalues/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java Mon Mar 28 10:50:28 2011
@@ -20,6 +20,8 @@ package org.apache.lucene.collation;
 
 import com.ibm.icu.text.Collator;
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.index.codecs.CodecProvider;
+import org.apache.lucene.util.BytesRef;
 
 import java.util.Locale;
 
@@ -27,17 +29,23 @@ import java.util.Locale;
 public class TestICUCollationKeyAnalyzer extends CollationTestBase {
 
   private Collator collator = Collator.getInstance(new Locale("fa"));
-  private Analyzer analyzer = new ICUCollationKeyAnalyzer(collator);
+  private Analyzer analyzer = new ICUCollationKeyAnalyzer(TEST_VERSION_CURRENT, collator);
 
-  private String firstRangeBeginning = encodeCollationKey
+  private BytesRef firstRangeBeginning = new BytesRef
     (collator.getCollationKey(firstRangeBeginningOriginal).toByteArray());
-  private String firstRangeEnd = encodeCollationKey
+  private BytesRef firstRangeEnd = new BytesRef
     (collator.getCollationKey(firstRangeEndOriginal).toByteArray());
-  private String secondRangeBeginning = encodeCollationKey
+  private BytesRef secondRangeBeginning = new BytesRef
     (collator.getCollationKey(secondRangeBeginningOriginal).toByteArray());
-  private String secondRangeEnd = encodeCollationKey
+  private BytesRef secondRangeEnd = new BytesRef
     (collator.getCollationKey(secondRangeEndOriginal).toByteArray());
-  
+
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+    assumeFalse("preflex format only supports UTF-8 encoded bytes", "PreFlex".equals(CodecProvider.getDefault().getDefaultFieldCodec()));
+  }
+
   public void testFarsiRangeFilterCollating() throws Exception {
     testFarsiRangeFilterCollating(analyzer, firstRangeBeginning, firstRangeEnd, 
                                   secondRangeBeginning, secondRangeEnd);
@@ -62,13 +70,13 @@ public class TestICUCollationKeyAnalyzer
   //  
   public void testCollationKeySort() throws Exception {
     Analyzer usAnalyzer = new ICUCollationKeyAnalyzer
-      (Collator.getInstance(Locale.US));
+      (TEST_VERSION_CURRENT, Collator.getInstance(Locale.US));
     Analyzer franceAnalyzer = new ICUCollationKeyAnalyzer
-      (Collator.getInstance(Locale.FRANCE));
+      (TEST_VERSION_CURRENT, Collator.getInstance(Locale.FRANCE));
     Analyzer swedenAnalyzer = new ICUCollationKeyAnalyzer
-      (Collator.getInstance(new Locale("sv", "se")));
+      (TEST_VERSION_CURRENT, Collator.getInstance(new Locale("sv", "se")));
     Analyzer denmarkAnalyzer = new ICUCollationKeyAnalyzer
-      (Collator.getInstance(new Locale("da", "dk")));
+      (TEST_VERSION_CURRENT, Collator.getInstance(new Locale("da", "dk")));
 
     // The ICU Collator and java.text.Collator implementations differ in their
     // orderings - "BFJHD" is the ordering for the ICU Collator for Locale.US.
@@ -76,4 +84,14 @@ public class TestICUCollationKeyAnalyzer
     (usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer, 
      "BFJHD", "ECAGI", "BJDFH", "BJDHF");
   }
+  
+  public void testThreadSafe() throws Exception {
+    int iters = 20 * RANDOM_MULTIPLIER;
+    for (int i = 0; i < iters; i++) {
+      Locale locale = randomLocale(random);
+      Collator collator = Collator.getInstance(locale);
+      collator.setStrength(Collator.IDENTICAL);
+      assertThreadSafe(new ICUCollationKeyAnalyzer(TEST_VERSION_CURRENT, collator));
+    }
+  }
 }

Modified: lucene/dev/branches/docvalues/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java?rev=1086181&r1=1086180&r2=1086181&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java (original)
+++ lucene/dev/branches/docvalues/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java Mon Mar 28 10:50:28 2011
@@ -22,24 +22,26 @@ import com.ibm.icu.text.Collator;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.util.BytesRef;
 
 import java.io.Reader;
 import java.util.Locale;
 
-
+/** @deprecated remove this when ICUCollationKeyFilter is removed */
+@Deprecated
 public class TestICUCollationKeyFilter extends CollationTestBase {
 
   private Collator collator = Collator.getInstance(new Locale("fa"));
   private Analyzer analyzer = new TestAnalyzer(collator);
 
-  private String firstRangeBeginning = encodeCollationKey
-    (collator.getCollationKey(firstRangeBeginningOriginal).toByteArray());
-  private String firstRangeEnd = encodeCollationKey
-    (collator.getCollationKey(firstRangeEndOriginal).toByteArray());
-  private String secondRangeBeginning = encodeCollationKey
-    (collator.getCollationKey(secondRangeBeginningOriginal).toByteArray());
-  private String secondRangeEnd = encodeCollationKey
-    (collator.getCollationKey(secondRangeEndOriginal).toByteArray());
+  private BytesRef firstRangeBeginning = new BytesRef(encodeCollationKey
+    (collator.getCollationKey(firstRangeBeginningOriginal).toByteArray()));
+  private BytesRef firstRangeEnd = new BytesRef(encodeCollationKey
+    (collator.getCollationKey(firstRangeEndOriginal).toByteArray()));
+  private BytesRef secondRangeBeginning = new BytesRef(encodeCollationKey
+    (collator.getCollationKey(secondRangeBeginningOriginal).toByteArray()));
+  private BytesRef secondRangeEnd = new BytesRef(encodeCollationKey
+    (collator.getCollationKey(secondRangeEndOriginal).toByteArray()));
 
   
   public final class TestAnalyzer extends Analyzer {

Modified: lucene/dev/branches/docvalues/modules/benchmark/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/benchmark/CHANGES.txt?rev=1086181&r1=1086180&r2=1086181&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/benchmark/CHANGES.txt (original)
+++ lucene/dev/branches/docvalues/modules/benchmark/CHANGES.txt Mon Mar 28 10:50:28 2011
@@ -2,12 +2,54 @@ Lucene Benchmark Contrib Change Log
 
 The Benchmark contrib package contains code for benchmarking Lucene in a variety of ways.
 
+03/24/2011
+  LUCENE-2977: WriteLineDocTask now automatically detects how to write -
+  GZip or BZip2 or Plain-text - according to the output file extension.
+  Property bzip.compression of WriteLineDocTask was canceled. (Doron Cohen)
+  
+03/23/2011
+  LUCENE-2980: Benchmark's ContentSource no more requires lower case file suffixes 
+  for detecting file type (gzip/bzip2/text). As part of this fix worked around an 
+  issue with gzip input streams which were remaining open (See COMPRESS-127).
+  (Doron Cohen) 
+
+03/22/2011
+  LUCENE-2978: Upgrade benchmark's commons-compress from 1.0 to 1.1 as 
+  the move of gzip decompression in LUCENE-1540 from Java's GZipInputStream
+  to commons-compress 1.0 made it 15 times slower. In 1.1 no such slow-down
+  is observed. (Doron Cohen)   
+  
+03/21/2011
+  LUCENE-2958: WriteLineDocTask improvements - allow to emit line docs also for empty
+  docs, and be flexible about which fields are added to the line file. For this, a header
+  line was added to the line file. That header is examined by LineDocSource. Old line
+  files which have no header line are handled as before, imposing the default header.
+  (Doron Cohen, Shai Erera, Mike McCandless)
+  
+03/21/2011
+  LUCENE-2964: Allow benchmark tasks from alternative packages,
+  specified through a new property "alt.tasks.packages".
+  (Doron Cohen, Shai Erera)
+  
+03/20/2011
+  LUCENE-2963: Easier way to run benchmark, by calling Benmchmark.exec(alg-file).
+  (Doron Cohen)
+  
+03/10/2011
+  LUCENE-2961: Removed lib/xml-apis.jar, since JVM 1.5+ already contains the
+  JAXP 1.3 interface classes it provides.
+
 02/05/2011
   LUCENE-1540: Improvements to contrib.benchmark for TREC collections. 
   ContentSource can now process plain text files, gzip files, and bzip2 files.
   TREC doc parsing now handles the TREC gov2 collection and TREC disks 4&5-CR 
   collection (both used by many TREC tasks). (Shai Erera, Doron Cohen)
-  
+
+01/31/2011
+  LUCENE-1591: Rollback to xerces-2.9.1-patched-XERCESJ-1257.jar to workaround
+  XERCESJ-1257, which we hit on current Wikipedia XML export
+  (ENWIKI-20110115-pages-articles.xml) with xerces-2.10.0.jar.   (Mike McCandless)
+
 01/26/2011
   LUCENE-929: ExtractReuters first extracts to a tmp dir and then renames. That 
   way, if a previous extract attempt failed, "ant extract-reuters" will still 
@@ -33,7 +75,7 @@ The Benchmark contrib package contains c
 
 4/27/2010: WriteLineDocTask now supports multi-threading. Also, 
   StringBufferReader was renamed to StringBuilderReader and works on 
-  StringBuilder now. In addition, LongToEnglishCountentSource starts from 0
+  StringBuilder now. In addition, LongToEnglishContentSource starts from 0
   (instead of Long.MIN_VAL+10) and wraps around to MIN_VAL (if you ever hit 
   Long.MAX_VAL). (Shai Erera)
 

Modified: lucene/dev/branches/docvalues/modules/benchmark/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/benchmark/build.xml?rev=1086181&r1=1086180&r2=1086181&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/benchmark/build.xml (original)
+++ lucene/dev/branches/docvalues/modules/benchmark/build.xml Mon Mar 28 10:50:28 2011
@@ -265,5 +265,8 @@
 		     />
     </target>
     
-    <target name="dist-maven" depends="jar-core,javadocs,contrib-build.dist-maven"/>
+    <target name="dist-maven" depends="jar-core,javadocs,contrib-build.dist-maven">
+      <m2-deploy-with-pom-template pom.xml="lib/lucene-xercesImpl-pom.xml.template"
+                                   jar.file="lib/xercesImpl-2.9.1-patched-XERCESJ-1257.jar" />
+    </target>
 </project>

Modified: lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java?rev=1086181&r1=1086180&r2=1086181&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java (original)
+++ lucene/dev/branches/docvalues/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java Mon Mar 28 10:50:28 2011
@@ -64,6 +64,9 @@ public class Benchmark {
     }
   }
   
+  /**
+   * Execute this benchmark 
+   */
   public synchronized void  execute() throws Exception {
     if (executed) {
       throw new IllegalStateException("Benchmark was already executed");
@@ -78,6 +81,14 @@ public class Benchmark {
    * @param args benchmark config and algorithm files
    */
   public static void main(String[] args) {
+    exec(args);
+  }
+
+  /**
+   * Utility: execute benchmark from command line
+   * @param args single argument is expected: algorithm-file
+   */
+  public static void exec(String[] args) {
     // verify command line args
     if (args.length < 1) {
       System.err.println("Usage: java Benchmark <algorithm file>");
@@ -115,7 +126,6 @@ public class Benchmark {
     System.out.println("####################");
     System.out.println("###  D O N E !!! ###");
     System.out.println("####################");
-
   }
 
   /**