You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by si...@apache.org on 2011/03/30 11:17:42 UTC
svn commit: r1086876 [11/18] - in /lucene/dev/branches/realtime_search: ./ dev-tools/eclipse/ dev-tools/idea/ dev-tools/idea/.idea/libraries/ dev-tools/idea/lucene/ dev-tools/idea/solr/ dev-tools/idea/solr/contrib/analysis-extras/ dev-tools/idea/solr/c...

Modified: lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadTermQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadTermQuery.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadTermQuery.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadTermQuery.java Wed Mar 30 09:17:25 2011
@@ -18,8 +18,11 @@ package org.apache.lucene.search.payload
 
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.English;
+import org.apache.lucene.search.DefaultSimilarityProvider;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.QueryUtils;
+import org.apache.lucene.search.Similarity;
+import org.apache.lucene.search.SimilarityProvider;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.CheckHits;
@@ -54,7 +57,7 @@ import java.io.IOException;
 public class TestPayloadTermQuery extends LuceneTestCase {
   private IndexSearcher searcher;
   private IndexReader reader;
-  private BoostingSimilarity similarity = new BoostingSimilarity();
+  private SimilarityProvider similarityProvider = new BoostingSimilarityProvider();
   private byte[] payloadField = new byte[]{1};
   private byte[] payloadMultiField1 = new byte[]{2};
   private byte[] payloadMultiField2 = new byte[]{4};
@@ -110,7 +113,7 @@ public class TestPayloadTermQuery extend
     directory = newDirectory();
     RandomIndexWriter writer = new RandomIndexWriter(random, directory, 
         newIndexWriterConfig(TEST_VERSION_CURRENT, new PayloadAnalyzer())
-                                                     .setSimilarityProvider(similarity).setMergePolicy(newInOrderLogMergePolicy()));
+                                                     .setSimilarityProvider(similarityProvider).setMergePolicy(newInOrderLogMergePolicy()));
     //writer.infoStream = System.out;
     for (int i = 0; i < 1000; i++) {
       Document doc = new Document();
@@ -125,7 +128,7 @@ public class TestPayloadTermQuery extend
     writer.close();
 
     searcher = newSearcher(reader);
-    searcher.setSimilarityProvider(similarity);
+    searcher.setSimilarityProvider(similarityProvider);
   }
 
   @Override
@@ -220,7 +223,12 @@ public class TestPayloadTermQuery extend
             new MaxPayloadFunction(), false);
 
     IndexSearcher theSearcher = new IndexSearcher(directory, true);
-    theSearcher.setSimilarityProvider(new FullSimilarity());
+    theSearcher.setSimilarityProvider(new DefaultSimilarityProvider() {
+      @Override
+      public Similarity get(String field) {
+        return new FullSimilarity();
+      }
+    });
     TopDocs hits = searcher.search(query, null, 100);
     assertTrue("hits is null and it shouldn't be", hits != null);
     assertTrue("hits Size: " + hits.totalHits + " is not: " + 100, hits.totalHits == 100);
@@ -283,46 +291,49 @@ public class TestPayloadTermQuery extend
   }
 
   // must be static for weight serialization tests 
-  static class BoostingSimilarity extends DefaultSimilarity {
-
-    // TODO: Remove warning after API has been finalized
-    @Override
-    public float scorePayload(int docId, int start, int end, byte[] payload, int offset, int length) {
-      //we know it is size 4 here, so ignore the offset/length
-      return payload[offset];
-    }
+  static class BoostingSimilarityProvider implements SimilarityProvider {
 
-    //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-    //Make everything else 1 so we see the effect of the payload
-    //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-    @Override
-    public float computeNorm(FieldInvertState state) {
-      return state.getBoost();
-    }
-
-    @Override
     public float queryNorm(float sumOfSquaredWeights) {
       return 1;
     }
-
-    @Override
-    public float sloppyFreq(int distance) {
-      return 1;
-    }
-
-    @Override
+    
     public float coord(int overlap, int maxOverlap) {
       return 1;
     }
 
-    @Override
-    public float idf(int docFreq, int numDocs) {
-      return 1;
-    }
+    public Similarity get(String field) {
+      return new DefaultSimilarity() {
+    
+        // TODO: Remove warning after API has been finalized
+        @Override
+        public float scorePayload(int docId, int start, int end, byte[] payload, int offset, int length) {
+          //we know it is size 4 here, so ignore the offset/length
+          return payload[offset];
+        }
 
-    @Override
-    public float tf(float freq) {
-      return freq == 0 ? 0 : 1;
+        //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+        //Make everything else 1 so we see the effect of the payload
+        //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+        @Override
+        public float computeNorm(FieldInvertState state) {
+          return state.getBoost();
+        }
+
+        @Override
+        public float sloppyFreq(int distance) {
+          return 1;
+        }
+
+        @Override
+        public float idf(int docFreq, int numDocs) {
+          return 1;
+        }
+
+        @Override
+        public float tf(float freq) {
+          return freq == 0 ? 0 : 1;
+        }
+      };
     }
   }
 

Modified: lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/search/spans/TestPayloadSpans.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/search/spans/TestPayloadSpans.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/search/spans/TestPayloadSpans.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/search/spans/TestPayloadSpans.java Wed Mar 30 09:17:25 2011
@@ -37,7 +37,7 @@ import org.apache.lucene.index.RandomInd
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Payload;
 import org.apache.lucene.index.Term;
-import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.search.DefaultSimilarityProvider;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.SimilarityProvider;
 import org.apache.lucene.search.TermQuery;
@@ -50,7 +50,7 @@ import org.apache.lucene.util.LuceneTest
 
 public class TestPayloadSpans extends LuceneTestCase {
   private IndexSearcher searcher;
-  private SimilarityProvider similarity = new DefaultSimilarity();
+  private SimilarityProvider similarity = new DefaultSimilarityProvider();
   protected IndexReader indexReader;
   private IndexReader closeIndexReader;
   private Directory directory;

Modified: lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/search/spans/TestSpans.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/search/spans/TestSpans.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/search/spans/TestSpans.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/search/spans/TestSpans.java Wed Mar 30 09:17:25 2011
@@ -17,11 +17,13 @@ package org.apache.lucene.search.spans;
  * limitations under the License.
  */
 
+import org.apache.lucene.search.DefaultSimilarityProvider;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.CheckHits;
 import org.apache.lucene.search.DefaultSimilarity;
 import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.Similarity;
 import org.apache.lucene.search.SimilarityProvider;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.IndexSearcher;
@@ -410,10 +412,14 @@ public class TestSpans extends LuceneTes
     for (int i = 0; i < leaves.length; i++) {
       
      
-      final SimilarityProvider sim = new DefaultSimilarity() {
-        @Override
-        public float sloppyFreq(int distance) {
-          return 0.0f;
+      final SimilarityProvider sim = new DefaultSimilarityProvider() {
+        public Similarity get(String field) {
+          return new DefaultSimilarity() {
+            @Override
+            public float sloppyFreq(int distance) {
+              return 0.0f;
+            }
+          };
         }
       };
   

Modified: lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/util/TestCollectionUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/util/TestCollectionUtil.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/util/TestCollectionUtil.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/util/TestCollectionUtil.java Wed Mar 30 09:17:25 2011
@@ -20,6 +20,7 @@ package org.apache.lucene.util;
 import java.util.Arrays;
 import java.util.ArrayList;
 import java.util.Collections;
+import java.util.LinkedList;
 import java.util.List;
 
 public class TestCollectionUtil extends LuceneTestCase {
@@ -89,9 +90,30 @@ public class TestCollectionUtil extends 
     }
   }
   
-  // should produce no exceptions
-  public void testEmptyArraySort() {
-    List<Integer> list = Collections.emptyList();
+  public void testEmptyListSort() {
+    // should produce no exceptions
+    List<Integer> list = Arrays.asList(new Integer[0]);
+    CollectionUtil.quickSort(list);
+    CollectionUtil.mergeSort(list);
+    CollectionUtil.insertionSort(list);
+    CollectionUtil.quickSort(list, Collections.reverseOrder());
+    CollectionUtil.mergeSort(list, Collections.reverseOrder());
+    CollectionUtil.insertionSort(list, Collections.reverseOrder());
+    
+    // check that empty non-random access lists pass sorting without ex (as sorting is not needed)
+    list = new LinkedList<Integer>();
+    CollectionUtil.quickSort(list);
+    CollectionUtil.mergeSort(list);
+    CollectionUtil.insertionSort(list);
+    CollectionUtil.quickSort(list, Collections.reverseOrder());
+    CollectionUtil.mergeSort(list, Collections.reverseOrder());
+    CollectionUtil.insertionSort(list, Collections.reverseOrder());
+  }
+  
+  public void testOneElementListSort() {
+    // check that one-element non-random access lists pass sorting without ex (as sorting is not needed)
+    List<Integer> list = new LinkedList<Integer>();
+    list.add(1);
     CollectionUtil.quickSort(list);
     CollectionUtil.mergeSort(list);
     CollectionUtil.insertionSort(list);

Modified: lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/util/TestIndexableBinaryStringTools.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/util/TestIndexableBinaryStringTools.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/util/TestIndexableBinaryStringTools.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/util/TestIndexableBinaryStringTools.java Wed Mar 30 09:17:25 2011
@@ -17,6 +17,10 @@ package org.apache.lucene.util;
  * limitations under the License.
  */
 
+/**
+ * @deprecated Remove when IndexableBinaryStringTools is removed.
+ */
+@Deprecated
 public class TestIndexableBinaryStringTools extends LuceneTestCase {
   private static final int NUM_RANDOM_TESTS = 2000 * RANDOM_MULTIPLIER;
   private static final int MAX_RANDOM_BINARY_LENGTH = 300 * RANDOM_MULTIPLIER;

Modified: lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/util/TestPriorityQueue.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/util/TestPriorityQueue.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/util/TestPriorityQueue.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/util/TestPriorityQueue.java Wed Mar 30 09:17:25 2011
@@ -23,8 +23,7 @@ public class TestPriorityQueue extends L
 
     private static class IntegerQueue extends PriorityQueue<Integer> {
         public IntegerQueue(int count) {
-            super();
-            initialize(count);
+            super(count);
         }
 
         @Override

Modified: lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java Wed Mar 30 09:17:25 2011
@@ -25,16 +25,7 @@ import java.io.IOException;
 import java.io.InputStreamReader;
 import java.io.OutputStreamWriter;
 import java.io.Writer;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Random;
-import java.util.Set;
+import java.util.*;
 
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.document.Document;
@@ -1098,7 +1089,7 @@ public class TestFSTs extends LuceneTest
 
     protected abstract T getOutput(IntsRef input, int ord) throws IOException;
 
-    public void run(int limit) throws IOException {
+    public void run(int limit, boolean verify) throws IOException {
       BufferedReader is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536);
       try {
         final IntsRef intsRef = new IntsRef(10);
@@ -1115,7 +1106,9 @@ public class TestFSTs extends LuceneTest
 
           ord++;
           if (ord % 500000 == 0) {
-            System.out.println(((System.currentTimeMillis()-tStart)/1000.0) + "s: " + ord + "...");
+            System.out.println(
+                String.format(Locale.ENGLISH, 
+                    "%6.2fs: %9d...", ((System.currentTimeMillis() - tStart) / 1000.0), ord));
           }
           if (ord >= limit) {
             break;
@@ -1129,6 +1122,9 @@ public class TestFSTs extends LuceneTest
           System.exit(0);
         }
 
+        if (dirOut == null)
+          return;
+
         System.out.println(ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs; " + fst.getArcWithOutputCount() + " arcs w/ output; tot size " + fst.sizeInBytes());
         if (fst.getNodeCount() < 100) {
           Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
@@ -1144,6 +1140,10 @@ public class TestFSTs extends LuceneTest
 
         System.out.println("Saved FST to fst.bin.");
 
+        if (!verify) {
+          return;
+        }
+
         System.out.println("\nNow verify...");
 
         is.close();
@@ -1186,37 +1186,54 @@ public class TestFSTs extends LuceneTest
 
   // java -cp build/classes/test:build/classes/java:lib/junit-4.7.jar org.apache.lucene.util.automaton.fst.TestFSTs /x/tmp/allTerms3.txt out
   public static void main(String[] args) throws IOException {
-    final String wordsFileIn = args[0];
-    final String dirOut = args[1];
-    int idx = 2;
     int prune = 0;
     int limit = Integer.MAX_VALUE;
     int inputMode = 0;                             // utf8
     boolean storeOrds = false;
     boolean storeDocFreqs = false;
-    while(idx < args.length) {
+    boolean verify = true;
+    
+    String wordsFileIn = null;
+    String dirOut = null;
+
+    int idx = 0;
+    while (idx < args.length) {
       if (args[idx].equals("-prune")) {
-        prune = Integer.valueOf(args[1+idx]);
+        prune = Integer.valueOf(args[1 + idx]);
         idx++;
-      }
-      if (args[idx].equals("-limit")) {
-        limit = Integer.valueOf(args[1+idx]);
+      } else if (args[idx].equals("-limit")) {
+        limit = Integer.valueOf(args[1 + idx]);
         idx++;
-      }
-      if (args[idx].equals("-utf8")) {
+      } else if (args[idx].equals("-utf8")) {
         inputMode = 0;
-      }
-      if (args[idx].equals("-utf32")) {
+      } else if (args[idx].equals("-utf32")) {
         inputMode = 1;
-      }
-      if (args[idx].equals("-docFreq")) {
+      } else if (args[idx].equals("-docFreq")) {
         storeDocFreqs = true;
-      }
-      if (args[idx].equals("-ords")) {
+      } else if (args[idx].equals("-ords")) {
         storeOrds = true;
+      } else if (args[idx].equals("-noverify")) {
+        verify = false;
+      } else if (args[idx].startsWith("-")) {
+        System.err.println("Unrecognized option: " + args[idx]);
+        System.exit(-1);
+      } else {
+        if (wordsFileIn == null) {
+          wordsFileIn = args[idx];
+        } else if (dirOut == null) {
+          dirOut = args[idx];
+        } else {
+          System.err.println("Too many arguments, expected: input [output]");
+          System.exit(-1);
+        }
       }
       idx++;
     }
+    
+    if (wordsFileIn == null) {
+      System.err.println("No input file.");
+      System.exit(-1);
+    }
 
     // ord benefits from share, docFreqs don't:
 
@@ -1235,7 +1252,7 @@ public class TestFSTs extends LuceneTest
           return new PairOutputs.Pair<Long,Long>(o1.get(ord),
                                                  o2.get(_TestUtil.nextInt(rand, 1, 5000)));
         }
-      }.run(limit);
+      }.run(limit, verify);
     } else if (storeOrds) {
       // Store only ords
       final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
@@ -1244,7 +1261,7 @@ public class TestFSTs extends LuceneTest
         public Long getOutput(IntsRef input, int ord) {
           return outputs.get(ord);
         }
-      }.run(limit);
+      }.run(limit, verify);
     } else if (storeDocFreqs) {
       // Store only docFreq
       final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(false);
@@ -1257,7 +1274,7 @@ public class TestFSTs extends LuceneTest
           }
           return outputs.get(_TestUtil.nextInt(rand, 1, 5000));
         }
-      }.run(limit);
+      }.run(limit, verify);
     } else {
       // Store nothing
       final NoOutputs outputs = NoOutputs.getSingleton();
@@ -1267,7 +1284,7 @@ public class TestFSTs extends LuceneTest
         public Object getOutput(IntsRef input, int ord) {
           return NO_OUTPUT;
         }
-      }.run(limit);
+      }.run(limit, verify);
     }
   }
 

Modified: lucene/dev/branches/realtime_search/modules/analysis/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/analysis/CHANGES.txt?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/analysis/CHANGES.txt (original)
+++ lucene/dev/branches/realtime_search/modules/analysis/CHANGES.txt Wed Mar 30 09:17:25 2011
@@ -25,6 +25,10 @@ API Changes
  * LUCENE-1370: Added ShingleFilter option to output unigrams if no shingles
    can be generated. (Chris Harris via Steven Rowe)
    
+ * LUCENE-2514, LUCENE-2551: JDK and ICU CollationKeyAnalyzers were changed to
+   use pure byte keys when Version >= 4.0. This cuts sort key size approximately
+   in half. (Robert Muir)
+   
 New Features
    
  * LUCENE-2413: Consolidated Solr analysis components into common. 

Modified: lucene/dev/branches/realtime_search/modules/analysis/NOTICE.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/analysis/NOTICE.txt?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/analysis/NOTICE.txt (original)
+++ lucene/dev/branches/realtime_search/modules/analysis/NOTICE.txt Wed Mar 30 09:17:25 2011
@@ -1,5 +1,5 @@
 Apache Lucene
-Copyright 2006 The Apache Software Foundation
+Copyright 2011 The Apache Software Foundation
 
 This product includes software developed by
 The Apache Software Foundation (http://www.apache.org/).

Modified: lucene/dev/branches/realtime_search/modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java (original)
+++ lucene/dev/branches/realtime_search/modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java Wed Mar 30 09:17:25 2011
@@ -29,8 +29,8 @@ import org.apache.lucene.util.AttributeS
  * Emits the entire input as a single token.
  */
 public final class KeywordTokenizer extends Tokenizer {
-  
-  private static final int DEFAULT_BUFFER_SIZE = 256;
+  /** Default read buffer size */ 
+  public static final int DEFAULT_BUFFER_SIZE = 256;
 
   private boolean done = false;
   private int finalOffset;

Modified: lucene/dev/branches/realtime_search/modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java (original)
+++ lucene/dev/branches/realtime_search/modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java Wed Mar 30 09:17:25 2011
@@ -18,14 +18,13 @@ package org.apache.lucene.collation;
  */
 
 
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+import org.apache.lucene.util.IndexableBinaryStringTools; // javadoc @link
+import org.apache.lucene.util.Version;
 
 import java.text.Collator;
 import java.io.Reader;
-import java.io.IOException;
 
 /**
  * <p>
@@ -33,8 +32,8 @@ import java.io.IOException;
  * </p>
  * <p>
  *   Converts the token into its {@link java.text.CollationKey}, and then
- *   encodes the CollationKey with 
- *   {@link org.apache.lucene.util.IndexableBinaryStringTools}, to allow 
+ *   encodes the CollationKey either directly or with 
+ *   {@link IndexableBinaryStringTools} (see <a href="#version">below</a>), to allow 
  *   it to be stored as an index term.
  * </p>
  * <p>
@@ -75,39 +74,49 @@ import java.io.IOException;
  *   CollationKeyAnalyzer to generate index terms, do not use
  *   ICUCollationKeyAnalyzer on the query side, or vice versa.
  * </p>
+ * <a name="version"/>
+ * <p>You must specify the required {@link Version}
+ * compatibility when creating CollationKeyAnalyzer:
+ * <ul>
+ *   <li> As of 4.0, Collation Keys are directly encoded as bytes. Previous
+ *   versions will encode the bytes with {@link IndexableBinaryStringTools}.
+ * </ul>
  */
-public final class CollationKeyAnalyzer extends Analyzer {
-  private Collator collator;
-
-  public CollationKeyAnalyzer(Collator collator) {
+public final class CollationKeyAnalyzer extends ReusableAnalyzerBase {
+  private final Collator collator;
+  private final CollationAttributeFactory factory;
+  private final Version matchVersion;
+  
+  /**
+   * Create a new CollationKeyAnalyzer, using the specified collator.
+   * 
+   * @param matchVersion See <a href="#version">above</a>
+   * @param collator CollationKey generator
+   */
+  public CollationKeyAnalyzer(Version matchVersion, Collator collator) {
+    this.matchVersion = matchVersion;
     this.collator = collator;
-  }
-
-  @Override
-  public TokenStream tokenStream(String fieldName, Reader reader) {
-    TokenStream result = new KeywordTokenizer(reader);
-    result = new CollationKeyFilter(result, collator);
-    return result;
+    this.factory = new CollationAttributeFactory(collator);
   }
   
-  private class SavedStreams {
-    Tokenizer source;
-    TokenStream result;
+  /**
+   * @deprecated Use {@link CollationKeyAnalyzer#CollationKeyAnalyzer(Version, Collator)}
+   *   and specify a version instead. This ctor will be removed in Lucene 5.0
+   */
+  @Deprecated
+  public CollationKeyAnalyzer(Collator collator) {
+    this(Version.LUCENE_31, collator);
   }
-  
+
   @Override
-  public TokenStream reusableTokenStream(String fieldName, Reader reader) 
-    throws IOException {
-    
-    SavedStreams streams = (SavedStreams)getPreviousTokenStream();
-    if (streams == null) {
-      streams = new SavedStreams();
-      streams.source = new KeywordTokenizer(reader);
-      streams.result = new CollationKeyFilter(streams.source, collator);
-      setPreviousTokenStream(streams);
+  protected TokenStreamComponents createComponents(String fieldName,
+      Reader reader) {
+    if (matchVersion.onOrAfter(Version.LUCENE_40)) {
+      KeywordTokenizer tokenizer = new KeywordTokenizer(factory, reader, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
+      return new TokenStreamComponents(tokenizer, tokenizer);
     } else {
-      streams.source.reset(reader);
+      KeywordTokenizer tokenizer = new KeywordTokenizer(reader);
+      return new TokenStreamComponents(tokenizer, new CollationKeyFilter(tokenizer, collator));
     }
-    return streams.result;
   }
 }

Modified: lucene/dev/branches/realtime_search/modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyFilter.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyFilter.java (original)
+++ lucene/dev/branches/realtime_search/modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyFilter.java Wed Mar 30 09:17:25 2011
@@ -71,7 +71,10 @@ import java.text.Collator;
  *   CollationKeyFilter to generate index terms, do not use
  *   ICUCollationKeyFilter on the query side, or vice versa.
  * </p>
+ * @deprecated Use {@link CollationAttributeFactory} instead, which encodes
+ *  terms directly as bytes. This filter will be removed in Lucene 5.0
  */
+@Deprecated
 public final class CollationKeyFilter extends TokenFilter {
   private final Collator collator;
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
@@ -82,7 +85,9 @@ public final class CollationKeyFilter ex
    */
   public CollationKeyFilter(TokenStream input, Collator collator) {
     super(input);
-    this.collator = collator;
+    // clone in case JRE doesnt properly sync,
+    // or to reduce contention in case they do
+    this.collator = (Collator) collator.clone();
   }
 
   @Override

Modified: lucene/dev/branches/realtime_search/modules/analysis/common/src/java/org/apache/lucene/collation/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/analysis/common/src/java/org/apache/lucene/collation/package.html?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/analysis/common/src/java/org/apache/lucene/collation/package.html (original)
+++ lucene/dev/branches/realtime_search/modules/analysis/common/src/java/org/apache/lucene/collation/package.html Wed Mar 30 09:17:25 2011
@@ -52,13 +52,12 @@
 <h2>Example Usages</h2>
 
 <h3>Farsi Range Queries</h3>
-<code><pre>
+<pre class="prettyprint">
   // "fa" Locale is not supported by Sun JDK 1.4 or 1.5
   Collator collator = Collator.getInstance(new Locale("ar"));
-  CollationKeyAnalyzer analyzer = new CollationKeyAnalyzer(collator);
+  CollationKeyAnalyzer analyzer = new CollationKeyAnalyzer(Version.LUCENE_40, collator);
   RAMDirectory ramDir = new RAMDirectory();
-  IndexWriter writer = new IndexWriter
-    (ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
+  IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_40, analyzer));
   Document doc = new Document();
   doc.add(new Field("content", "\u0633\u0627\u0628", 
                     Field.Store.YES, Field.Index.ANALYZED));
@@ -66,12 +65,9 @@
   writer.close();
   IndexSearcher is = new IndexSearcher(ramDir, true);
 
-  // The AnalyzingQueryParser in Lucene's contrib allows terms in range queries
-  // to be passed through an analyzer - Lucene's standard QueryParser does not
-  // allow this.
-  AnalyzingQueryParser aqp = new AnalyzingQueryParser("content", analyzer);
-  aqp.setLowercaseExpandedTerms(false);
-  
+  QueryParser aqp = new QueryParser(Version.LUCENE_40, "content", analyzer);
+  aqp.setAnalyzeRangeTerms(true);
+    
   // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
   // orders the U+0698 character before the U+0633 character, so the single
   // indexed Term above should NOT be returned by a ConstantScoreRangeQuery
@@ -80,15 +76,14 @@
   ScoreDoc[] result
     = is.search(aqp.parse("[ \u062F TO \u0698 ]"), null, 1000).scoreDocs;
   assertEquals("The index Term should not be included.", 0, result.length);
-</pre></code>
+</pre>
 
 <h3>Danish Sorting</h3>
-<code><pre>
+<pre class="prettyprint">
   Analyzer analyzer 
-    = new CollationKeyAnalyzer(Collator.getInstance(new Locale("da", "dk")));
+    = new CollationKeyAnalyzer(Version.LUCENE_40, Collator.getInstance(new Locale("da", "dk")));
   RAMDirectory indexStore = new RAMDirectory();
-  IndexWriter writer = new IndexWriter 
-    (indexStore, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
+  IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig(Version.LUCENE_40, analyzer));
   String[] tracer = new String[] { "A", "B", "C", "D", "E" };
   String[] data = new String[] { "HAT", "HUT", "H\u00C5T", "H\u00D8T", "HOT" };
   String[] sortedTracerOrder = new String[] { "A", "E", "B", "D", "C" };
@@ -99,7 +94,7 @@
     writer.addDocument(doc);
   }
   writer.close();
-  Searcher searcher = new IndexSearcher(indexStore, true);
+  IndexSearcher searcher = new IndexSearcher(indexStore, true);
   Sort sort = new Sort();
   sort.setSort(new SortField("contents", SortField.STRING));
   Query query = new MatchAllDocsQuery();
@@ -108,26 +103,25 @@
     Document doc = searcher.doc(result[i].doc);
     assertEquals(sortedTracerOrder[i], doc.getValues("tracer")[0]);
   }
-</pre></code>
+</pre>
 
 <h3>Turkish Case Normalization</h3>
-<code><pre>
+<pre class="prettyprint">
   Collator collator = Collator.getInstance(new Locale("tr", "TR"));
   collator.setStrength(Collator.PRIMARY);
-  Analyzer analyzer = new CollationKeyAnalyzer(collator);
+  Analyzer analyzer = new CollationKeyAnalyzer(Version.LUCENE_40, collator);
   RAMDirectory ramDir = new RAMDirectory();
-  IndexWriter writer = new IndexWriter
-    (ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
+  IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_40, analyzer));
   Document doc = new Document();
   doc.add(new Field("contents", "DIGY", Field.Store.NO, Field.Index.ANALYZED));
   writer.addDocument(doc);
   writer.close();
   IndexSearcher is = new IndexSearcher(ramDir, true);
-  QueryParser parser = new QueryParser("contents", analyzer);
+  QueryParser parser = new QueryParser(Version.LUCENE_40, "contents", analyzer);
   Query query = parser.parse("d\u0131gy");   // U+0131: dotless i
   ScoreDoc[] result = is.search(query, null, 1000).scoreDocs;
   assertEquals("The index Term should be included.", 1, result.length);
-</pre></code>
+</pre>
 
 <h2>Caveats and Comparisons</h2>
 <p>

Modified: lucene/dev/branches/realtime_search/modules/analysis/common/src/test/org/apache/lucene/collation/CollationTestBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/analysis/common/src/test/org/apache/lucene/collation/CollationTestBase.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/analysis/common/src/test/org/apache/lucene/collation/CollationTestBase.java (original)
+++ lucene/dev/branches/realtime_search/modules/analysis/common/src/test/org/apache/lucene/collation/CollationTestBase.java Wed Mar 30 09:17:25 2011
@@ -21,6 +21,8 @@ package org.apache.lucene.collation;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
 import org.apache.lucene.store.RAMDirectory;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
@@ -36,11 +38,15 @@ import org.apache.lucene.search.Sort;
 import org.apache.lucene.search.SortField;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.Document;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IndexableBinaryStringTools;
 import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util._TestUtil;
 
 import java.io.StringReader;
 import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
 
 public abstract class CollationTestBase extends LuceneTestCase {
 
@@ -56,7 +62,9 @@ public abstract class CollationTestBase 
    * @param keyBits the result from 
    *  collator.getCollationKey(original).toByteArray()
    * @return The encoded collation key for the original String
+   * @deprecated only for testing deprecated filters
    */
+  @Deprecated
   protected String encodeCollationKey(byte[] keyBits) {
     // Ensure that the backing char[] array is large enough to hold the encoded
     // Binary String
@@ -65,10 +73,10 @@ public abstract class CollationTestBase 
     IndexableBinaryStringTools.encode(keyBits, 0, keyBits.length, encodedBegArray, 0, encodedLength);
     return new String(encodedBegArray);
   }
-    
-  public void testFarsiRangeFilterCollating(Analyzer analyzer, String firstBeg, 
-                                            String firstEnd, String secondBeg,
-                                            String secondEnd) throws Exception {
+  
+  public void testFarsiRangeFilterCollating(Analyzer analyzer, BytesRef firstBeg, 
+                                            BytesRef firstEnd, BytesRef secondBeg,
+                                            BytesRef secondEnd) throws Exception {
     RAMDirectory ramDir = new RAMDirectory();
     IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(
         TEST_VERSION_CURRENT, analyzer));
@@ -98,9 +106,9 @@ public abstract class CollationTestBase 
     searcher.close();
   }
  
-  public void testFarsiRangeQueryCollating(Analyzer analyzer, String firstBeg, 
-                                            String firstEnd, String secondBeg,
-                                            String secondEnd) throws Exception {
+  public void testFarsiRangeQueryCollating(Analyzer analyzer, BytesRef firstBeg, 
+                                            BytesRef firstEnd, BytesRef secondBeg,
+                                            BytesRef secondEnd) throws Exception {
     RAMDirectory ramDir = new RAMDirectory();
     IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(
         TEST_VERSION_CURRENT, analyzer));
@@ -126,8 +134,8 @@ public abstract class CollationTestBase 
     searcher.close();
   }
 
-  public void testFarsiTermRangeQuery(Analyzer analyzer, String firstBeg,
-      String firstEnd, String secondBeg, String secondEnd) throws Exception {
+  public void testFarsiTermRangeQuery(Analyzer analyzer, BytesRef firstBeg,
+      BytesRef firstEnd, BytesRef secondBeg, BytesRef secondEnd) throws Exception {
 
     RAMDirectory farsiIndex = new RAMDirectory();
     IndexWriter writer = new IndexWriter(farsiIndex, new IndexWriterConfig(
@@ -249,4 +257,77 @@ public abstract class CollationTestBase 
     }
     assertEquals(expectedResult, buff.toString());
   }
+  
+  private String randomString() {
+    // ideally we could do this!
+    // return _TestUtil.randomUnicodeString(random);
+    //
+    // http://bugs.icu-project.org/trac/ticket/8060
+    // http://bugs.icu-project.org/trac/ticket/7732
+    // ...
+    // 
+    // as a workaround, just test the BMP for now (and avoid 0xFFFF etc)
+    int length = _TestUtil.nextInt(random, 0, 10);
+    char chars[] = new char[length];
+    for (int i = 0; i < length; i++) {
+      if (random.nextBoolean()) {
+        chars[i] = (char) _TestUtil.nextInt(random, 0, 0xD7FF);
+      } else {
+        chars[i] = (char) _TestUtil.nextInt(random, 0xE000, 0xFFFD);
+      }
+    }
+    return new String(chars, 0, length);
+  }
+
+  public void assertThreadSafe(final Analyzer analyzer) throws Exception {
+    int numTestPoints = 100;
+    int numThreads = _TestUtil.nextInt(random, 3, 5);
+    final HashMap<String,BytesRef> map = new HashMap<String,BytesRef>();
+    
+    // create a map<String,SortKey> up front.
+    // then with multiple threads, generate sort keys for all the keys in the map
+    // and ensure they are the same as the ones we produced in serial fashion.
+
+    for (int i = 0; i < numTestPoints; i++) {
+      String term = randomString();
+      TokenStream ts = analyzer.reusableTokenStream("fake", new StringReader(term));
+      TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
+      BytesRef bytes = termAtt.getBytesRef();
+      ts.reset();
+      assertTrue(ts.incrementToken());
+      termAtt.fillBytesRef();
+      // ensure we make a copy of the actual bytes too
+      map.put(term, new BytesRef(bytes));
+    }
+    
+    Thread threads[] = new Thread[numThreads];
+    for (int i = 0; i < numThreads; i++) {
+      threads[i] = new Thread() {
+        @Override
+        public void run() {
+          try {
+            for (Map.Entry<String,BytesRef> mapping : map.entrySet()) {
+              String term = mapping.getKey();
+              BytesRef expected = mapping.getValue();
+              TokenStream ts = analyzer.reusableTokenStream("fake", new StringReader(term));
+              TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
+              BytesRef bytes = termAtt.getBytesRef();
+              ts.reset();
+              assertTrue(ts.incrementToken());
+              termAtt.fillBytesRef();
+              assertEquals(expected, bytes);
+            }
+          } catch (IOException e) {
+            throw new RuntimeException(e);
+          }
+        }
+      };
+    }
+    for (int i = 0; i < numThreads; i++) {
+      threads[i].start();
+    }
+    for (int i = 0; i < numThreads; i++) {
+      threads[i].join();
+    }
+  }
 }

Modified: lucene/dev/branches/realtime_search/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java (original)
+++ lucene/dev/branches/realtime_search/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java Wed Mar 30 09:17:25 2011
@@ -19,6 +19,8 @@ package org.apache.lucene.collation;
 
 
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.index.codecs.CodecProvider;
+import org.apache.lucene.util.BytesRef;
 
 import java.text.Collator;
 import java.util.Locale;
@@ -34,17 +36,19 @@ public class TestCollationKeyAnalyzer ex
   // RuleBasedCollator.  However, the Arabic Locale seems to order the Farsi
   // characters properly.
   private Collator collator = Collator.getInstance(new Locale("ar"));
-  private Analyzer analyzer = new CollationKeyAnalyzer(collator);
+  private Analyzer analyzer = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, collator);
 
-  private String firstRangeBeginning = encodeCollationKey
-    (collator.getCollationKey(firstRangeBeginningOriginal).toByteArray());
-  private String firstRangeEnd = encodeCollationKey
-    (collator.getCollationKey(firstRangeEndOriginal).toByteArray());
-  private String secondRangeBeginning = encodeCollationKey
-    (collator.getCollationKey(secondRangeBeginningOriginal).toByteArray());
-  private String secondRangeEnd = encodeCollationKey
-    (collator.getCollationKey(secondRangeEndOriginal).toByteArray());
+  private BytesRef firstRangeBeginning = new BytesRef(collator.getCollationKey(firstRangeBeginningOriginal).toByteArray());
+  private BytesRef firstRangeEnd = new BytesRef(collator.getCollationKey(firstRangeEndOriginal).toByteArray());
+  private BytesRef secondRangeBeginning = new BytesRef(collator.getCollationKey(secondRangeBeginningOriginal).toByteArray());
+  private BytesRef secondRangeEnd = new BytesRef(collator.getCollationKey(secondRangeEndOriginal).toByteArray());
   
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+    assumeFalse("preflex format only supports UTF-8 encoded bytes", "PreFlex".equals(CodecProvider.getDefault().getDefaultFieldCodec()));
+  }
+
   public void testFarsiRangeFilterCollating() throws Exception {
     testFarsiRangeFilterCollating
       (analyzer, firstRangeBeginning, firstRangeEnd, 
@@ -65,13 +69,13 @@ public class TestCollationKeyAnalyzer ex
   
   public void testCollationKeySort() throws Exception {
     Analyzer usAnalyzer 
-      = new CollationKeyAnalyzer(Collator.getInstance(Locale.US));
+      = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(Locale.US));
     Analyzer franceAnalyzer 
-      = new CollationKeyAnalyzer(Collator.getInstance(Locale.FRANCE));
+      = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(Locale.FRANCE));
     Analyzer swedenAnalyzer 
-      = new CollationKeyAnalyzer(Collator.getInstance(new Locale("sv", "se")));
+      = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(new Locale("sv", "se")));
     Analyzer denmarkAnalyzer 
-      = new CollationKeyAnalyzer(Collator.getInstance(new Locale("da", "dk")));
+      = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(new Locale("da", "dk")));
     
     // The ICU Collator and Sun java.text.Collator implementations differ in their
     // orderings - "BFJDH" is the ordering for java.text.Collator for Locale.US.
@@ -79,4 +83,14 @@ public class TestCollationKeyAnalyzer ex
     (usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer, 
      oStrokeFirst ? "BFJHD" : "BFJDH", "EACGI", "BJDFH", "BJDHF");
   }
+  
+  public void testThreadSafe() throws Exception {
+    int iters = 20 * RANDOM_MULTIPLIER;
+    for (int i = 0; i < iters; i++) {
+      Locale locale = randomLocale(random);
+      Collator collator = Collator.getInstance(locale);
+      collator.setStrength(Collator.PRIMARY);
+      assertThreadSafe(new CollationKeyAnalyzer(TEST_VERSION_CURRENT, collator));
+    }
+  }
 }

Modified: lucene/dev/branches/realtime_search/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java (original)
+++ lucene/dev/branches/realtime_search/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java Wed Mar 30 09:17:25 2011
@@ -21,12 +21,16 @@ package org.apache.lucene.collation;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.util.BytesRef;
 
 import java.text.Collator;
 import java.util.Locale;
 import java.io.Reader;
 
-
+/**
+ * @deprecated remove when CollationKeyFilter is removed.
+ */
+@Deprecated
 public class TestCollationKeyFilter extends CollationTestBase {
   // the sort order of Ã versus U depends on the version of the rules being used
   // for the inherited root locale: Ã's order isnt specified in Locale.US since 
@@ -39,14 +43,14 @@ public class TestCollationKeyFilter exte
   private Collator collator = Collator.getInstance(new Locale("ar"));
   private Analyzer analyzer = new TestAnalyzer(collator);
 
-  private String firstRangeBeginning = encodeCollationKey
-    (collator.getCollationKey(firstRangeBeginningOriginal).toByteArray());
-  private String firstRangeEnd = encodeCollationKey
-    (collator.getCollationKey(firstRangeEndOriginal).toByteArray());
-  private String secondRangeBeginning = encodeCollationKey
-    (collator.getCollationKey(secondRangeBeginningOriginal).toByteArray());
-  private String secondRangeEnd = encodeCollationKey
-    (collator.getCollationKey(secondRangeEndOriginal).toByteArray());
+  private BytesRef firstRangeBeginning = new BytesRef(encodeCollationKey
+    (collator.getCollationKey(firstRangeBeginningOriginal).toByteArray()));
+  private BytesRef firstRangeEnd = new BytesRef(encodeCollationKey
+    (collator.getCollationKey(firstRangeEndOriginal).toByteArray()));
+  private BytesRef secondRangeBeginning = new BytesRef(encodeCollationKey
+    (collator.getCollationKey(secondRangeBeginningOriginal).toByteArray()));
+  private BytesRef secondRangeEnd = new BytesRef(encodeCollationKey
+    (collator.getCollationKey(secondRangeEndOriginal).toByteArray()));
 
   
   public final class TestAnalyzer extends Analyzer {

Modified: lucene/dev/branches/realtime_search/modules/analysis/icu/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/analysis/icu/build.xml?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/analysis/icu/build.xml (original)
+++ lucene/dev/branches/realtime_search/modules/analysis/icu/build.xml Wed Mar 30 09:17:25 2011
@@ -132,4 +132,9 @@ are part of the ICU4C package. See http:
       <classpath refid="classpath"/>
     </compile>
   </target>
+  
+  <target name="dist-maven" depends="contrib-build.dist-maven">
+    <m2-deploy-with-pom-template pom.xml="lib/lucene-icu4j-pom.xml.template"
+                                 jar.file="lib/icu4j-4_6.jar" />
+  </target>
 </project>

Modified: lucene/dev/branches/realtime_search/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyAnalyzer.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyAnalyzer.java (original)
+++ lucene/dev/branches/realtime_search/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyAnalyzer.java Wed Mar 30 09:17:25 2011
@@ -19,24 +19,21 @@ package org.apache.lucene.collation;
 
 
 import com.ibm.icu.text.Collator;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.core.KeywordTokenizer;
-import org.apache.lucene.analysis.Tokenizer;
-
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
 import org.apache.lucene.collation.CollationKeyAnalyzer; // javadocs
+import org.apache.lucene.util.IndexableBinaryStringTools; // javadocs
+import org.apache.lucene.util.Version;
 
 import java.io.Reader;
-import java.io.IOException;
-
 
 /**
  * <p>
  *   Filters {@link KeywordTokenizer} with {@link ICUCollationKeyFilter}.
  * <p>
  *   Converts the token into its {@link com.ibm.icu.text.CollationKey}, and
- *   then encodes the CollationKey with 
- *   {@link org.apache.lucene.util.IndexableBinaryStringTools}, to allow it to
+ *   then encodes the CollationKey either directly or with 
+ *   {@link IndexableBinaryStringTools} (see <a href="#version">below</a>), to allow it to
  *   be stored as an index term.
  * </p>
  * <p>
@@ -70,39 +67,48 @@ import java.io.IOException;
  *   generation timing and key length comparisons between ICU4J and
  *   java.text.Collator over several languages.
  * </p>
+ * <a name="version"/>
+ * <p>You must specify the required {@link Version}
+ * compatibility when creating ICUCollationKeyAnalyzer:
+ * <ul>
+ *   <li> As of 4.0, Collation Keys are directly encoded as bytes. Previous
+ *   versions will encode the bytes with {@link IndexableBinaryStringTools}.
+ * </ul>
  */
-public final class ICUCollationKeyAnalyzer extends Analyzer {
-  private Collator collator;
-
-  public ICUCollationKeyAnalyzer(Collator collator) {
+public final class ICUCollationKeyAnalyzer extends ReusableAnalyzerBase {
+  private final Collator collator;
+  private final ICUCollationAttributeFactory factory;
+  private final Version matchVersion;
+
+  /**
+   * Create a new ICUCollationKeyAnalyzer, using the specified collator.
+   * 
+   * @param matchVersion See <a href="#version">above</a>
+   * @param collator CollationKey generator
+   */
+  public ICUCollationKeyAnalyzer(Version matchVersion, Collator collator) {
+    this.matchVersion = matchVersion;
     this.collator = collator;
+    this.factory = new ICUCollationAttributeFactory(collator);
   }
 
-  @Override
-  public TokenStream tokenStream(String fieldName, Reader reader) {
-    TokenStream result = new KeywordTokenizer(reader);
-    result = new ICUCollationKeyFilter(result, collator);
-    return result;
-  }
-  
-  private class SavedStreams {
-    Tokenizer source;
-    TokenStream result;
+  /**
+   * @deprecated Use {@link ICUCollationKeyAnalyzer#ICUCollationKeyAnalyzer(Version, Collator)}
+   *   and specify a version instead. This ctor will be removed in Lucene 5.0
+   */
+  @Deprecated
+  public ICUCollationKeyAnalyzer(Collator collator) {
+    this(Version.LUCENE_31, collator);
   }
-  
+
   @Override
-  public TokenStream reusableTokenStream(String fieldName, Reader reader) 
-    throws IOException {
-    
-    SavedStreams streams = (SavedStreams)getPreviousTokenStream();
-    if (streams == null) {
-      streams = new SavedStreams();
-      streams.source = new KeywordTokenizer(reader);
-      streams.result = new ICUCollationKeyFilter(streams.source, collator);
-      setPreviousTokenStream(streams);
+  protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+    if (matchVersion.onOrAfter(Version.LUCENE_40)) {
+      KeywordTokenizer tokenizer = new KeywordTokenizer(factory, reader, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
+      return new TokenStreamComponents(tokenizer, tokenizer);
     } else {
-      streams.source.reset(reader);
+      KeywordTokenizer tokenizer = new KeywordTokenizer(reader);
+      return new TokenStreamComponents(tokenizer, new ICUCollationKeyFilter(tokenizer, collator));
     }
-    return streams.result;
   }
 }

Modified: lucene/dev/branches/realtime_search/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java (original)
+++ lucene/dev/branches/realtime_search/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java Wed Mar 30 09:17:25 2011
@@ -68,7 +68,10 @@ import java.io.IOException;
  *   generation timing and key length comparisons between ICU4J and
  *   java.text.Collator over several languages.
  * </p>
+ *  @deprecated Use {@link ICUCollationAttributeFactory} instead, which encodes
+ *  terms directly as bytes. This filter will be removed in Lucene 5.0
  */
+@Deprecated
 public final class ICUCollationKeyFilter extends TokenFilter {
   private Collator collator = null;
   private RawCollationKey reusableKey = new RawCollationKey();
@@ -81,7 +84,12 @@ public final class ICUCollationKeyFilter
    */
   public ICUCollationKeyFilter(TokenStream input, Collator collator) {
     super(input);
-    this.collator = collator;
+    // clone the collator: see http://userguide.icu-project.org/collation/architecture
+    try {
+      this.collator = (Collator) collator.clone();
+    } catch (CloneNotSupportedException e) {
+      throw new RuntimeException(e);
+    }
   }
 
   @Override

Modified: lucene/dev/branches/realtime_search/modules/analysis/icu/src/java/overview.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/analysis/icu/src/java/overview.html?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/analysis/icu/src/java/overview.html (original)
+++ lucene/dev/branches/realtime_search/modules/analysis/icu/src/java/overview.html Wed Mar 30 09:17:25 2011
@@ -66,12 +66,12 @@ algorithm.
 </ul>
 <h2>Example Usages</h2>
 <h3>Tokenizing multilanguage text</h3>
-<code><pre>
+<pre class="prettyprint">
   /**
    * This tokenizer will work well in general for most languages.
    */
   Tokenizer tokenizer = new ICUTokenizer(reader);
-</pre></code>
+</pre>
 <hr/>
 <h1><a name="collation">Collation</a></h1>
 <p>
@@ -111,12 +111,11 @@ algorithm.
 <h2>Example Usages</h2>
 
 <h3>Farsi Range Queries</h3>
-<code><pre>
-  Collator collator = Collator.getInstance(new Locale("ar"));
-  ICUCollationKeyAnalyzer analyzer = new ICUCollationKeyAnalyzer(collator);
+<pre class="prettyprint">
+  Collator collator = Collator.getInstance(new ULocale("ar"));
+  ICUCollationKeyAnalyzer analyzer = new ICUCollationKeyAnalyzer(Version.LUCENE_40, collator);
   RAMDirectory ramDir = new RAMDirectory();
-  IndexWriter writer = new IndexWriter
-    (ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
+  IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_40, analyzer));
   Document doc = new Document();
   doc.add(new Field("content", "\u0633\u0627\u0628", 
                     Field.Store.YES, Field.Index.ANALYZED));
@@ -124,12 +123,9 @@ algorithm.
   writer.close();
   IndexSearcher is = new IndexSearcher(ramDir, true);
 
-  // The AnalyzingQueryParser in Lucene's contrib allows terms in range queries
-  // to be passed through an analyzer - Lucene's standard QueryParser does not
-  // allow this.
-  AnalyzingQueryParser aqp = new AnalyzingQueryParser("content", analyzer);
-  aqp.setLowercaseExpandedTerms(false);
-  
+  QueryParser aqp = new QueryParser(Version.LUCENE_40, "content", analyzer);
+  aqp.setAnalyzeRangeTerms(true);
+    
   // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
   // orders the U+0698 character before the U+0633 character, so the single
   // indexed Term above should NOT be returned by a ConstantScoreRangeQuery
@@ -138,15 +134,14 @@ algorithm.
   ScoreDoc[] result
     = is.search(aqp.parse("[ \u062F TO \u0698 ]"), null, 1000).scoreDocs;
   assertEquals("The index Term should not be included.", 0, result.length);
-</pre></code>
+</pre>
 
 <h3>Danish Sorting</h3>
-<code><pre>
+<pre class="prettyprint">
   Analyzer analyzer 
-    = new ICUCollationKeyAnalyzer(Collator.getInstance(new Locale("da", "dk")));
+    = new ICUCollationKeyAnalyzer(Version.LUCENE_40, Collator.getInstance(new ULocale("da", "dk")));
   RAMDirectory indexStore = new RAMDirectory();
-  IndexWriter writer = new IndexWriter 
-    (indexStore, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
+  IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig(Version.LUCENE_40, analyzer));
   String[] tracer = new String[] { "A", "B", "C", "D", "E" };
   String[] data = new String[] { "HAT", "HUT", "H\u00C5T", "H\u00D8T", "HOT" };
   String[] sortedTracerOrder = new String[] { "A", "E", "B", "D", "C" };
@@ -157,7 +152,7 @@ algorithm.
     writer.addDocument(doc);
   }
   writer.close();
-  Searcher searcher = new IndexSearcher(indexStore, true);
+  IndexSearcher searcher = new IndexSearcher(indexStore, true);
   Sort sort = new Sort();
   sort.setSort(new SortField("contents", SortField.STRING));
   Query query = new MatchAllDocsQuery();
@@ -166,26 +161,25 @@ algorithm.
     Document doc = searcher.doc(result[i].doc);
     assertEquals(sortedTracerOrder[i], doc.getValues("tracer")[0]);
   }
-</pre></code>
+</pre>
 
 <h3>Turkish Case Normalization</h3>
-<code><pre>
-  Collator collator = Collator.getInstance(new Locale("tr", "TR"));
+<pre class="prettyprint">
+  Collator collator = Collator.getInstance(new ULocale("tr", "TR"));
   collator.setStrength(Collator.PRIMARY);
-  Analyzer analyzer = new ICUCollationKeyAnalyzer(collator);
+  Analyzer analyzer = new ICUCollationKeyAnalyzer(Version.LUCENE_40, collator);
   RAMDirectory ramDir = new RAMDirectory();
-  IndexWriter writer = new IndexWriter
-    (ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
+  IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_40, analyzer));
   Document doc = new Document();
   doc.add(new Field("contents", "DIGY", Field.Store.NO, Field.Index.ANALYZED));
   writer.addDocument(doc);
   writer.close();
   IndexSearcher is = new IndexSearcher(ramDir, true);
-  QueryParser parser = new QueryParser("contents", analyzer);
+  QueryParser parser = new QueryParser(Version.LUCENE_40, "contents", analyzer);
   Query query = parser.parse("d\u0131gy");   // U+0131: dotless i
   ScoreDoc[] result = is.search(query, null, 1000).scoreDocs;
   assertEquals("The index Term should be included.", 1, result.length);
-</pre></code>
+</pre>
 
 <h2>Caveats and Comparisons</h2>
 <p>
@@ -245,7 +239,7 @@ algorithm.
 </ul>
 <h2>Example Usages</h2>
 <h3>Normalizing text to NFC</h3>
-<code><pre>
+<pre class="prettyprint">
   /**
    * Normalizer2 objects are unmodifiable and immutable.
    */
@@ -254,7 +248,7 @@ algorithm.
    * This filter will normalize to NFC.
    */
   TokenStream tokenstream = new ICUNormalizer2Filter(tokenizer, normalizer);
-</pre></code>
+</pre>
 <hr/>
 <h1><a name="casefolding">Case Folding</a></h1>
 <p>
@@ -284,12 +278,12 @@ this integration. To perform case-foldin
 </ul>
 <h2>Example Usages</h2>
 <h3>Lowercasing text</h3>
-<code><pre>
+<pre class="prettyprint">
   /**
    * This filter will case-fold and normalize to NFKC.
    */
   TokenStream tokenstream = new ICUNormalizer2Filter(tokenizer);
-</pre></code>
+</pre>
 <hr/>
 <h1><a name="searchfolding">Search Term Folding</a></h1>
 <p>
@@ -311,13 +305,13 @@ many character foldings recursively.
 </ul>
 <h2>Example Usages</h2>
 <h3>Removing accents</h3>
-<code><pre>
+<pre class="prettyprint">
   /**
    * This filter will case-fold, remove accents and other distinctions, and
    * normalize to NFKC.
    */
   TokenStream tokenstream = new ICUFoldingFilter(tokenizer);
-</pre></code>
+</pre>
 <hr/>
 <h1><a name="transform">Text Transformation</a></h1>
 <p>
@@ -341,19 +335,19 @@ and 
 </ul>
 <h2>Example Usages</h2>
 <h3>Convert Traditional to Simplified</h3>
-<code><pre>
+<pre class="prettyprint">
   /**
    * This filter will map Traditional Chinese to Simplified Chinese
    */
   TokenStream tokenstream = new ICUTransformFilter(tokenizer, Transliterator.getInstance("Traditional-Simplified"));
-</pre></code>
+</pre>
 <h3>Transliterate Serbian Cyrillic to Serbian Latin</h3>
-  <code><pre>
+<pre class="prettyprint">
   /**
    * This filter will map Serbian Cyrillic to Serbian Latin according to BGN rules
    */
   TokenStream tokenstream = new ICUTransformFilter(tokenizer, Transliterator.getInstance("Serbian-Latin/BGN"));
-</pre></code>
+</pre>
 <hr/>
 <h1><a name="backcompat">Backwards Compatibility</a></h1>
 <p>
@@ -365,7 +359,7 @@ a specific Unicode Version by using a {@
 </p>
 <h2>Example Usages</h2>
 <h3>Restricting normalization to Unicode 5.0</h3>
-<code><pre>
+<pre class="prettyprint">
   /**
    * This filter will do NFC normalization, but will ignore any characters that
    * did not exist as of Unicode 5.0. Because of the normalization stability policy
@@ -377,6 +371,6 @@ a specific Unicode Version by using a {@
     set.freeze(); 
     FilteredNormalizer2 unicode50 = new FilteredNormalizer2(normalizer, set);
     TokenStream tokenstream = new ICUNormalizer2Filter(tokenizer, unicode50);
-</pre></code>
+</pre>
 </body>
 </html>

Modified: lucene/dev/branches/realtime_search/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java (original)
+++ lucene/dev/branches/realtime_search/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java Wed Mar 30 09:17:25 2011
@@ -20,6 +20,8 @@ package org.apache.lucene.collation;
 
 import com.ibm.icu.text.Collator;
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.index.codecs.CodecProvider;
+import org.apache.lucene.util.BytesRef;
 
 import java.util.Locale;
 
@@ -27,17 +29,23 @@ import java.util.Locale;
 public class TestICUCollationKeyAnalyzer extends CollationTestBase {
 
   private Collator collator = Collator.getInstance(new Locale("fa"));
-  private Analyzer analyzer = new ICUCollationKeyAnalyzer(collator);
+  private Analyzer analyzer = new ICUCollationKeyAnalyzer(TEST_VERSION_CURRENT, collator);
 
-  private String firstRangeBeginning = encodeCollationKey
+  private BytesRef firstRangeBeginning = new BytesRef
     (collator.getCollationKey(firstRangeBeginningOriginal).toByteArray());
-  private String firstRangeEnd = encodeCollationKey
+  private BytesRef firstRangeEnd = new BytesRef
     (collator.getCollationKey(firstRangeEndOriginal).toByteArray());
-  private String secondRangeBeginning = encodeCollationKey
+  private BytesRef secondRangeBeginning = new BytesRef
     (collator.getCollationKey(secondRangeBeginningOriginal).toByteArray());
-  private String secondRangeEnd = encodeCollationKey
+  private BytesRef secondRangeEnd = new BytesRef
     (collator.getCollationKey(secondRangeEndOriginal).toByteArray());
-  
+
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+    assumeFalse("preflex format only supports UTF-8 encoded bytes", "PreFlex".equals(CodecProvider.getDefault().getDefaultFieldCodec()));
+  }
+
   public void testFarsiRangeFilterCollating() throws Exception {
     testFarsiRangeFilterCollating(analyzer, firstRangeBeginning, firstRangeEnd, 
                                   secondRangeBeginning, secondRangeEnd);
@@ -62,13 +70,13 @@ public class TestICUCollationKeyAnalyzer
   //  
   public void testCollationKeySort() throws Exception {
     Analyzer usAnalyzer = new ICUCollationKeyAnalyzer
-      (Collator.getInstance(Locale.US));
+      (TEST_VERSION_CURRENT, Collator.getInstance(Locale.US));
     Analyzer franceAnalyzer = new ICUCollationKeyAnalyzer
-      (Collator.getInstance(Locale.FRANCE));
+      (TEST_VERSION_CURRENT, Collator.getInstance(Locale.FRANCE));
     Analyzer swedenAnalyzer = new ICUCollationKeyAnalyzer
-      (Collator.getInstance(new Locale("sv", "se")));
+      (TEST_VERSION_CURRENT, Collator.getInstance(new Locale("sv", "se")));
     Analyzer denmarkAnalyzer = new ICUCollationKeyAnalyzer
-      (Collator.getInstance(new Locale("da", "dk")));
+      (TEST_VERSION_CURRENT, Collator.getInstance(new Locale("da", "dk")));
 
     // The ICU Collator and java.text.Collator implementations differ in their
     // orderings - "BFJHD" is the ordering for the ICU Collator for Locale.US.
@@ -76,4 +84,14 @@ public class TestICUCollationKeyAnalyzer
     (usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer, 
      "BFJHD", "ECAGI", "BJDFH", "BJDHF");
   }
+  
+  public void testThreadSafe() throws Exception {
+    int iters = 20 * RANDOM_MULTIPLIER;
+    for (int i = 0; i < iters; i++) {
+      Locale locale = randomLocale(random);
+      Collator collator = Collator.getInstance(locale);
+      collator.setStrength(Collator.IDENTICAL);
+      assertThreadSafe(new ICUCollationKeyAnalyzer(TEST_VERSION_CURRENT, collator));
+    }
+  }
 }

Modified: lucene/dev/branches/realtime_search/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java (original)
+++ lucene/dev/branches/realtime_search/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java Wed Mar 30 09:17:25 2011
@@ -22,24 +22,26 @@ import com.ibm.icu.text.Collator;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.util.BytesRef;
 
 import java.io.Reader;
 import java.util.Locale;
 
-
+/** @deprecated remove this when ICUCollationKeyFilter is removed */
+@Deprecated
 public class TestICUCollationKeyFilter extends CollationTestBase {
 
   private Collator collator = Collator.getInstance(new Locale("fa"));
   private Analyzer analyzer = new TestAnalyzer(collator);
 
-  private String firstRangeBeginning = encodeCollationKey
-    (collator.getCollationKey(firstRangeBeginningOriginal).toByteArray());
-  private String firstRangeEnd = encodeCollationKey
-    (collator.getCollationKey(firstRangeEndOriginal).toByteArray());
-  private String secondRangeBeginning = encodeCollationKey
-    (collator.getCollationKey(secondRangeBeginningOriginal).toByteArray());
-  private String secondRangeEnd = encodeCollationKey
-    (collator.getCollationKey(secondRangeEndOriginal).toByteArray());
+  private BytesRef firstRangeBeginning = new BytesRef(encodeCollationKey
+    (collator.getCollationKey(firstRangeBeginningOriginal).toByteArray()));
+  private BytesRef firstRangeEnd = new BytesRef(encodeCollationKey
+    (collator.getCollationKey(firstRangeEndOriginal).toByteArray()));
+  private BytesRef secondRangeBeginning = new BytesRef(encodeCollationKey
+    (collator.getCollationKey(secondRangeBeginningOriginal).toByteArray()));
+  private BytesRef secondRangeEnd = new BytesRef(encodeCollationKey
+    (collator.getCollationKey(secondRangeEndOriginal).toByteArray()));
 
   
   public final class TestAnalyzer extends Analyzer {

Modified: lucene/dev/branches/realtime_search/modules/benchmark/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/benchmark/CHANGES.txt?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/benchmark/CHANGES.txt (original)
+++ lucene/dev/branches/realtime_search/modules/benchmark/CHANGES.txt Wed Mar 30 09:17:25 2011
@@ -2,12 +2,54 @@ Lucene Benchmark Contrib Change Log
 
 The Benchmark contrib package contains code for benchmarking Lucene in a variety of ways.
 
+03/24/2011
+  LUCENE-2977: WriteLineDocTask now automatically detects how to write -
+  GZip or BZip2 or Plain-text - according to the output file extension.
+  Property bzip.compression of WriteLineDocTask was canceled. (Doron Cohen)
+  
+03/23/2011
+  LUCENE-2980: Benchmark's ContentSource no more requires lower case file suffixes 
+  for detecting file type (gzip/bzip2/text). As part of this fix worked around an 
+  issue with gzip input streams which were remaining open (See COMPRESS-127).
+  (Doron Cohen) 
+
+03/22/2011
+  LUCENE-2978: Upgrade benchmark's commons-compress from 1.0 to 1.1 as 
+  the move of gzip decompression in LUCENE-1540 from Java's GZipInputStream
+  to commons-compress 1.0 made it 15 times slower. In 1.1 no such slow-down
+  is observed. (Doron Cohen)   
+  
+03/21/2011
+  LUCENE-2958: WriteLineDocTask improvements - allow to emit line docs also for empty
+  docs, and be flexible about which fields are added to the line file. For this, a header
+  line was added to the line file. That header is examined by LineDocSource. Old line
+  files which have no header line are handled as before, imposing the default header.
+  (Doron Cohen, Shai Erera, Mike McCandless)
+  
+03/21/2011
+  LUCENE-2964: Allow benchmark tasks from alternative packages,
+  specified through a new property "alt.tasks.packages".
+  (Doron Cohen, Shai Erera)
+  
+03/20/2011
+  LUCENE-2963: Easier way to run benchmark, by calling Benmchmark.exec(alg-file).
+  (Doron Cohen)
+  
+03/10/2011
+  LUCENE-2961: Removed lib/xml-apis.jar, since JVM 1.5+ already contains the
+  JAXP 1.3 interface classes it provides.
+
 02/05/2011
   LUCENE-1540: Improvements to contrib.benchmark for TREC collections. 
   ContentSource can now process plain text files, gzip files, and bzip2 files.
   TREC doc parsing now handles the TREC gov2 collection and TREC disks 4&5-CR 
   collection (both used by many TREC tasks). (Shai Erera, Doron Cohen)
-  
+
+01/31/2011
+  LUCENE-1591: Rollback to xerces-2.9.1-patched-XERCESJ-1257.jar to workaround
+  XERCESJ-1257, which we hit on current Wikipedia XML export
+  (ENWIKI-20110115-pages-articles.xml) with xerces-2.10.0.jar.   (Mike McCandless)
+
 01/26/2011
   LUCENE-929: ExtractReuters first extracts to a tmp dir and then renames. That 
   way, if a previous extract attempt failed, "ant extract-reuters" will still 
@@ -33,7 +75,7 @@ The Benchmark contrib package contains c
 
 4/27/2010: WriteLineDocTask now supports multi-threading. Also, 
   StringBufferReader was renamed to StringBuilderReader and works on 
-  StringBuilder now. In addition, LongToEnglishCountentSource starts from 0
+  StringBuilder now. In addition, LongToEnglishContentSource starts from 0
   (instead of Long.MIN_VAL+10) and wraps around to MIN_VAL (if you ever hit 
   Long.MAX_VAL). (Shai Erera)
 

Modified: lucene/dev/branches/realtime_search/modules/benchmark/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/benchmark/build.xml?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/benchmark/build.xml (original)
+++ lucene/dev/branches/realtime_search/modules/benchmark/build.xml Wed Mar 30 09:17:25 2011
@@ -265,5 +265,8 @@
 		     />
     </target>
     
-    <target name="dist-maven" depends="jar-core,javadocs,contrib-build.dist-maven"/>
+    <target name="dist-maven" depends="jar-core,javadocs,contrib-build.dist-maven">
+      <m2-deploy-with-pom-template pom.xml="lib/lucene-xercesImpl-pom.xml.template"
+                                   jar.file="lib/xercesImpl-2.9.1-patched-XERCESJ-1257.jar" />
+    </target>
 </project>

Modified: lucene/dev/branches/realtime_search/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java (original)
+++ lucene/dev/branches/realtime_search/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java Wed Mar 30 09:17:25 2011
@@ -64,6 +64,9 @@ public class Benchmark {
     }
   }
   
+  /**
+   * Execute this benchmark 
+   */
   public synchronized void  execute() throws Exception {
     if (executed) {
       throw new IllegalStateException("Benchmark was already executed");
@@ -78,6 +81,14 @@ public class Benchmark {
    * @param args benchmark config and algorithm files
    */
   public static void main(String[] args) {
+    exec(args);
+  }
+
+  /**
+   * Utility: execute benchmark from command line
+   * @param args single argument is expected: algorithm-file
+   */
+  public static void exec(String[] args) {
     // verify command line args
     if (args.length < 1) {
       System.err.println("Usage: java Benchmark <algorithm file>");
@@ -115,7 +126,6 @@ public class Benchmark {
     System.out.println("####################");
     System.out.println("###  D O N E !!! ###");
     System.out.println("####################");
-
   }
 
   /**

Modified: lucene/dev/branches/realtime_search/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ContentSource.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ContentSource.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ContentSource.java (original)
+++ lucene/dev/branches/realtime_search/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ContentSource.java Wed Mar 30 09:17:25 2011
@@ -17,18 +17,11 @@ package org.apache.lucene.benchmark.byTa
  * limitations under the License.
  */
 
-import java.io.BufferedInputStream;
 import java.io.File;
-import java.io.FileInputStream;
 import java.io.IOException;
-import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.Arrays;
-import java.util.HashMap;
-import java.util.Map;
 
-import org.apache.commons.compress.compressors.CompressorException;
-import org.apache.commons.compress.compressors.CompressorStreamFactory;
 import org.apache.lucene.benchmark.byTask.utils.Config;
 
 /**
@@ -55,19 +48,6 @@ import org.apache.lucene.benchmark.byTas
  */
 public abstract class ContentSource {
   
-  private static final int BZIP = 0;
-  private static final int GZIP = 1;
-  private static final int OTHER = 2;
-  private static final Map<String,Integer> extensionToType = new HashMap<String,Integer>();
-  static {
-    extensionToType.put(".bz2", Integer.valueOf(BZIP));
-    extensionToType.put(".bzip", Integer.valueOf(BZIP));
-    extensionToType.put(".gz", Integer.valueOf(GZIP));
-    extensionToType.put(".gzip", Integer.valueOf(GZIP));
-  }
-  
-  protected static final int BUFFER_SIZE = 1 << 16; // 64K
-
   private long bytesCount;
   private long totalBytesCount;
   private int docsCount;
@@ -79,8 +59,6 @@ public abstract class ContentSource {
   protected boolean verbose;
   protected String encoding;
   
-  private CompressorStreamFactory csFactory = new CompressorStreamFactory();
-
   /** update count of bytes generated by this source */  
   protected final synchronized void addBytes(long numBytes) {
     bytesCount += numBytes;
@@ -115,51 +93,7 @@ public abstract class ContentSource {
     }
   }
 
-  /**
-   * Returns an {@link InputStream} over the requested file. This method
-   * attempts to identify the appropriate {@link InputStream} instance to return
-   * based on the file name (e.g., if it ends with .bz2 or .bzip, return a
-   * 'bzip' {@link InputStream}).
-   */
-  protected InputStream getInputStream(File file) throws IOException {
-    // First, create a FileInputStream, as this will be required by all types.
-    // Wrap with BufferedInputStream for better performance
-    InputStream is = new BufferedInputStream(new FileInputStream(file), BUFFER_SIZE);
-    
-    String fileName = file.getName();
-    int idx = fileName.lastIndexOf('.');
-    int type = OTHER;
-    if (idx != -1) {
-      Integer typeInt = extensionToType.get(fileName.substring(idx));
-      if (typeInt != null) {
-        type = typeInt.intValue();
-      }
-    }
-    
-    try {
-      switch (type) {
-        case BZIP:
-          // According to BZip2CompressorInputStream's code, it reads the first 
-          // two file header chars ('B' and 'Z'). It is important to wrap the
-          // underlying input stream with a buffered one since
-          // Bzip2CompressorInputStream uses the read() method exclusively.
-          is = csFactory.createCompressorInputStream("bzip2", is);
-          break;
-        case GZIP:
-          is = csFactory.createCompressorInputStream("gz", is);
-          break;
-        default: // Do nothing, stay with FileInputStream
-      }
-    } catch (CompressorException e) {
-      IOException ioe = new IOException(e.getMessage());
-      ioe.initCause(e);
-      throw ioe;
-    }
-    
-    return is;
-  }
-  
-  /**
+	/**
    * Returns true whether it's time to log a message (depending on verbose and
    * the number of documents generated).
    */

Modified: lucene/dev/branches/realtime_search/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java (original)
+++ lucene/dev/branches/realtime_search/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java Wed Mar 30 09:17:25 2011
@@ -24,6 +24,7 @@ import java.util.HashMap;
 import java.util.Map;
 
 import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
 import org.apache.lucene.util.ThreadInterruptedException;
 import org.xml.sax.Attributes;
 import org.xml.sax.InputSource;
@@ -189,7 +190,7 @@ public class EnwikiContentSource extends
               return;
             } else if (localFileIS == is) {
               // If file is not already re-opened then re-open it now
-              is = getInputStream(file);
+              is = StreamUtils.inputStream(file);
             }
           }
         }
@@ -290,7 +291,7 @@ public class EnwikiContentSource extends
   @Override
   public void resetInputs() throws IOException {
     super.resetInputs();
-    is = getInputStream(file);
+    is = StreamUtils.inputStream(file);
   }
   
   @Override