You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2016/02/27 16:26:08 UTC

lucene-solr git commit: BytesRefHash.sort always sorts in unicode order

Repository: lucene-solr
Updated Branches:
  refs/heads/master 70440bbbd -> 126ac9a5f


BytesRefHash.sort always sorts in unicode order


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/126ac9a5
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/126ac9a5
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/126ac9a5

Branch: refs/heads/master
Commit: 126ac9a5fe00fbbc6870ef25ae3fc6af6cd7c557
Parents: 70440bb
Author: Mike McCandless <mi...@apache.org>
Authored: Sat Feb 27 10:26:20 2016 -0500
Committer: Mike McCandless <mi...@apache.org>
Committed: Sat Feb 27 10:26:20 2016 -0500

----------------------------------------------------------------------
 .../miscellaneous/StemmerOverrideFilter.java    |  2 +-
 .../lucene/index/SortedDocValuesWriter.java     |  2 +-
 .../lucene/index/SortedSetDocValuesWriter.java  |  2 +-
 .../apache/lucene/index/TermsHashPerField.java  |  2 +-
 .../apache/lucene/search/ScoringRewrite.java    |  2 +-
 .../org/apache/lucene/util/BytesRefHash.java    |  6 ++--
 .../apache/lucene/util/TestBytesRefHash.java    | 37 ++++++++++++++++----
 .../search/join/TermsIncludingScoreQuery.java   |  2 +-
 .../apache/lucene/search/join/TermsQuery.java   |  2 +-
 .../apache/lucene/index/memory/MemoryIndex.java |  2 +-
 .../index/BaseDocValuesFormatTestCase.java      |  2 +-
 11 files changed, 42 insertions(+), 19 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/126ac9a5/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java
index e78137e..32423e9 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilter.java
@@ -205,7 +205,7 @@ public final class StemmerOverrideFilter extends TokenFilter {
       ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
       org.apache.lucene.util.fst.Builder<BytesRef> builder = new org.apache.lucene.util.fst.Builder<>(
           FST.INPUT_TYPE.BYTE4, outputs);
-      final int[] sort = hash.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
+      final int[] sort = hash.sort();
       IntsRefBuilder intsSpare = new IntsRefBuilder();
       final int size = hash.size();
       BytesRef spare = new BytesRef();

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/126ac9a5/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java
index 2d8557b..6517218 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SortedDocValuesWriter.java
@@ -112,7 +112,7 @@ class SortedDocValuesWriter extends DocValuesWriter {
     final int valueCount = hash.size();
     final PackedLongValues ords = pending.build();
 
-    final int[] sortedValues = hash.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
+    final int[] sortedValues = hash.sort();
     final int[] ordMap = new int[valueCount];
 
     for(int ord=0;ord<valueCount;ord++) {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/126ac9a5/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java
index e98fc82..3f3beb3 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValuesWriter.java
@@ -152,7 +152,7 @@ class SortedSetDocValuesWriter extends DocValuesWriter {
     final PackedLongValues ords = pending.build();
     final PackedLongValues ordCounts = pendingCounts.build();
 
-    final int[] sortedValues = hash.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
+    final int[] sortedValues = hash.sort();
     final int[] ordMap = new int[valueCount];
 
     for(int ord=0;ord<valueCount;ord++) {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/126ac9a5/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java b/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java
index 3275a4e..3496589 100644
--- a/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java
+++ b/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java
@@ -93,7 +93,7 @@ abstract class TermsHashPerField implements Comparable<TermsHashPerField> {
   /** Collapse the hash table and sort in-place; also sets
    * this.sortedTermIDs to the results */
   public int[] sortPostings() {
-    sortedTermIDs = bytesHash.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
+    sortedTermIDs = bytesHash.sort();
     return sortedTermIDs;
   }
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/126ac9a5/lucene/core/src/java/org/apache/lucene/search/ScoringRewrite.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/ScoringRewrite.java b/lucene/core/src/java/org/apache/lucene/search/ScoringRewrite.java
index e0917eb..3a62e15 100644
--- a/lucene/core/src/java/org/apache/lucene/search/ScoringRewrite.java
+++ b/lucene/core/src/java/org/apache/lucene/search/ScoringRewrite.java
@@ -109,7 +109,7 @@ public abstract class ScoringRewrite<B> extends TermCollectingRewrite<B> {
     
     final int size = col.terms.size();
     if (size > 0) {
-      final int sort[] = col.terms.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
+      final int sort[] = col.terms.sort();
       final float[] boost = col.array.boost;
       final TermContext[] termStates = col.array.termState;
       for (int i = 0; i < size; i++) {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/126ac9a5/lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java b/lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java
index 25b74a6..82cce03 100644
--- a/lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java
+++ b/lucene/core/src/java/org/apache/lucene/util/BytesRefHash.java
@@ -156,11 +156,9 @@ public final class BytesRefHash {
    * Note: This is a destructive operation. {@link #clear()} must be called in
    * order to reuse this {@link BytesRefHash} instance.
    * </p>
-   * 
-   * @param comp
-   *          the {@link Comparator} used for sorting
    */
-  public int[] sort(final Comparator<BytesRef> comp) {
+  public int[] sort() {
+    final Comparator<BytesRef> comp = BytesRef.getUTF8SortedAsUnicodeComparator();
     final int[] compact = compact();
     new IntroSorter() {
       @Override

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/126ac9a5/lucene/core/src/test/org/apache/lucene/util/TestBytesRefHash.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/util/TestBytesRefHash.java b/lucene/core/src/test/org/apache/lucene/util/TestBytesRefHash.java
index e44b283..50d921b 100644
--- a/lucene/core/src/test/org/apache/lucene/util/TestBytesRefHash.java
+++ b/lucene/core/src/test/org/apache/lucene/util/TestBytesRefHash.java
@@ -17,14 +17,16 @@
 package org.apache.lucene.util;
 
 
+import java.util.Arrays;
 import java.util.BitSet;
+import java.util.Comparator;
 import java.util.HashMap;
 import java.util.HashSet;
+import java.util.Map.Entry;
 import java.util.Map;
 import java.util.Set;
 import java.util.SortedSet;
 import java.util.TreeSet;
-import java.util.Map.Entry;
 
 import org.apache.lucene.util.BytesRefHash.MaxBytesLengthExceededException;
 import org.junit.Before;
@@ -166,16 +168,41 @@ public class TestBytesRefHash extends LuceneTestCase {
     }
   }
 
+  private static int[] codePoints(String input) {
+    int length = Character.codePointCount(input, 0, input.length());
+    int word[] = new int[length];
+    for (int i = 0, j = 0, cp = 0; i < input.length(); i += Character.charCount(cp)) {
+      word[j++] = cp = input.codePointAt(i);
+    }
+    return word;
+  }
+
   /**
    * Test method for
-   * {@link org.apache.lucene.util.BytesRefHash#sort(java.util.Comparator)}.
+   * {@link org.apache.lucene.util.BytesRefHash#sort()}.
    */
   @Test
   public void testSort() {
     BytesRefBuilder ref = new BytesRefBuilder();
     int num = atLeast(2);
     for (int j = 0; j < num; j++) {
-      SortedSet<String> strings = new TreeSet<>();
+
+      // Sorts by unicode code point order (is there a simple way, e.g. a Collator?)
+      SortedSet<String> strings = new TreeSet<>(new Comparator<String>() {
+          @Override
+          public int compare(String a, String b) {
+            int[] aCodePoints = codePoints(a);
+            int[] bCodePoints = codePoints(b);
+            for(int i=0;i<Math.min(aCodePoints.length, bCodePoints.length);i++) {
+              if (aCodePoints[i] < bCodePoints[i]) {
+                return -1;
+              } else if (aCodePoints[i] > bCodePoints[i]) {
+                return 1;
+              }
+            }
+            return aCodePoints.length - bCodePoints.length;
+          }
+        });
       for (int i = 0; i < 797; i++) {
         String str;
         do {
@@ -185,9 +212,7 @@ public class TestBytesRefHash extends LuceneTestCase {
         hash.add(ref.get());
         strings.add(str);
       }
-      // We use the UTF-16 comparator here, because we need to be able to
-      // compare to native String.compareTo() [UTF-16]:
-      int[] sort = hash.sort(BytesRef.getUTF8SortedAsUTF16Comparator());
+      int[] sort = hash.sort();
       assertTrue(strings.size() < sort.length);
       int i = 0;
       BytesRef scratch = new BytesRef();

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/126ac9a5/lucene/join/src/java/org/apache/lucene/search/join/TermsIncludingScoreQuery.java
----------------------------------------------------------------------
diff --git a/lucene/join/src/java/org/apache/lucene/search/join/TermsIncludingScoreQuery.java b/lucene/join/src/java/org/apache/lucene/search/join/TermsIncludingScoreQuery.java
index 7c03103..65ab1f0 100644
--- a/lucene/join/src/java/org/apache/lucene/search/join/TermsIncludingScoreQuery.java
+++ b/lucene/join/src/java/org/apache/lucene/search/join/TermsIncludingScoreQuery.java
@@ -55,7 +55,7 @@ class TermsIncludingScoreQuery extends Query {
     this.terms = terms;
     this.scores = scores;
     this.originalQuery = originalQuery;
-    this.ords = terms.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
+    this.ords = terms.sort();
     this.unwrittenOriginalQuery = originalQuery;
   }
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/126ac9a5/lucene/join/src/java/org/apache/lucene/search/join/TermsQuery.java
----------------------------------------------------------------------
diff --git a/lucene/join/src/java/org/apache/lucene/search/join/TermsQuery.java b/lucene/join/src/java/org/apache/lucene/search/join/TermsQuery.java
index eabc72a..11b201d 100644
--- a/lucene/join/src/java/org/apache/lucene/search/join/TermsQuery.java
+++ b/lucene/join/src/java/org/apache/lucene/search/join/TermsQuery.java
@@ -48,7 +48,7 @@ class TermsQuery extends MultiTermQuery {
     super(field);
     this.fromQuery = fromQuery;
     this.terms = terms;
-    ords = terms.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
+    ords = terms.sort();
   }
 
   @Override

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/126ac9a5/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
----------------------------------------------------------------------
diff --git a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
index 29e60ba..c666724 100644
--- a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
+++ b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
@@ -758,7 +758,7 @@ public class MemoryIndex {
      */
     public void sortTerms() {
       if (sortedTerms == null) {
-        sortedTerms = terms.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
+        sortedTerms = terms.sort();
       }
     }
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/126ac9a5/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java
index 79cfa0f..5a8a99f 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java
@@ -1144,7 +1144,7 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes
     w.commit();
     IndexReader reader = w.getReader();
     SortedDocValues docValues = MultiDocValues.getSortedValues(reader, "field");
-    int[] sort = hash.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
+    int[] sort = hash.sort();
     BytesRef expected = new BytesRef();
     assertEquals(hash.size(), docValues.getValueCount());
     for (int i = 0; i < hash.size(); i++) {