You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by yo...@apache.org on 2011/03/15 22:35:35 UTC
svn commit: r1081952 [10/17] - in /lucene/dev/branches/bulkpostings: ./
dev-tools/ dev-tools/eclipse/ dev-tools/idea/ dev-tools/idea/.idea/
dev-tools/idea/lucene/ dev-tools/idea/lucene/contrib/ant/
dev-tools/idea/lucene/contrib/demo/ dev-tools/idea/luc...
Modified: lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/search/TestNumericRangeQuery64.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/search/TestNumericRangeQuery64.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/search/TestNumericRangeQuery64.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/search/TestNumericRangeQuery64.java Tue Mar 15 21:35:17 2011
@@ -361,12 +361,10 @@ public class TestNumericRangeQuery64 ext
final BytesRef lowerBytes = new BytesRef(NumericUtils.BUF_SIZE_LONG), upperBytes = new BytesRef(NumericUtils.BUF_SIZE_LONG);
NumericUtils.longToPrefixCoded(lower, 0, lowerBytes);
NumericUtils.longToPrefixCoded(upper, 0, upperBytes);
- // TODO: when new TermRange ctors with BytesRef available, use them and do not convert to string!
- final String lowerString = lowerBytes.utf8ToString(), upperString = upperBytes.utf8ToString();
// test inclusive range
NumericRangeQuery<Long> tq=NumericRangeQuery.newLongRange(field, precisionStep, lower, upper, true, true);
- TermRangeQuery cq=new TermRangeQuery(field, lowerString, upperString, true, true);
+ TermRangeQuery cq=new TermRangeQuery(field, lowerBytes, upperBytes, true, true);
TopDocs tTopDocs = searcher.search(tq, 1);
TopDocs cTopDocs = searcher.search(cq, 1);
assertEquals("Returned count for NumericRangeQuery and TermRangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits );
@@ -374,7 +372,7 @@ public class TestNumericRangeQuery64 ext
termCountC += cq.getTotalNumberOfTerms();
// test exclusive range
tq=NumericRangeQuery.newLongRange(field, precisionStep, lower, upper, false, false);
- cq=new TermRangeQuery(field, lowerString, upperString, false, false);
+ cq=new TermRangeQuery(field, lowerBytes, upperBytes, false, false);
tTopDocs = searcher.search(tq, 1);
cTopDocs = searcher.search(cq, 1);
assertEquals("Returned count for NumericRangeQuery and TermRangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits );
@@ -382,7 +380,7 @@ public class TestNumericRangeQuery64 ext
termCountC += cq.getTotalNumberOfTerms();
// test left exclusive range
tq=NumericRangeQuery.newLongRange(field, precisionStep, lower, upper, false, true);
- cq=new TermRangeQuery(field, lowerString, upperString, false, true);
+ cq=new TermRangeQuery(field, lowerBytes, upperBytes, false, true);
tTopDocs = searcher.search(tq, 1);
cTopDocs = searcher.search(cq, 1);
assertEquals("Returned count for NumericRangeQuery and TermRangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits );
@@ -390,7 +388,7 @@ public class TestNumericRangeQuery64 ext
termCountC += cq.getTotalNumberOfTerms();
// test right exclusive range
tq=NumericRangeQuery.newLongRange(field, precisionStep, lower, upper, true, false);
- cq=new TermRangeQuery(field, lowerString, upperString, true, false);
+ cq=new TermRangeQuery(field, lowerBytes, upperBytes, true, false);
tTopDocs = searcher.search(tq, 1);
cTopDocs = searcher.search(cq, 1);
assertEquals("Returned count for NumericRangeQuery and TermRangeQuery must be equal", cTopDocs.totalHits, tTopDocs.totalHits );
Modified: lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/search/TestSetNorm.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/search/TestSetNorm.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/search/TestSetNorm.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/search/TestSetNorm.java Tue Mar 15 21:35:17 2011
@@ -51,7 +51,7 @@ public class TestSetNorm extends LuceneT
// reset the boost of each instance of this document
IndexReader reader = IndexReader.open(store, false);
- Similarity similarity = new DefaultSimilarity().get("field");
+ Similarity similarity = new DefaultSimilarity();
reader.setNorm(0, "field", similarity.encodeNormValue(1.0f));
reader.setNorm(1, "field", similarity.encodeNormValue(2.0f));
reader.setNorm(2, "field", similarity.encodeNormValue(4.0f));
Modified: lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/search/TestSimpleExplanations.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/search/TestSimpleExplanations.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/search/TestSimpleExplanations.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/search/TestSimpleExplanations.java Tue Mar 15 21:35:17 2011
@@ -289,4 +289,62 @@ public class TestSimpleExplanations exte
qtest(q, new int[] { 0,3 });
}
+
+ /* BQ of TQ: using alt so some fields have zero boost and some don't */
+
+ public void testMultiFieldBQ1() throws Exception {
+ qtest("+w1 +alt:w2", new int[] { 0,1,2,3 });
+ }
+ public void testMultiFieldBQ2() throws Exception {
+ qtest("+yy +alt:w3", new int[] { 2,3 });
+ }
+ public void testMultiFieldBQ3() throws Exception {
+ qtest("yy +alt:w3", new int[] { 0,1,2,3 });
+ }
+ public void testMultiFieldBQ4() throws Exception {
+ qtest("w1 (-xx alt:w2)", new int[] { 0,1,2,3 });
+ }
+ public void testMultiFieldBQ5() throws Exception {
+ qtest("w1 (+alt:qq alt:w2)", new int[] { 0,1,2,3 });
+ }
+ public void testMultiFieldBQ6() throws Exception {
+ qtest("w1 -(-alt:qq alt:w5)", new int[] { 1,2,3 });
+ }
+ public void testMultiFieldBQ7() throws Exception {
+ qtest("+w1 +(alt:qq (alt:xx -alt:w2) (+alt:w3 +alt:w4))", new int[] { 0 });
+ }
+ public void testMultiFieldBQ8() throws Exception {
+ qtest("+alt:w1 (qq (alt:xx -w2) (+alt:w3 +w4))", new int[] { 0,1,2,3 });
+ }
+ public void testMultiFieldBQ9() throws Exception {
+ qtest("+w1 (alt:qq (-xx w2) -(+alt:w3 +w4))", new int[] { 0,1,2,3 });
+ }
+ public void testMultiFieldBQ10() throws Exception {
+ qtest("+w1 +(alt:qq (-xx alt:w2) -(+alt:w3 +w4))", new int[] { 1 });
+ }
+
+ /* BQ of PQ: using alt so some fields have zero boost and some don't */
+
+ public void testMultiFieldBQofPQ1() throws Exception {
+ qtest("\"w1 w2\" alt:\"w1 w2\"", new int[] { 0 });
+ }
+ public void testMultiFieldBQofPQ2() throws Exception {
+ qtest("\"w1 w3\" alt:\"w1 w3\"", new int[] { 1,3 });
+ }
+ public void testMultiFieldBQofPQ3() throws Exception {
+ qtest("\"w1 w2\"~1 alt:\"w1 w2\"~1", new int[] { 0,1,2 });
+ }
+ public void testMultiFieldBQofPQ4() throws Exception {
+ qtest("\"w2 w3\"~1 alt:\"w2 w3\"~1", new int[] { 0,1,2,3 });
+ }
+ public void testMultiFieldBQofPQ5() throws Exception {
+ qtest("\"w3 w2\"~1 alt:\"w3 w2\"~1", new int[] { 1,3 });
+ }
+ public void testMultiFieldBQofPQ6() throws Exception {
+ qtest("\"w3 w2\"~2 alt:\"w3 w2\"~2", new int[] { 0,1,3 });
+ }
+ public void testMultiFieldBQofPQ7() throws Exception {
+ qtest("\"w3 w2\"~3 alt:\"w3 w2\"~3", new int[] { 0,1,2,3 });
+ }
+
}
Modified: lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/search/TestSort.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/search/TestSort.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/search/TestSort.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/search/TestSort.java Tue Mar 15 21:35:17 2011
@@ -18,12 +18,8 @@ package org.apache.lucene.search;
*/
import java.io.IOException;
-import java.text.Collator;
import java.util.ArrayList;
import java.util.BitSet;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.Locale;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
@@ -110,11 +106,6 @@ public class TestSort extends LuceneTest
{ "d", "m", null, null, null, null, null, null, null, null, null, null}
};
- // the sort order of à versus U depends on the version of the rules being used
- // for the inherited root locale: Ã's order isnt specified in Locale.US since
- // its not used in english.
- private boolean oStrokeFirst = Collator.getInstance(new Locale("")).compare("Ã", "U") < 0;
-
// create an index of all the documents, or just the x, or just the y documents
private IndexSearcher getIndex (boolean even, boolean odd)
throws IOException {
@@ -504,13 +495,15 @@ public class TestSort extends LuceneTest
bottomValue = slotValues[bottom];
}
+ private static final FieldCache.IntParser testIntParser = new FieldCache.IntParser() {
+ public final int parseInt(final BytesRef term) {
+ return (term.bytes[term.offset]-'A') * 123456;
+ }
+ };
+
@Override
public FieldComparator setNextReader(AtomicReaderContext context) throws IOException {
- docValues = FieldCache.DEFAULT.getInts(context.reader, "parser", new FieldCache.IntParser() {
- public final int parseInt(final BytesRef term) {
- return (term.bytes[term.offset]-'A') * 123456;
- }
- });
+ docValues = FieldCache.DEFAULT.getInts(context.reader, "parser", testIntParser);
return this;
}
@@ -564,12 +557,6 @@ public class TestSort extends LuceneTest
sort.setSort (new SortField ("string", SortField.STRING, true) );
assertMatches (full, queryF, sort, "IJZ");
- sort.setSort (new SortField ("i18n", Locale.ENGLISH));
- assertMatches (full, queryF, sort, "ZJI");
-
- sort.setSort (new SortField ("i18n", Locale.ENGLISH, true));
- assertMatches (full, queryF, sort, "IJZ");
-
sort.setSort (new SortField ("int", SortField.INT) );
assertMatches (full, queryF, sort, "IZJ");
@@ -630,36 +617,6 @@ public class TestSort extends LuceneTest
assertMatches (full, queryX, sort, "GICEA");
}
- // test using a Locale for sorting strings
- public void testLocaleSort() throws Exception {
- sort.setSort (new SortField ("string", Locale.US) );
- assertMatches (full, queryX, sort, "AIGEC");
- assertMatches (full, queryY, sort, "DJHFB");
-
- sort.setSort (new SortField ("string", Locale.US, true) );
- assertMatches (full, queryX, sort, "CEGIA");
- assertMatches (full, queryY, sort, "BFHJD");
- }
-
- // test using various international locales with accented characters
- // (which sort differently depending on locale)
- public void testInternationalSort() throws Exception {
- sort.setSort (new SortField ("i18n", Locale.US));
- assertMatches (full, queryY, sort, oStrokeFirst ? "BFJHD" : "BFJDH");
-
- sort.setSort (new SortField ("i18n", new Locale("sv", "se")));
- assertMatches (full, queryY, sort, "BJDFH");
-
- sort.setSort (new SortField ("i18n", new Locale("da", "dk")));
- assertMatches (full, queryY, sort, "BJDHF");
-
- sort.setSort (new SortField ("i18n", Locale.US));
- assertMatches (full, queryX, sort, "ECAGI");
-
- sort.setSort (new SortField ("i18n", Locale.FRANCE));
- assertMatches (full, queryX, sort, "EACGI");
- }
-
// test a variety of sorts using a parallel multisearcher
public void testParallelMultiSort() throws Exception {
ExecutorService exec = Executors.newFixedThreadPool(_TestUtil.nextInt(random, 2, 8));
@@ -976,19 +933,6 @@ public class TestSort extends LuceneTest
assertSaneFieldCaches(getName() + " various");
// next we'll check Locale based (String[]) for 'string', so purge first
FieldCache.DEFAULT.purgeAllCaches();
-
- sort.setSort(new SortField ("string", Locale.US) );
- assertMatches(multi, queryA, sort, "DJAIHGFEBC");
-
- sort.setSort(new SortField ("string", Locale.US, true) );
- assertMatches(multi, queryA, sort, "CBEFGHIAJD");
-
- sort.setSort(new SortField ("string", Locale.UK) );
- assertMatches(multi, queryA, sort, "DJAIHGFEBC");
-
- assertSaneFieldCaches(getName() + " Locale.US + Locale.UK");
- FieldCache.DEFAULT.purgeAllCaches();
-
}
private void assertMatches(IndexSearcher searcher, Query query, Sort sort, String expectedResult) throws IOException {
@@ -1014,37 +958,6 @@ public class TestSort extends LuceneTest
assertEquals (msg, expectedResult, buff.toString());
}
- private HashMap<String,Float> getScores (ScoreDoc[] hits, IndexSearcher searcher)
- throws IOException {
- HashMap<String,Float> scoreMap = new HashMap<String,Float>();
- int n = hits.length;
- for (int i=0; i<n; ++i) {
- Document doc = searcher.doc(hits[i].doc);
- String[] v = doc.getValues("tracer");
- assertEquals (v.length, 1);
- scoreMap.put (v[0], Float.valueOf(hits[i].score));
- }
- return scoreMap;
- }
-
- // make sure all the values in the maps match
- private <K, V> void assertSameValues (HashMap<K,V> m1, HashMap<K,V> m2) {
- int n = m1.size();
- int m = m2.size();
- assertEquals (n, m);
- Iterator<K> iter = m1.keySet().iterator();
- while (iter.hasNext()) {
- K key = iter.next();
- V o1 = m1.get(key);
- V o2 = m2.get(key);
- if (o1 instanceof Float) {
- assertEquals(((Float)o1).floatValue(), ((Float)o2).floatValue(), 1e-6);
- } else {
- assertEquals (m1.get(key), m2.get(key));
- }
- }
- }
-
public void testEmptyStringVsNullStringSort() throws Exception {
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(
Modified: lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/search/TestTermRangeFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/search/TestTermRangeFilter.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/search/TestTermRangeFilter.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/search/TestTermRangeFilter.java Tue Mar 15 21:35:17 2011
@@ -18,15 +18,9 @@ package org.apache.lucene.search;
*/
import java.io.IOException;
-import java.text.Collator;
-import java.util.Locale;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.store.Directory;
import org.junit.Test;
/**
@@ -61,83 +55,83 @@ public class TestTermRangeFilter extends
// test id, bounded on both ends
- result = search.search(q, new TermRangeFilter("id", minIP, maxIP, T, T),
+ result = search.search(q, TermRangeFilter.newStringRange("id", minIP, maxIP, T, T),
numDocs).scoreDocs;
assertEquals("find all", numDocs, result.length);
- result = search.search(q, new TermRangeFilter("id", minIP, maxIP, T, F),
+ result = search.search(q, TermRangeFilter.newStringRange("id", minIP, maxIP, T, F),
numDocs).scoreDocs;
assertEquals("all but last", numDocs - 1, result.length);
- result = search.search(q, new TermRangeFilter("id", minIP, maxIP, F, T),
+ result = search.search(q, TermRangeFilter.newStringRange("id", minIP, maxIP, F, T),
numDocs).scoreDocs;
assertEquals("all but first", numDocs - 1, result.length);
- result = search.search(q, new TermRangeFilter("id", minIP, maxIP, F, F),
+ result = search.search(q, TermRangeFilter.newStringRange("id", minIP, maxIP, F, F),
numDocs).scoreDocs;
assertEquals("all but ends", numDocs - 2, result.length);
- result = search.search(q, new TermRangeFilter("id", medIP, maxIP, T, T),
+ result = search.search(q, TermRangeFilter.newStringRange("id", medIP, maxIP, T, T),
numDocs).scoreDocs;
assertEquals("med and up", 1 + maxId - medId, result.length);
- result = search.search(q, new TermRangeFilter("id", minIP, medIP, T, T),
+ result = search.search(q, TermRangeFilter.newStringRange("id", minIP, medIP, T, T),
numDocs).scoreDocs;
assertEquals("up to med", 1 + medId - minId, result.length);
// unbounded id
- result = search.search(q, new TermRangeFilter("id", minIP, null, T, F),
+ result = search.search(q, TermRangeFilter.newStringRange("id", minIP, null, T, F),
numDocs).scoreDocs;
assertEquals("min and up", numDocs, result.length);
- result = search.search(q, new TermRangeFilter("id", null, maxIP, F, T),
+ result = search.search(q, TermRangeFilter.newStringRange("id", null, maxIP, F, T),
numDocs).scoreDocs;
assertEquals("max and down", numDocs, result.length);
- result = search.search(q, new TermRangeFilter("id", minIP, null, F, F),
+ result = search.search(q, TermRangeFilter.newStringRange("id", minIP, null, F, F),
numDocs).scoreDocs;
assertEquals("not min, but up", numDocs - 1, result.length);
- result = search.search(q, new TermRangeFilter("id", null, maxIP, F, F),
+ result = search.search(q, TermRangeFilter.newStringRange("id", null, maxIP, F, F),
numDocs).scoreDocs;
assertEquals("not max, but down", numDocs - 1, result.length);
- result = search.search(q, new TermRangeFilter("id", medIP, maxIP, T, F),
+ result = search.search(q, TermRangeFilter.newStringRange("id", medIP, maxIP, T, F),
numDocs).scoreDocs;
assertEquals("med and up, not max", maxId - medId, result.length);
- result = search.search(q, new TermRangeFilter("id", minIP, medIP, F, T),
+ result = search.search(q, TermRangeFilter.newStringRange("id", minIP, medIP, F, T),
numDocs).scoreDocs;
assertEquals("not min, up to med", medId - minId, result.length);
// very small sets
- result = search.search(q, new TermRangeFilter("id", minIP, minIP, F, F),
+ result = search.search(q, TermRangeFilter.newStringRange("id", minIP, minIP, F, F),
numDocs).scoreDocs;
assertEquals("min,min,F,F", 0, result.length);
- result = search.search(q, new TermRangeFilter("id", medIP, medIP, F, F),
+ result = search.search(q, TermRangeFilter.newStringRange("id", medIP, medIP, F, F),
numDocs).scoreDocs;
assertEquals("med,med,F,F", 0, result.length);
- result = search.search(q, new TermRangeFilter("id", maxIP, maxIP, F, F),
+ result = search.search(q, TermRangeFilter.newStringRange("id", maxIP, maxIP, F, F),
numDocs).scoreDocs;
assertEquals("max,max,F,F", 0, result.length);
- result = search.search(q, new TermRangeFilter("id", minIP, minIP, T, T),
+ result = search.search(q, TermRangeFilter.newStringRange("id", minIP, minIP, T, T),
numDocs).scoreDocs;
assertEquals("min,min,T,T", 1, result.length);
- result = search.search(q, new TermRangeFilter("id", null, minIP, F, T),
+ result = search.search(q, TermRangeFilter.newStringRange("id", null, minIP, F, T),
numDocs).scoreDocs;
assertEquals("nul,min,F,T", 1, result.length);
- result = search.search(q, new TermRangeFilter("id", maxIP, maxIP, T, T),
+ result = search.search(q, TermRangeFilter.newStringRange("id", maxIP, maxIP, T, T),
numDocs).scoreDocs;
assertEquals("max,max,T,T", 1, result.length);
- result = search.search(q, new TermRangeFilter("id", maxIP, null, T, F),
+ result = search.search(q, TermRangeFilter.newStringRange("id", maxIP, null, T, F),
numDocs).scoreDocs;
assertEquals("max,nul,T,T", 1, result.length);
- result = search.search(q, new TermRangeFilter("id", medIP, medIP, T, T),
+ result = search.search(q, TermRangeFilter.newStringRange("id", medIP, medIP, T, T),
numDocs).scoreDocs;
assertEquals("med,med,T,T", 1, result.length);
@@ -145,110 +139,6 @@ public class TestTermRangeFilter extends
}
@Test
- public void testRangeFilterIdCollating() throws IOException {
-
- IndexReader reader = signedIndexReader;
- IndexSearcher search = newSearcher(reader);
-
- Collator c = Collator.getInstance(Locale.ENGLISH);
-
- int medId = ((maxId - minId) / 2);
-
- String minIP = pad(minId);
- String maxIP = pad(maxId);
- String medIP = pad(medId);
-
- int numDocs = reader.numDocs();
-
- assertEquals("num of docs", numDocs, 1 + maxId - minId);
-
- Query q = new TermQuery(new Term("body", "body"));
-
- // test id, bounded on both ends
- int numHits = search.search(q, new TermRangeFilter("id", minIP, maxIP, T,
- T, c), 1000).totalHits;
- assertEquals("find all", numDocs, numHits);
-
- numHits = search.search(q,
- new TermRangeFilter("id", minIP, maxIP, T, F, c), 1000).totalHits;
- assertEquals("all but last", numDocs - 1, numHits);
-
- numHits = search.search(q,
- new TermRangeFilter("id", minIP, maxIP, F, T, c), 1000).totalHits;
- assertEquals("all but first", numDocs - 1, numHits);
-
- numHits = search.search(q,
- new TermRangeFilter("id", minIP, maxIP, F, F, c), 1000).totalHits;
- assertEquals("all but ends", numDocs - 2, numHits);
-
- numHits = search.search(q,
- new TermRangeFilter("id", medIP, maxIP, T, T, c), 1000).totalHits;
- assertEquals("med and up", 1 + maxId - medId, numHits);
-
- numHits = search.search(q,
- new TermRangeFilter("id", minIP, medIP, T, T, c), 1000).totalHits;
- assertEquals("up to med", 1 + medId - minId, numHits);
-
- // unbounded id
-
- numHits = search.search(q, new TermRangeFilter("id", minIP, null, T, F, c),
- 1000).totalHits;
- assertEquals("min and up", numDocs, numHits);
-
- numHits = search.search(q, new TermRangeFilter("id", null, maxIP, F, T, c),
- 1000).totalHits;
- assertEquals("max and down", numDocs, numHits);
-
- numHits = search.search(q, new TermRangeFilter("id", minIP, null, F, F, c),
- 1000).totalHits;
- assertEquals("not min, but up", numDocs - 1, numHits);
-
- numHits = search.search(q, new TermRangeFilter("id", null, maxIP, F, F, c),
- 1000).totalHits;
- assertEquals("not max, but down", numDocs - 1, numHits);
-
- numHits = search.search(q,
- new TermRangeFilter("id", medIP, maxIP, T, F, c), 1000).totalHits;
- assertEquals("med and up, not max", maxId - medId, numHits);
-
- numHits = search.search(q,
- new TermRangeFilter("id", minIP, medIP, F, T, c), 1000).totalHits;
- assertEquals("not min, up to med", medId - minId, numHits);
-
- // very small sets
-
- numHits = search.search(q,
- new TermRangeFilter("id", minIP, minIP, F, F, c), 1000).totalHits;
- assertEquals("min,min,F,F", 0, numHits);
- numHits = search.search(q,
- new TermRangeFilter("id", medIP, medIP, F, F, c), 1000).totalHits;
- assertEquals("med,med,F,F", 0, numHits);
- numHits = search.search(q,
- new TermRangeFilter("id", maxIP, maxIP, F, F, c), 1000).totalHits;
- assertEquals("max,max,F,F", 0, numHits);
-
- numHits = search.search(q,
- new TermRangeFilter("id", minIP, minIP, T, T, c), 1000).totalHits;
- assertEquals("min,min,T,T", 1, numHits);
- numHits = search.search(q, new TermRangeFilter("id", null, minIP, F, T, c),
- 1000).totalHits;
- assertEquals("nul,min,F,T", 1, numHits);
-
- numHits = search.search(q,
- new TermRangeFilter("id", maxIP, maxIP, T, T, c), 1000).totalHits;
- assertEquals("max,max,T,T", 1, numHits);
- numHits = search.search(q, new TermRangeFilter("id", maxIP, null, T, F, c),
- 1000).totalHits;
- assertEquals("max,nul,T,T", 1, numHits);
-
- numHits = search.search(q,
- new TermRangeFilter("id", medIP, medIP, T, T, c), 1000).totalHits;
- assertEquals("med,med,T,T", 1, numHits);
-
- search.close();
- }
-
- @Test
public void testRangeFilterRand() throws IOException {
IndexReader reader = signedIndexReader;
@@ -266,223 +156,63 @@ public class TestTermRangeFilter extends
// test extremes, bounded on both ends
- result = search.search(q, new TermRangeFilter("rand", minRP, maxRP, T, T),
+ result = search.search(q, TermRangeFilter.newStringRange("rand", minRP, maxRP, T, T),
numDocs).scoreDocs;
assertEquals("find all", numDocs, result.length);
- result = search.search(q, new TermRangeFilter("rand", minRP, maxRP, T, F),
+ result = search.search(q, TermRangeFilter.newStringRange("rand", minRP, maxRP, T, F),
numDocs).scoreDocs;
assertEquals("all but biggest", numDocs - 1, result.length);
- result = search.search(q, new TermRangeFilter("rand", minRP, maxRP, F, T),
+ result = search.search(q, TermRangeFilter.newStringRange("rand", minRP, maxRP, F, T),
numDocs).scoreDocs;
assertEquals("all but smallest", numDocs - 1, result.length);
- result = search.search(q, new TermRangeFilter("rand", minRP, maxRP, F, F),
+ result = search.search(q, TermRangeFilter.newStringRange("rand", minRP, maxRP, F, F),
numDocs).scoreDocs;
assertEquals("all but extremes", numDocs - 2, result.length);
// unbounded
- result = search.search(q, new TermRangeFilter("rand", minRP, null, T, F),
+ result = search.search(q, TermRangeFilter.newStringRange("rand", minRP, null, T, F),
numDocs).scoreDocs;
assertEquals("smallest and up", numDocs, result.length);
- result = search.search(q, new TermRangeFilter("rand", null, maxRP, F, T),
+ result = search.search(q, TermRangeFilter.newStringRange("rand", null, maxRP, F, T),
numDocs).scoreDocs;
assertEquals("biggest and down", numDocs, result.length);
- result = search.search(q, new TermRangeFilter("rand", minRP, null, F, F),
+ result = search.search(q, TermRangeFilter.newStringRange("rand", minRP, null, F, F),
numDocs).scoreDocs;
assertEquals("not smallest, but up", numDocs - 1, result.length);
- result = search.search(q, new TermRangeFilter("rand", null, maxRP, F, F),
+ result = search.search(q, TermRangeFilter.newStringRange("rand", null, maxRP, F, F),
numDocs).scoreDocs;
assertEquals("not biggest, but down", numDocs - 1, result.length);
// very small sets
- result = search.search(q, new TermRangeFilter("rand", minRP, minRP, F, F),
+ result = search.search(q, TermRangeFilter.newStringRange("rand", minRP, minRP, F, F),
numDocs).scoreDocs;
assertEquals("min,min,F,F", 0, result.length);
- result = search.search(q, new TermRangeFilter("rand", maxRP, maxRP, F, F),
+ result = search.search(q, TermRangeFilter.newStringRange("rand", maxRP, maxRP, F, F),
numDocs).scoreDocs;
assertEquals("max,max,F,F", 0, result.length);
- result = search.search(q, new TermRangeFilter("rand", minRP, minRP, T, T),
+ result = search.search(q, TermRangeFilter.newStringRange("rand", minRP, minRP, T, T),
numDocs).scoreDocs;
assertEquals("min,min,T,T", 1, result.length);
- result = search.search(q, new TermRangeFilter("rand", null, minRP, F, T),
+ result = search.search(q, TermRangeFilter.newStringRange("rand", null, minRP, F, T),
numDocs).scoreDocs;
assertEquals("nul,min,F,T", 1, result.length);
- result = search.search(q, new TermRangeFilter("rand", maxRP, maxRP, T, T),
+ result = search.search(q, TermRangeFilter.newStringRange("rand", maxRP, maxRP, T, T),
numDocs).scoreDocs;
assertEquals("max,max,T,T", 1, result.length);
- result = search.search(q, new TermRangeFilter("rand", maxRP, null, T, F),
+ result = search.search(q, TermRangeFilter.newStringRange("rand", maxRP, null, T, F),
numDocs).scoreDocs;
assertEquals("max,nul,T,T", 1, result.length);
search.close();
}
-
- @Test
- public void testRangeFilterRandCollating() throws IOException {
-
- // using the unsigned index because collation seems to ignore hyphens
- IndexReader reader = unsignedIndexReader;
- IndexSearcher search = newSearcher(reader);
-
- Collator c = Collator.getInstance(Locale.ENGLISH);
-
- String minRP = pad(unsignedIndexDir.minR);
- String maxRP = pad(unsignedIndexDir.maxR);
-
- int numDocs = reader.numDocs();
-
- assertEquals("num of docs", numDocs, 1 + maxId - minId);
-
- Query q = new TermQuery(new Term("body", "body"));
-
- // test extremes, bounded on both ends
-
- int numHits = search.search(q, new TermRangeFilter("rand", minRP, maxRP, T,
- T, c), 1000).totalHits;
- assertEquals("find all", numDocs, numHits);
-
- numHits = search.search(q, new TermRangeFilter("rand", minRP, maxRP, T, F,
- c), 1000).totalHits;
- assertEquals("all but biggest", numDocs - 1, numHits);
-
- numHits = search.search(q, new TermRangeFilter("rand", minRP, maxRP, F, T,
- c), 1000).totalHits;
- assertEquals("all but smallest", numDocs - 1, numHits);
-
- numHits = search.search(q, new TermRangeFilter("rand", minRP, maxRP, F, F,
- c), 1000).totalHits;
- assertEquals("all but extremes", numDocs - 2, numHits);
-
- // unbounded
-
- numHits = search.search(q,
- new TermRangeFilter("rand", minRP, null, T, F, c), 1000).totalHits;
- assertEquals("smallest and up", numDocs, numHits);
-
- numHits = search.search(q,
- new TermRangeFilter("rand", null, maxRP, F, T, c), 1000).totalHits;
- assertEquals("biggest and down", numDocs, numHits);
-
- numHits = search.search(q,
- new TermRangeFilter("rand", minRP, null, F, F, c), 1000).totalHits;
- assertEquals("not smallest, but up", numDocs - 1, numHits);
-
- numHits = search.search(q,
- new TermRangeFilter("rand", null, maxRP, F, F, c), 1000).totalHits;
- assertEquals("not biggest, but down", numDocs - 1, numHits);
-
- // very small sets
-
- numHits = search.search(q, new TermRangeFilter("rand", minRP, minRP, F, F,
- c), 1000).totalHits;
- assertEquals("min,min,F,F", 0, numHits);
- numHits = search.search(q, new TermRangeFilter("rand", maxRP, maxRP, F, F,
- c), 1000).totalHits;
- assertEquals("max,max,F,F", 0, numHits);
-
- numHits = search.search(q, new TermRangeFilter("rand", minRP, minRP, T, T,
- c), 1000).totalHits;
- assertEquals("min,min,T,T", 1, numHits);
- numHits = search.search(q,
- new TermRangeFilter("rand", null, minRP, F, T, c), 1000).totalHits;
- assertEquals("nul,min,F,T", 1, numHits);
-
- numHits = search.search(q, new TermRangeFilter("rand", maxRP, maxRP, T, T,
- c), 1000).totalHits;
- assertEquals("max,max,T,T", 1, numHits);
- numHits = search.search(q,
- new TermRangeFilter("rand", maxRP, null, T, F, c), 1000).totalHits;
- assertEquals("max,nul,T,T", 1, numHits);
-
- search.close();
- }
-
- @Test
- public void testFarsi() throws Exception {
-
- /* build an index */
- Directory farsiIndex = newDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(random, farsiIndex);
- Document doc = new Document();
- doc.add(newField("content", "\u0633\u0627\u0628", Field.Store.YES,
- Field.Index.NOT_ANALYZED));
- doc
- .add(newField("body", "body", Field.Store.YES,
- Field.Index.NOT_ANALYZED));
- writer.addDocument(doc);
-
- IndexReader reader = writer.getReader();
- writer.close();
-
- IndexSearcher search = newSearcher(reader);
- Query q = new TermQuery(new Term("body", "body"));
-
- // Neither Java 1.4.2 nor 1.5.0 has Farsi Locale collation available in
- // RuleBasedCollator. However, the Arabic Locale seems to order the Farsi
- // characters properly.
- Collator collator = Collator.getInstance(new Locale("ar"));
-
- // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
- // orders the U+0698 character before the U+0633 character, so the single
- // index Term below should NOT be returned by a TermRangeFilter with a Farsi
- // Collator (or an Arabic one for the case when Farsi is not supported).
- int numHits = search.search(q, new TermRangeFilter("content", "\u062F",
- "\u0698", T, T, collator), 1000).totalHits;
- assertEquals("The index Term should not be included.", 0, numHits);
-
- numHits = search.search(q, new TermRangeFilter("content", "\u0633",
- "\u0638", T, T, collator), 1000).totalHits;
- assertEquals("The index Term should be included.", 1, numHits);
- search.close();
- reader.close();
- farsiIndex.close();
- }
-
- @Test
- public void testDanish() throws Exception {
-
- /* build an index */
- Directory danishIndex = newDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(random, danishIndex);
- // Danish collation orders the words below in the given order
- // (example taken from TestSort.testInternationalSort() ).
- String[] words = {"H\u00D8T", "H\u00C5T", "MAND"};
- for (int docnum = 0; docnum < words.length; ++docnum) {
- Document doc = new Document();
- doc.add(newField("content", words[docnum], Field.Store.YES,
- Field.Index.NOT_ANALYZED));
- doc.add(newField("body", "body", Field.Store.YES,
- Field.Index.NOT_ANALYZED));
- writer.addDocument(doc);
- }
- IndexReader reader = writer.getReader();
- writer.close();
-
- IndexSearcher search = newSearcher(reader);
- Query q = new TermQuery(new Term("body", "body"));
-
- Collator collator = Collator.getInstance(new Locale("da", "dk"));
-
- // Unicode order would not include "H\u00C5T" in [ "H\u00D8T", "MAND" ],
- // but Danish collation does.
- int numHits = search.search(q, new TermRangeFilter("content", "H\u00D8T",
- "MAND", F, F, collator), 1000).totalHits;
- assertEquals("The index Term should be included.", 1, numHits);
-
- numHits = search.search(q, new TermRangeFilter("content", "H\u00C5T",
- "MAND", F, F, collator), 1000).totalHits;
- assertEquals("The index Term should not be included.", 0, numHits);
- search.close();
- reader.close();
- danishIndex.close();
- }
}
Modified: lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/search/TestTermRangeQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/search/TestTermRangeQuery.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/search/TestTermRangeQuery.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/search/TestTermRangeQuery.java Tue Mar 15 21:35:17 2011
@@ -53,7 +53,7 @@ public class TestTermRangeQuery extends
}
public void testExclusive() throws Exception {
- Query query = new TermRangeQuery("content", "A", "C", false, false);
+ Query query = TermRangeQuery.newStringRange("content", "A", "C", false, false);
initializeIndex(new String[] {"A", "B", "C", "D"});
IndexSearcher searcher = new IndexSearcher(dir, true);
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
@@ -74,7 +74,7 @@ public class TestTermRangeQuery extends
}
public void testInclusive() throws Exception {
- Query query = new TermRangeQuery("content", "A", "C", true, true);
+ Query query = TermRangeQuery.newStringRange("content", "A", "C", true, true);
initializeIndex(new String[]{"A", "B", "C", "D"});
IndexSearcher searcher = new IndexSearcher(dir, true);
@@ -105,11 +105,11 @@ public class TestTermRangeQuery extends
query = new TermRangeQuery("content", null, null, false, false);
assertFalse(query.getTermsEnum(terms) instanceof TermRangeTermsEnum);
assertEquals(4, searcher.search(query, null, 1000).scoreDocs.length);
- query = new TermRangeQuery("content", "", null, true, false);
+ query = TermRangeQuery.newStringRange("content", "", null, true, false);
assertFalse(query.getTermsEnum(terms) instanceof TermRangeTermsEnum);
assertEquals(4, searcher.search(query, null, 1000).scoreDocs.length);
// and now anothe one
- query = new TermRangeQuery("content", "B", null, true, false);
+ query = TermRangeQuery.newStringRange("content", "B", null, true, false);
assertTrue(query.getTermsEnum(terms) instanceof TermRangeTermsEnum);
assertEquals(3, searcher.search(query, null, 1000).scoreDocs.length);
searcher.close();
@@ -121,7 +121,7 @@ public class TestTermRangeQuery extends
initializeIndex(new String[]{"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K"});
IndexSearcher searcher = new IndexSearcher(dir, true);
- TermRangeQuery query = new TermRangeQuery("content", "B", "J", true, true);
+ TermRangeQuery query = TermRangeQuery.newStringRange("content", "B", "J", true, true);
checkBooleanTerms(searcher, query, "B", "C", "D", "E", "F", "G", "H", "I", "J");
final int savedClauseCount = BooleanQuery.getMaxClauseCount();
@@ -150,10 +150,10 @@ public class TestTermRangeQuery extends
}
public void testEqualsHashcode() {
- Query query = new TermRangeQuery("content", "A", "C", true, true);
+ Query query = TermRangeQuery.newStringRange("content", "A", "C", true, true);
query.setBoost(1.0f);
- Query other = new TermRangeQuery("content", "A", "C", true, true);
+ Query other = TermRangeQuery.newStringRange("content", "A", "C", true, true);
other.setBoost(1.0f);
assertEquals("query equals itself is true", query, query);
@@ -163,120 +163,32 @@ public class TestTermRangeQuery extends
other.setBoost(2.0f);
assertFalse("Different boost queries are not equal", query.equals(other));
- other = new TermRangeQuery("notcontent", "A", "C", true, true);
+ other = TermRangeQuery.newStringRange("notcontent", "A", "C", true, true);
assertFalse("Different fields are not equal", query.equals(other));
- other = new TermRangeQuery("content", "X", "C", true, true);
+ other = TermRangeQuery.newStringRange("content", "X", "C", true, true);
assertFalse("Different lower terms are not equal", query.equals(other));
- other = new TermRangeQuery("content", "A", "Z", true, true);
+ other = TermRangeQuery.newStringRange("content", "A", "Z", true, true);
assertFalse("Different upper terms are not equal", query.equals(other));
- query = new TermRangeQuery("content", null, "C", true, true);
- other = new TermRangeQuery("content", null, "C", true, true);
+ query = TermRangeQuery.newStringRange("content", null, "C", true, true);
+ other = TermRangeQuery.newStringRange("content", null, "C", true, true);
assertEquals("equivalent queries with null lowerterms are equal()", query, other);
assertEquals("hashcode must return same value when equals is true", query.hashCode(), other.hashCode());
- query = new TermRangeQuery("content", "C", null, true, true);
- other = new TermRangeQuery("content", "C", null, true, true);
+ query = TermRangeQuery.newStringRange("content", "C", null, true, true);
+ other = TermRangeQuery.newStringRange("content", "C", null, true, true);
assertEquals("equivalent queries with null upperterms are equal()", query, other);
assertEquals("hashcode returns same value", query.hashCode(), other.hashCode());
- query = new TermRangeQuery("content", null, "C", true, true);
- other = new TermRangeQuery("content", "C", null, true, true);
+ query = TermRangeQuery.newStringRange("content", null, "C", true, true);
+ other = TermRangeQuery.newStringRange("content", "C", null, true, true);
assertFalse("queries with different upper and lower terms are not equal", query.equals(other));
- query = new TermRangeQuery("content", "A", "C", false, false);
- other = new TermRangeQuery("content", "A", "C", true, true);
+ query = TermRangeQuery.newStringRange("content", "A", "C", false, false);
+ other = TermRangeQuery.newStringRange("content", "A", "C", true, true);
assertFalse("queries with different inclusive are not equal", query.equals(other));
-
- query = new TermRangeQuery("content", "A", "C", false, false);
- other = new TermRangeQuery("content", "A", "C", false, false, Collator.getInstance());
- assertFalse("a query with a collator is not equal to one without", query.equals(other));
- }
-
- public void testExclusiveCollating() throws Exception {
- Query query = new TermRangeQuery("content", "A", "C", false, false, Collator.getInstance(Locale.ENGLISH));
- initializeIndex(new String[] {"A", "B", "C", "D"});
- IndexSearcher searcher = new IndexSearcher(dir, true);
- ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
- assertEquals("A,B,C,D, only B in range", 1, hits.length);
- searcher.close();
-
- initializeIndex(new String[] {"A", "B", "D"});
- searcher = new IndexSearcher(dir, true);
- hits = searcher.search(query, null, 1000).scoreDocs;
- assertEquals("A,B,D, only B in range", 1, hits.length);
- searcher.close();
-
- addDoc("C");
- searcher = new IndexSearcher(dir, true);
- hits = searcher.search(query, null, 1000).scoreDocs;
- assertEquals("C added, still only B in range", 1, hits.length);
- searcher.close();
- }
-
- public void testInclusiveCollating() throws Exception {
- Query query = new TermRangeQuery("content", "A", "C",true, true, Collator.getInstance(Locale.ENGLISH));
-
- initializeIndex(new String[]{"A", "B", "C", "D"});
- IndexSearcher searcher = new IndexSearcher(dir, true);
- ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
- assertEquals("A,B,C,D - A,B,C in range", 3, hits.length);
- searcher.close();
-
- initializeIndex(new String[]{"A", "B", "D"});
- searcher = new IndexSearcher(dir, true);
- hits = searcher.search(query, null, 1000).scoreDocs;
- assertEquals("A,B,D - A and B in range", 2, hits.length);
- searcher.close();
-
- addDoc("C");
- searcher = new IndexSearcher(dir, true);
- hits = searcher.search(query, null, 1000).scoreDocs;
- assertEquals("C added - A, B, C in range", 3, hits.length);
- searcher.close();
- }
-
- public void testFarsi() throws Exception {
- // Neither Java 1.4.2 nor 1.5.0 has Farsi Locale collation available in
- // RuleBasedCollator. However, the Arabic Locale seems to order the Farsi
- // characters properly.
- Collator collator = Collator.getInstance(new Locale("ar"));
- Query query = new TermRangeQuery("content", "\u062F", "\u0698", true, true, collator);
- // Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
- // orders the U+0698 character before the U+0633 character, so the single
- // index Term below should NOT be returned by a TermRangeQuery with a Farsi
- // Collator (or an Arabic one for the case when Farsi is not supported).
- initializeIndex(new String[]{ "\u0633\u0627\u0628"});
- IndexSearcher searcher = new IndexSearcher(dir, true);
- ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
- assertEquals("The index Term should not be included.", 0, hits.length);
-
- query = new TermRangeQuery("content", "\u0633", "\u0638",true, true, collator);
- hits = searcher.search(query, null, 1000).scoreDocs;
- assertEquals("The index Term should be included.", 1, hits.length);
- searcher.close();
- }
-
- public void testDanish() throws Exception {
- Collator collator = Collator.getInstance(new Locale("da", "dk"));
- // Danish collation orders the words below in the given order (example taken
- // from TestSort.testInternationalSort() ).
- String[] words = { "H\u00D8T", "H\u00C5T", "MAND" };
- Query query = new TermRangeQuery("content", "H\u00D8T", "MAND", false, false, collator);
-
- // Unicode order would not include "H\u00C5T" in [ "H\u00D8T", "MAND" ],
- // but Danish collation does.
- initializeIndex(words);
- IndexSearcher searcher = new IndexSearcher(dir, true);
- ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
- assertEquals("The index Term should be included.", 1, hits.length);
-
- query = new TermRangeQuery("content", "H\u00C5T", "MAND", false, false, collator);
- hits = searcher.search(query, null, 1000).scoreDocs;
- assertEquals("The index Term should not be included.", 0, hits.length);
- searcher.close();
}
private static class SingleCharAnalyzer extends Analyzer {
@@ -363,7 +275,7 @@ public class TestTermRangeQuery extends
public void testExclusiveLowerNull() throws Exception {
Analyzer analyzer = new SingleCharAnalyzer();
//http://issues.apache.org/jira/browse/LUCENE-38
- Query query = new TermRangeQuery("content", null, "C",
+ Query query = TermRangeQuery.newStringRange("content", null, "C",
false, false);
initializeIndex(new String[] {"A", "B", "", "C", "D"}, analyzer);
IndexSearcher searcher = new IndexSearcher(dir, true);
@@ -396,7 +308,7 @@ public class TestTermRangeQuery extends
public void testInclusiveLowerNull() throws Exception {
//http://issues.apache.org/jira/browse/LUCENE-38
Analyzer analyzer = new SingleCharAnalyzer();
- Query query = new TermRangeQuery("content", null, "C", true, true);
+ Query query = TermRangeQuery.newStringRange("content", null, "C", true, true);
initializeIndex(new String[]{"A", "B", "","C", "D"}, analyzer);
IndexSearcher searcher = new IndexSearcher(dir, true);
int numHits = searcher.search(query, null, 1000).totalHits;
Modified: lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/util/TestIndexableBinaryStringTools.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/util/TestIndexableBinaryStringTools.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/util/TestIndexableBinaryStringTools.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/util/TestIndexableBinaryStringTools.java Tue Mar 15 21:35:17 2011
@@ -17,6 +17,10 @@ package org.apache.lucene.util;
* limitations under the License.
*/
+/**
+ * @deprecated Remove when IndexableBinaryStringTools is removed.
+ */
+@Deprecated
public class TestIndexableBinaryStringTools extends LuceneTestCase {
private static final int NUM_RANDOM_TESTS = 2000 * RANDOM_MULTIPLIER;
private static final int MAX_RANDOM_BINARY_LENGTH = 300 * RANDOM_MULTIPLIER;
Modified: lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/util/TestPriorityQueue.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/util/TestPriorityQueue.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/util/TestPriorityQueue.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/util/TestPriorityQueue.java Tue Mar 15 21:35:17 2011
@@ -23,8 +23,7 @@ public class TestPriorityQueue extends L
private static class IntegerQueue extends PriorityQueue<Integer> {
public IntegerQueue(int count) {
- super();
- initialize(count);
+ super(count);
}
@Override
Modified: lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/util/TestSmallFloat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/util/TestSmallFloat.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/util/TestSmallFloat.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/util/TestSmallFloat.java Tue Mar 15 21:35:17 2011
@@ -28,8 +28,8 @@ public class TestSmallFloat extends Luce
return Float.intBitsToFloat(bits);
}
- // original lucene floatToByte
- static byte orig_floatToByte(float f) {
+ // original lucene floatToByte (since lucene 1.3)
+ static byte orig_floatToByte_v13(float f) {
if (f < 0.0f) // round negatives up to zero
f = 0.0f;
@@ -53,6 +53,33 @@ public class TestSmallFloat extends Luce
return (byte)((exponent << 3) | mantissa); // pack into a byte
}
+ // This is the original lucene floatToBytes (from v1.3)
+ // except with the underflow detection bug fixed for values like 5.8123817E-10f
+ static byte orig_floatToByte(float f) {
+ if (f < 0.0f) // round negatives up to zero
+ f = 0.0f;
+
+ if (f == 0.0f) // zero is a special case
+ return 0;
+
+ int bits = Float.floatToIntBits(f); // parse float into parts
+ int mantissa = (bits & 0xffffff) >> 21;
+ int exponent = (((bits >> 24) & 0x7f) - 63) + 15;
+
+ if (exponent > 31) { // overflow: use max value
+ exponent = 31;
+ mantissa = 7;
+ }
+
+ if (exponent < 0 || exponent == 0 && mantissa == 0) { // underflow: use min value
+ exponent = 0;
+ mantissa = 1;
+ }
+
+ return (byte)((exponent << 3) | mantissa); // pack into a byte
+ }
+
+
public void testByteToFloat() {
for (int i=0; i<256; i++) {
float f1 = orig_byteToFloat((byte)i);
@@ -68,6 +95,22 @@ public class TestSmallFloat extends Luce
}
public void testFloatToByte() {
+ assertEquals(0, orig_floatToByte_v13(5.8123817E-10f)); // verify the old bug (see LUCENE-2937)
+ assertEquals(1, orig_floatToByte(5.8123817E-10f)); // verify it's fixed in this test code
+ assertEquals(1, SmallFloat.floatToByte315(5.8123817E-10f)); // verify it's fixed
+
+ // test some constants
+ assertEquals(0, SmallFloat.floatToByte315(0));
+ assertEquals(1, SmallFloat.floatToByte315(Float.MIN_VALUE)); // underflow rounds up to smallest positive
+ assertEquals(255, SmallFloat.floatToByte315(Float.MAX_VALUE) & 0xff); // overflow rounds down to largest positive
+ assertEquals(255, SmallFloat.floatToByte315(Float.POSITIVE_INFINITY) & 0xff);
+
+ // all negatives map to 0
+ assertEquals(0, SmallFloat.floatToByte315(-Float.MIN_VALUE));
+ assertEquals(0, SmallFloat.floatToByte315(-Float.MAX_VALUE));
+ assertEquals(0, SmallFloat.floatToByte315(Float.NEGATIVE_INFINITY));
+
+
// up iterations for more exhaustive test after changing something
int num = 100000 * RANDOM_MULTIPLIER;
for (int i = 0; i < num; i++) {
@@ -95,8 +138,8 @@ public class TestSmallFloat extends Luce
if (f==f) { // skip non-numbers
byte b1 = orig_floatToByte(f);
byte b2 = SmallFloat.floatToByte315(f);
- if (b1!=b2) {
- TestCase.fail("Failed floatToByte315 for float " + f);
+ if (b1!=b2 || b2==0 && f>0) {
+ fail("Failed floatToByte315 for float " + f + " source bits="+Integer.toHexString(i) + " float raw bits=" + Integer.toHexString(Float.floatToRawIntBits(i)));
}
}
if (i==Integer.MAX_VALUE) break;
Modified: lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java (original)
+++ lucene/dev/branches/bulkpostings/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java Tue Mar 15 21:35:17 2011
@@ -20,19 +20,12 @@ package org.apache.lucene.util.automaton
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
+import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
-import java.io.PrintStream;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Random;
-import java.util.Set;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.util.*;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
@@ -54,6 +47,7 @@ import org.apache.lucene.util.LineFileDo
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util._TestUtil;
+import org.apache.lucene.util.automaton.fst.FST.Arc;
public class TestFSTs extends LuceneTestCase {
@@ -445,9 +439,9 @@ public class TestFSTs extends LuceneTest
}
if (VERBOSE && pairs.size() <= 20 && fst != null) {
- PrintStream ps = new PrintStream("out.dot");
- Util.toDot(fst, ps);
- ps.close();
+ Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
+ Util.toDot(fst, w, false, false);
+ w.close();
System.out.println("SAVED out.dot");
}
@@ -1095,7 +1089,7 @@ public class TestFSTs extends LuceneTest
protected abstract T getOutput(IntsRef input, int ord) throws IOException;
- public void run(int limit) throws IOException {
+ public void run(int limit, boolean verify) throws IOException {
BufferedReader is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536);
try {
final IntsRef intsRef = new IntsRef(10);
@@ -1112,7 +1106,9 @@ public class TestFSTs extends LuceneTest
ord++;
if (ord % 500000 == 0) {
- System.out.println(((System.currentTimeMillis()-tStart)/1000.0) + "s: " + ord + "...");
+ System.out.println(
+ String.format(Locale.ENGLISH,
+ "%6.2fs: %9d...", ((System.currentTimeMillis() - tStart) / 1000.0), ord));
}
if (ord >= limit) {
break;
@@ -1128,9 +1124,9 @@ public class TestFSTs extends LuceneTest
System.out.println(ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs; " + fst.getArcWithOutputCount() + " arcs w/ output; tot size " + fst.sizeInBytes());
if (fst.getNodeCount() < 100) {
- PrintStream ps = new PrintStream("out.dot");
- Util.toDot(fst, ps);
- ps.close();
+ Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
+ Util.toDot(fst, w, false, false);
+ w.close();
System.out.println("Wrote FST to out.dot");
}
@@ -1141,6 +1137,10 @@ public class TestFSTs extends LuceneTest
System.out.println("Saved FST to fst.bin.");
+ if (!verify) {
+ System.exit(0);
+ }
+
System.out.println("\nNow verify...");
is.close();
@@ -1191,6 +1191,7 @@ public class TestFSTs extends LuceneTest
int inputMode = 0; // utf8
boolean storeOrds = false;
boolean storeDocFreqs = false;
+ boolean verify = true;
while(idx < args.length) {
if (args[idx].equals("-prune")) {
prune = Integer.valueOf(args[1+idx]);
@@ -1212,6 +1213,9 @@ public class TestFSTs extends LuceneTest
if (args[idx].equals("-ords")) {
storeOrds = true;
}
+ if (args[idx].equals("-noverify")) {
+ verify = false;
+ }
idx++;
}
@@ -1232,7 +1236,7 @@ public class TestFSTs extends LuceneTest
return new PairOutputs.Pair<Long,Long>(o1.get(ord),
o2.get(_TestUtil.nextInt(rand, 1, 5000)));
}
- }.run(limit);
+ }.run(limit, verify);
} else if (storeOrds) {
// Store only ords
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
@@ -1241,7 +1245,7 @@ public class TestFSTs extends LuceneTest
public Long getOutput(IntsRef input, int ord) {
return outputs.get(ord);
}
- }.run(limit);
+ }.run(limit, verify);
} else if (storeDocFreqs) {
// Store only docFreq
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(false);
@@ -1254,7 +1258,7 @@ public class TestFSTs extends LuceneTest
}
return outputs.get(_TestUtil.nextInt(rand, 1, 5000));
}
- }.run(limit);
+ }.run(limit, verify);
} else {
// Store nothing
final NoOutputs outputs = NoOutputs.getSingleton();
@@ -1264,7 +1268,7 @@ public class TestFSTs extends LuceneTest
public Object getOutput(IntsRef input, int ord) {
return NO_OUTPUT;
}
- }.run(limit);
+ }.run(limit, verify);
}
}
@@ -1320,4 +1324,85 @@ public class TestFSTs extends LuceneTest
assertEquals(b, seekResult.input);
assertEquals(42, (long) seekResult.output);
}
+
+ /**
+ * Test state expansion (array format) on close-to-root states. Creates
+ * synthetic input that has one expanded state on each level.
+ *
+ * @see "https://issues.apache.org/jira/browse/LUCENE-2933"
+ */
+ public void testExpandedCloseToRoot() throws Exception {
+ class SyntheticData {
+ FST<Object> compile(String[] lines) throws IOException {
+ final NoOutputs outputs = NoOutputs.getSingleton();
+ final Object nothing = outputs.getNoOutput();
+ final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs);
+
+ int line = 0;
+ final BytesRef term = new BytesRef();
+ while (line < lines.length) {
+ String w = lines[line++];
+ if (w == null) {
+ break;
+ }
+ term.copy(w);
+ b.add(term, nothing);
+ }
+
+ return b.finish();
+ }
+
+ void generate(ArrayList<String> out, StringBuilder b, char from, char to,
+ int depth) {
+ if (depth == 0 || from == to) {
+ String seq = b.toString() + "_" + out.size() + "_end";
+ out.add(seq);
+ } else {
+ for (char c = from; c <= to; c++) {
+ b.append(c);
+ generate(out, b, from, c == to ? to : from, depth - 1);
+ b.deleteCharAt(b.length() - 1);
+ }
+ }
+ }
+
+ public int verifyStateAndBelow(FST<Object> fst, Arc<Object> arc, int depth)
+ throws IOException {
+ if (fst.targetHasArcs(arc)) {
+ int childCount = 0;
+ for (arc = fst.readFirstTargetArc(arc, arc);;
+ arc = fst.readNextArc(arc), childCount++)
+ {
+ boolean expanded = fst.isExpandedTarget(arc);
+ int children = verifyStateAndBelow(fst, new FST.Arc<Object>().copyFrom(arc), depth + 1);
+
+ assertEquals(
+ expanded,
+ (depth <= FST.FIXED_ARRAY_SHALLOW_DISTANCE &&
+ children >= FST.FIXED_ARRAY_NUM_ARCS_SHALLOW) ||
+ children >= FST.FIXED_ARRAY_NUM_ARCS_DEEP);
+ if (arc.isLast()) break;
+ }
+
+ return childCount;
+ }
+ return 0;
+ }
+ }
+
+ // Sanity check.
+ assertTrue(FST.FIXED_ARRAY_NUM_ARCS_SHALLOW < FST.FIXED_ARRAY_NUM_ARCS_DEEP);
+ assertTrue(FST.FIXED_ARRAY_SHALLOW_DISTANCE >= 0);
+
+ SyntheticData s = new SyntheticData();
+
+ ArrayList<String> out = new ArrayList<String>();
+ StringBuilder b = new StringBuilder();
+ s.generate(out, b, 'a', 'i', 10);
+ String[] input = out.toArray(new String[out.size()]);
+ Arrays.sort(input);
+ FST<Object> fst = s.compile(input);
+ FST.Arc<Object> arc = fst.getFirstArc(new FST.Arc<Object>());
+ s.verifyStateAndBelow(fst, arc, 1);
+ }
}
Modified: lucene/dev/branches/bulkpostings/modules/analysis/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/modules/analysis/CHANGES.txt?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/modules/analysis/CHANGES.txt (original)
+++ lucene/dev/branches/bulkpostings/modules/analysis/CHANGES.txt Tue Mar 15 21:35:17 2011
@@ -25,6 +25,10 @@ API Changes
* LUCENE-1370: Added ShingleFilter option to output unigrams if no shingles
can be generated. (Chris Harris via Steven Rowe)
+ * LUCENE-2514, LUCENE-2551: JDK and ICU CollationKeyAnalyzers were changed to
+ use pure byte keys when Version >= 4.0. This cuts sort key size approximately
+ in half. (Robert Muir)
+
New Features
* LUCENE-2413: Consolidated Solr analysis components into common.
Modified: lucene/dev/branches/bulkpostings/modules/analysis/NOTICE.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/modules/analysis/NOTICE.txt?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/modules/analysis/NOTICE.txt (original)
+++ lucene/dev/branches/bulkpostings/modules/analysis/NOTICE.txt Tue Mar 15 21:35:17 2011
@@ -1,5 +1,5 @@
Apache Lucene
-Copyright 2006 The Apache Software Foundation
+Copyright 2011 The Apache Software Foundation
This product includes software developed by
The Apache Software Foundation (http://www.apache.org/).
Modified: lucene/dev/branches/bulkpostings/modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java (original)
+++ lucene/dev/branches/bulkpostings/modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java Tue Mar 15 21:35:17 2011
@@ -29,8 +29,8 @@ import org.apache.lucene.util.AttributeS
* Emits the entire input as a single token.
*/
public final class KeywordTokenizer extends Tokenizer {
-
- private static final int DEFAULT_BUFFER_SIZE = 256;
+ /** Default read buffer size */
+ public static final int DEFAULT_BUFFER_SIZE = 256;
private boolean done = false;
private int finalOffset;
Modified: lucene/dev/branches/bulkpostings/modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java (original)
+++ lucene/dev/branches/bulkpostings/modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java Tue Mar 15 21:35:17 2011
@@ -18,14 +18,13 @@ package org.apache.lucene.collation;
*/
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+import org.apache.lucene.util.IndexableBinaryStringTools; // javadoc @link
+import org.apache.lucene.util.Version;
import java.text.Collator;
import java.io.Reader;
-import java.io.IOException;
/**
* <p>
@@ -33,8 +32,8 @@ import java.io.IOException;
* </p>
* <p>
* Converts the token into its {@link java.text.CollationKey}, and then
- * encodes the CollationKey with
- * {@link org.apache.lucene.util.IndexableBinaryStringTools}, to allow
+ * encodes the CollationKey either directly or with
+ * {@link IndexableBinaryStringTools} (see <a href="#version">below</a>), to allow
* it to be stored as an index term.
* </p>
* <p>
@@ -75,39 +74,49 @@ import java.io.IOException;
* CollationKeyAnalyzer to generate index terms, do not use
* ICUCollationKeyAnalyzer on the query side, or vice versa.
* </p>
+ * <a name="version"/>
+ * <p>You must specify the required {@link Version}
+ * compatibility when creating CollationKeyAnalyzer:
+ * <ul>
+ * <li> As of 4.0, Collation Keys are directly encoded as bytes. Previous
+ * versions will encode the bytes with {@link IndexableBinaryStringTools}.
+ * </ul>
*/
-public final class CollationKeyAnalyzer extends Analyzer {
- private Collator collator;
-
- public CollationKeyAnalyzer(Collator collator) {
+public final class CollationKeyAnalyzer extends ReusableAnalyzerBase {
+ private final Collator collator;
+ private final CollationAttributeFactory factory;
+ private final Version matchVersion;
+
+ /**
+ * Create a new CollationKeyAnalyzer, using the specified collator.
+ *
+ * @param matchVersion See <a href="#version">above</a>
+ * @param collator CollationKey generator
+ */
+ public CollationKeyAnalyzer(Version matchVersion, Collator collator) {
+ this.matchVersion = matchVersion;
this.collator = collator;
- }
-
- @Override
- public TokenStream tokenStream(String fieldName, Reader reader) {
- TokenStream result = new KeywordTokenizer(reader);
- result = new CollationKeyFilter(result, collator);
- return result;
+ this.factory = new CollationAttributeFactory(collator);
}
- private class SavedStreams {
- Tokenizer source;
- TokenStream result;
+ /**
+ * @deprecated Use {@link CollationKeyAnalyzer#CollationKeyAnalyzer(Version, Collator)}
+ * and specify a version instead. This ctor will be removed in Lucene 5.0
+ */
+ @Deprecated
+ public CollationKeyAnalyzer(Collator collator) {
+ this(Version.LUCENE_31, collator);
}
-
+
@Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader)
- throws IOException {
-
- SavedStreams streams = (SavedStreams)getPreviousTokenStream();
- if (streams == null) {
- streams = new SavedStreams();
- streams.source = new KeywordTokenizer(reader);
- streams.result = new CollationKeyFilter(streams.source, collator);
- setPreviousTokenStream(streams);
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ if (matchVersion.onOrAfter(Version.LUCENE_40)) {
+ KeywordTokenizer tokenizer = new KeywordTokenizer(factory, reader, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
+ return new TokenStreamComponents(tokenizer, tokenizer);
} else {
- streams.source.reset(reader);
+ KeywordTokenizer tokenizer = new KeywordTokenizer(reader);
+ return new TokenStreamComponents(tokenizer, new CollationKeyFilter(tokenizer, collator));
}
- return streams.result;
}
}
Modified: lucene/dev/branches/bulkpostings/modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyFilter.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyFilter.java (original)
+++ lucene/dev/branches/bulkpostings/modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyFilter.java Tue Mar 15 21:35:17 2011
@@ -71,7 +71,10 @@ import java.text.Collator;
* CollationKeyFilter to generate index terms, do not use
* ICUCollationKeyFilter on the query side, or vice versa.
* </p>
+ * @deprecated Use {@link CollationAttributeFactory} instead, which encodes
+ * terms directly as bytes. This filter will be removed in Lucene 5.0
*/
+@Deprecated
public final class CollationKeyFilter extends TokenFilter {
private final Collator collator;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
@@ -82,7 +85,9 @@ public final class CollationKeyFilter ex
*/
public CollationKeyFilter(TokenStream input, Collator collator) {
super(input);
- this.collator = collator;
+ // clone in case JRE doesnt properly sync,
+ // or to reduce contention in case they do
+ this.collator = (Collator) collator.clone();
}
@Override
Modified: lucene/dev/branches/bulkpostings/modules/analysis/common/src/java/org/apache/lucene/collation/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/modules/analysis/common/src/java/org/apache/lucene/collation/package.html?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/modules/analysis/common/src/java/org/apache/lucene/collation/package.html (original)
+++ lucene/dev/branches/bulkpostings/modules/analysis/common/src/java/org/apache/lucene/collation/package.html Tue Mar 15 21:35:17 2011
@@ -52,13 +52,12 @@
<h2>Example Usages</h2>
<h3>Farsi Range Queries</h3>
-<code><pre>
+<pre class="prettyprint">
// "fa" Locale is not supported by Sun JDK 1.4 or 1.5
Collator collator = Collator.getInstance(new Locale("ar"));
- CollationKeyAnalyzer analyzer = new CollationKeyAnalyzer(collator);
+ CollationKeyAnalyzer analyzer = new CollationKeyAnalyzer(Version.LUCENE_40, collator);
RAMDirectory ramDir = new RAMDirectory();
- IndexWriter writer = new IndexWriter
- (ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
+ IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_40, analyzer));
Document doc = new Document();
doc.add(new Field("content", "\u0633\u0627\u0628",
Field.Store.YES, Field.Index.ANALYZED));
@@ -66,12 +65,9 @@
writer.close();
IndexSearcher is = new IndexSearcher(ramDir, true);
- // The AnalyzingQueryParser in Lucene's contrib allows terms in range queries
- // to be passed through an analyzer - Lucene's standard QueryParser does not
- // allow this.
- AnalyzingQueryParser aqp = new AnalyzingQueryParser("content", analyzer);
- aqp.setLowercaseExpandedTerms(false);
-
+ QueryParser aqp = new QueryParser(Version.LUCENE_40, "content", analyzer);
+ aqp.setAnalyzeRangeTerms(true);
+
// Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
// orders the U+0698 character before the U+0633 character, so the single
// indexed Term above should NOT be returned by a ConstantScoreRangeQuery
@@ -80,15 +76,14 @@
ScoreDoc[] result
= is.search(aqp.parse("[ \u062F TO \u0698 ]"), null, 1000).scoreDocs;
assertEquals("The index Term should not be included.", 0, result.length);
-</pre></code>
+</pre>
<h3>Danish Sorting</h3>
-<code><pre>
+<pre class="prettyprint">
Analyzer analyzer
- = new CollationKeyAnalyzer(Collator.getInstance(new Locale("da", "dk")));
+ = new CollationKeyAnalyzer(Version.LUCENE_40, Collator.getInstance(new Locale("da", "dk")));
RAMDirectory indexStore = new RAMDirectory();
- IndexWriter writer = new IndexWriter
- (indexStore, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
+ IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig(Version.LUCENE_40, analyzer));
String[] tracer = new String[] { "A", "B", "C", "D", "E" };
String[] data = new String[] { "HAT", "HUT", "H\u00C5T", "H\u00D8T", "HOT" };
String[] sortedTracerOrder = new String[] { "A", "E", "B", "D", "C" };
@@ -99,7 +94,7 @@
writer.addDocument(doc);
}
writer.close();
- Searcher searcher = new IndexSearcher(indexStore, true);
+ IndexSearcher searcher = new IndexSearcher(indexStore, true);
Sort sort = new Sort();
sort.setSort(new SortField("contents", SortField.STRING));
Query query = new MatchAllDocsQuery();
@@ -108,26 +103,25 @@
Document doc = searcher.doc(result[i].doc);
assertEquals(sortedTracerOrder[i], doc.getValues("tracer")[0]);
}
-</pre></code>
+</pre>
<h3>Turkish Case Normalization</h3>
-<code><pre>
+<pre class="prettyprint">
Collator collator = Collator.getInstance(new Locale("tr", "TR"));
collator.setStrength(Collator.PRIMARY);
- Analyzer analyzer = new CollationKeyAnalyzer(collator);
+ Analyzer analyzer = new CollationKeyAnalyzer(Version.LUCENE_40, collator);
RAMDirectory ramDir = new RAMDirectory();
- IndexWriter writer = new IndexWriter
- (ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
+ IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_40, analyzer));
Document doc = new Document();
doc.add(new Field("contents", "DIGY", Field.Store.NO, Field.Index.ANALYZED));
writer.addDocument(doc);
writer.close();
IndexSearcher is = new IndexSearcher(ramDir, true);
- QueryParser parser = new QueryParser("contents", analyzer);
+ QueryParser parser = new QueryParser(Version.LUCENE_40, "contents", analyzer);
Query query = parser.parse("d\u0131gy"); // U+0131: dotless i
ScoreDoc[] result = is.search(query, null, 1000).scoreDocs;
assertEquals("The index Term should be included.", 1, result.length);
-</pre></code>
+</pre>
<h2>Caveats and Comparisons</h2>
<p>
Modified: lucene/dev/branches/bulkpostings/modules/analysis/common/src/test/org/apache/lucene/collation/CollationTestBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/modules/analysis/common/src/test/org/apache/lucene/collation/CollationTestBase.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/modules/analysis/common/src/test/org/apache/lucene/collation/CollationTestBase.java (original)
+++ lucene/dev/branches/bulkpostings/modules/analysis/common/src/test/org/apache/lucene/collation/CollationTestBase.java Tue Mar 15 21:35:17 2011
@@ -21,6 +21,8 @@ package org.apache.lucene.collation;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
@@ -36,11 +38,15 @@ import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Document;
+import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IndexableBinaryStringTools;
import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util._TestUtil;
import java.io.StringReader;
import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
public abstract class CollationTestBase extends LuceneTestCase {
@@ -56,7 +62,9 @@ public abstract class CollationTestBase
* @param keyBits the result from
* collator.getCollationKey(original).toByteArray()
* @return The encoded collation key for the original String
+ * @deprecated only for testing deprecated filters
*/
+ @Deprecated
protected String encodeCollationKey(byte[] keyBits) {
// Ensure that the backing char[] array is large enough to hold the encoded
// Binary String
@@ -65,10 +73,10 @@ public abstract class CollationTestBase
IndexableBinaryStringTools.encode(keyBits, 0, keyBits.length, encodedBegArray, 0, encodedLength);
return new String(encodedBegArray);
}
-
- public void testFarsiRangeFilterCollating(Analyzer analyzer, String firstBeg,
- String firstEnd, String secondBeg,
- String secondEnd) throws Exception {
+
+ public void testFarsiRangeFilterCollating(Analyzer analyzer, BytesRef firstBeg,
+ BytesRef firstEnd, BytesRef secondBeg,
+ BytesRef secondEnd) throws Exception {
RAMDirectory ramDir = new RAMDirectory();
IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(
TEST_VERSION_CURRENT, analyzer));
@@ -98,9 +106,9 @@ public abstract class CollationTestBase
searcher.close();
}
- public void testFarsiRangeQueryCollating(Analyzer analyzer, String firstBeg,
- String firstEnd, String secondBeg,
- String secondEnd) throws Exception {
+ public void testFarsiRangeQueryCollating(Analyzer analyzer, BytesRef firstBeg,
+ BytesRef firstEnd, BytesRef secondBeg,
+ BytesRef secondEnd) throws Exception {
RAMDirectory ramDir = new RAMDirectory();
IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(
TEST_VERSION_CURRENT, analyzer));
@@ -126,8 +134,8 @@ public abstract class CollationTestBase
searcher.close();
}
- public void testFarsiTermRangeQuery(Analyzer analyzer, String firstBeg,
- String firstEnd, String secondBeg, String secondEnd) throws Exception {
+ public void testFarsiTermRangeQuery(Analyzer analyzer, BytesRef firstBeg,
+ BytesRef firstEnd, BytesRef secondBeg, BytesRef secondEnd) throws Exception {
RAMDirectory farsiIndex = new RAMDirectory();
IndexWriter writer = new IndexWriter(farsiIndex, new IndexWriterConfig(
@@ -249,4 +257,77 @@ public abstract class CollationTestBase
}
assertEquals(expectedResult, buff.toString());
}
+
+ private String randomString() {
+ // ideally we could do this!
+ // return _TestUtil.randomUnicodeString(random);
+ //
+ // http://bugs.icu-project.org/trac/ticket/8060
+ // http://bugs.icu-project.org/trac/ticket/7732
+ // ...
+ //
+ // as a workaround, just test the BMP for now (and avoid 0xFFFF etc)
+ int length = _TestUtil.nextInt(random, 0, 10);
+ char chars[] = new char[length];
+ for (int i = 0; i < length; i++) {
+ if (random.nextBoolean()) {
+ chars[i] = (char) _TestUtil.nextInt(random, 0, 0xD7FF);
+ } else {
+ chars[i] = (char) _TestUtil.nextInt(random, 0xE000, 0xFFFD);
+ }
+ }
+ return new String(chars, 0, length);
+ }
+
+ public void assertThreadSafe(final Analyzer analyzer) throws Exception {
+ int numTestPoints = 100;
+ int numThreads = _TestUtil.nextInt(random, 3, 5);
+ final HashMap<String,BytesRef> map = new HashMap<String,BytesRef>();
+ BytesRef spare = new BytesRef();
+
+ // create a map<String,SortKey> up front.
+ // then with multiple threads, generate sort keys for all the keys in the map
+ // and ensure they are the same as the ones we produced in serial fashion.
+
+ for (int i = 0; i < numTestPoints; i++) {
+ String term = randomString();
+ TokenStream ts = analyzer.reusableTokenStream("fake", new StringReader(term));
+ TermToBytesRefAttribute bytes = ts.addAttribute(TermToBytesRefAttribute.class);
+ ts.reset();
+ assertTrue(ts.incrementToken());
+ bytes.toBytesRef(spare);
+ // ensure we make a copy of the actual bytes too
+ map.put(term, new BytesRef(spare));
+ }
+
+ Thread threads[] = new Thread[numThreads];
+ for (int i = 0; i < numThreads; i++) {
+ threads[i] = new Thread() {
+ @Override
+ public void run() {
+ try {
+ BytesRef spare = new BytesRef();
+ for (Map.Entry<String,BytesRef> mapping : map.entrySet()) {
+ String term = mapping.getKey();
+ BytesRef expected = mapping.getValue();
+ TokenStream ts = analyzer.reusableTokenStream("fake", new StringReader(term));
+ TermToBytesRefAttribute bytes = ts.addAttribute(TermToBytesRefAttribute.class);
+ ts.reset();
+ assertTrue(ts.incrementToken());
+ bytes.toBytesRef(spare);
+ assertEquals(expected, spare);
+ }
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ };
+ }
+ for (int i = 0; i < numThreads; i++) {
+ threads[i].start();
+ }
+ for (int i = 0; i < numThreads; i++) {
+ threads[i].join();
+ }
+ }
}
Modified: lucene/dev/branches/bulkpostings/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java (original)
+++ lucene/dev/branches/bulkpostings/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java Tue Mar 15 21:35:17 2011
@@ -19,6 +19,8 @@ package org.apache.lucene.collation;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.index.codecs.CodecProvider;
+import org.apache.lucene.util.BytesRef;
import java.text.Collator;
import java.util.Locale;
@@ -34,17 +36,19 @@ public class TestCollationKeyAnalyzer ex
// RuleBasedCollator. However, the Arabic Locale seems to order the Farsi
// characters properly.
private Collator collator = Collator.getInstance(new Locale("ar"));
- private Analyzer analyzer = new CollationKeyAnalyzer(collator);
+ private Analyzer analyzer = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, collator);
- private String firstRangeBeginning = encodeCollationKey
- (collator.getCollationKey(firstRangeBeginningOriginal).toByteArray());
- private String firstRangeEnd = encodeCollationKey
- (collator.getCollationKey(firstRangeEndOriginal).toByteArray());
- private String secondRangeBeginning = encodeCollationKey
- (collator.getCollationKey(secondRangeBeginningOriginal).toByteArray());
- private String secondRangeEnd = encodeCollationKey
- (collator.getCollationKey(secondRangeEndOriginal).toByteArray());
+ private BytesRef firstRangeBeginning = new BytesRef(collator.getCollationKey(firstRangeBeginningOriginal).toByteArray());
+ private BytesRef firstRangeEnd = new BytesRef(collator.getCollationKey(firstRangeEndOriginal).toByteArray());
+ private BytesRef secondRangeBeginning = new BytesRef(collator.getCollationKey(secondRangeBeginningOriginal).toByteArray());
+ private BytesRef secondRangeEnd = new BytesRef(collator.getCollationKey(secondRangeEndOriginal).toByteArray());
+ @Override
+ public void setUp() throws Exception {
+ super.setUp();
+ assumeFalse("preflex format only supports UTF-8 encoded bytes", "PreFlex".equals(CodecProvider.getDefault().getDefaultFieldCodec()));
+ }
+
public void testFarsiRangeFilterCollating() throws Exception {
testFarsiRangeFilterCollating
(analyzer, firstRangeBeginning, firstRangeEnd,
@@ -65,13 +69,13 @@ public class TestCollationKeyAnalyzer ex
public void testCollationKeySort() throws Exception {
Analyzer usAnalyzer
- = new CollationKeyAnalyzer(Collator.getInstance(Locale.US));
+ = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(Locale.US));
Analyzer franceAnalyzer
- = new CollationKeyAnalyzer(Collator.getInstance(Locale.FRANCE));
+ = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(Locale.FRANCE));
Analyzer swedenAnalyzer
- = new CollationKeyAnalyzer(Collator.getInstance(new Locale("sv", "se")));
+ = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(new Locale("sv", "se")));
Analyzer denmarkAnalyzer
- = new CollationKeyAnalyzer(Collator.getInstance(new Locale("da", "dk")));
+ = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(new Locale("da", "dk")));
// The ICU Collator and Sun java.text.Collator implementations differ in their
// orderings - "BFJDH" is the ordering for java.text.Collator for Locale.US.
@@ -79,4 +83,14 @@ public class TestCollationKeyAnalyzer ex
(usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer,
oStrokeFirst ? "BFJHD" : "BFJDH", "EACGI", "BJDFH", "BJDHF");
}
+
+ public void testThreadSafe() throws Exception {
+ int iters = 20 * RANDOM_MULTIPLIER;
+ for (int i = 0; i < iters; i++) {
+ Locale locale = randomLocale(random);
+ Collator collator = Collator.getInstance(locale);
+ collator.setStrength(Collator.PRIMARY);
+ assertThreadSafe(new CollationKeyAnalyzer(TEST_VERSION_CURRENT, collator));
+ }
+ }
}
Modified: lucene/dev/branches/bulkpostings/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java (original)
+++ lucene/dev/branches/bulkpostings/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java Tue Mar 15 21:35:17 2011
@@ -21,12 +21,16 @@ package org.apache.lucene.collation;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.util.BytesRef;
import java.text.Collator;
import java.util.Locale;
import java.io.Reader;
-
+/**
+ * @deprecated remove when CollationKeyFilter is removed.
+ */
+@Deprecated
public class TestCollationKeyFilter extends CollationTestBase {
// the sort order of à versus U depends on the version of the rules being used
// for the inherited root locale: Ã's order isnt specified in Locale.US since
@@ -39,14 +43,14 @@ public class TestCollationKeyFilter exte
private Collator collator = Collator.getInstance(new Locale("ar"));
private Analyzer analyzer = new TestAnalyzer(collator);
- private String firstRangeBeginning = encodeCollationKey
- (collator.getCollationKey(firstRangeBeginningOriginal).toByteArray());
- private String firstRangeEnd = encodeCollationKey
- (collator.getCollationKey(firstRangeEndOriginal).toByteArray());
- private String secondRangeBeginning = encodeCollationKey
- (collator.getCollationKey(secondRangeBeginningOriginal).toByteArray());
- private String secondRangeEnd = encodeCollationKey
- (collator.getCollationKey(secondRangeEndOriginal).toByteArray());
+ private BytesRef firstRangeBeginning = new BytesRef(encodeCollationKey
+ (collator.getCollationKey(firstRangeBeginningOriginal).toByteArray()));
+ private BytesRef firstRangeEnd = new BytesRef(encodeCollationKey
+ (collator.getCollationKey(firstRangeEndOriginal).toByteArray()));
+ private BytesRef secondRangeBeginning = new BytesRef(encodeCollationKey
+ (collator.getCollationKey(secondRangeBeginningOriginal).toByteArray()));
+ private BytesRef secondRangeEnd = new BytesRef(encodeCollationKey
+ (collator.getCollationKey(secondRangeEndOriginal).toByteArray()));
public final class TestAnalyzer extends Analyzer {
Modified: lucene/dev/branches/bulkpostings/modules/analysis/icu/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/modules/analysis/icu/build.xml?rev=1081952&r1=1081951&r2=1081952&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/modules/analysis/icu/build.xml (original)
+++ lucene/dev/branches/bulkpostings/modules/analysis/icu/build.xml Tue Mar 15 21:35:17 2011
@@ -132,4 +132,9 @@ are part of the ICU4C package. See http:
<classpath refid="classpath"/>
</compile>
</target>
+
+ <target name="dist-maven" depends="contrib-build.dist-maven">
+ <m2-deploy-with-pom-template pom.xml="lib/lucene-icu4j-pom.xml.template"
+ jar.file="lib/icu4j-4_6.jar" />
+ </target>
</project>