You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by si...@apache.org on 2011/03/30 11:17:42 UTC
svn commit: r1086876 [11/18] - in /lucene/dev/branches/realtime_search: ./
dev-tools/eclipse/ dev-tools/idea/ dev-tools/idea/.idea/libraries/
dev-tools/idea/lucene/ dev-tools/idea/solr/
dev-tools/idea/solr/contrib/analysis-extras/ dev-tools/idea/solr/c...
Modified: lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadTermQuery.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadTermQuery.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadTermQuery.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/search/payloads/TestPayloadTermQuery.java Wed Mar 30 09:17:25 2011
@@ -18,8 +18,11 @@ package org.apache.lucene.search.payload
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.English;
+import org.apache.lucene.search.DefaultSimilarityProvider;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.QueryUtils;
+import org.apache.lucene.search.Similarity;
+import org.apache.lucene.search.SimilarityProvider;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.CheckHits;
@@ -54,7 +57,7 @@ import java.io.IOException;
public class TestPayloadTermQuery extends LuceneTestCase {
private IndexSearcher searcher;
private IndexReader reader;
- private BoostingSimilarity similarity = new BoostingSimilarity();
+ private SimilarityProvider similarityProvider = new BoostingSimilarityProvider();
private byte[] payloadField = new byte[]{1};
private byte[] payloadMultiField1 = new byte[]{2};
private byte[] payloadMultiField2 = new byte[]{4};
@@ -110,7 +113,7 @@ public class TestPayloadTermQuery extend
directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random, directory,
newIndexWriterConfig(TEST_VERSION_CURRENT, new PayloadAnalyzer())
- .setSimilarityProvider(similarity).setMergePolicy(newInOrderLogMergePolicy()));
+ .setSimilarityProvider(similarityProvider).setMergePolicy(newInOrderLogMergePolicy()));
//writer.infoStream = System.out;
for (int i = 0; i < 1000; i++) {
Document doc = new Document();
@@ -125,7 +128,7 @@ public class TestPayloadTermQuery extend
writer.close();
searcher = newSearcher(reader);
- searcher.setSimilarityProvider(similarity);
+ searcher.setSimilarityProvider(similarityProvider);
}
@Override
@@ -220,7 +223,12 @@ public class TestPayloadTermQuery extend
new MaxPayloadFunction(), false);
IndexSearcher theSearcher = new IndexSearcher(directory, true);
- theSearcher.setSimilarityProvider(new FullSimilarity());
+ theSearcher.setSimilarityProvider(new DefaultSimilarityProvider() {
+ @Override
+ public Similarity get(String field) {
+ return new FullSimilarity();
+ }
+ });
TopDocs hits = searcher.search(query, null, 100);
assertTrue("hits is null and it shouldn't be", hits != null);
assertTrue("hits Size: " + hits.totalHits + " is not: " + 100, hits.totalHits == 100);
@@ -283,46 +291,49 @@ public class TestPayloadTermQuery extend
}
// must be static for weight serialization tests
- static class BoostingSimilarity extends DefaultSimilarity {
-
- // TODO: Remove warning after API has been finalized
- @Override
- public float scorePayload(int docId, int start, int end, byte[] payload, int offset, int length) {
- //we know it is size 4 here, so ignore the offset/length
- return payload[offset];
- }
+ static class BoostingSimilarityProvider implements SimilarityProvider {
- //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- //Make everything else 1 so we see the effect of the payload
- //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
- @Override
- public float computeNorm(FieldInvertState state) {
- return state.getBoost();
- }
-
- @Override
public float queryNorm(float sumOfSquaredWeights) {
return 1;
}
-
- @Override
- public float sloppyFreq(int distance) {
- return 1;
- }
-
- @Override
+
public float coord(int overlap, int maxOverlap) {
return 1;
}
- @Override
- public float idf(int docFreq, int numDocs) {
- return 1;
- }
+ public Similarity get(String field) {
+ return new DefaultSimilarity() {
+
+ // TODO: Remove warning after API has been finalized
+ @Override
+ public float scorePayload(int docId, int start, int end, byte[] payload, int offset, int length) {
+ //we know it is size 4 here, so ignore the offset/length
+ return payload[offset];
+ }
- @Override
- public float tf(float freq) {
- return freq == 0 ? 0 : 1;
+ //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ //Make everything else 1 so we see the effect of the payload
+ //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ @Override
+ public float computeNorm(FieldInvertState state) {
+ return state.getBoost();
+ }
+
+ @Override
+ public float sloppyFreq(int distance) {
+ return 1;
+ }
+
+ @Override
+ public float idf(int docFreq, int numDocs) {
+ return 1;
+ }
+
+ @Override
+ public float tf(float freq) {
+ return freq == 0 ? 0 : 1;
+ }
+ };
}
}
Modified: lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/search/spans/TestPayloadSpans.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/search/spans/TestPayloadSpans.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/search/spans/TestPayloadSpans.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/search/spans/TestPayloadSpans.java Wed Mar 30 09:17:25 2011
@@ -37,7 +37,7 @@ import org.apache.lucene.index.RandomInd
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Payload;
import org.apache.lucene.index.Term;
-import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.search.DefaultSimilarityProvider;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.SimilarityProvider;
import org.apache.lucene.search.TermQuery;
@@ -50,7 +50,7 @@ import org.apache.lucene.util.LuceneTest
public class TestPayloadSpans extends LuceneTestCase {
private IndexSearcher searcher;
- private SimilarityProvider similarity = new DefaultSimilarity();
+ private SimilarityProvider similarity = new DefaultSimilarityProvider();
protected IndexReader indexReader;
private IndexReader closeIndexReader;
private Directory directory;
Modified: lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/search/spans/TestSpans.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/search/spans/TestSpans.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/search/spans/TestSpans.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/search/spans/TestSpans.java Wed Mar 30 09:17:25 2011
@@ -17,11 +17,13 @@ package org.apache.lucene.search.spans;
* limitations under the License.
*/
+import org.apache.lucene.search.DefaultSimilarityProvider;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.CheckHits;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.SimilarityProvider;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.IndexSearcher;
@@ -410,10 +412,14 @@ public class TestSpans extends LuceneTes
for (int i = 0; i < leaves.length; i++) {
- final SimilarityProvider sim = new DefaultSimilarity() {
- @Override
- public float sloppyFreq(int distance) {
- return 0.0f;
+ final SimilarityProvider sim = new DefaultSimilarityProvider() {
+ public Similarity get(String field) {
+ return new DefaultSimilarity() {
+ @Override
+ public float sloppyFreq(int distance) {
+ return 0.0f;
+ }
+ };
}
};
Modified: lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/util/TestCollectionUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/util/TestCollectionUtil.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/util/TestCollectionUtil.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/util/TestCollectionUtil.java Wed Mar 30 09:17:25 2011
@@ -20,6 +20,7 @@ package org.apache.lucene.util;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.Collections;
+import java.util.LinkedList;
import java.util.List;
public class TestCollectionUtil extends LuceneTestCase {
@@ -89,9 +90,30 @@ public class TestCollectionUtil extends
}
}
- // should produce no exceptions
- public void testEmptyArraySort() {
- List<Integer> list = Collections.emptyList();
+ public void testEmptyListSort() {
+ // should produce no exceptions
+ List<Integer> list = Arrays.asList(new Integer[0]);
+ CollectionUtil.quickSort(list);
+ CollectionUtil.mergeSort(list);
+ CollectionUtil.insertionSort(list);
+ CollectionUtil.quickSort(list, Collections.reverseOrder());
+ CollectionUtil.mergeSort(list, Collections.reverseOrder());
+ CollectionUtil.insertionSort(list, Collections.reverseOrder());
+
+ // check that empty non-random access lists pass sorting without ex (as sorting is not needed)
+ list = new LinkedList<Integer>();
+ CollectionUtil.quickSort(list);
+ CollectionUtil.mergeSort(list);
+ CollectionUtil.insertionSort(list);
+ CollectionUtil.quickSort(list, Collections.reverseOrder());
+ CollectionUtil.mergeSort(list, Collections.reverseOrder());
+ CollectionUtil.insertionSort(list, Collections.reverseOrder());
+ }
+
+ public void testOneElementListSort() {
+ // check that one-element non-random access lists pass sorting without ex (as sorting is not needed)
+ List<Integer> list = new LinkedList<Integer>();
+ list.add(1);
CollectionUtil.quickSort(list);
CollectionUtil.mergeSort(list);
CollectionUtil.insertionSort(list);
Modified: lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/util/TestIndexableBinaryStringTools.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/util/TestIndexableBinaryStringTools.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/util/TestIndexableBinaryStringTools.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/util/TestIndexableBinaryStringTools.java Wed Mar 30 09:17:25 2011
@@ -17,6 +17,10 @@ package org.apache.lucene.util;
* limitations under the License.
*/
+/**
+ * @deprecated Remove when IndexableBinaryStringTools is removed.
+ */
+@Deprecated
public class TestIndexableBinaryStringTools extends LuceneTestCase {
private static final int NUM_RANDOM_TESTS = 2000 * RANDOM_MULTIPLIER;
private static final int MAX_RANDOM_BINARY_LENGTH = 300 * RANDOM_MULTIPLIER;
Modified: lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/util/TestPriorityQueue.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/util/TestPriorityQueue.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/util/TestPriorityQueue.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/util/TestPriorityQueue.java Wed Mar 30 09:17:25 2011
@@ -23,8 +23,7 @@ public class TestPriorityQueue extends L
private static class IntegerQueue extends PriorityQueue<Integer> {
public IntegerQueue(int count) {
- super();
- initialize(count);
+ super(count);
}
@Override
Modified: lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java Wed Mar 30 09:17:25 2011
@@ -25,16 +25,7 @@ import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Random;
-import java.util.Set;
+import java.util.*;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
@@ -1098,7 +1089,7 @@ public class TestFSTs extends LuceneTest
protected abstract T getOutput(IntsRef input, int ord) throws IOException;
- public void run(int limit) throws IOException {
+ public void run(int limit, boolean verify) throws IOException {
BufferedReader is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536);
try {
final IntsRef intsRef = new IntsRef(10);
@@ -1115,7 +1106,9 @@ public class TestFSTs extends LuceneTest
ord++;
if (ord % 500000 == 0) {
- System.out.println(((System.currentTimeMillis()-tStart)/1000.0) + "s: " + ord + "...");
+ System.out.println(
+ String.format(Locale.ENGLISH,
+ "%6.2fs: %9d...", ((System.currentTimeMillis() - tStart) / 1000.0), ord));
}
if (ord >= limit) {
break;
@@ -1129,6 +1122,9 @@ public class TestFSTs extends LuceneTest
System.exit(0);
}
+ if (dirOut == null)
+ return;
+
System.out.println(ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs; " + fst.getArcWithOutputCount() + " arcs w/ output; tot size " + fst.sizeInBytes());
if (fst.getNodeCount() < 100) {
Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
@@ -1144,6 +1140,10 @@ public class TestFSTs extends LuceneTest
System.out.println("Saved FST to fst.bin.");
+ if (!verify) {
+ return;
+ }
+
System.out.println("\nNow verify...");
is.close();
@@ -1186,37 +1186,54 @@ public class TestFSTs extends LuceneTest
// java -cp build/classes/test:build/classes/java:lib/junit-4.7.jar org.apache.lucene.util.automaton.fst.TestFSTs /x/tmp/allTerms3.txt out
public static void main(String[] args) throws IOException {
- final String wordsFileIn = args[0];
- final String dirOut = args[1];
- int idx = 2;
int prune = 0;
int limit = Integer.MAX_VALUE;
int inputMode = 0; // utf8
boolean storeOrds = false;
boolean storeDocFreqs = false;
- while(idx < args.length) {
+ boolean verify = true;
+
+ String wordsFileIn = null;
+ String dirOut = null;
+
+ int idx = 0;
+ while (idx < args.length) {
if (args[idx].equals("-prune")) {
- prune = Integer.valueOf(args[1+idx]);
+ prune = Integer.valueOf(args[1 + idx]);
idx++;
- }
- if (args[idx].equals("-limit")) {
- limit = Integer.valueOf(args[1+idx]);
+ } else if (args[idx].equals("-limit")) {
+ limit = Integer.valueOf(args[1 + idx]);
idx++;
- }
- if (args[idx].equals("-utf8")) {
+ } else if (args[idx].equals("-utf8")) {
inputMode = 0;
- }
- if (args[idx].equals("-utf32")) {
+ } else if (args[idx].equals("-utf32")) {
inputMode = 1;
- }
- if (args[idx].equals("-docFreq")) {
+ } else if (args[idx].equals("-docFreq")) {
storeDocFreqs = true;
- }
- if (args[idx].equals("-ords")) {
+ } else if (args[idx].equals("-ords")) {
storeOrds = true;
+ } else if (args[idx].equals("-noverify")) {
+ verify = false;
+ } else if (args[idx].startsWith("-")) {
+ System.err.println("Unrecognized option: " + args[idx]);
+ System.exit(-1);
+ } else {
+ if (wordsFileIn == null) {
+ wordsFileIn = args[idx];
+ } else if (dirOut == null) {
+ dirOut = args[idx];
+ } else {
+ System.err.println("Too many arguments, expected: input [output]");
+ System.exit(-1);
+ }
}
idx++;
}
+
+ if (wordsFileIn == null) {
+ System.err.println("No input file.");
+ System.exit(-1);
+ }
// ord benefits from share, docFreqs don't:
@@ -1235,7 +1252,7 @@ public class TestFSTs extends LuceneTest
return new PairOutputs.Pair<Long,Long>(o1.get(ord),
o2.get(_TestUtil.nextInt(rand, 1, 5000)));
}
- }.run(limit);
+ }.run(limit, verify);
} else if (storeOrds) {
// Store only ords
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
@@ -1244,7 +1261,7 @@ public class TestFSTs extends LuceneTest
public Long getOutput(IntsRef input, int ord) {
return outputs.get(ord);
}
- }.run(limit);
+ }.run(limit, verify);
} else if (storeDocFreqs) {
// Store only docFreq
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(false);
@@ -1257,7 +1274,7 @@ public class TestFSTs extends LuceneTest
}
return outputs.get(_TestUtil.nextInt(rand, 1, 5000));
}
- }.run(limit);
+ }.run(limit, verify);
} else {
// Store nothing
final NoOutputs outputs = NoOutputs.getSingleton();
@@ -1267,7 +1284,7 @@ public class TestFSTs extends LuceneTest
public Object getOutput(IntsRef input, int ord) {
return NO_OUTPUT;
}
- }.run(limit);
+ }.run(limit, verify);
}
}
Modified: lucene/dev/branches/realtime_search/modules/analysis/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/analysis/CHANGES.txt?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/analysis/CHANGES.txt (original)
+++ lucene/dev/branches/realtime_search/modules/analysis/CHANGES.txt Wed Mar 30 09:17:25 2011
@@ -25,6 +25,10 @@ API Changes
* LUCENE-1370: Added ShingleFilter option to output unigrams if no shingles
can be generated. (Chris Harris via Steven Rowe)
+ * LUCENE-2514, LUCENE-2551: JDK and ICU CollationKeyAnalyzers were changed to
+ use pure byte keys when Version >= 4.0. This cuts sort key size approximately
+ in half. (Robert Muir)
+
New Features
* LUCENE-2413: Consolidated Solr analysis components into common.
Modified: lucene/dev/branches/realtime_search/modules/analysis/NOTICE.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/analysis/NOTICE.txt?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/analysis/NOTICE.txt (original)
+++ lucene/dev/branches/realtime_search/modules/analysis/NOTICE.txt Wed Mar 30 09:17:25 2011
@@ -1,5 +1,5 @@
Apache Lucene
-Copyright 2006 The Apache Software Foundation
+Copyright 2011 The Apache Software Foundation
This product includes software developed by
The Apache Software Foundation (http://www.apache.org/).
Modified: lucene/dev/branches/realtime_search/modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java (original)
+++ lucene/dev/branches/realtime_search/modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java Wed Mar 30 09:17:25 2011
@@ -29,8 +29,8 @@ import org.apache.lucene.util.AttributeS
* Emits the entire input as a single token.
*/
public final class KeywordTokenizer extends Tokenizer {
-
- private static final int DEFAULT_BUFFER_SIZE = 256;
+ /** Default read buffer size */
+ public static final int DEFAULT_BUFFER_SIZE = 256;
private boolean done = false;
private int finalOffset;
Modified: lucene/dev/branches/realtime_search/modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java (original)
+++ lucene/dev/branches/realtime_search/modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java Wed Mar 30 09:17:25 2011
@@ -18,14 +18,13 @@ package org.apache.lucene.collation;
*/
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
+import org.apache.lucene.util.IndexableBinaryStringTools; // javadoc @link
+import org.apache.lucene.util.Version;
import java.text.Collator;
import java.io.Reader;
-import java.io.IOException;
/**
* <p>
@@ -33,8 +32,8 @@ import java.io.IOException;
* </p>
* <p>
* Converts the token into its {@link java.text.CollationKey}, and then
- * encodes the CollationKey with
- * {@link org.apache.lucene.util.IndexableBinaryStringTools}, to allow
+ * encodes the CollationKey either directly or with
+ * {@link IndexableBinaryStringTools} (see <a href="#version">below</a>), to allow
* it to be stored as an index term.
* </p>
* <p>
@@ -75,39 +74,49 @@ import java.io.IOException;
* CollationKeyAnalyzer to generate index terms, do not use
* ICUCollationKeyAnalyzer on the query side, or vice versa.
* </p>
+ * <a name="version"/>
+ * <p>You must specify the required {@link Version}
+ * compatibility when creating CollationKeyAnalyzer:
+ * <ul>
+ * <li> As of 4.0, Collation Keys are directly encoded as bytes. Previous
+ * versions will encode the bytes with {@link IndexableBinaryStringTools}.
+ * </ul>
*/
-public final class CollationKeyAnalyzer extends Analyzer {
- private Collator collator;
-
- public CollationKeyAnalyzer(Collator collator) {
+public final class CollationKeyAnalyzer extends ReusableAnalyzerBase {
+ private final Collator collator;
+ private final CollationAttributeFactory factory;
+ private final Version matchVersion;
+
+ /**
+ * Create a new CollationKeyAnalyzer, using the specified collator.
+ *
+ * @param matchVersion See <a href="#version">above</a>
+ * @param collator CollationKey generator
+ */
+ public CollationKeyAnalyzer(Version matchVersion, Collator collator) {
+ this.matchVersion = matchVersion;
this.collator = collator;
- }
-
- @Override
- public TokenStream tokenStream(String fieldName, Reader reader) {
- TokenStream result = new KeywordTokenizer(reader);
- result = new CollationKeyFilter(result, collator);
- return result;
+ this.factory = new CollationAttributeFactory(collator);
}
- private class SavedStreams {
- Tokenizer source;
- TokenStream result;
+ /**
+ * @deprecated Use {@link CollationKeyAnalyzer#CollationKeyAnalyzer(Version, Collator)}
+ * and specify a version instead. This ctor will be removed in Lucene 5.0
+ */
+ @Deprecated
+ public CollationKeyAnalyzer(Collator collator) {
+ this(Version.LUCENE_31, collator);
}
-
+
@Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader)
- throws IOException {
-
- SavedStreams streams = (SavedStreams)getPreviousTokenStream();
- if (streams == null) {
- streams = new SavedStreams();
- streams.source = new KeywordTokenizer(reader);
- streams.result = new CollationKeyFilter(streams.source, collator);
- setPreviousTokenStream(streams);
+ protected TokenStreamComponents createComponents(String fieldName,
+ Reader reader) {
+ if (matchVersion.onOrAfter(Version.LUCENE_40)) {
+ KeywordTokenizer tokenizer = new KeywordTokenizer(factory, reader, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
+ return new TokenStreamComponents(tokenizer, tokenizer);
} else {
- streams.source.reset(reader);
+ KeywordTokenizer tokenizer = new KeywordTokenizer(reader);
+ return new TokenStreamComponents(tokenizer, new CollationKeyFilter(tokenizer, collator));
}
- return streams.result;
}
}
Modified: lucene/dev/branches/realtime_search/modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyFilter.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyFilter.java (original)
+++ lucene/dev/branches/realtime_search/modules/analysis/common/src/java/org/apache/lucene/collation/CollationKeyFilter.java Wed Mar 30 09:17:25 2011
@@ -71,7 +71,10 @@ import java.text.Collator;
* CollationKeyFilter to generate index terms, do not use
* ICUCollationKeyFilter on the query side, or vice versa.
* </p>
+ * @deprecated Use {@link CollationAttributeFactory} instead, which encodes
+ * terms directly as bytes. This filter will be removed in Lucene 5.0
*/
+@Deprecated
public final class CollationKeyFilter extends TokenFilter {
private final Collator collator;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
@@ -82,7 +85,9 @@ public final class CollationKeyFilter ex
*/
public CollationKeyFilter(TokenStream input, Collator collator) {
super(input);
- this.collator = collator;
+ // clone in case JRE doesnt properly sync,
+ // or to reduce contention in case they do
+ this.collator = (Collator) collator.clone();
}
@Override
Modified: lucene/dev/branches/realtime_search/modules/analysis/common/src/java/org/apache/lucene/collation/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/analysis/common/src/java/org/apache/lucene/collation/package.html?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/analysis/common/src/java/org/apache/lucene/collation/package.html (original)
+++ lucene/dev/branches/realtime_search/modules/analysis/common/src/java/org/apache/lucene/collation/package.html Wed Mar 30 09:17:25 2011
@@ -52,13 +52,12 @@
<h2>Example Usages</h2>
<h3>Farsi Range Queries</h3>
-<code><pre>
+<pre class="prettyprint">
// "fa" Locale is not supported by Sun JDK 1.4 or 1.5
Collator collator = Collator.getInstance(new Locale("ar"));
- CollationKeyAnalyzer analyzer = new CollationKeyAnalyzer(collator);
+ CollationKeyAnalyzer analyzer = new CollationKeyAnalyzer(Version.LUCENE_40, collator);
RAMDirectory ramDir = new RAMDirectory();
- IndexWriter writer = new IndexWriter
- (ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
+ IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_40, analyzer));
Document doc = new Document();
doc.add(new Field("content", "\u0633\u0627\u0628",
Field.Store.YES, Field.Index.ANALYZED));
@@ -66,12 +65,9 @@
writer.close();
IndexSearcher is = new IndexSearcher(ramDir, true);
- // The AnalyzingQueryParser in Lucene's contrib allows terms in range queries
- // to be passed through an analyzer - Lucene's standard QueryParser does not
- // allow this.
- AnalyzingQueryParser aqp = new AnalyzingQueryParser("content", analyzer);
- aqp.setLowercaseExpandedTerms(false);
-
+ QueryParser aqp = new QueryParser(Version.LUCENE_40, "content", analyzer);
+ aqp.setAnalyzeRangeTerms(true);
+
// Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
// orders the U+0698 character before the U+0633 character, so the single
// indexed Term above should NOT be returned by a ConstantScoreRangeQuery
@@ -80,15 +76,14 @@
ScoreDoc[] result
= is.search(aqp.parse("[ \u062F TO \u0698 ]"), null, 1000).scoreDocs;
assertEquals("The index Term should not be included.", 0, result.length);
-</pre></code>
+</pre>
<h3>Danish Sorting</h3>
-<code><pre>
+<pre class="prettyprint">
Analyzer analyzer
- = new CollationKeyAnalyzer(Collator.getInstance(new Locale("da", "dk")));
+ = new CollationKeyAnalyzer(Version.LUCENE_40, Collator.getInstance(new Locale("da", "dk")));
RAMDirectory indexStore = new RAMDirectory();
- IndexWriter writer = new IndexWriter
- (indexStore, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
+ IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig(Version.LUCENE_40, analyzer));
String[] tracer = new String[] { "A", "B", "C", "D", "E" };
String[] data = new String[] { "HAT", "HUT", "H\u00C5T", "H\u00D8T", "HOT" };
String[] sortedTracerOrder = new String[] { "A", "E", "B", "D", "C" };
@@ -99,7 +94,7 @@
writer.addDocument(doc);
}
writer.close();
- Searcher searcher = new IndexSearcher(indexStore, true);
+ IndexSearcher searcher = new IndexSearcher(indexStore, true);
Sort sort = new Sort();
sort.setSort(new SortField("contents", SortField.STRING));
Query query = new MatchAllDocsQuery();
@@ -108,26 +103,25 @@
Document doc = searcher.doc(result[i].doc);
assertEquals(sortedTracerOrder[i], doc.getValues("tracer")[0]);
}
-</pre></code>
+</pre>
<h3>Turkish Case Normalization</h3>
-<code><pre>
+<pre class="prettyprint">
Collator collator = Collator.getInstance(new Locale("tr", "TR"));
collator.setStrength(Collator.PRIMARY);
- Analyzer analyzer = new CollationKeyAnalyzer(collator);
+ Analyzer analyzer = new CollationKeyAnalyzer(Version.LUCENE_40, collator);
RAMDirectory ramDir = new RAMDirectory();
- IndexWriter writer = new IndexWriter
- (ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
+ IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_40, analyzer));
Document doc = new Document();
doc.add(new Field("contents", "DIGY", Field.Store.NO, Field.Index.ANALYZED));
writer.addDocument(doc);
writer.close();
IndexSearcher is = new IndexSearcher(ramDir, true);
- QueryParser parser = new QueryParser("contents", analyzer);
+ QueryParser parser = new QueryParser(Version.LUCENE_40, "contents", analyzer);
Query query = parser.parse("d\u0131gy"); // U+0131: dotless i
ScoreDoc[] result = is.search(query, null, 1000).scoreDocs;
assertEquals("The index Term should be included.", 1, result.length);
-</pre></code>
+</pre>
<h2>Caveats and Comparisons</h2>
<p>
Modified: lucene/dev/branches/realtime_search/modules/analysis/common/src/test/org/apache/lucene/collation/CollationTestBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/analysis/common/src/test/org/apache/lucene/collation/CollationTestBase.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/analysis/common/src/test/org/apache/lucene/collation/CollationTestBase.java (original)
+++ lucene/dev/branches/realtime_search/modules/analysis/common/src/test/org/apache/lucene/collation/CollationTestBase.java Wed Mar 30 09:17:25 2011
@@ -21,6 +21,8 @@ package org.apache.lucene.collation;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
@@ -36,11 +38,15 @@ import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Document;
+import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IndexableBinaryStringTools;
import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util._TestUtil;
import java.io.StringReader;
import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
public abstract class CollationTestBase extends LuceneTestCase {
@@ -56,7 +62,9 @@ public abstract class CollationTestBase
* @param keyBits the result from
* collator.getCollationKey(original).toByteArray()
* @return The encoded collation key for the original String
+ * @deprecated only for testing deprecated filters
*/
+ @Deprecated
protected String encodeCollationKey(byte[] keyBits) {
// Ensure that the backing char[] array is large enough to hold the encoded
// Binary String
@@ -65,10 +73,10 @@ public abstract class CollationTestBase
IndexableBinaryStringTools.encode(keyBits, 0, keyBits.length, encodedBegArray, 0, encodedLength);
return new String(encodedBegArray);
}
-
- public void testFarsiRangeFilterCollating(Analyzer analyzer, String firstBeg,
- String firstEnd, String secondBeg,
- String secondEnd) throws Exception {
+
+ public void testFarsiRangeFilterCollating(Analyzer analyzer, BytesRef firstBeg,
+ BytesRef firstEnd, BytesRef secondBeg,
+ BytesRef secondEnd) throws Exception {
RAMDirectory ramDir = new RAMDirectory();
IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(
TEST_VERSION_CURRENT, analyzer));
@@ -98,9 +106,9 @@ public abstract class CollationTestBase
searcher.close();
}
- public void testFarsiRangeQueryCollating(Analyzer analyzer, String firstBeg,
- String firstEnd, String secondBeg,
- String secondEnd) throws Exception {
+ public void testFarsiRangeQueryCollating(Analyzer analyzer, BytesRef firstBeg,
+ BytesRef firstEnd, BytesRef secondBeg,
+ BytesRef secondEnd) throws Exception {
RAMDirectory ramDir = new RAMDirectory();
IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(
TEST_VERSION_CURRENT, analyzer));
@@ -126,8 +134,8 @@ public abstract class CollationTestBase
searcher.close();
}
- public void testFarsiTermRangeQuery(Analyzer analyzer, String firstBeg,
- String firstEnd, String secondBeg, String secondEnd) throws Exception {
+ public void testFarsiTermRangeQuery(Analyzer analyzer, BytesRef firstBeg,
+ BytesRef firstEnd, BytesRef secondBeg, BytesRef secondEnd) throws Exception {
RAMDirectory farsiIndex = new RAMDirectory();
IndexWriter writer = new IndexWriter(farsiIndex, new IndexWriterConfig(
@@ -249,4 +257,77 @@ public abstract class CollationTestBase
}
assertEquals(expectedResult, buff.toString());
}
+
+ private String randomString() {
+ // ideally we could do this!
+ // return _TestUtil.randomUnicodeString(random);
+ //
+ // http://bugs.icu-project.org/trac/ticket/8060
+ // http://bugs.icu-project.org/trac/ticket/7732
+ // ...
+ //
+ // as a workaround, just test the BMP for now (and avoid 0xFFFF etc)
+ int length = _TestUtil.nextInt(random, 0, 10);
+ char chars[] = new char[length];
+ for (int i = 0; i < length; i++) {
+ if (random.nextBoolean()) {
+ chars[i] = (char) _TestUtil.nextInt(random, 0, 0xD7FF);
+ } else {
+ chars[i] = (char) _TestUtil.nextInt(random, 0xE000, 0xFFFD);
+ }
+ }
+ return new String(chars, 0, length);
+ }
+
+ public void assertThreadSafe(final Analyzer analyzer) throws Exception {
+ int numTestPoints = 100;
+ int numThreads = _TestUtil.nextInt(random, 3, 5);
+ final HashMap<String,BytesRef> map = new HashMap<String,BytesRef>();
+
+ // create a map<String,SortKey> up front.
+ // then with multiple threads, generate sort keys for all the keys in the map
+ // and ensure they are the same as the ones we produced in serial fashion.
+
+ for (int i = 0; i < numTestPoints; i++) {
+ String term = randomString();
+ TokenStream ts = analyzer.reusableTokenStream("fake", new StringReader(term));
+ TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
+ BytesRef bytes = termAtt.getBytesRef();
+ ts.reset();
+ assertTrue(ts.incrementToken());
+ termAtt.fillBytesRef();
+ // ensure we make a copy of the actual bytes too
+ map.put(term, new BytesRef(bytes));
+ }
+
+ Thread threads[] = new Thread[numThreads];
+ for (int i = 0; i < numThreads; i++) {
+ threads[i] = new Thread() {
+ @Override
+ public void run() {
+ try {
+ for (Map.Entry<String,BytesRef> mapping : map.entrySet()) {
+ String term = mapping.getKey();
+ BytesRef expected = mapping.getValue();
+ TokenStream ts = analyzer.reusableTokenStream("fake", new StringReader(term));
+ TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
+ BytesRef bytes = termAtt.getBytesRef();
+ ts.reset();
+ assertTrue(ts.incrementToken());
+ termAtt.fillBytesRef();
+ assertEquals(expected, bytes);
+ }
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ };
+ }
+ for (int i = 0; i < numThreads; i++) {
+ threads[i].start();
+ }
+ for (int i = 0; i < numThreads; i++) {
+ threads[i].join();
+ }
+ }
}
Modified: lucene/dev/branches/realtime_search/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java (original)
+++ lucene/dev/branches/realtime_search/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyAnalyzer.java Wed Mar 30 09:17:25 2011
@@ -19,6 +19,8 @@ package org.apache.lucene.collation;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.index.codecs.CodecProvider;
+import org.apache.lucene.util.BytesRef;
import java.text.Collator;
import java.util.Locale;
@@ -34,17 +36,19 @@ public class TestCollationKeyAnalyzer ex
// RuleBasedCollator. However, the Arabic Locale seems to order the Farsi
// characters properly.
private Collator collator = Collator.getInstance(new Locale("ar"));
- private Analyzer analyzer = new CollationKeyAnalyzer(collator);
+ private Analyzer analyzer = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, collator);
- private String firstRangeBeginning = encodeCollationKey
- (collator.getCollationKey(firstRangeBeginningOriginal).toByteArray());
- private String firstRangeEnd = encodeCollationKey
- (collator.getCollationKey(firstRangeEndOriginal).toByteArray());
- private String secondRangeBeginning = encodeCollationKey
- (collator.getCollationKey(secondRangeBeginningOriginal).toByteArray());
- private String secondRangeEnd = encodeCollationKey
- (collator.getCollationKey(secondRangeEndOriginal).toByteArray());
+ private BytesRef firstRangeBeginning = new BytesRef(collator.getCollationKey(firstRangeBeginningOriginal).toByteArray());
+ private BytesRef firstRangeEnd = new BytesRef(collator.getCollationKey(firstRangeEndOriginal).toByteArray());
+ private BytesRef secondRangeBeginning = new BytesRef(collator.getCollationKey(secondRangeBeginningOriginal).toByteArray());
+ private BytesRef secondRangeEnd = new BytesRef(collator.getCollationKey(secondRangeEndOriginal).toByteArray());
+ @Override
+ public void setUp() throws Exception {
+ super.setUp();
+ assumeFalse("preflex format only supports UTF-8 encoded bytes", "PreFlex".equals(CodecProvider.getDefault().getDefaultFieldCodec()));
+ }
+
public void testFarsiRangeFilterCollating() throws Exception {
testFarsiRangeFilterCollating
(analyzer, firstRangeBeginning, firstRangeEnd,
@@ -65,13 +69,13 @@ public class TestCollationKeyAnalyzer ex
public void testCollationKeySort() throws Exception {
Analyzer usAnalyzer
- = new CollationKeyAnalyzer(Collator.getInstance(Locale.US));
+ = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(Locale.US));
Analyzer franceAnalyzer
- = new CollationKeyAnalyzer(Collator.getInstance(Locale.FRANCE));
+ = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(Locale.FRANCE));
Analyzer swedenAnalyzer
- = new CollationKeyAnalyzer(Collator.getInstance(new Locale("sv", "se")));
+ = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(new Locale("sv", "se")));
Analyzer denmarkAnalyzer
- = new CollationKeyAnalyzer(Collator.getInstance(new Locale("da", "dk")));
+ = new CollationKeyAnalyzer(TEST_VERSION_CURRENT, Collator.getInstance(new Locale("da", "dk")));
// The ICU Collator and Sun java.text.Collator implementations differ in their
// orderings - "BFJDH" is the ordering for java.text.Collator for Locale.US.
@@ -79,4 +83,14 @@ public class TestCollationKeyAnalyzer ex
(usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer,
oStrokeFirst ? "BFJHD" : "BFJDH", "EACGI", "BJDFH", "BJDHF");
}
+
+ public void testThreadSafe() throws Exception {
+ int iters = 20 * RANDOM_MULTIPLIER;
+ for (int i = 0; i < iters; i++) {
+ Locale locale = randomLocale(random);
+ Collator collator = Collator.getInstance(locale);
+ collator.setStrength(Collator.PRIMARY);
+ assertThreadSafe(new CollationKeyAnalyzer(TEST_VERSION_CURRENT, collator));
+ }
+ }
}
Modified: lucene/dev/branches/realtime_search/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java (original)
+++ lucene/dev/branches/realtime_search/modules/analysis/common/src/test/org/apache/lucene/collation/TestCollationKeyFilter.java Wed Mar 30 09:17:25 2011
@@ -21,12 +21,16 @@ package org.apache.lucene.collation;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.util.BytesRef;
import java.text.Collator;
import java.util.Locale;
import java.io.Reader;
-
+/**
+ * @deprecated remove when CollationKeyFilter is removed.
+ */
+@Deprecated
public class TestCollationKeyFilter extends CollationTestBase {
// the sort order of à versus U depends on the version of the rules being used
// for the inherited root locale: Ã's order isnt specified in Locale.US since
@@ -39,14 +43,14 @@ public class TestCollationKeyFilter exte
private Collator collator = Collator.getInstance(new Locale("ar"));
private Analyzer analyzer = new TestAnalyzer(collator);
- private String firstRangeBeginning = encodeCollationKey
- (collator.getCollationKey(firstRangeBeginningOriginal).toByteArray());
- private String firstRangeEnd = encodeCollationKey
- (collator.getCollationKey(firstRangeEndOriginal).toByteArray());
- private String secondRangeBeginning = encodeCollationKey
- (collator.getCollationKey(secondRangeBeginningOriginal).toByteArray());
- private String secondRangeEnd = encodeCollationKey
- (collator.getCollationKey(secondRangeEndOriginal).toByteArray());
+ private BytesRef firstRangeBeginning = new BytesRef(encodeCollationKey
+ (collator.getCollationKey(firstRangeBeginningOriginal).toByteArray()));
+ private BytesRef firstRangeEnd = new BytesRef(encodeCollationKey
+ (collator.getCollationKey(firstRangeEndOriginal).toByteArray()));
+ private BytesRef secondRangeBeginning = new BytesRef(encodeCollationKey
+ (collator.getCollationKey(secondRangeBeginningOriginal).toByteArray()));
+ private BytesRef secondRangeEnd = new BytesRef(encodeCollationKey
+ (collator.getCollationKey(secondRangeEndOriginal).toByteArray()));
public final class TestAnalyzer extends Analyzer {
Modified: lucene/dev/branches/realtime_search/modules/analysis/icu/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/analysis/icu/build.xml?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/analysis/icu/build.xml (original)
+++ lucene/dev/branches/realtime_search/modules/analysis/icu/build.xml Wed Mar 30 09:17:25 2011
@@ -132,4 +132,9 @@ are part of the ICU4C package. See http:
<classpath refid="classpath"/>
</compile>
</target>
+
+ <target name="dist-maven" depends="contrib-build.dist-maven">
+ <m2-deploy-with-pom-template pom.xml="lib/lucene-icu4j-pom.xml.template"
+ jar.file="lib/icu4j-4_6.jar" />
+ </target>
</project>
Modified: lucene/dev/branches/realtime_search/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyAnalyzer.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyAnalyzer.java (original)
+++ lucene/dev/branches/realtime_search/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyAnalyzer.java Wed Mar 30 09:17:25 2011
@@ -19,24 +19,21 @@ package org.apache.lucene.collation;
import com.ibm.icu.text.Collator;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.KeywordTokenizer;
-import org.apache.lucene.analysis.Tokenizer;
-
+import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import org.apache.lucene.collation.CollationKeyAnalyzer; // javadocs
+import org.apache.lucene.util.IndexableBinaryStringTools; // javadocs
+import org.apache.lucene.util.Version;
import java.io.Reader;
-import java.io.IOException;
-
/**
* <p>
* Filters {@link KeywordTokenizer} with {@link ICUCollationKeyFilter}.
* <p>
* Converts the token into its {@link com.ibm.icu.text.CollationKey}, and
- * then encodes the CollationKey with
- * {@link org.apache.lucene.util.IndexableBinaryStringTools}, to allow it to
+ * then encodes the CollationKey either directly or with
+ * {@link IndexableBinaryStringTools} (see <a href="#version">below</a>), to allow it to
* be stored as an index term.
* </p>
* <p>
@@ -70,39 +67,48 @@ import java.io.IOException;
* generation timing and key length comparisons between ICU4J and
* java.text.Collator over several languages.
* </p>
+ * <a name="version"/>
+ * <p>You must specify the required {@link Version}
+ * compatibility when creating ICUCollationKeyAnalyzer:
+ * <ul>
+ * <li> As of 4.0, Collation Keys are directly encoded as bytes. Previous
+ * versions will encode the bytes with {@link IndexableBinaryStringTools}.
+ * </ul>
*/
-public final class ICUCollationKeyAnalyzer extends Analyzer {
- private Collator collator;
-
- public ICUCollationKeyAnalyzer(Collator collator) {
+public final class ICUCollationKeyAnalyzer extends ReusableAnalyzerBase {
+ private final Collator collator;
+ private final ICUCollationAttributeFactory factory;
+ private final Version matchVersion;
+
+ /**
+ * Create a new ICUCollationKeyAnalyzer, using the specified collator.
+ *
+ * @param matchVersion See <a href="#version">above</a>
+ * @param collator CollationKey generator
+ */
+ public ICUCollationKeyAnalyzer(Version matchVersion, Collator collator) {
+ this.matchVersion = matchVersion;
this.collator = collator;
+ this.factory = new ICUCollationAttributeFactory(collator);
}
- @Override
- public TokenStream tokenStream(String fieldName, Reader reader) {
- TokenStream result = new KeywordTokenizer(reader);
- result = new ICUCollationKeyFilter(result, collator);
- return result;
- }
-
- private class SavedStreams {
- Tokenizer source;
- TokenStream result;
+ /**
+ * @deprecated Use {@link ICUCollationKeyAnalyzer#ICUCollationKeyAnalyzer(Version, Collator)}
+ * and specify a version instead. This ctor will be removed in Lucene 5.0
+ */
+ @Deprecated
+ public ICUCollationKeyAnalyzer(Collator collator) {
+ this(Version.LUCENE_31, collator);
}
-
+
@Override
- public TokenStream reusableTokenStream(String fieldName, Reader reader)
- throws IOException {
-
- SavedStreams streams = (SavedStreams)getPreviousTokenStream();
- if (streams == null) {
- streams = new SavedStreams();
- streams.source = new KeywordTokenizer(reader);
- streams.result = new ICUCollationKeyFilter(streams.source, collator);
- setPreviousTokenStream(streams);
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ if (matchVersion.onOrAfter(Version.LUCENE_40)) {
+ KeywordTokenizer tokenizer = new KeywordTokenizer(factory, reader, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
+ return new TokenStreamComponents(tokenizer, tokenizer);
} else {
- streams.source.reset(reader);
+ KeywordTokenizer tokenizer = new KeywordTokenizer(reader);
+ return new TokenStreamComponents(tokenizer, new ICUCollationKeyFilter(tokenizer, collator));
}
- return streams.result;
}
}
Modified: lucene/dev/branches/realtime_search/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java (original)
+++ lucene/dev/branches/realtime_search/modules/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilter.java Wed Mar 30 09:17:25 2011
@@ -68,7 +68,10 @@ import java.io.IOException;
* generation timing and key length comparisons between ICU4J and
* java.text.Collator over several languages.
* </p>
+ * @deprecated Use {@link ICUCollationAttributeFactory} instead, which encodes
+ * terms directly as bytes. This filter will be removed in Lucene 5.0
*/
+@Deprecated
public final class ICUCollationKeyFilter extends TokenFilter {
private Collator collator = null;
private RawCollationKey reusableKey = new RawCollationKey();
@@ -81,7 +84,12 @@ public final class ICUCollationKeyFilter
*/
public ICUCollationKeyFilter(TokenStream input, Collator collator) {
super(input);
- this.collator = collator;
+ // clone the collator: see http://userguide.icu-project.org/collation/architecture
+ try {
+ this.collator = (Collator) collator.clone();
+ } catch (CloneNotSupportedException e) {
+ throw new RuntimeException(e);
+ }
}
@Override
Modified: lucene/dev/branches/realtime_search/modules/analysis/icu/src/java/overview.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/analysis/icu/src/java/overview.html?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/analysis/icu/src/java/overview.html (original)
+++ lucene/dev/branches/realtime_search/modules/analysis/icu/src/java/overview.html Wed Mar 30 09:17:25 2011
@@ -66,12 +66,12 @@ algorithm.
</ul>
<h2>Example Usages</h2>
<h3>Tokenizing multilanguage text</h3>
-<code><pre>
+<pre class="prettyprint">
/**
* This tokenizer will work well in general for most languages.
*/
Tokenizer tokenizer = new ICUTokenizer(reader);
-</pre></code>
+</pre>
<hr/>
<h1><a name="collation">Collation</a></h1>
<p>
@@ -111,12 +111,11 @@ algorithm.
<h2>Example Usages</h2>
<h3>Farsi Range Queries</h3>
-<code><pre>
- Collator collator = Collator.getInstance(new Locale("ar"));
- ICUCollationKeyAnalyzer analyzer = new ICUCollationKeyAnalyzer(collator);
+<pre class="prettyprint">
+ Collator collator = Collator.getInstance(new ULocale("ar"));
+ ICUCollationKeyAnalyzer analyzer = new ICUCollationKeyAnalyzer(Version.LUCENE_40, collator);
RAMDirectory ramDir = new RAMDirectory();
- IndexWriter writer = new IndexWriter
- (ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
+ IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_40, analyzer));
Document doc = new Document();
doc.add(new Field("content", "\u0633\u0627\u0628",
Field.Store.YES, Field.Index.ANALYZED));
@@ -124,12 +123,9 @@ algorithm.
writer.close();
IndexSearcher is = new IndexSearcher(ramDir, true);
- // The AnalyzingQueryParser in Lucene's contrib allows terms in range queries
- // to be passed through an analyzer - Lucene's standard QueryParser does not
- // allow this.
- AnalyzingQueryParser aqp = new AnalyzingQueryParser("content", analyzer);
- aqp.setLowercaseExpandedTerms(false);
-
+ QueryParser aqp = new QueryParser(Version.LUCENE_40, "content", analyzer);
+ aqp.setAnalyzeRangeTerms(true);
+
// Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
// orders the U+0698 character before the U+0633 character, so the single
// indexed Term above should NOT be returned by a ConstantScoreRangeQuery
@@ -138,15 +134,14 @@ algorithm.
ScoreDoc[] result
= is.search(aqp.parse("[ \u062F TO \u0698 ]"), null, 1000).scoreDocs;
assertEquals("The index Term should not be included.", 0, result.length);
-</pre></code>
+</pre>
<h3>Danish Sorting</h3>
-<code><pre>
+<pre class="prettyprint">
Analyzer analyzer
- = new ICUCollationKeyAnalyzer(Collator.getInstance(new Locale("da", "dk")));
+ = new ICUCollationKeyAnalyzer(Version.LUCENE_40, Collator.getInstance(new ULocale("da", "dk")));
RAMDirectory indexStore = new RAMDirectory();
- IndexWriter writer = new IndexWriter
- (indexStore, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
+ IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig(Version.LUCENE_40, analyzer));
String[] tracer = new String[] { "A", "B", "C", "D", "E" };
String[] data = new String[] { "HAT", "HUT", "H\u00C5T", "H\u00D8T", "HOT" };
String[] sortedTracerOrder = new String[] { "A", "E", "B", "D", "C" };
@@ -157,7 +152,7 @@ algorithm.
writer.addDocument(doc);
}
writer.close();
- Searcher searcher = new IndexSearcher(indexStore, true);
+ IndexSearcher searcher = new IndexSearcher(indexStore, true);
Sort sort = new Sort();
sort.setSort(new SortField("contents", SortField.STRING));
Query query = new MatchAllDocsQuery();
@@ -166,26 +161,25 @@ algorithm.
Document doc = searcher.doc(result[i].doc);
assertEquals(sortedTracerOrder[i], doc.getValues("tracer")[0]);
}
-</pre></code>
+</pre>
<h3>Turkish Case Normalization</h3>
-<code><pre>
- Collator collator = Collator.getInstance(new Locale("tr", "TR"));
+<pre class="prettyprint">
+ Collator collator = Collator.getInstance(new ULocale("tr", "TR"));
collator.setStrength(Collator.PRIMARY);
- Analyzer analyzer = new ICUCollationKeyAnalyzer(collator);
+ Analyzer analyzer = new ICUCollationKeyAnalyzer(Version.LUCENE_40, collator);
RAMDirectory ramDir = new RAMDirectory();
- IndexWriter writer = new IndexWriter
- (ramDir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
+ IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_40, analyzer));
Document doc = new Document();
doc.add(new Field("contents", "DIGY", Field.Store.NO, Field.Index.ANALYZED));
writer.addDocument(doc);
writer.close();
IndexSearcher is = new IndexSearcher(ramDir, true);
- QueryParser parser = new QueryParser("contents", analyzer);
+ QueryParser parser = new QueryParser(Version.LUCENE_40, "contents", analyzer);
Query query = parser.parse("d\u0131gy"); // U+0131: dotless i
ScoreDoc[] result = is.search(query, null, 1000).scoreDocs;
assertEquals("The index Term should be included.", 1, result.length);
-</pre></code>
+</pre>
<h2>Caveats and Comparisons</h2>
<p>
@@ -245,7 +239,7 @@ algorithm.
</ul>
<h2>Example Usages</h2>
<h3>Normalizing text to NFC</h3>
-<code><pre>
+<pre class="prettyprint">
/**
* Normalizer2 objects are unmodifiable and immutable.
*/
@@ -254,7 +248,7 @@ algorithm.
* This filter will normalize to NFC.
*/
TokenStream tokenstream = new ICUNormalizer2Filter(tokenizer, normalizer);
-</pre></code>
+</pre>
<hr/>
<h1><a name="casefolding">Case Folding</a></h1>
<p>
@@ -284,12 +278,12 @@ this integration. To perform case-foldin
</ul>
<h2>Example Usages</h2>
<h3>Lowercasing text</h3>
-<code><pre>
+<pre class="prettyprint">
/**
* This filter will case-fold and normalize to NFKC.
*/
TokenStream tokenstream = new ICUNormalizer2Filter(tokenizer);
-</pre></code>
+</pre>
<hr/>
<h1><a name="searchfolding">Search Term Folding</a></h1>
<p>
@@ -311,13 +305,13 @@ many character foldings recursively.
</ul>
<h2>Example Usages</h2>
<h3>Removing accents</h3>
-<code><pre>
+<pre class="prettyprint">
/**
* This filter will case-fold, remove accents and other distinctions, and
* normalize to NFKC.
*/
TokenStream tokenstream = new ICUFoldingFilter(tokenizer);
-</pre></code>
+</pre>
<hr/>
<h1><a name="transform">Text Transformation</a></h1>
<p>
@@ -341,19 +335,19 @@ and
</ul>
<h2>Example Usages</h2>
<h3>Convert Traditional to Simplified</h3>
-<code><pre>
+<pre class="prettyprint">
/**
* This filter will map Traditional Chinese to Simplified Chinese
*/
TokenStream tokenstream = new ICUTransformFilter(tokenizer, Transliterator.getInstance("Traditional-Simplified"));
-</pre></code>
+</pre>
<h3>Transliterate Serbian Cyrillic to Serbian Latin</h3>
- <code><pre>
+<pre class="prettyprint">
/**
* This filter will map Serbian Cyrillic to Serbian Latin according to BGN rules
*/
TokenStream tokenstream = new ICUTransformFilter(tokenizer, Transliterator.getInstance("Serbian-Latin/BGN"));
-</pre></code>
+</pre>
<hr/>
<h1><a name="backcompat">Backwards Compatibility</a></h1>
<p>
@@ -365,7 +359,7 @@ a specific Unicode Version by using a {@
</p>
<h2>Example Usages</h2>
<h3>Restricting normalization to Unicode 5.0</h3>
-<code><pre>
+<pre class="prettyprint">
/**
* This filter will do NFC normalization, but will ignore any characters that
* did not exist as of Unicode 5.0. Because of the normalization stability policy
@@ -377,6 +371,6 @@ a specific Unicode Version by using a {@
set.freeze();
FilteredNormalizer2 unicode50 = new FilteredNormalizer2(normalizer, set);
TokenStream tokenstream = new ICUNormalizer2Filter(tokenizer, unicode50);
-</pre></code>
+</pre>
</body>
</html>
Modified: lucene/dev/branches/realtime_search/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java (original)
+++ lucene/dev/branches/realtime_search/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyAnalyzer.java Wed Mar 30 09:17:25 2011
@@ -20,6 +20,8 @@ package org.apache.lucene.collation;
import com.ibm.icu.text.Collator;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.index.codecs.CodecProvider;
+import org.apache.lucene.util.BytesRef;
import java.util.Locale;
@@ -27,17 +29,23 @@ import java.util.Locale;
public class TestICUCollationKeyAnalyzer extends CollationTestBase {
private Collator collator = Collator.getInstance(new Locale("fa"));
- private Analyzer analyzer = new ICUCollationKeyAnalyzer(collator);
+ private Analyzer analyzer = new ICUCollationKeyAnalyzer(TEST_VERSION_CURRENT, collator);
- private String firstRangeBeginning = encodeCollationKey
+ private BytesRef firstRangeBeginning = new BytesRef
(collator.getCollationKey(firstRangeBeginningOriginal).toByteArray());
- private String firstRangeEnd = encodeCollationKey
+ private BytesRef firstRangeEnd = new BytesRef
(collator.getCollationKey(firstRangeEndOriginal).toByteArray());
- private String secondRangeBeginning = encodeCollationKey
+ private BytesRef secondRangeBeginning = new BytesRef
(collator.getCollationKey(secondRangeBeginningOriginal).toByteArray());
- private String secondRangeEnd = encodeCollationKey
+ private BytesRef secondRangeEnd = new BytesRef
(collator.getCollationKey(secondRangeEndOriginal).toByteArray());
-
+
+ @Override
+ public void setUp() throws Exception {
+ super.setUp();
+ assumeFalse("preflex format only supports UTF-8 encoded bytes", "PreFlex".equals(CodecProvider.getDefault().getDefaultFieldCodec()));
+ }
+
public void testFarsiRangeFilterCollating() throws Exception {
testFarsiRangeFilterCollating(analyzer, firstRangeBeginning, firstRangeEnd,
secondRangeBeginning, secondRangeEnd);
@@ -62,13 +70,13 @@ public class TestICUCollationKeyAnalyzer
//
public void testCollationKeySort() throws Exception {
Analyzer usAnalyzer = new ICUCollationKeyAnalyzer
- (Collator.getInstance(Locale.US));
+ (TEST_VERSION_CURRENT, Collator.getInstance(Locale.US));
Analyzer franceAnalyzer = new ICUCollationKeyAnalyzer
- (Collator.getInstance(Locale.FRANCE));
+ (TEST_VERSION_CURRENT, Collator.getInstance(Locale.FRANCE));
Analyzer swedenAnalyzer = new ICUCollationKeyAnalyzer
- (Collator.getInstance(new Locale("sv", "se")));
+ (TEST_VERSION_CURRENT, Collator.getInstance(new Locale("sv", "se")));
Analyzer denmarkAnalyzer = new ICUCollationKeyAnalyzer
- (Collator.getInstance(new Locale("da", "dk")));
+ (TEST_VERSION_CURRENT, Collator.getInstance(new Locale("da", "dk")));
// The ICU Collator and java.text.Collator implementations differ in their
// orderings - "BFJHD" is the ordering for the ICU Collator for Locale.US.
@@ -76,4 +84,14 @@ public class TestICUCollationKeyAnalyzer
(usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer,
"BFJHD", "ECAGI", "BJDFH", "BJDHF");
}
+
+ public void testThreadSafe() throws Exception {
+ int iters = 20 * RANDOM_MULTIPLIER;
+ for (int i = 0; i < iters; i++) {
+ Locale locale = randomLocale(random);
+ Collator collator = Collator.getInstance(locale);
+ collator.setStrength(Collator.IDENTICAL);
+ assertThreadSafe(new ICUCollationKeyAnalyzer(TEST_VERSION_CURRENT, collator));
+ }
+ }
}
Modified: lucene/dev/branches/realtime_search/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java (original)
+++ lucene/dev/branches/realtime_search/modules/analysis/icu/src/test/org/apache/lucene/collation/TestICUCollationKeyFilter.java Wed Mar 30 09:17:25 2011
@@ -22,24 +22,26 @@ import com.ibm.icu.text.Collator;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.util.BytesRef;
import java.io.Reader;
import java.util.Locale;
-
+/** @deprecated remove this when ICUCollationKeyFilter is removed */
+@Deprecated
public class TestICUCollationKeyFilter extends CollationTestBase {
private Collator collator = Collator.getInstance(new Locale("fa"));
private Analyzer analyzer = new TestAnalyzer(collator);
- private String firstRangeBeginning = encodeCollationKey
- (collator.getCollationKey(firstRangeBeginningOriginal).toByteArray());
- private String firstRangeEnd = encodeCollationKey
- (collator.getCollationKey(firstRangeEndOriginal).toByteArray());
- private String secondRangeBeginning = encodeCollationKey
- (collator.getCollationKey(secondRangeBeginningOriginal).toByteArray());
- private String secondRangeEnd = encodeCollationKey
- (collator.getCollationKey(secondRangeEndOriginal).toByteArray());
+ private BytesRef firstRangeBeginning = new BytesRef(encodeCollationKey
+ (collator.getCollationKey(firstRangeBeginningOriginal).toByteArray()));
+ private BytesRef firstRangeEnd = new BytesRef(encodeCollationKey
+ (collator.getCollationKey(firstRangeEndOriginal).toByteArray()));
+ private BytesRef secondRangeBeginning = new BytesRef(encodeCollationKey
+ (collator.getCollationKey(secondRangeBeginningOriginal).toByteArray()));
+ private BytesRef secondRangeEnd = new BytesRef(encodeCollationKey
+ (collator.getCollationKey(secondRangeEndOriginal).toByteArray()));
public final class TestAnalyzer extends Analyzer {
Modified: lucene/dev/branches/realtime_search/modules/benchmark/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/benchmark/CHANGES.txt?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/benchmark/CHANGES.txt (original)
+++ lucene/dev/branches/realtime_search/modules/benchmark/CHANGES.txt Wed Mar 30 09:17:25 2011
@@ -2,12 +2,54 @@ Lucene Benchmark Contrib Change Log
The Benchmark contrib package contains code for benchmarking Lucene in a variety of ways.
+03/24/2011
+ LUCENE-2977: WriteLineDocTask now automatically detects how to write -
+ GZip or BZip2 or Plain-text - according to the output file extension.
+ Property bzip.compression of WriteLineDocTask was canceled. (Doron Cohen)
+
+03/23/2011
+ LUCENE-2980: Benchmark's ContentSource no more requires lower case file suffixes
+ for detecting file type (gzip/bzip2/text). As part of this fix worked around an
+ issue with gzip input streams which were remaining open (See COMPRESS-127).
+ (Doron Cohen)
+
+03/22/2011
+ LUCENE-2978: Upgrade benchmark's commons-compress from 1.0 to 1.1 as
+ the move of gzip decompression in LUCENE-1540 from Java's GZipInputStream
+ to commons-compress 1.0 made it 15 times slower. In 1.1 no such slow-down
+ is observed. (Doron Cohen)
+
+03/21/2011
+ LUCENE-2958: WriteLineDocTask improvements - allow to emit line docs also for empty
+ docs, and be flexible about which fields are added to the line file. For this, a header
+ line was added to the line file. That header is examined by LineDocSource. Old line
+ files which have no header line are handled as before, imposing the default header.
+ (Doron Cohen, Shai Erera, Mike McCandless)
+
+03/21/2011
+ LUCENE-2964: Allow benchmark tasks from alternative packages,
+ specified through a new property "alt.tasks.packages".
+ (Doron Cohen, Shai Erera)
+
+03/20/2011
+ LUCENE-2963: Easier way to run benchmark, by calling Benmchmark.exec(alg-file).
+ (Doron Cohen)
+
+03/10/2011
+ LUCENE-2961: Removed lib/xml-apis.jar, since JVM 1.5+ already contains the
+ JAXP 1.3 interface classes it provides.
+
02/05/2011
LUCENE-1540: Improvements to contrib.benchmark for TREC collections.
ContentSource can now process plain text files, gzip files, and bzip2 files.
TREC doc parsing now handles the TREC gov2 collection and TREC disks 4&5-CR
collection (both used by many TREC tasks). (Shai Erera, Doron Cohen)
-
+
+01/31/2011
+ LUCENE-1591: Rollback to xerces-2.9.1-patched-XERCESJ-1257.jar to workaround
+ XERCESJ-1257, which we hit on current Wikipedia XML export
+ (ENWIKI-20110115-pages-articles.xml) with xerces-2.10.0.jar. (Mike McCandless)
+
01/26/2011
LUCENE-929: ExtractReuters first extracts to a tmp dir and then renames. That
way, if a previous extract attempt failed, "ant extract-reuters" will still
@@ -33,7 +75,7 @@ The Benchmark contrib package contains c
4/27/2010: WriteLineDocTask now supports multi-threading. Also,
StringBufferReader was renamed to StringBuilderReader and works on
- StringBuilder now. In addition, LongToEnglishCountentSource starts from 0
+ StringBuilder now. In addition, LongToEnglishContentSource starts from 0
(instead of Long.MIN_VAL+10) and wraps around to MIN_VAL (if you ever hit
Long.MAX_VAL). (Shai Erera)
Modified: lucene/dev/branches/realtime_search/modules/benchmark/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/benchmark/build.xml?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/benchmark/build.xml (original)
+++ lucene/dev/branches/realtime_search/modules/benchmark/build.xml Wed Mar 30 09:17:25 2011
@@ -265,5 +265,8 @@
/>
</target>
- <target name="dist-maven" depends="jar-core,javadocs,contrib-build.dist-maven"/>
+ <target name="dist-maven" depends="jar-core,javadocs,contrib-build.dist-maven">
+ <m2-deploy-with-pom-template pom.xml="lib/lucene-xercesImpl-pom.xml.template"
+ jar.file="lib/xercesImpl-2.9.1-patched-XERCESJ-1257.jar" />
+ </target>
</project>
Modified: lucene/dev/branches/realtime_search/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java (original)
+++ lucene/dev/branches/realtime_search/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java Wed Mar 30 09:17:25 2011
@@ -64,6 +64,9 @@ public class Benchmark {
}
}
+ /**
+ * Execute this benchmark
+ */
public synchronized void execute() throws Exception {
if (executed) {
throw new IllegalStateException("Benchmark was already executed");
@@ -78,6 +81,14 @@ public class Benchmark {
* @param args benchmark config and algorithm files
*/
public static void main(String[] args) {
+ exec(args);
+ }
+
+ /**
+ * Utility: execute benchmark from command line
+ * @param args single argument is expected: algorithm-file
+ */
+ public static void exec(String[] args) {
// verify command line args
if (args.length < 1) {
System.err.println("Usage: java Benchmark <algorithm file>");
@@ -115,7 +126,6 @@ public class Benchmark {
System.out.println("####################");
System.out.println("### D O N E !!! ###");
System.out.println("####################");
-
}
/**
Modified: lucene/dev/branches/realtime_search/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ContentSource.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ContentSource.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ContentSource.java (original)
+++ lucene/dev/branches/realtime_search/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ContentSource.java Wed Mar 30 09:17:25 2011
@@ -17,18 +17,11 @@ package org.apache.lucene.benchmark.byTa
* limitations under the License.
*/
-import java.io.BufferedInputStream;
import java.io.File;
-import java.io.FileInputStream;
import java.io.IOException;
-import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
-import java.util.HashMap;
-import java.util.Map;
-import org.apache.commons.compress.compressors.CompressorException;
-import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.lucene.benchmark.byTask.utils.Config;
/**
@@ -55,19 +48,6 @@ import org.apache.lucene.benchmark.byTas
*/
public abstract class ContentSource {
- private static final int BZIP = 0;
- private static final int GZIP = 1;
- private static final int OTHER = 2;
- private static final Map<String,Integer> extensionToType = new HashMap<String,Integer>();
- static {
- extensionToType.put(".bz2", Integer.valueOf(BZIP));
- extensionToType.put(".bzip", Integer.valueOf(BZIP));
- extensionToType.put(".gz", Integer.valueOf(GZIP));
- extensionToType.put(".gzip", Integer.valueOf(GZIP));
- }
-
- protected static final int BUFFER_SIZE = 1 << 16; // 64K
-
private long bytesCount;
private long totalBytesCount;
private int docsCount;
@@ -79,8 +59,6 @@ public abstract class ContentSource {
protected boolean verbose;
protected String encoding;
- private CompressorStreamFactory csFactory = new CompressorStreamFactory();
-
/** update count of bytes generated by this source */
protected final synchronized void addBytes(long numBytes) {
bytesCount += numBytes;
@@ -115,51 +93,7 @@ public abstract class ContentSource {
}
}
- /**
- * Returns an {@link InputStream} over the requested file. This method
- * attempts to identify the appropriate {@link InputStream} instance to return
- * based on the file name (e.g., if it ends with .bz2 or .bzip, return a
- * 'bzip' {@link InputStream}).
- */
- protected InputStream getInputStream(File file) throws IOException {
- // First, create a FileInputStream, as this will be required by all types.
- // Wrap with BufferedInputStream for better performance
- InputStream is = new BufferedInputStream(new FileInputStream(file), BUFFER_SIZE);
-
- String fileName = file.getName();
- int idx = fileName.lastIndexOf('.');
- int type = OTHER;
- if (idx != -1) {
- Integer typeInt = extensionToType.get(fileName.substring(idx));
- if (typeInt != null) {
- type = typeInt.intValue();
- }
- }
-
- try {
- switch (type) {
- case BZIP:
- // According to BZip2CompressorInputStream's code, it reads the first
- // two file header chars ('B' and 'Z'). It is important to wrap the
- // underlying input stream with a buffered one since
- // Bzip2CompressorInputStream uses the read() method exclusively.
- is = csFactory.createCompressorInputStream("bzip2", is);
- break;
- case GZIP:
- is = csFactory.createCompressorInputStream("gz", is);
- break;
- default: // Do nothing, stay with FileInputStream
- }
- } catch (CompressorException e) {
- IOException ioe = new IOException(e.getMessage());
- ioe.initCause(e);
- throw ioe;
- }
-
- return is;
- }
-
- /**
+ /**
* Returns true whether it's time to log a message (depending on verbose and
* the number of documents generated).
*/
Modified: lucene/dev/branches/realtime_search/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java?rev=1086876&r1=1086875&r2=1086876&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java (original)
+++ lucene/dev/branches/realtime_search/modules/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java Wed Mar 30 09:17:25 2011
@@ -24,6 +24,7 @@ import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
import org.apache.lucene.util.ThreadInterruptedException;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
@@ -189,7 +190,7 @@ public class EnwikiContentSource extends
return;
} else if (localFileIS == is) {
// If file is not already re-opened then re-open it now
- is = getInputStream(file);
+ is = StreamUtils.inputStream(file);
}
}
}
@@ -290,7 +291,7 @@ public class EnwikiContentSource extends
@Override
public void resetInputs() throws IOException {
super.resetInputs();
- is = getInputStream(file);
+ is = StreamUtils.inputStream(file);
}
@Override