You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ma...@apache.org on 2012/01/23 19:34:08 UTC
svn commit: r1234932 [2/5] - in /lucene/dev/branches/solrcloud: ./
dev-tools/idea/lucene/contrib/ dev-tools/maven/ dev-tools/maven/solr/
dev-tools/maven/solr/contrib/analysis-extras/
dev-tools/maven/solr/contrib/clustering/ dev-tools/maven/solr/contrib...
Modified: lucene/dev/branches/solrcloud/lucene/src/test/org/apache/lucene/index/TestIndexWriterExceptions.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/lucene/src/test/org/apache/lucene/index/TestIndexWriterExceptions.java?rev=1234932&r1=1234931&r2=1234932&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/lucene/src/test/org/apache/lucene/index/TestIndexWriterExceptions.java (original)
+++ lucene/dev/branches/solrcloud/lucene/src/test/org/apache/lucene/index/TestIndexWriterExceptions.java Mon Jan 23 18:34:04 2012
@@ -695,12 +695,12 @@ public class TestIndexWriterExceptions e
MockDirectoryWrapper dir = newDirectory();
{
- final IndexWriter writer = new IndexWriter(
- dir,
- newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer).
- setMaxBufferedDocs(-1).
- setMergePolicy(newLogMergePolicy(10))
- );
+ final IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(
+ TEST_VERSION_CURRENT, analyzer).setMaxBufferedDocs(-1)
+ .setMergePolicy(
+ random.nextBoolean() ? NoMergePolicy.COMPOUND_FILES
+ : NoMergePolicy.NO_COMPOUND_FILES));
+ // don't use a merge policy here they depend on the DWPThreadPool and its max thread states etc.
final int finalI = i;
Thread[] threads = new Thread[NUM_THREAD];
Modified: lucene/dev/branches/solrcloud/lucene/src/test/org/apache/lucene/util/TestSentinelIntSet.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/lucene/src/test/org/apache/lucene/util/TestSentinelIntSet.java?rev=1234932&r1=1234931&r2=1234932&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/lucene/src/test/org/apache/lucene/util/TestSentinelIntSet.java (original)
+++ lucene/dev/branches/solrcloud/lucene/src/test/org/apache/lucene/util/TestSentinelIntSet.java Mon Jan 23 18:34:04 2012
@@ -20,6 +20,8 @@ package org.apache.lucene.util;
import org.junit.Test;
+import java.util.HashSet;
+
/**
*
*
@@ -45,4 +47,32 @@ public class TestSentinelIntSet extends
assertEquals(20, set.size());
assertEquals(24, set.rehashCount);
}
+
+
+ @Test
+ public void testRandom() throws Exception {
+ for (int i=0; i<10000; i++) {
+ int initSz = random.nextInt(20);
+ int num = random.nextInt(30);
+ int maxVal = (random.nextBoolean() ? random.nextInt(50) : random.nextInt(Integer.MAX_VALUE)) + 1;
+
+ HashSet<Integer> a = new HashSet<Integer>(initSz);
+ SentinelIntSet b = new SentinelIntSet(initSz, -1);
+
+ for (int j=0; j<num; j++) {
+ int val = random.nextInt(maxVal);
+ boolean exists = !a.add(val);
+ boolean existsB = b.exists(val);
+ assertEquals(exists, existsB);
+ int slot = b.find(val);
+ assertEquals(exists, slot>=0);
+ b.put(val);
+
+ assertEquals(a.size(), b.size());
+ }
+
+ }
+
+ }
+
}
Modified: lucene/dev/branches/solrcloud/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java?rev=1234932&r1=1234931&r2=1234932&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java (original)
+++ lucene/dev/branches/solrcloud/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java Mon Jan 23 18:34:04 2012
@@ -161,7 +161,7 @@ public class TestFSTs extends LuceneTest
for(IntsRef term : terms2) {
pairs.add(new FSTTester.InputOutput<Object>(term, NO_OUTPUT));
}
- FST<Object> fst = new FSTTester<Object>(random, dir, inputMode, pairs, outputs).doTest(0, 0, false);
+ FST<Object> fst = new FSTTester<Object>(random, dir, inputMode, pairs, outputs, false).doTest(0, 0, false);
assertNotNull(fst);
assertEquals(22, fst.getNodeCount());
assertEquals(27, fst.getArcCount());
@@ -174,7 +174,7 @@ public class TestFSTs extends LuceneTest
for(int idx=0;idx<terms2.length;idx++) {
pairs.add(new FSTTester.InputOutput<Long>(terms2[idx], outputs.get(idx)));
}
- final FST<Long> fst = new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest(0, 0, false);
+ final FST<Long> fst = new FSTTester<Long>(random, dir, inputMode, pairs, outputs, true).doTest(0, 0, false);
assertNotNull(fst);
assertEquals(22, fst.getNodeCount());
assertEquals(27, fst.getArcCount());
@@ -189,7 +189,7 @@ public class TestFSTs extends LuceneTest
final BytesRef output = random.nextInt(30) == 17 ? NO_OUTPUT : new BytesRef(Integer.toString(idx));
pairs.add(new FSTTester.InputOutput<BytesRef>(terms2[idx], output));
}
- final FST<BytesRef> fst = new FSTTester<BytesRef>(random, dir, inputMode, pairs, outputs).doTest(0, 0, false);
+ final FST<BytesRef> fst = new FSTTester<BytesRef>(random, dir, inputMode, pairs, outputs, false).doTest(0, 0, false);
assertNotNull(fst);
assertEquals(24, fst.getNodeCount());
assertEquals(30, fst.getArcCount());
@@ -222,7 +222,7 @@ public class TestFSTs extends LuceneTest
for(IntsRef term : terms) {
pairs.add(new FSTTester.InputOutput<Object>(term, NO_OUTPUT));
}
- new FSTTester<Object>(random, dir, inputMode, pairs, outputs).doTest();
+ new FSTTester<Object>(random, dir, inputMode, pairs, outputs, false).doTest();
}
// PositiveIntOutput (ord)
@@ -232,12 +232,13 @@ public class TestFSTs extends LuceneTest
for(int idx=0;idx<terms.length;idx++) {
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(idx)));
}
- new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest();
+ new FSTTester<Long>(random, dir, inputMode, pairs, outputs, true).doTest();
}
// PositiveIntOutput (random monotonically increasing positive number)
{
- final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean());
+ final boolean doShare = random.nextBoolean();
+ final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(doShare);
final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms.length);
long lastOutput = 0;
for(int idx=0;idx<terms.length;idx++) {
@@ -245,7 +246,7 @@ public class TestFSTs extends LuceneTest
lastOutput = value;
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(value)));
}
- new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest();
+ new FSTTester<Long>(random, dir, inputMode, pairs, outputs, doShare).doTest();
}
// PositiveIntOutput (random positive number)
@@ -255,7 +256,7 @@ public class TestFSTs extends LuceneTest
for(int idx=0;idx<terms.length;idx++) {
pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(random.nextLong()) & Long.MAX_VALUE));
}
- new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest();
+ new FSTTester<Long>(random, dir, inputMode, pairs, outputs, false).doTest();
}
// Pair<ord, (random monotonically increasing positive number>
@@ -272,7 +273,7 @@ public class TestFSTs extends LuceneTest
outputs.get(o1.get(idx),
o2.get(value))));
}
- new FSTTester<PairOutputs.Pair<Long,Long>>(random, dir, inputMode, pairs, outputs).doTest();
+ new FSTTester<PairOutputs.Pair<Long,Long>>(random, dir, inputMode, pairs, outputs, false).doTest();
}
// Sequence-of-bytes
@@ -284,7 +285,7 @@ public class TestFSTs extends LuceneTest
final BytesRef output = random.nextInt(30) == 17 ? NO_OUTPUT : new BytesRef(Integer.toString(idx));
pairs.add(new FSTTester.InputOutput<BytesRef>(terms[idx], output));
}
- new FSTTester<BytesRef>(random, dir, inputMode, pairs, outputs).doTest();
+ new FSTTester<BytesRef>(random, dir, inputMode, pairs, outputs, false).doTest();
}
// Sequence-of-ints
@@ -300,7 +301,7 @@ public class TestFSTs extends LuceneTest
}
pairs.add(new FSTTester.InputOutput<IntsRef>(terms[idx], output));
}
- new FSTTester<IntsRef>(random, dir, inputMode, pairs, outputs).doTest();
+ new FSTTester<IntsRef>(random, dir, inputMode, pairs, outputs, false).doTest();
}
// Up to two positive ints, shared, generally but not
@@ -330,7 +331,7 @@ public class TestFSTs extends LuceneTest
}
pairs.add(new FSTTester.InputOutput<Object>(terms[idx], output));
}
- new FSTTester<Object>(random, dir, inputMode, pairs, outputs).doTest();
+ new FSTTester<Object>(random, dir, inputMode, pairs, outputs, false).doTest();
}
}
@@ -341,13 +342,15 @@ public class TestFSTs extends LuceneTest
final int inputMode;
final Outputs<T> outputs;
final Directory dir;
+ final boolean doReverseLookup;
- public FSTTester(Random random, Directory dir, int inputMode, List<InputOutput<T>> pairs, Outputs<T> outputs) {
+ public FSTTester(Random random, Directory dir, int inputMode, List<InputOutput<T>> pairs, Outputs<T> outputs, boolean doReverseLookup) {
this.random = random;
this.dir = dir;
this.inputMode = inputMode;
this.pairs = pairs;
this.outputs = outputs;
+ this.doReverseLookup = doReverseLookup;
}
private static class InputOutput<T> implements Comparable<InputOutput<T>> {
@@ -525,6 +528,26 @@ public class TestFSTs extends LuceneTest
// FST is complete
private void verifyUnPruned(int inputMode, FST<T> fst) throws IOException {
+ final FST<Long> fstLong;
+ final Set<Long> validOutputs;
+ long minLong = Long.MAX_VALUE;
+ long maxLong = Long.MIN_VALUE;
+
+ if (doReverseLookup) {
+ @SuppressWarnings("unchecked") FST<Long> fstLong0 = (FST<Long>) fst;
+ fstLong = fstLong0;
+ validOutputs = new HashSet<Long>();
+ for(InputOutput<T> pair: pairs) {
+ Long output = (Long) pair.output;
+ maxLong = Math.max(maxLong, output);
+ minLong = Math.min(minLong, output);
+ validOutputs.add(output);
+ }
+ } else {
+ fstLong = null;
+ validOutputs = null;
+ }
+
if (pairs.size() == 0) {
assertNull(fst);
return;
@@ -542,7 +565,7 @@ public class TestFSTs extends LuceneTest
assertNotNull(fst);
- // visit valid paris in order -- make sure all words
+ // visit valid pairs in order -- make sure all words
// are accepted, and FSTEnum's next() steps through
// them correctly
if (VERBOSE) {
@@ -556,7 +579,6 @@ public class TestFSTs extends LuceneTest
System.out.println("TEST: check term=" + inputToString(inputMode, term) + " output=" + fst.outputs.outputToString(pair.output));
}
Object output = run(fst, term, null);
-
assertNotNull("term " + inputToString(inputMode, term) + " is not accepted", output);
assertEquals(pair.output, output);
@@ -574,6 +596,20 @@ public class TestFSTs extends LuceneTest
termsMap.put(pair.input, pair.output);
}
+ if (doReverseLookup && maxLong > minLong) {
+ // Do random lookups so we test null (output doesn't
+ // exist) case:
+ assertNull(Util.getByOutput(fstLong, minLong-7));
+ assertNull(Util.getByOutput(fstLong, maxLong+7));
+
+ final int num = atLeast(100);
+ for(int iter=0;iter<num;iter++) {
+ Long v = minLong + random.nextLong() % (maxLong - minLong);
+ IntsRef input = Util.getByOutput(fstLong, v);
+ assertTrue(validOutputs.contains(v) || input == null);
+ }
+ }
+
// find random matching word and make sure it's valid
if (VERBOSE) {
System.out.println("TEST: verify random accepted terms");
@@ -584,6 +620,14 @@ public class TestFSTs extends LuceneTest
T output = randomAcceptedWord(fst, scratch);
assertTrue("accepted word " + inputToString(inputMode, scratch) + " is not valid", termsMap.containsKey(scratch));
assertEquals(termsMap.get(scratch), output);
+
+ if (doReverseLookup) {
+ //System.out.println("lookup output=" + output + " outs=" + fst.outputs);
+ IntsRef input = Util.getByOutput(fstLong, (Long) output);
+ assertNotNull(input);
+ //System.out.println(" got " + Util.toBytesRef(input, new BytesRef()).utf8ToString());
+ assertEquals(scratch, input);
+ }
}
// test IntsRefFSTEnum.seek:
@@ -887,7 +931,7 @@ public class TestFSTs extends LuceneTest
if (VERBOSE) {
System.out.println("TEST: after prune");
for(Map.Entry<IntsRef,CountMinOutput<T>> ent : prefixes.entrySet()) {
- System.out.println(" " + inputToString(inputMode, ent.getKey()) + ": isLeaf=" + ent.getValue().isLeaf + " isFinal=" + ent.getValue().isFinal);
+ System.out.println(" " + inputToString(inputMode, ent.getKey(), false) + ": isLeaf=" + ent.getValue().isLeaf + " isFinal=" + ent.getValue().isFinal);
if (ent.getValue().isFinal) {
System.out.println(" finalOutput=" + outputs.outputToString(ent.getValue().finalOutput));
}
@@ -951,7 +995,7 @@ public class TestFSTs extends LuceneTest
//testRandomWords(20, 100);
}
- private String inputModeToString(int mode) {
+ String inputModeToString(int mode) {
if (mode == 0) {
return "utf8";
} else {
@@ -995,7 +1039,7 @@ public class TestFSTs extends LuceneTest
testRandomWords(_TestUtil.nextInt(random, 50000, 60000), 1);
}
- private static String inputToString(int inputMode, IntsRef term) {
+ static String inputToString(int inputMode, IntsRef term) {
return inputToString(inputMode, term, true);
}
@@ -1011,6 +1055,50 @@ public class TestFSTs extends LuceneTest
}
}
+ // NOTE: this test shows a case where our current builder
+ // fails to produce minimal FST:
+ /*
+ public void test3() throws Exception {
+ final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
+ Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
+ IntsRef scratchIntsRef = new IntsRef();
+ builder.add(Util.toIntsRef(new BytesRef("aa$"), scratchIntsRef), outputs.get(0));
+ builder.add(Util.toIntsRef(new BytesRef("aab$"), scratchIntsRef), 1L);
+ builder.add(Util.toIntsRef(new BytesRef("bbb$"), scratchIntsRef), 2L);
+ final FST<Long> fst = builder.finish();
+ //System.out.println("NODES " + fst.getNodeCount() + " ARCS " + fst.getArcCount());
+ // NOTE: we produce 7 nodes today
+ assertEquals(6, fst.getNodeCount());
+ // NOTE: we produce 8 arcs today
+ assertEquals(7, fst.getNodeCount());
+ //Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
+ //Util.toDot(fst, w, false, false);
+ //w.close();
+ }
+ */
+
+ // NOTE: this test shows a case where our current builder
+ // fails to produce minimal FST:
+ /*
+ public void test4() throws Exception {
+ final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
+ Builder<BytesRef> builder = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1, outputs);
+ IntsRef scratchIntsRef = new IntsRef();
+ builder.add(Util.toIntsRef(new BytesRef("aa$"), scratchIntsRef), outputs.getNoOutput());
+ builder.add(Util.toIntsRef(new BytesRef("aab$"), scratchIntsRef), new BytesRef("1"));
+ builder.add(Util.toIntsRef(new BytesRef("bbb$"), scratchIntsRef), new BytesRef("11"));
+ final FST<BytesRef> fst = builder.finish();
+ //System.out.println("NODES " + fst.getNodeCount() + " ARCS " + fst.getArcCount());
+ // NOTE: we produce 7 nodes today
+ assertEquals(6, fst.getNodeCount());
+ // NOTE: we produce 8 arcs today
+ assertEquals(7, fst.getNodeCount());
+ //Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
+ //Util.toDot(fst, w, false, false);
+ //w.close();
+ }
+ */
+
// Build FST for all unique terms in the test line docs
// file, up until a time limit
public void testRealTerms() throws Exception {
@@ -1422,6 +1510,14 @@ public class TestFSTs extends LuceneTest
assertNotNull(seekResult);
assertEquals(b, seekResult.input);
assertEquals(42, (long) seekResult.output);
+
+ assertEquals(Util.toIntsRef(new BytesRef("c"), new IntsRef()),
+ Util.getByOutput(fst, 13824324872317238L));
+ assertNull(Util.getByOutput(fst, 47));
+ assertEquals(Util.toIntsRef(new BytesRef("b"), new IntsRef()),
+ Util.getByOutput(fst, 42));
+ assertEquals(Util.toIntsRef(new BytesRef("a"), new IntsRef()),
+ Util.getByOutput(fst, 17));
}
public void testPrimaryKeys() throws Exception {
Modified: lucene/dev/branches/solrcloud/modules/analysis/common/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/modules/analysis/common/build.xml?rev=1234932&r1=1234931&r2=1234932&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/modules/analysis/common/build.xml (original)
+++ lucene/dev/branches/solrcloud/modules/analysis/common/build.xml Mon Jan 23 18:34:04 2012
@@ -31,14 +31,38 @@
<target name="compile-core" depends="jflex-notice, common.compile-core"/>
<target name="jflex" depends="jflex-check,clean-jflex,gen-uax29-supp-macros,
- jflex-StandardAnalyzer,jflex-UAX29URLEmailTokenizer,jflex-wiki-tokenizer"/>
+ jflex-StandardAnalyzer,jflex-UAX29URLEmailTokenizer,
+ jflex-wiki-tokenizer,jflex-HTMLStripCharFilter"/>
<target name="gen-uax29-supp-macros">
<subant target="gen-uax29-supp-macros">
<fileset dir="../icu" includes="build.xml"/>
</subant>
</target>
-
+
+ <target name="jflex-HTMLStripCharFilter"
+ depends="init,jflex-check,generate-jflex-html-char-entities"
+ if="jflex.present">
+ <taskdef classname="jflex.anttask.JFlexTask" name="jflex">
+ <classpath refid="jflex.classpath"/>
+ </taskdef>
+ <jflex file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex"
+ outdir="src/java/org/apache/lucene/analysis/charfilter"
+ nobak="on"/>
+ <!-- Remove the inappropriate JFlex-generated constructors -->
+ <replaceregexp file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java"
+ match="/\*\*\s*\*\s*Creates a new scanner.*this\(new java\.io\.InputStreamReader\(in\)\);\s*\}"
+ replace="" flags="sg"/>
+ </target>
+
+ <target name="generate-jflex-html-char-entities">
+ <exec dir="src/java/org/apache/lucene/analysis/charfilter"
+ output="src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex"
+ executable="${python.exe}" failonerror="true" logerror="true">
+ <arg value="htmlentity.py"/>
+ </exec>
+ </target>
+
<target name="jflex-wiki-tokenizer" depends="init,jflex-check" if="jflex.present">
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
<classpath refid="jflex.classpath"/>
Modified: lucene/dev/branches/solrcloud/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/BaseCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/BaseCharFilter.java?rev=1234932&r1=1234931&r2=1234932&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/BaseCharFilter.java (original)
+++ lucene/dev/branches/solrcloud/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/BaseCharFilter.java Mon Jan 23 18:34:04 2012
@@ -20,6 +20,8 @@ package org.apache.lucene.analysis.charf
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.util.ArrayUtil;
+import java.util.Arrays;
+
/**
* Base utility class for implementing a {@link CharFilter}.
* You subclass this, and then record mappings by calling
@@ -71,6 +73,19 @@ public abstract class BaseCharFilter ext
0 : diffs[size-1];
}
+ /**
+ * <p>
+ * Adds an offset correction mapping at the given output stream offset.
+ * </p>
+ * <p>
+ * Assumption: the offset given with each successive call to this method
+ * will not be smaller than the offset given at the previous invocation.
+ * </p>
+ *
+ * @param off The output stream offset at which to apply the correction
+ * @param cumulativeDiff The input offset is given by adding this
+ * to the output offset
+ */
protected void addOffCorrectMap(int off, int cumulativeDiff) {
if (offsets == null) {
offsets = new int[64];
@@ -80,7 +95,15 @@ public abstract class BaseCharFilter ext
diffs = ArrayUtil.grow(diffs);
}
- offsets[size] = off;
- diffs[size++] = cumulativeDiff;
+ assert (size == 0 || off >= offsets[size])
+ : "Offset #" + size + "(" + off + ") is less than the last recorded offset "
+ + offsets[size] + "\n" + Arrays.toString(offsets) + "\n" + Arrays.toString(diffs);
+
+ if (size == 0 || off != offsets[size - 1]) {
+ offsets[size] = off;
+ diffs[size++] = cumulativeDiff;
+ } else { // Overwrite the diff at the last recorded offset
+ diffs[size - 1] = cumulativeDiff;
+ }
}
}