You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ma...@apache.org on 2012/01/23 19:34:08 UTC

svn commit: r1234932 [2/5] - in /lucene/dev/branches/solrcloud: ./ dev-tools/idea/lucene/contrib/ dev-tools/maven/ dev-tools/maven/solr/ dev-tools/maven/solr/contrib/analysis-extras/ dev-tools/maven/solr/contrib/clustering/ dev-tools/maven/solr/contrib...

Modified: lucene/dev/branches/solrcloud/lucene/src/test/org/apache/lucene/index/TestIndexWriterExceptions.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/lucene/src/test/org/apache/lucene/index/TestIndexWriterExceptions.java?rev=1234932&r1=1234931&r2=1234932&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/lucene/src/test/org/apache/lucene/index/TestIndexWriterExceptions.java (original)
+++ lucene/dev/branches/solrcloud/lucene/src/test/org/apache/lucene/index/TestIndexWriterExceptions.java Mon Jan 23 18:34:04 2012
@@ -695,12 +695,12 @@ public class TestIndexWriterExceptions e
       MockDirectoryWrapper dir = newDirectory();
 
       {
-        final  IndexWriter writer = new IndexWriter(
-            dir,
-            newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer).
-                setMaxBufferedDocs(-1).
-                setMergePolicy(newLogMergePolicy(10))
-        );
+        final IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(
+            TEST_VERSION_CURRENT, analyzer).setMaxBufferedDocs(-1)
+            .setMergePolicy(
+                random.nextBoolean() ? NoMergePolicy.COMPOUND_FILES
+                    : NoMergePolicy.NO_COMPOUND_FILES));
+        // don't use a merge policy here they depend on the DWPThreadPool and its max thread states etc.
         final int finalI = i;
 
         Thread[] threads = new Thread[NUM_THREAD];

Modified: lucene/dev/branches/solrcloud/lucene/src/test/org/apache/lucene/util/TestSentinelIntSet.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/lucene/src/test/org/apache/lucene/util/TestSentinelIntSet.java?rev=1234932&r1=1234931&r2=1234932&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/lucene/src/test/org/apache/lucene/util/TestSentinelIntSet.java (original)
+++ lucene/dev/branches/solrcloud/lucene/src/test/org/apache/lucene/util/TestSentinelIntSet.java Mon Jan 23 18:34:04 2012
@@ -20,6 +20,8 @@ package org.apache.lucene.util;
 
 import org.junit.Test;
 
+import java.util.HashSet;
+
 /**
  *
  *
@@ -45,4 +47,32 @@ public class TestSentinelIntSet extends 
     assertEquals(20, set.size());
     assertEquals(24, set.rehashCount);
   }
+  
+
+  @Test
+  public void testRandom() throws Exception {
+    for (int i=0; i<10000; i++) {
+      int initSz = random.nextInt(20);
+      int num = random.nextInt(30);
+      int maxVal = (random.nextBoolean() ? random.nextInt(50) : random.nextInt(Integer.MAX_VALUE)) + 1;
+
+      HashSet<Integer> a = new HashSet<Integer>(initSz);
+      SentinelIntSet b = new SentinelIntSet(initSz, -1);
+      
+      for (int j=0; j<num; j++) {
+        int val = random.nextInt(maxVal);
+        boolean exists = !a.add(val);
+        boolean existsB = b.exists(val);
+        assertEquals(exists, existsB);
+        int slot = b.find(val);
+        assertEquals(exists, slot>=0);
+        b.put(val);
+        
+        assertEquals(a.size(), b.size());
+      }
+      
+    }
+
+  }
+  
 }

Modified: lucene/dev/branches/solrcloud/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java?rev=1234932&r1=1234931&r2=1234932&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java (original)
+++ lucene/dev/branches/solrcloud/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java Mon Jan 23 18:34:04 2012
@@ -161,7 +161,7 @@ public class TestFSTs extends LuceneTest
         for(IntsRef term : terms2) {
           pairs.add(new FSTTester.InputOutput<Object>(term, NO_OUTPUT));
         }
-        FST<Object> fst = new FSTTester<Object>(random, dir, inputMode, pairs, outputs).doTest(0, 0, false);
+        FST<Object> fst = new FSTTester<Object>(random, dir, inputMode, pairs, outputs, false).doTest(0, 0, false);
         assertNotNull(fst);
         assertEquals(22, fst.getNodeCount());
         assertEquals(27, fst.getArcCount());
@@ -174,7 +174,7 @@ public class TestFSTs extends LuceneTest
         for(int idx=0;idx<terms2.length;idx++) {
           pairs.add(new FSTTester.InputOutput<Long>(terms2[idx], outputs.get(idx)));
         }
-        final FST<Long> fst = new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest(0, 0, false);
+        final FST<Long> fst = new FSTTester<Long>(random, dir, inputMode, pairs, outputs, true).doTest(0, 0, false);
         assertNotNull(fst);
         assertEquals(22, fst.getNodeCount());
         assertEquals(27, fst.getArcCount());
@@ -189,7 +189,7 @@ public class TestFSTs extends LuceneTest
           final BytesRef output = random.nextInt(30) == 17 ? NO_OUTPUT : new BytesRef(Integer.toString(idx));
           pairs.add(new FSTTester.InputOutput<BytesRef>(terms2[idx], output));
         }
-        final FST<BytesRef> fst = new FSTTester<BytesRef>(random, dir, inputMode, pairs, outputs).doTest(0, 0, false);
+        final FST<BytesRef> fst = new FSTTester<BytesRef>(random, dir, inputMode, pairs, outputs, false).doTest(0, 0, false);
         assertNotNull(fst);
         assertEquals(24, fst.getNodeCount());
         assertEquals(30, fst.getArcCount());
@@ -222,7 +222,7 @@ public class TestFSTs extends LuceneTest
       for(IntsRef term : terms) {
         pairs.add(new FSTTester.InputOutput<Object>(term, NO_OUTPUT));
       }
-      new FSTTester<Object>(random, dir, inputMode, pairs, outputs).doTest();
+      new FSTTester<Object>(random, dir, inputMode, pairs, outputs, false).doTest();
     }
 
     // PositiveIntOutput (ord)
@@ -232,12 +232,13 @@ public class TestFSTs extends LuceneTest
       for(int idx=0;idx<terms.length;idx++) {
         pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(idx)));
       }
-      new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest();
+      new FSTTester<Long>(random, dir, inputMode, pairs, outputs, true).doTest();
     }
 
     // PositiveIntOutput (random monotonically increasing positive number)
     {
-      final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean());
+      final boolean doShare = random.nextBoolean();
+      final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(doShare);
       final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms.length);
       long lastOutput = 0;
       for(int idx=0;idx<terms.length;idx++) {
@@ -245,7 +246,7 @@ public class TestFSTs extends LuceneTest
         lastOutput = value;
         pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(value)));
       }
-      new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest();
+      new FSTTester<Long>(random, dir, inputMode, pairs, outputs, doShare).doTest();
     }
 
     // PositiveIntOutput (random positive number)
@@ -255,7 +256,7 @@ public class TestFSTs extends LuceneTest
       for(int idx=0;idx<terms.length;idx++) {
         pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(random.nextLong()) & Long.MAX_VALUE));
       }
-      new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest();
+      new FSTTester<Long>(random, dir, inputMode, pairs, outputs, false).doTest();
     }
 
     // Pair<ord, (random monotonically increasing positive number>
@@ -272,7 +273,7 @@ public class TestFSTs extends LuceneTest
                                                                          outputs.get(o1.get(idx),
                                                                                      o2.get(value))));
       }
-      new FSTTester<PairOutputs.Pair<Long,Long>>(random, dir, inputMode, pairs, outputs).doTest();
+      new FSTTester<PairOutputs.Pair<Long,Long>>(random, dir, inputMode, pairs, outputs, false).doTest();
     }
 
     // Sequence-of-bytes
@@ -284,7 +285,7 @@ public class TestFSTs extends LuceneTest
         final BytesRef output = random.nextInt(30) == 17 ? NO_OUTPUT : new BytesRef(Integer.toString(idx));
         pairs.add(new FSTTester.InputOutput<BytesRef>(terms[idx], output));
       }
-      new FSTTester<BytesRef>(random, dir, inputMode, pairs, outputs).doTest();
+      new FSTTester<BytesRef>(random, dir, inputMode, pairs, outputs, false).doTest();
     }
 
     // Sequence-of-ints
@@ -300,7 +301,7 @@ public class TestFSTs extends LuceneTest
         }
         pairs.add(new FSTTester.InputOutput<IntsRef>(terms[idx], output));
       }
-      new FSTTester<IntsRef>(random, dir, inputMode, pairs, outputs).doTest();
+      new FSTTester<IntsRef>(random, dir, inputMode, pairs, outputs, false).doTest();
     }
 
     // Up to two positive ints, shared, generally but not
@@ -330,7 +331,7 @@ public class TestFSTs extends LuceneTest
         }
         pairs.add(new FSTTester.InputOutput<Object>(terms[idx], output));
       }
-      new FSTTester<Object>(random, dir, inputMode, pairs, outputs).doTest();
+      new FSTTester<Object>(random, dir, inputMode, pairs, outputs, false).doTest();
     }
   }
 
@@ -341,13 +342,15 @@ public class TestFSTs extends LuceneTest
     final int inputMode;
     final Outputs<T> outputs;
     final Directory dir;
+    final boolean doReverseLookup;
 
-    public FSTTester(Random random, Directory dir, int inputMode, List<InputOutput<T>> pairs, Outputs<T> outputs) {
+    public FSTTester(Random random, Directory dir, int inputMode, List<InputOutput<T>> pairs, Outputs<T> outputs, boolean doReverseLookup) {
       this.random = random;
       this.dir = dir;
       this.inputMode = inputMode;
       this.pairs = pairs;
       this.outputs = outputs;
+      this.doReverseLookup = doReverseLookup;
     }
 
     private static class InputOutput<T> implements Comparable<InputOutput<T>> {
@@ -525,6 +528,26 @@ public class TestFSTs extends LuceneTest
     // FST is complete
     private void verifyUnPruned(int inputMode, FST<T> fst) throws IOException {
 
+      final FST<Long> fstLong;
+      final Set<Long> validOutputs;
+      long minLong = Long.MAX_VALUE;
+      long maxLong = Long.MIN_VALUE;
+
+      if (doReverseLookup) {
+        @SuppressWarnings("unchecked") FST<Long> fstLong0 = (FST<Long>) fst;
+        fstLong = fstLong0;
+        validOutputs = new HashSet<Long>();
+        for(InputOutput<T> pair: pairs) {
+          Long output = (Long) pair.output;
+          maxLong = Math.max(maxLong, output);
+          minLong = Math.min(minLong, output);
+          validOutputs.add(output);
+        }
+      } else {
+        fstLong = null;
+        validOutputs = null;
+      }
+
       if (pairs.size() == 0) {
         assertNull(fst);
         return;
@@ -542,7 +565,7 @@ public class TestFSTs extends LuceneTest
 
       assertNotNull(fst);
 
-      // visit valid paris in order -- make sure all words
+      // visit valid pairs in order -- make sure all words
       // are accepted, and FSTEnum's next() steps through
       // them correctly
       if (VERBOSE) {
@@ -556,7 +579,6 @@ public class TestFSTs extends LuceneTest
             System.out.println("TEST: check term=" + inputToString(inputMode, term) + " output=" + fst.outputs.outputToString(pair.output));
           }
           Object output = run(fst, term, null);
-
           assertNotNull("term " + inputToString(inputMode, term) + " is not accepted", output);
           assertEquals(pair.output, output);
 
@@ -574,6 +596,20 @@ public class TestFSTs extends LuceneTest
         termsMap.put(pair.input, pair.output);
       }
 
+      if (doReverseLookup && maxLong > minLong) {
+        // Do random lookups so we test null (output doesn't
+        // exist) case:
+        assertNull(Util.getByOutput(fstLong, minLong-7));
+        assertNull(Util.getByOutput(fstLong, maxLong+7));
+
+        final int num = atLeast(100);
+        for(int iter=0;iter<num;iter++) {
+          Long v = minLong + random.nextLong() % (maxLong - minLong);
+          IntsRef input = Util.getByOutput(fstLong, v);
+          assertTrue(validOutputs.contains(v) || input == null);
+        }
+      }
+
       // find random matching word and make sure it's valid
       if (VERBOSE) {
         System.out.println("TEST: verify random accepted terms");
@@ -584,6 +620,14 @@ public class TestFSTs extends LuceneTest
         T output = randomAcceptedWord(fst, scratch);
         assertTrue("accepted word " + inputToString(inputMode, scratch) + " is not valid", termsMap.containsKey(scratch));
         assertEquals(termsMap.get(scratch), output);
+
+        if (doReverseLookup) {
+          //System.out.println("lookup output=" + output + " outs=" + fst.outputs);
+          IntsRef input = Util.getByOutput(fstLong, (Long) output);
+          assertNotNull(input);
+          //System.out.println("  got " + Util.toBytesRef(input, new BytesRef()).utf8ToString());
+          assertEquals(scratch, input);
+        }
       }
     
       // test IntsRefFSTEnum.seek:
@@ -887,7 +931,7 @@ public class TestFSTs extends LuceneTest
       if (VERBOSE) {
         System.out.println("TEST: after prune");
         for(Map.Entry<IntsRef,CountMinOutput<T>> ent : prefixes.entrySet()) {
-          System.out.println("  " + inputToString(inputMode, ent.getKey()) + ": isLeaf=" + ent.getValue().isLeaf + " isFinal=" + ent.getValue().isFinal);
+          System.out.println("  " + inputToString(inputMode, ent.getKey(), false) + ": isLeaf=" + ent.getValue().isLeaf + " isFinal=" + ent.getValue().isFinal);
           if (ent.getValue().isFinal) {
             System.out.println("    finalOutput=" + outputs.outputToString(ent.getValue().finalOutput));
           }
@@ -951,7 +995,7 @@ public class TestFSTs extends LuceneTest
     //testRandomWords(20, 100);
   }
 
-  private String inputModeToString(int mode) {
+  String inputModeToString(int mode) {
     if (mode == 0) {
       return "utf8";
     } else {
@@ -995,7 +1039,7 @@ public class TestFSTs extends LuceneTest
     testRandomWords(_TestUtil.nextInt(random, 50000, 60000), 1);
   }
   
-  private static String inputToString(int inputMode, IntsRef term) {
+  static String inputToString(int inputMode, IntsRef term) {
     return inputToString(inputMode, term, true);
   }
 
@@ -1011,6 +1055,50 @@ public class TestFSTs extends LuceneTest
     }
   }
 
+  // NOTE: this test shows a case where our current builder
+  // fails to produce minimal FST:
+  /*
+  public void test3() throws Exception {
+    final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
+    Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
+    IntsRef scratchIntsRef = new IntsRef();
+    builder.add(Util.toIntsRef(new BytesRef("aa$"), scratchIntsRef), outputs.get(0));
+    builder.add(Util.toIntsRef(new BytesRef("aab$"), scratchIntsRef), 1L);
+    builder.add(Util.toIntsRef(new BytesRef("bbb$"), scratchIntsRef), 2L);
+    final FST<Long> fst = builder.finish();
+    //System.out.println("NODES " + fst.getNodeCount() + " ARCS " + fst.getArcCount());
+    // NOTE: we produce 7 nodes today
+    assertEquals(6, fst.getNodeCount());
+    // NOTE: we produce 8 arcs today
+    assertEquals(7, fst.getNodeCount());
+    //Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
+    //Util.toDot(fst, w, false, false);
+    //w.close();
+  }
+  */
+
+  // NOTE: this test shows a case where our current builder
+  // fails to produce minimal FST:
+  /*
+  public void test4() throws Exception {
+    final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
+    Builder<BytesRef> builder = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE1, outputs);
+    IntsRef scratchIntsRef = new IntsRef();
+    builder.add(Util.toIntsRef(new BytesRef("aa$"), scratchIntsRef), outputs.getNoOutput());
+    builder.add(Util.toIntsRef(new BytesRef("aab$"), scratchIntsRef), new BytesRef("1"));
+    builder.add(Util.toIntsRef(new BytesRef("bbb$"), scratchIntsRef), new BytesRef("11"));
+    final FST<BytesRef> fst = builder.finish();
+    //System.out.println("NODES " + fst.getNodeCount() + " ARCS " + fst.getArcCount());
+    // NOTE: we produce 7 nodes today
+    assertEquals(6, fst.getNodeCount());
+    // NOTE: we produce 8 arcs today
+    assertEquals(7, fst.getNodeCount());
+    //Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
+    //Util.toDot(fst, w, false, false);
+    //w.close();
+  }
+  */
+
   // Build FST for all unique terms in the test line docs
   // file, up until a time limit
   public void testRealTerms() throws Exception {
@@ -1422,6 +1510,14 @@ public class TestFSTs extends LuceneTest
     assertNotNull(seekResult);
     assertEquals(b, seekResult.input);
     assertEquals(42, (long) seekResult.output);
+
+    assertEquals(Util.toIntsRef(new BytesRef("c"), new IntsRef()),
+                 Util.getByOutput(fst, 13824324872317238L));
+    assertNull(Util.getByOutput(fst, 47));
+    assertEquals(Util.toIntsRef(new BytesRef("b"), new IntsRef()),
+                 Util.getByOutput(fst, 42));
+    assertEquals(Util.toIntsRef(new BytesRef("a"), new IntsRef()),
+                 Util.getByOutput(fst, 17));
   }
 
   public void testPrimaryKeys() throws Exception {

Modified: lucene/dev/branches/solrcloud/modules/analysis/common/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/modules/analysis/common/build.xml?rev=1234932&r1=1234931&r2=1234932&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/modules/analysis/common/build.xml (original)
+++ lucene/dev/branches/solrcloud/modules/analysis/common/build.xml Mon Jan 23 18:34:04 2012
@@ -31,14 +31,38 @@
   <target name="compile-core" depends="jflex-notice, common.compile-core"/>
 
   <target name="jflex" depends="jflex-check,clean-jflex,gen-uax29-supp-macros,
-                                jflex-StandardAnalyzer,jflex-UAX29URLEmailTokenizer,jflex-wiki-tokenizer"/>
+                                jflex-StandardAnalyzer,jflex-UAX29URLEmailTokenizer,
+                                jflex-wiki-tokenizer,jflex-HTMLStripCharFilter"/>
 
   <target name="gen-uax29-supp-macros">
     <subant target="gen-uax29-supp-macros">
        <fileset dir="../icu" includes="build.xml"/>
     </subant>
   </target>
-  
+
+  <target name="jflex-HTMLStripCharFilter"
+          depends="init,jflex-check,generate-jflex-html-char-entities"
+          if="jflex.present">
+    <taskdef classname="jflex.anttask.JFlexTask" name="jflex">
+      <classpath refid="jflex.classpath"/>
+    </taskdef>
+    <jflex file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex"
+           outdir="src/java/org/apache/lucene/analysis/charfilter"
+           nobak="on"/>
+    <!-- Remove the inappropriate JFlex-generated constructors -->
+    <replaceregexp file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java"
+                   match="/\*\*\s*\*\s*Creates a new scanner.*this\(new java\.io\.InputStreamReader\(in\)\);\s*\}"
+                   replace="" flags="sg"/>
+  </target>
+
+  <target name="generate-jflex-html-char-entities">
+    <exec dir="src/java/org/apache/lucene/analysis/charfilter"
+          output="src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex"
+          executable="${python.exe}" failonerror="true" logerror="true">
+      <arg value="htmlentity.py"/>
+    </exec>
+  </target>
+
   <target name="jflex-wiki-tokenizer" depends="init,jflex-check" if="jflex.present">
     <taskdef classname="jflex.anttask.JFlexTask" name="jflex">
       <classpath refid="jflex.classpath"/>

Modified: lucene/dev/branches/solrcloud/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/BaseCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solrcloud/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/BaseCharFilter.java?rev=1234932&r1=1234931&r2=1234932&view=diff
==============================================================================
--- lucene/dev/branches/solrcloud/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/BaseCharFilter.java (original)
+++ lucene/dev/branches/solrcloud/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/BaseCharFilter.java Mon Jan 23 18:34:04 2012
@@ -20,6 +20,8 @@ package org.apache.lucene.analysis.charf
 import org.apache.lucene.analysis.CharStream;
 import org.apache.lucene.util.ArrayUtil;
 
+import java.util.Arrays;
+
 /**
  * Base utility class for implementing a {@link CharFilter}.
  * You subclass this, and then record mappings by calling
@@ -71,6 +73,19 @@ public abstract class BaseCharFilter ext
       0 : diffs[size-1];
   }
 
+  /**
+   * <p>
+   *   Adds an offset correction mapping at the given output stream offset.
+   * </p>
+   * <p>
+   *   Assumption: the offset given with each successive call to this method
+   *   will not be smaller than the offset given at the previous invocation.
+   * </p>
+   *
+   * @param off The output stream offset at which to apply the correction
+   * @param cumulativeDiff The input offset is given by adding this
+   *                       to the output offset
+   */
   protected void addOffCorrectMap(int off, int cumulativeDiff) {
     if (offsets == null) {
       offsets = new int[64];
@@ -80,7 +95,15 @@ public abstract class BaseCharFilter ext
       diffs = ArrayUtil.grow(diffs);
     }
     
-    offsets[size] = off;
-    diffs[size++] = cumulativeDiff; 
+    assert (size == 0 || off >= offsets[size])
+        : "Offset #" + size + "(" + off + ") is less than the last recorded offset "
+          + offsets[size] + "\n" + Arrays.toString(offsets) + "\n" + Arrays.toString(diffs);
+    
+    if (size == 0 || off != offsets[size - 1]) {
+      offsets[size] = off;
+      diffs[size++] = cumulativeDiff;
+    } else { // Overwrite the diff at the last recorded offset
+      diffs[size - 1] = cumulativeDiff;
+    }
   }
 }