You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/05/23 18:50:04 UTC

svn commit: r1126578 [3/4] - in /lucene/dev/branches/flexscoring: ./ dev-tools/eclipse/ dev-tools/idea/ dev-tools/idea/.idea/ dev-tools/idea/lucene/ dev-tools/idea/lucene/contrib/ant/ dev-tools/idea/lucene/contrib/db/bdb-je/ dev-tools/idea/lucene/contr...

Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sinks/TestTeeSinkTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sinks/TestTeeSinkTokenFilter.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sinks/TestTeeSinkTokenFilter.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sinks/TestTeeSinkTokenFilter.java Mon May 23 16:49:59 2011
@@ -84,7 +84,7 @@ public class TestTeeSinkTokenFilter exte
   // with BaseTokenStreamTestCase now...
   public void testEndOffsetPositionWithTeeSinkTokenFilter() throws Exception {
     Directory dir = newDirectory();
-    Analyzer analyzer = new WhitespaceAnalyzer(TEST_VERSION_CURRENT);
+    Analyzer analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false);
     IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
     Document doc = new Document();
     TeeSinkTokenFilter tee = new TeeSinkTokenFilter(analyzer.tokenStream("field", new StringReader("abcd   ")));
@@ -108,7 +108,7 @@ public class TestTeeSinkTokenFilter exte
   }
   
   public void testGeneral() throws IOException {
-    final TeeSinkTokenFilter source = new TeeSinkTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer1.toString())));
+    final TeeSinkTokenFilter source = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer1.toString()), MockTokenizer.WHITESPACE, false));
     final TokenStream sink1 = source.newSinkTokenStream();
     final TokenStream sink2 = source.newSinkTokenStream(theFilter);
     
@@ -122,16 +122,17 @@ public class TestTeeSinkTokenFilter exte
   }
 
   public void testMultipleSources() throws Exception {
-    final TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer1.toString())));
+    final TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer1.toString()), MockTokenizer.WHITESPACE, false));
     final TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.newSinkTokenStream(dogFilter);
     final TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.newSinkTokenStream(theFilter);
+    tee1.reset();
     final TokenStream source1 = new CachingTokenFilter(tee1);
     
     tee1.addAttribute(CheckClearAttributesAttribute.class);
     dogDetector.addAttribute(CheckClearAttributesAttribute.class);
     theDetector.addAttribute(CheckClearAttributesAttribute.class);
 
-    final TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer2.toString())));
+    final TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer2.toString()), MockTokenizer.WHITESPACE, false));
     tee2.addSinkTokenStream(dogDetector);
     tee2.addSinkTokenStream(theDetector);
     final TokenStream source2 = tee2;

Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sinks/TokenRangeSinkTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sinks/TokenRangeSinkTokenizerTest.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sinks/TokenRangeSinkTokenizerTest.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sinks/TokenRangeSinkTokenizerTest.java Mon May 23 16:49:59 2011
@@ -20,14 +20,14 @@ import java.io.IOException;
 import java.io.StringReader;
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.MockTokenizer;
 
 public class TokenRangeSinkTokenizerTest extends BaseTokenStreamTestCase {
 
   public void test() throws IOException {
     TokenRangeSinkFilter sinkFilter = new TokenRangeSinkFilter(2, 4);
     String test = "The quick red fox jumped over the lazy brown dogs";
-    TeeSinkTokenFilter tee = new TeeSinkTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test)));
+    TeeSinkTokenFilter tee = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false));
     TeeSinkTokenFilter.SinkTokenStream rangeToks = tee.newSinkTokenStream(sinkFilter);
     
     int count = 0;

Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sinks/TokenTypeSinkTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sinks/TokenTypeSinkTokenizerTest.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sinks/TokenTypeSinkTokenizerTest.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sinks/TokenTypeSinkTokenizerTest.java Mon May 23 16:49:59 2011
@@ -20,9 +20,9 @@ import java.io.IOException;
 import java.io.StringReader;
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 
@@ -32,7 +32,7 @@ public class TokenTypeSinkTokenizerTest 
     TokenTypeSinkFilter sinkFilter = new TokenTypeSinkFilter("D");
     String test = "The quick red fox jumped over the lazy brown dogs";
 
-    TeeSinkTokenFilter ttf = new TeeSinkTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test))));
+    TeeSinkTokenFilter ttf = new TeeSinkTokenFilter(new WordTokenFilter(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false)));
     TeeSinkTokenFilter.SinkTokenStream sink = ttf.newSinkTokenStream(sinkFilter);
     
     boolean seenDogs = false;

Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilter.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilter.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilter.java Mon May 23 16:49:59 2011
@@ -22,8 +22,8 @@ import java.io.Reader;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
 
 import static org.apache.lucene.analysis.util.VocabularyAssert.*;
@@ -36,7 +36,7 @@ public class TestSwedishLightStemFilter 
     @Override
     protected TokenStreamComponents createComponents(String fieldName,
         Reader reader) {
-      Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
+      Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
       return new TokenStreamComponents(source, new SwedishLightStemFilter(source));
     }
   };

Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilter.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilter.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilter.java Mon May 23 16:49:59 2011
@@ -25,6 +25,7 @@ import java.util.Collection;
 import java.util.List;
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
@@ -43,14 +44,14 @@ public class TestSynonymFilter extends B
 
   static void assertTokenizesTo(SynonymMap dict, String input,
       String expected[]) throws IOException {
-    Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
+    Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
     SynonymFilter stream = new SynonymFilter(tokenizer, dict);
     assertTokenStreamContents(stream, expected);
   }
   
   static void assertTokenizesTo(SynonymMap dict, String input,
       String expected[], int posIncs[]) throws IOException {
-    Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
+    Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
     SynonymFilter stream = new SynonymFilter(tokenizer, dict);
     assertTokenStreamContents(stream, expected, posIncs);
   }

Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilter.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilter.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilter.java Mon May 23 16:49:59 2011
@@ -20,8 +20,8 @@ package org.apache.lucene.analysis.tr;
 import java.io.StringReader;
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 
 /**
  * Test the Turkish lowercase filter.
@@ -32,8 +32,8 @@ public class TestTurkishLowerCaseFilter 
    * Test composed forms
    */
   public void testTurkishLowerCaseFilter() throws Exception {
-    TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
-        "\u0130STANBUL \u0130ZM\u0130R ISPARTA"));
+    TokenStream stream = new MockTokenizer(new StringReader(
+        "\u0130STANBUL \u0130ZM\u0130R ISPARTA"), MockTokenizer.WHITESPACE, false);
     TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(stream);
     assertTokenStreamContents(filter, new String[] {"istanbul", "izmir",
         "\u0131sparta",});
@@ -43,8 +43,8 @@ public class TestTurkishLowerCaseFilter 
    * Test decomposed forms
    */
   public void testDecomposed() throws Exception {
-    TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
-        "\u0049\u0307STANBUL \u0049\u0307ZM\u0049\u0307R ISPARTA"));
+    TokenStream stream = new MockTokenizer(new StringReader(
+        "\u0049\u0307STANBUL \u0049\u0307ZM\u0049\u0307R ISPARTA"), MockTokenizer.WHITESPACE, false);
     TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(stream);
     assertTokenStreamContents(filter, new String[] {"istanbul", "izmir",
         "\u0131sparta",});
@@ -56,8 +56,8 @@ public class TestTurkishLowerCaseFilter 
    * to U+0130 + U+0316, and is lowercased the same way.
    */
   public void testDecomposed2() throws Exception {
-    TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
-        "\u0049\u0316\u0307STANBUL \u0049\u0307ZM\u0049\u0307R I\u0316SPARTA"));
+    TokenStream stream = new MockTokenizer(new StringReader(
+        "\u0049\u0316\u0307STANBUL \u0049\u0307ZM\u0049\u0307R I\u0316SPARTA"), MockTokenizer.WHITESPACE, false);
     TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(stream);
     assertTokenStreamContents(filter, new String[] {"i\u0316stanbul", "izmir",
         "\u0131\u0316sparta",});

Modified: lucene/dev/branches/flexscoring/modules/grouping/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/grouping/build.xml?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/grouping/build.xml (original)
+++ lucene/dev/branches/flexscoring/modules/grouping/build.xml Mon May 23 16:49:59 2011
@@ -10,4 +10,6 @@
 
     <import file="../../lucene/contrib/contrib-build.xml"/>
     <property name="working.dir" location="work"/>
+
+    <target name="dist-maven" depends="jar-core,javadocs,contrib-build.dist-maven" />
 </project>

Modified: lucene/dev/branches/flexscoring/modules/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java (original)
+++ lucene/dev/branches/flexscoring/modules/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java Mon May 23 16:49:59 2011
@@ -34,9 +34,12 @@ import org.apache.lucene.search.SortFiel
 import org.apache.lucene.util.BytesRef;
 
 /** FirstPassGroupingCollector is the first of two passes necessary
- *  to collected grouped hits.  This pass gathers the top N sorted
+ *  to collect grouped hits.  This pass gathers the top N sorted
  *  groups.
  *
+ *  <p>See {@link org.apache.lucene.search.grouping} for more
+ *  details including a full code example.</p>
+ *
  * @lucene.experimental
  */
 
@@ -229,7 +232,9 @@ public class FirstPassGroupingCollector 
       // We already tested that the document is competitive, so replace
       // the bottom group with this new group.
 
-      final CollectedSearchGroup bottomGroup = orderedGroups.pollLast();
+      // java 6-only: final CollectedSearchGroup bottomGroup = orderedGroups.pollLast();
+      final CollectedSearchGroup bottomGroup = orderedGroups.last();
+      orderedGroups.remove(bottomGroup);
       assert orderedGroups.size() == topNGroups -1;
 
       groupMap.remove(bottomGroup.groupValue);

Modified: lucene/dev/branches/flexscoring/modules/grouping/src/java/org/apache/lucene/search/grouping/SearchGroup.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/grouping/src/java/org/apache/lucene/search/grouping/SearchGroup.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/grouping/src/java/org/apache/lucene/search/grouping/SearchGroup.java (original)
+++ lucene/dev/branches/flexscoring/modules/grouping/src/java/org/apache/lucene/search/grouping/SearchGroup.java Mon May 23 16:49:59 2011
@@ -20,7 +20,7 @@ package org.apache.lucene.search.groupin
 import org.apache.lucene.util.BytesRef;
 
 /** @lucene.experimental */
-class SearchGroup {
+public class SearchGroup {
   public BytesRef groupValue;
   public Comparable[] sortValues;
 }

Modified: lucene/dev/branches/flexscoring/modules/grouping/src/java/org/apache/lucene/search/grouping/SecondPassGroupingCollector.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/grouping/src/java/org/apache/lucene/search/grouping/SecondPassGroupingCollector.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/grouping/src/java/org/apache/lucene/search/grouping/SecondPassGroupingCollector.java (original)
+++ lucene/dev/branches/flexscoring/modules/grouping/src/java/org/apache/lucene/search/grouping/SecondPassGroupingCollector.java Mon May 23 16:49:59 2011
@@ -33,7 +33,14 @@ import org.apache.lucene.search.TopScore
 import org.apache.lucene.util.BytesRef;
 
 /**
- * See {@link FirstPassGroupingCollector}.
+ * SecondPassGroupingCollector is the second of two passes
+ * necessary to collect grouped docs.  This pass gathers the
+ * top N documents per top group computed from the
+ * first pass.
+ *
+ * <p>See {@link org.apache.lucene.search.grouping} for more
+ * details including a full code example.</p>
+ *
  * @lucene.experimental
  */
 public class SecondPassGroupingCollector extends Collector {

Modified: lucene/dev/branches/flexscoring/modules/grouping/src/java/org/apache/lucene/search/grouping/TopGroups.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/grouping/src/java/org/apache/lucene/search/grouping/TopGroups.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/grouping/src/java/org/apache/lucene/search/grouping/TopGroups.java (original)
+++ lucene/dev/branches/flexscoring/modules/grouping/src/java/org/apache/lucene/search/grouping/TopGroups.java Mon May 23 16:49:59 2011
@@ -21,9 +21,6 @@ import org.apache.lucene.search.SortFiel
 
 /** Represents result returned by a grouping search.
  *
- * Note that we do not return the total number of unique
- * groups; doing so would be costly.
- * 
  * @lucene.experimental */
 public class TopGroups {
   /** Number of documents matching the search */
@@ -32,6 +29,9 @@ public class TopGroups {
   /** Number of documents grouped into the topN groups */
   public final int totalGroupedHitCount;
 
+  /** The total number of unique groups. If <code>null</code> this value is not computed. */
+  public final Integer totalGroupCount;
+
   /** Group results in groupSort order */
   public final GroupDocs[] groups;
 
@@ -47,5 +47,15 @@ public class TopGroups {
     this.totalHitCount = totalHitCount;
     this.totalGroupedHitCount = totalGroupedHitCount;
     this.groups = groups;
+    this.totalGroupCount = null;
+  }
+
+  public TopGroups(TopGroups oldTopGroups, Integer totalGroupCount) {
+    this.groupSort = oldTopGroups.groupSort;
+    this.withinGroupSort = oldTopGroups.withinGroupSort;
+    this.totalHitCount = oldTopGroups.totalHitCount;
+    this.totalGroupedHitCount = oldTopGroups.totalGroupedHitCount;
+    this.groups = oldTopGroups.groups;
+    this.totalGroupCount = totalGroupCount;
   }
 }

Modified: lucene/dev/branches/flexscoring/modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java (original)
+++ lucene/dev/branches/flexscoring/modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java Mon May 23 16:49:59 2011
@@ -17,13 +17,7 @@
 
 package org.apache.lucene.search.grouping;
 
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.List;
+import java.util.*;
 
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.document.Document;
@@ -32,14 +26,7 @@ import org.apache.lucene.document.Numeri
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.Term;
-import org.apache.lucene.search.Collector;
-import org.apache.lucene.search.FieldCache;
-import org.apache.lucene.search.FieldDoc;
-import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.ScoreDoc;
-import org.apache.lucene.search.Sort;
-import org.apache.lucene.search.SortField;
-import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.*;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;
@@ -120,7 +107,7 @@ public class TestGrouping extends Lucene
 
     final SecondPassGroupingCollector c2 = new SecondPassGroupingCollector(groupField, c1.getTopGroups(0, true), groupSort, null, 5, true, false, true);
     indexSearcher.search(new TermQuery(new Term("content", "random")), c2);
-    
+
     final TopGroups groups = c2.getTopGroups(0);
 
     assertEquals(7, groups.totalHitCount);
@@ -218,11 +205,11 @@ public class TestGrouping extends Lucene
     };
   }
 
-  private Comparable[] fillFields(GroupDoc d, Sort sort) {
+  private Comparable<?>[] fillFields(GroupDoc d, Sort sort) {
     final SortField[] sortFields = sort.getSort();
-    final Comparable[] fields = new Comparable[sortFields.length];
+    final Comparable<?>[] fields = new Comparable[sortFields.length];
     for(int fieldIDX=0;fieldIDX<sortFields.length;fieldIDX++) {
-      final Comparable c;
+      final Comparable<?> c;
       final SortField sf = sortFields[fieldIDX];
       if (sf.getField().equals("sort1")) {
         c = d.sort1;
@@ -242,6 +229,7 @@ public class TestGrouping extends Lucene
                                  boolean fillFields,
                                  boolean getScores,
                                  boolean getMaxScores,
+                                 boolean doAllGroups,
                                  Sort groupSort,
                                  Sort docSort,
                                  int topNGroups,
@@ -254,9 +242,10 @@ public class TestGrouping extends Lucene
     Arrays.sort(groupDocs, groupSortComp);
     final HashMap<BytesRef,List<GroupDoc>> groups = new HashMap<BytesRef,List<GroupDoc>>();
     final List<BytesRef> sortedGroups = new ArrayList<BytesRef>();
-    final List<Comparable[]> sortedGroupFields = new ArrayList<Comparable[]>();
+    final List<Comparable<?>[]> sortedGroupFields = new ArrayList<Comparable<?>[]>();
 
     int totalHitCount = 0;
+    Set<BytesRef> knownGroups = new HashSet<BytesRef>();
 
     for(GroupDoc d : groupDocs) {
       // TODO: would be better to filter by searchTerm before sorting!
@@ -264,6 +253,13 @@ public class TestGrouping extends Lucene
         continue;
       }
       totalHitCount++;
+
+      if (doAllGroups) {
+        if (!knownGroups.contains(d.group)) {
+          knownGroups.add(d.group);
+        }
+      }
+
       List<GroupDoc> l = groups.get(d.group);
       if (l == null) {
         sortedGroups.add(d.group);
@@ -316,7 +312,14 @@ public class TestGrouping extends Lucene
                                               fillFields ? sortedGroupFields.get(idx) : null);
     }
 
-    return new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result);
+    if (doAllGroups) {
+      return new TopGroups(
+          new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result),
+          knownGroups.size()
+      );
+    } else {
+      return new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result);
+    }
   }
 
   public void testRandom() throws Exception {
@@ -334,7 +337,7 @@ public class TestGrouping extends Lucene
       if (VERBOSE) {
         System.out.println("TEST: numDocs=" + numDocs + " numGroups=" + numGroups);
       }
-      
+
       final List<BytesRef> groups = new ArrayList<BytesRef>();
       for(int i=0;i<numGroups;i++) {
         groups.add(new BytesRef(_TestUtil.randomRealisticUnicodeString(random)));
@@ -427,25 +430,69 @@ public class TestGrouping extends Lucene
         //final int docOffset = 0;
 
         final boolean doCache = random.nextBoolean();
+        final boolean doAllGroups = random.nextBoolean();
         if (VERBOSE) {
-          System.out.println("TEST: groupSort=" + groupSort + " docSort=" + docSort + " searchTerm=" + searchTerm + " topNGroups=" + topNGroups + " groupOffset=" + groupOffset + " docOffset=" + docOffset + " doCache=" + doCache + " docsPerGroup=" + docsPerGroup);
+          System.out.println("TEST: groupSort=" + groupSort + " docSort=" + docSort + " searchTerm=" + searchTerm + " topNGroups=" + topNGroups + " groupOffset=" + groupOffset + " docOffset=" + docOffset + " doCache=" + doCache + " docsPerGroup=" + docsPerGroup + " doAllGroups=" + doAllGroups);
+        }
+
+        final AllGroupsCollector allGroupsCollector;
+        if (doAllGroups) {
+          allGroupsCollector = new AllGroupsCollector("group");
+        } else {
+          allGroupsCollector = null;
         }
 
         final FirstPassGroupingCollector c1 = new FirstPassGroupingCollector("group", groupSort, groupOffset+topNGroups);
         final CachingCollector cCache;
         final Collector c;
+        
+        final boolean useWrappingCollector = random.nextBoolean();
+        
         if (doCache) {
           final double maxCacheMB = random.nextDouble();
           if (VERBOSE) {
             System.out.println("TEST: maxCacheMB=" + maxCacheMB);
           }
-          c = cCache = new CachingCollector(c1, true, maxCacheMB);
+
+          if (useWrappingCollector) {
+            if (doAllGroups) {
+              cCache = CachingCollector.create(c1, true, maxCacheMB);              
+              c = MultiCollector.wrap(cCache, allGroupsCollector);
+            } else {
+              c = cCache = CachingCollector.create(c1, true, maxCacheMB);              
+            }
+          } else {
+            // Collect only into cache, then replay multiple times:
+            c = cCache = CachingCollector.create(false, true, maxCacheMB);
+          }
         } else {
-          c = c1;
           cCache = null;
+          if (doAllGroups) {
+            c = MultiCollector.wrap(c1, allGroupsCollector);
+          } else {
+            c = c1;
+          }
         }
+        
         s.search(new TermQuery(new Term("content", searchTerm)), c);
 
+        if (doCache && !useWrappingCollector) {
+          if (cCache.isCached()) {
+            // Replay for first-pass grouping
+            cCache.replay(c1);
+            if (doAllGroups) {
+              // Replay for all groups:
+              cCache.replay(allGroupsCollector);
+            }
+          } else {
+            // Replay by re-running search:
+            s.search(new TermQuery(new Term("content", searchTerm)), c1);
+            if (doAllGroups) {
+              s.search(new TermQuery(new Term("content", searchTerm)), allGroupsCollector);
+            }
+          }
+        }
+
         final Collection<SearchGroup> topGroups = c1.getTopGroups(groupOffset, fillFields);
         final TopGroups groupsResult;
 
@@ -474,8 +521,13 @@ public class TestGrouping extends Lucene
           } else {
             s.search(new TermQuery(new Term("content", searchTerm)), c2);
           }
-        
-          groupsResult = c2.getTopGroups(docOffset);
+
+          if (doAllGroups) {
+            TopGroups tempTopGroups = c2.getTopGroups(docOffset);
+            groupsResult = new TopGroups(tempTopGroups, allGroupsCollector.getGroupCount());
+          } else {
+            groupsResult = c2.getTopGroups(docOffset);
+          }
         } else {
           groupsResult = null;
           if (VERBOSE) {
@@ -483,7 +535,7 @@ public class TestGrouping extends Lucene
           }
         }
 
-        final TopGroups expectedGroups = slowGrouping(groupDocs, searchTerm, fillFields, getScores, getMaxScores, groupSort, docSort, topNGroups, docsPerGroup, groupOffset, docOffset);
+        final TopGroups expectedGroups = slowGrouping(groupDocs, searchTerm, fillFields, getScores, getMaxScores, doAllGroups, groupSort, docSort, topNGroups, docsPerGroup, groupOffset, docOffset);
 
         try {
           // NOTE: intentional but temporary field cache insanity!
@@ -508,7 +560,10 @@ public class TestGrouping extends Lucene
     assertEquals(expected.groups.length, actual.groups.length);
     assertEquals(expected.totalHitCount, actual.totalHitCount);
     assertEquals(expected.totalGroupedHitCount, actual.totalGroupedHitCount);
-    
+    if (expected.totalGroupCount != null) {
+      assertEquals(expected.totalGroupCount, actual.totalGroupCount);
+    }
+
     for(int groupIDX=0;groupIDX<expected.groups.length;groupIDX++) {
       if (VERBOSE) {
         System.out.println("  check groupIDX=" + groupIDX);
@@ -516,7 +571,7 @@ public class TestGrouping extends Lucene
       final GroupDocs expectedGroup = expected.groups[groupIDX];
       final GroupDocs actualGroup = actual.groups[groupIDX];
       assertEquals(expectedGroup.groupValue, actualGroup.groupValue);
-      assertEquals(expectedGroup.groupSortValues, actualGroup.groupSortValues);
+      assertArrayEquals(expectedGroup.groupSortValues, actualGroup.groupSortValues);
 
       // TODO
       // assertEquals(expectedGroup.maxScore, actualGroup.maxScore);
@@ -532,7 +587,7 @@ public class TestGrouping extends Lucene
         assertEquals(expectedFD.doc, docIDtoID[actualFD.doc]);
         // TODO
         // assertEquals(expectedFD.score, actualFD.score);
-        assertEquals(expectedFD.fields, actualFD.fields);
+        assertArrayEquals(expectedFD.fields, actualFD.fields);
       }
     }
   }

Modified: lucene/dev/branches/flexscoring/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/solr/CHANGES.txt?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/solr/CHANGES.txt (original)
+++ lucene/dev/branches/flexscoring/solr/CHANGES.txt Mon May 23 16:49:59 2011
@@ -26,7 +26,7 @@ Versions of Major Components
 ---------------------
 Apache Lucene trunk
 Apache Tika 0.8
-Carrot2 3.4.2
+Carrot2 3.5.0
 Velocity 1.6.4 and Velocity Tools 2.0
 Apache UIMA 2.3.1-SNAPSHOT
 
@@ -329,6 +329,10 @@ Bug Fixes
 * SOLR-2495: The JSON parser could hang on corrupted input and could fail
   to detect numbers that were too large to fit in a long.  (yonik)
 
+* SOLR-2520: Make JSON response format escape \u2029 as well as \u2028
+  in strings since those characters are not valid in javascript strings
+  (although they are valid in JSON strings).  (yonik)
+
 
 Other Changes
 ----------------------
@@ -336,6 +340,9 @@ Other Changes
 * SOLR-2105: Rename RequestHandler param 'update.processor' to 'update.chain'.
 	(Jan Høydahl via Mark Miller)
 
+* SOLR-2528: Remove default="true" from HtmlEncoder in example solrconfig.xml,
+  because html encoding confuses non-ascii users. (koji)
+
 Build
 ----------------------
 

Modified: lucene/dev/branches/flexscoring/solr/contrib/clustering/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/solr/contrib/clustering/CHANGES.txt?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/solr/contrib/clustering/CHANGES.txt (original)
+++ lucene/dev/branches/flexscoring/solr/contrib/clustering/CHANGES.txt Mon May 23 16:49:59 2011
@@ -9,11 +9,19 @@ CHANGES
 $Id$
 ================== Release 4.0.0-dev ==================
 
-(No Changes)
+* SOLR-2448: Search results clustering updates: bisecting k-means
+  clustering algorithm added, loading of Carrot2 stop words from
+  <solr.home>/conf/carrot2 (SOLR-2449), using Solr's stopwords.txt
+  for clustering (SOLR-2450), output of cluster scores (SOLR-2505)
+  (Stanislaw Osinski, Dawid Weiss).
 
 ================== Release 3.2.0-dev ==================
 
-(No Changes)
+* SOLR-2448: Search results clustering updates: bisecting k-means
+  clustering algorithm added, loading of Carrot2 stop words from
+  <solr.home>/conf/carrot2 (SOLR-2449), using Solr's stopwords.txt
+  for clustering (SOLR-2450), output of cluster scores (SOLR-2505)
+  (Stanislaw Osinski, Dawid Weiss).
 
 ================== Release 3.1.0-dev ==================
 

Modified: lucene/dev/branches/flexscoring/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java (original)
+++ lucene/dev/branches/flexscoring/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java Mon May 23 16:49:59 2011
@@ -18,9 +18,11 @@ package org.apache.solr.handler.clusteri
  */
 
 import java.io.IOException;
+import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
@@ -37,6 +39,7 @@ import org.apache.solr.common.params.Sol
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.common.util.SimpleOrderedMap;
 import org.apache.solr.core.SolrCore;
+import org.apache.solr.core.SolrResourceLoader;
 import org.apache.solr.handler.clustering.SearchClusteringEngine;
 import org.apache.solr.handler.component.HighlightComponent;
 import org.apache.solr.highlight.SolrHighlighter;
@@ -52,9 +55,17 @@ import org.carrot2.core.ControllerFactor
 import org.carrot2.core.Document;
 import org.carrot2.core.IClusteringAlgorithm;
 import org.carrot2.core.attribute.AttributeNames;
+import org.carrot2.text.linguistic.DefaultLexicalDataFactoryDescriptor;
+import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor;
+import org.carrot2.util.resource.ClassLoaderLocator;
+import org.carrot2.util.resource.IResource;
+import org.carrot2.util.resource.IResourceLocator;
+import org.carrot2.util.resource.ResourceLookup;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
 import com.google.common.collect.Sets;
 
 /**
@@ -64,19 +75,33 @@ import com.google.common.collect.Sets;
  *
  * @link http://project.carrot2.org
  */
-@SuppressWarnings("unchecked")
 public class CarrotClusteringEngine extends SearchClusteringEngine {
-  private transient static Logger log = LoggerFactory
+	private transient static Logger log = LoggerFactory
           .getLogger(CarrotClusteringEngine.class);
 
+	/**
+	 * The subdirectory in Solr config dir to read customized Carrot2 resources from.
+	 */
+	private static final String CARROT_RESOURCES_PREFIX = "clustering/carrot2";
+
+  /**
+   * Name of Carrot2 document's field containing Solr document's identifier.
+   */
+  private static final String SOLR_DOCUMENT_ID = "solrId";
+
+  /**
+   * Name of Solr document's field containing the document's identifier. To avoid
+   * repeating the content of documents in clusters on output, each cluster contains
+   * identifiers of documents it contains.
+   */
+  private String idFieldName;
+
   /**
    * Carrot2 controller that manages instances of clustering algorithms
    */
   private Controller controller = ControllerFactory.createPooling();
   private Class<? extends IClusteringAlgorithm> clusteringAlgorithmClass;
 
-  private String idFieldName;
-
   @Override
   @Deprecated
   public Object cluster(Query query, DocList docList, SolrQueryRequest sreq) {
@@ -101,6 +126,10 @@ public class CarrotClusteringEngine exte
       attributes.put(AttributeNames.DOCUMENTS, documents);
       attributes.put(AttributeNames.QUERY, query.toString());
 
+      // Pass the fields on which clustering runs to the
+      // SolrStopwordsCarrot2LexicalDataFactory
+      attributes.put("solrFieldNames", getFieldsForClustering(sreq));
+
       // Pass extra overriding attributes from the request, if any
       extractCarrotAttributes(sreq.getParams(), attributes);
 
@@ -113,22 +142,68 @@ public class CarrotClusteringEngine exte
     }
   }
 
-  @Override
+	@Override
+	@SuppressWarnings({ "unchecked", "rawtypes" })
   public String init(NamedList config, final SolrCore core) {
     String result = super.init(config, core);
-    SolrParams initParams = SolrParams.toSolrParams(config);
+    final SolrParams initParams = SolrParams.toSolrParams(config);
 
     // Initialize Carrot2 controller. Pass initialization attributes, if any.
     HashMap<String, Object> initAttributes = new HashMap<String, Object>();
     extractCarrotAttributes(initParams, initAttributes);
-    
-    // Customize the language model factory. The implementation we provide here
-    // is included in the code base of Solr, so that it's possible to refactor
-    // the Lucene APIs the factory relies on if needed.
-    initAttributes.put("PreprocessingPipeline.languageModelFactory",
-      LuceneLanguageModelFactory.class);
-    this.controller.init(initAttributes);
 
+    // Customize the stemmer and tokenizer factories. The implementations we provide here
+    // are included in the code base of Solr, so that it's possible to refactor
+    // the Lucene APIs the factories rely on if needed.
+    // Additionally, we set a custom lexical resource factory for Carrot2 that
+    // will use both Carrot2 default stop words as well as stop words from
+    // the StopFilter defined on the field.
+		BasicPreprocessingPipelineDescriptor.attributeBuilder(initAttributes)
+				.stemmerFactory(LuceneCarrot2StemmerFactory.class)
+				.tokenizerFactory(LuceneCarrot2TokenizerFactory.class)
+				.lexicalDataFactory(SolrStopwordsCarrot2LexicalDataFactory.class);
+
+		// Pass the schema to SolrStopwordsCarrot2LexicalDataFactory.
+		initAttributes.put("solrIndexSchema", core.getSchema());
+
+    // Customize Carrot2's resource lookup to first look for resources
+    // using Solr's resource loader. If that fails, try loading from the classpath.
+    DefaultLexicalDataFactoryDescriptor.attributeBuilder(initAttributes)
+        .resourceLookup(new ResourceLookup(new IResourceLocator() {
+          @Override
+          public IResource[] getAll(final String resource) {
+            final SolrResourceLoader resourceLoader = core.getResourceLoader();
+            final String carrot2ResourcesDir = resourceLoader.getConfigDir()
+                + initParams.get(CarrotParams.LEXICAL_RESOURCES_DIR, CARROT_RESOURCES_PREFIX);
+            try {
+              log.debug("Looking for " + resource + " in "
+                  + carrot2ResourcesDir);
+              final InputStream resourceStream = resourceLoader
+                  .openResource(carrot2ResourcesDir + "/" + resource);
+
+              log.info(resource + " loaded from " + carrot2ResourcesDir);
+              final IResource foundResource = new IResource() {
+                @Override
+                public InputStream open() throws IOException {
+                  return resourceStream;
+                }
+              };
+              return new IResource[] { foundResource };
+            } catch (RuntimeException e) {
+              // No way to distinguish if the resource was found but failed
+              // to load or wasn't found at all, so we simply fall back
+              // to Carrot2 defaults here by returning an empty locations array.
+              log.debug(resource + " not found in " + carrot2ResourcesDir
+                  + ". Using the default " + resource + " from Carrot JAR.");
+              return new IResource[] {};
+            }
+          }
+        },
+
+        // Using the class loader directly because this time we want to omit the prefix
+        new ClassLoaderLocator(core.getResourceLoader().getClassLoader())));
+
+    this.controller.init(initAttributes);
     this.idFieldName = core.getSchema().getUniqueKeyField().getName();
 
     // Make sure the requested Carrot2 clustering algorithm class is available
@@ -148,17 +223,29 @@ public class CarrotClusteringEngine exte
   protected Set<String> getFieldsToLoad(SolrQueryRequest sreq){
     SolrParams solrParams = sreq.getParams();
 
-    // Names of fields to deliver content for clustering
-    String urlField = solrParams.get(CarrotParams.URL_FIELD_NAME, "url");
+    HashSet<String> fields = Sets.newHashSet(getFieldsForClustering(sreq));
+    fields.add(idFieldName);
+    fields.add(solrParams.get(CarrotParams.URL_FIELD_NAME, "url"));
+		return fields;
+  }
+
+	/**
+	 * Returns the names of fields that will be delivering the actual
+	 * content for clustering. Currently, there are two such fields: document
+	 * title and document content.
+	 */
+	private Set<String> getFieldsForClustering(SolrQueryRequest sreq) {
+    SolrParams solrParams = sreq.getParams();
+
     String titleField = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
     String snippetField = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleField);
     if (StringUtils.isBlank(snippetField)) {
       throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, CarrotParams.SNIPPET_FIELD_NAME
               + " must not be blank.");
     }
-    return Sets.newHashSet(urlField, titleField, snippetField, idFieldName);
-  }
-  
+    return Sets.newHashSet(titleField, snippetField);
+	}
+
   /**
    * Prepares Carrot2 documents for clustering.
    */
@@ -180,7 +267,7 @@ public class CarrotClusteringEngine exte
     if (produceSummary == true) {
       highlighter = HighlightComponent.getHighlighter(core);
       if (highlighter != null){
-        Map args = new HashMap();
+        Map<String, Object> args = Maps.newHashMap();
         snippetFieldAry = new String[]{snippetField};
         args.put(HighlightParams.FIELDS, snippetFieldAry);
         args.put(HighlightParams.HIGHLIGHT, "true");
@@ -214,11 +301,12 @@ public class CarrotClusteringEngine exte
       if (produceSummary && docIds != null) {
         docsHolder[0] = docIds.get(sdoc).intValue();
         DocList docAsList = new DocSlice(0, 1, docsHolder, scores, 1, 1.0f);
-        NamedList highlights = highlighter.doHighlighting(docAsList, theQuery, req, snippetFieldAry);
+        NamedList<Object> highlights = highlighter.doHighlighting(docAsList, theQuery, req, snippetFieldAry);
         if (highlights != null && highlights.size() == 1) {//should only be one value given our setup
           //should only be one document with one field
-          NamedList tmp = (NamedList) highlights.getVal(0);
-          String [] highlt = (String[]) tmp.get(snippetField);
+          @SuppressWarnings("unchecked")
+					NamedList<String []> tmp = (NamedList<String[]>) highlights.getVal(0);
+          String [] highlt = tmp.get(snippetField);
           if (highlt != null && highlt.length == 1) {
             snippet = highlt[0];
           }
@@ -226,27 +314,13 @@ public class CarrotClusteringEngine exte
       }
       Document carrotDocument = new Document(getValue(sdoc, titleField),
               snippet, (String)sdoc.getFieldValue(urlField));
-      carrotDocument.setField("solrId", sdoc.getFieldValue(idFieldName));
+      carrotDocument.setField(SOLR_DOCUMENT_ID, sdoc.getFieldValue(idFieldName));
       result.add(carrotDocument);
     }
 
     return result;
   }
 
-  @Deprecated
-  protected String getValue(org.apache.lucene.document.Document doc,
-                            String field) {
-    StringBuilder result = new StringBuilder();
-    String[] vals = doc.getValues(field);
-    for (int i = 0; i < vals.length; i++) {
-      // Join multiple values with a period so that Carrot2 does not pick up
-      // phrases that cross field value boundaries (in most cases it would
-      // create useless phrases).
-      result.append(vals[i]).append(" . ");
-    }
-    return result.toString().trim();
-  }
-
   protected String getValue(SolrDocument sdoc, String field) {
     StringBuilder result = new StringBuilder();
     Collection<Object> vals = sdoc.getFieldValues(field);
@@ -261,9 +335,9 @@ public class CarrotClusteringEngine exte
     return result.toString().trim();
   }
 
-  private List clustersToNamedList(List<Cluster> carrotClusters,
+  private List<NamedList<Object>> clustersToNamedList(List<Cluster> carrotClusters,
                                    SolrParams solrParams) {
-    List result = new ArrayList();
+    List<NamedList<Object>> result = Lists.newArrayList();
     clustersToNamedList(carrotClusters, result, solrParams.getBool(
             CarrotParams.OUTPUT_SUB_CLUSTERS, true), solrParams.getInt(
             CarrotParams.NUM_DESCRIPTIONS, Integer.MAX_VALUE));
@@ -271,25 +345,40 @@ public class CarrotClusteringEngine exte
   }
 
   private void clustersToNamedList(List<Cluster> outputClusters,
-                                   List parent, boolean outputSubClusters, int maxLabels) {
+                                   List<NamedList<Object>> parent, boolean outputSubClusters, int maxLabels) {
     for (Cluster outCluster : outputClusters) {
-      NamedList cluster = new SimpleOrderedMap();
+      NamedList<Object> cluster = new SimpleOrderedMap<Object>();
       parent.add(cluster);
 
+      // Add labels
       List<String> labels = outCluster.getPhrases();
-      if (labels.size() > maxLabels)
+      if (labels.size() > maxLabels) {
         labels = labels.subList(0, maxLabels);
+      }
       cluster.add("labels", labels);
 
+      // Add cluster score
+      final Double score = outCluster.getScore();
+      if (score != null) {
+        cluster.add("score", score);
+      }
+
+      // Add other topics marker
+      if (outCluster.isOtherTopics()) {
+        cluster.add("other-topics", outCluster.isOtherTopics());
+      }
+
+      // Add documents
       List<Document> docs = outputSubClusters ? outCluster.getDocuments() : outCluster.getAllDocuments();
-      List docList = new ArrayList();
+      List<Object> docList = Lists.newArrayList();
       cluster.add("docs", docList);
       for (Document doc : docs) {
-        docList.add(doc.getField("solrId"));
+        docList.add(doc.getField(SOLR_DOCUMENT_ID));
       }
 
-      if (outputSubClusters) {
-        List subclusters = new ArrayList();
+      // Add subclusters
+      if (outputSubClusters && !outCluster.getSubclusters().isEmpty()) {
+        List<NamedList<Object>> subclusters = Lists.newArrayList();
         cluster.add("clusters", subclusters);
         clustersToNamedList(outCluster.getSubclusters(), subclusters,
                 outputSubClusters, maxLabels);

Modified: lucene/dev/branches/flexscoring/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java (original)
+++ lucene/dev/branches/flexscoring/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java Mon May 23 16:49:59 2011
@@ -35,6 +35,8 @@ public interface CarrotParams {
   String OUTPUT_SUB_CLUSTERS = CARROT_PREFIX + "outputSubClusters";
   String SUMMARY_FRAGSIZE = CARROT_PREFIX + "fragzise";
 
+  String LEXICAL_RESOURCES_DIR = CARROT_PREFIX + "lexicalResourcesDir";
+
   public static final Set<String> CARROT_PARAM_NAMES = ImmutableSet.of(
           ALGORITHM, TITLE_FIELD_NAME, URL_FIELD_NAME, SNIPPET_FIELD_NAME,
           PRODUCE_SUMMARY, NUM_DESCRIPTIONS, OUTPUT_SUB_CLUSTERS, SUMMARY_FRAGSIZE);

Modified: lucene/dev/branches/flexscoring/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java (original)
+++ lucene/dev/branches/flexscoring/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java Mon May 23 16:49:59 2011
@@ -17,6 +17,11 @@ package org.apache.solr.handler.clusteri
  * limitations under the License.
  */
 
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.Query;
@@ -37,15 +42,11 @@ import org.apache.solr.util.SolrPluginUt
 import org.carrot2.util.attribute.AttributeUtils;
 import org.junit.Test;
 
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
+import com.google.common.collect.ImmutableList;
 
 /**
  *
  */
-@SuppressWarnings("unchecked")
 public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
   @Test
   public void testCarrotLingo() throws Exception {
@@ -74,7 +75,7 @@ public class CarrotClusteringEngineTest 
 
   @Test
   public void testWithoutSubclusters() throws Exception {
-    checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs),
+    checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs),
             1, 1, 0);
   }
 
@@ -82,7 +83,7 @@ public class CarrotClusteringEngineTest 
   public void testWithSubclusters() throws Exception {
     ModifiableSolrParams params = new ModifiableSolrParams();
     params.set(CarrotParams.OUTPUT_SUB_CLUSTERS, true);
-    checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs), 1, 1, 2);
+    checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs), 1, 1, 2);
   }
 
   @Test
@@ -90,19 +91,107 @@ public class CarrotClusteringEngineTest 
     ModifiableSolrParams params = new ModifiableSolrParams();
     params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "labels"), 5);
     params.set(CarrotParams.NUM_DESCRIPTIONS, 3);
-    checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs,
+    checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs,
             params), 1, 3, 0);
   }
 
   @Test
+  public void testClusterScores() throws Exception {
+    ModifiableSolrParams params = new ModifiableSolrParams();
+    params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
+    List<NamedList<Object>> clusters = checkEngine(getClusteringEngine("mock"),
+        AbstractClusteringTestCase.numberOfDocs, params);
+    int i = 1;
+    for (NamedList<Object> cluster : clusters) {
+      final Double score = getScore(cluster);
+      assertNotNull(score);
+      assertEquals(0.25 * i++, score, 0);
+    }
+  }
+
+  @Test
+  public void testOtherTopics() throws Exception {
+    ModifiableSolrParams params = new ModifiableSolrParams();
+    params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
+    params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "otherTopicsModulo"), 2);
+    List<NamedList<Object>> clusters = checkEngine(getClusteringEngine("mock"),
+        AbstractClusteringTestCase.numberOfDocs, params);
+    int i = 1;
+    for (NamedList<Object> cluster : clusters) {
+      assertEquals(i++ % 2 == 0 ? true : null, isOtherTopics(cluster));
+    }
+  }
+
+  @Test
   public void testCarrotAttributePassing() throws Exception {
     ModifiableSolrParams params = new ModifiableSolrParams();
     params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
     params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "labels"), 3);
-    checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs,
+    checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs,
             params), 1, 3, 0);
   }
 
+	@Test
+	public void testLexicalResourcesFromSolrConfigDefaultDir() throws Exception {
+		checkLexicalResourcesFromSolrConfig("lexical-resource-check",
+				"online,customsolrstopword,customsolrstoplabel");
+	}
+
+	@Test
+	public void testLexicalResourcesFromSolrConfigCustomDir() throws Exception {
+		checkLexicalResourcesFromSolrConfig("lexical-resource-check-custom-resource-dir",
+				"online,customsolrstopwordcustomdir,customsolrstoplabelcustomdir");
+	}
+
+	private void checkLexicalResourcesFromSolrConfig(String engineName, String wordsToCheck)
+			throws IOException {
+		ModifiableSolrParams params = new ModifiableSolrParams();
+		params.set("merge-resources", false);
+		params.set(AttributeUtils.getKey(
+				LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
+				wordsToCheck);
+
+		// "customsolrstopword" is in stopwords.en, "customsolrstoplabel" is in
+		// stoplabels.en, so we're expecting only one cluster with label "online".
+		final List<NamedList<Object>> clusters = checkEngine(
+				getClusteringEngine(engineName), 1, params);
+		assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
+	}
+
+	@Test
+	public void solrStopWordsUsedInCarrot2Clustering() throws Exception {
+		ModifiableSolrParams params = new ModifiableSolrParams();
+		params.set("merge-resources", false);
+		params.set(AttributeUtils.getKey(
+				LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
+		"online,solrownstopword");
+
+		// "solrownstopword" is in stopwords.txt, so we're expecting
+		// only one cluster with label "online".
+		final List<NamedList<Object>> clusters = checkEngine(
+				getClusteringEngine("lexical-resource-check"), 1, params);
+		assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
+	}
+
+	@Test
+	public void solrStopWordsNotDefinedOnAFieldForClustering() throws Exception {
+		ModifiableSolrParams params = new ModifiableSolrParams();
+		// Force string fields to be used for clustering. Does not make sense
+		// in a real word, but does the job in the test.
+		params.set(CarrotParams.TITLE_FIELD_NAME, "url");
+		params.set(CarrotParams.SNIPPET_FIELD_NAME, "url");
+		params.set("merge-resources", false);
+		params.set(AttributeUtils.getKey(
+				LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
+		"online,solrownstopword");
+
+		final List<NamedList<Object>> clusters = checkEngine(
+				getClusteringEngine("lexical-resource-check"), 2, params);
+		assertEquals(ImmutableList.of("online"), getLabels(clusters.get(0)));
+		assertEquals(ImmutableList.of("solrownstopword"),
+				getLabels(clusters.get(1)));
+	}
+
   private CarrotClusteringEngine getClusteringEngine(String engineName) {
     ClusteringComponent comp = (ClusteringComponent) h.getCore()
             .getSearchComponent("clustering");
@@ -114,18 +203,18 @@ public class CarrotClusteringEngineTest 
     return engine;
   }
 
-  private List checkEngine(CarrotClusteringEngine engine,
+  private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine,
                             int expectedNumClusters) throws IOException {
     return checkEngine(engine, numberOfDocs, expectedNumClusters, new MatchAllDocsQuery(), new ModifiableSolrParams());
   }
 
-  private List checkEngine(CarrotClusteringEngine engine,
+  private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine,
                             int expectedNumClusters, SolrParams clusteringParams) throws IOException {
     return checkEngine(engine, numberOfDocs, expectedNumClusters, new MatchAllDocsQuery(), clusteringParams);
   }
 
 
-  private List checkEngine(CarrotClusteringEngine engine, int expectedNumDocs,
+  private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine, int expectedNumDocs,
                            int expectedNumClusters, Query query, SolrParams clusteringParams) throws IOException {
     // Get all documents to cluster
     RefCounted<SolrIndexSearcher> ref = h.getCore().getSearcher();
@@ -145,7 +234,9 @@ public class CarrotClusteringEngineTest 
       LocalSolrQueryRequest req = new LocalSolrQueryRequest(h.getCore(), solrParams);
       Map<SolrDocument,Integer> docIds = new HashMap<SolrDocument, Integer>(docList.size());
       SolrDocumentList solrDocList = SolrPluginUtils.docListToSolrDocumentList( docList, searcher, engine.getFieldsToLoad(req), docIds );
-      List results = (List)engine.cluster(query, solrDocList, docIds, req);
+
+      @SuppressWarnings("unchecked")
+			List<NamedList<Object>> results = (List<NamedList<Object>>) engine.cluster(query, solrDocList, docIds, req);
       req.close();
       assertEquals("number of clusters: " + results, expectedNumClusters, results.size());
       checkClusters(results, false);
@@ -155,51 +246,74 @@ public class CarrotClusteringEngineTest 
     }
   }
 
-  private void checkClusters(List results, int expectedDocCount,
+  private void checkClusters(List<NamedList<Object>> results, int expectedDocCount,
                              int expectedLabelCount, int expectedSubclusterCount) {
     for (int i = 0; i < results.size(); i++) {
-      NamedList cluster = (NamedList) results.get(i);
+      NamedList<Object> cluster = results.get(i);
       checkCluster(cluster, expectedDocCount, expectedLabelCount,
               expectedSubclusterCount);
     }
   }
 
-  private void checkClusters(List results, boolean hasSubclusters) {
+  private void checkClusters(List<NamedList<Object>> results, boolean hasSubclusters) {
     for (int i = 0; i < results.size(); i++) {
-      checkCluster((NamedList) results.get(i), hasSubclusters);
+      checkCluster(results.get(i), hasSubclusters);
     }
   }
 
-  private void checkCluster(NamedList cluster, boolean hasSubclusters) {
-    List docs = (List) cluster.get("docs");
+  private void checkCluster(NamedList<Object> cluster, boolean hasSubclusters) {
+    List<Object> docs = getDocs(cluster);
     assertNotNull("docs is null and it shouldn't be", docs);
     for (int j = 0; j < docs.size(); j++) {
       String id = (String) docs.get(j);
       assertNotNull("id is null and it shouldn't be", id);
     }
 
-    List labels = (List) cluster.get("labels");
+    List<String> labels = getLabels(cluster);
     assertNotNull("labels is null but it shouldn't be", labels);
 
     if (hasSubclusters) {
-      List subclusters = (List) cluster.get("clusters");
+      List<NamedList<Object>> subclusters = getSubclusters(cluster);
       assertNotNull("subclusters is null but it shouldn't be", subclusters);
     }
   }
 
-  private void checkCluster(NamedList cluster, int expectedDocCount,
+  private void checkCluster(NamedList<Object> cluster, int expectedDocCount,
                             int expectedLabelCount, int expectedSubclusterCount) {
     checkCluster(cluster, expectedSubclusterCount > 0);
     assertEquals("number of docs in cluster", expectedDocCount,
-            ((List) cluster.get("docs")).size());
+            getDocs(cluster).size());
     assertEquals("number of labels in cluster", expectedLabelCount,
-            ((List) cluster.get("labels")).size());
+            getLabels(cluster).size());
 
     if (expectedSubclusterCount > 0) {
-      List subclusters = (List) cluster.get("clusters");
+      List<NamedList<Object>> subclusters = getSubclusters(cluster);
       assertEquals("numClusters", expectedSubclusterCount, subclusters.size());
       assertEquals("number of subclusters in cluster",
               expectedSubclusterCount, subclusters.size());
     }
   }
+
+	@SuppressWarnings("unchecked")
+	private List<NamedList<Object>> getSubclusters(NamedList<Object> cluster) {
+		return (List<NamedList<Object>>) cluster.get("clusters");
+	}
+
+	@SuppressWarnings("unchecked")
+	private List<String> getLabels(NamedList<Object> cluster) {
+		return (List<String>) cluster.get("labels");
+	}
+
+	private Double getScore(NamedList<Object> cluster) {
+	  return (Double) cluster.get("score");
+	}
+
+	private Boolean isOtherTopics(NamedList<Object> cluster) {
+	  return (Boolean)cluster.get("other-topics");
+	}
+
+	@SuppressWarnings("unchecked")
+	private List<Object> getDocs(NamedList<Object> cluster) {
+		return (List<Object>) cluster.get("docs");
+	}
 }

Modified: lucene/dev/branches/flexscoring/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java (original)
+++ lucene/dev/branches/flexscoring/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java Mon May 23 16:49:59 2011
@@ -49,6 +49,11 @@ public class MockClusteringAlgorithm ext
   @IntRange(min = 1, max = 5)
   private int labels = 1;
 
+  @Input
+  @Processing
+  @Attribute
+  private int otherTopicsModulo = 0;
+
   @Override
   public void process() throws ProcessingException {
     clusters = Lists.newArrayList();
@@ -59,21 +64,26 @@ public class MockClusteringAlgorithm ext
     int documentIndex = 1;
     for (Document document : documents) {
       StringBuilder label = new StringBuilder("Cluster " + documentIndex);
-      Cluster cluster = createCluster(label.toString(), document);
+      Cluster cluster = createCluster(label.toString(), documentIndex, document);
       clusters.add(cluster);
       for (int i = 1; i <= depth; i++) {
         label.append(".");
         label.append(i);
-        Cluster newCluster = createCluster(label.toString(), document);
-        cluster.addSubclusters(createCluster(label.toString(), document), newCluster);
+        Cluster newCluster = createCluster(label.toString(), documentIndex, document);
+        cluster.addSubclusters(createCluster(label.toString(), documentIndex, document), newCluster);
         cluster = newCluster;
       }
       documentIndex++;
     }
   }
 
-  private Cluster createCluster(String labelBase, Document... documents) {
+  private Cluster createCluster(String labelBase, int documentIndex, Document... documents) {
     Cluster cluster = new Cluster();
+    cluster.setScore(documentIndex * 0.25);
+    if (otherTopicsModulo != 0 && documentIndex % otherTopicsModulo == 0)
+    {
+      cluster.setOtherTopics(true);
+    }
     for (int i = 0; i < labels; i++) {
       cluster.addPhrases(labelBase + "#" + (i + 1));
     }

Modified: lucene/dev/branches/flexscoring/solr/contrib/clustering/src/test/resources/solr-clustering/conf/solrconfig.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/solr/contrib/clustering/src/test/resources/solr-clustering/conf/solrconfig.xml?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/solr/contrib/clustering/src/test/resources/solr-clustering/conf/solrconfig.xml (original)
+++ lucene/dev/branches/flexscoring/solr/contrib/clustering/src/test/resources/solr-clustering/conf/solrconfig.xml Mon May 23 16:49:59 2011
@@ -396,6 +396,15 @@
       <str name="name">mock</str>
       <str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.MockClusteringAlgorithm</str>
     </lst>
+    <lst name="engine">
+      <str name="name">lexical-resource-check</str>
+      <str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.LexicalResourcesCheckClusteringAlgorithm</str>
+    </lst>
+    <lst name="engine">
+      <str name="name">lexical-resource-check-custom-resource-dir</str>
+      <str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.LexicalResourcesCheckClusteringAlgorithm</str>
+      <str name="carrot.lexicalResourcesDir">clustering/custom</str>
+    </lst>
   </searchComponent>
 
   <searchComponent class="org.apache.solr.handler.clustering.ClusteringComponent" name="doc-clustering">

Modified: lucene/dev/branches/flexscoring/solr/contrib/clustering/src/test/resources/solr-clustering/conf/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/solr/contrib/clustering/src/test/resources/solr-clustering/conf/stopwords.txt?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/solr/contrib/clustering/src/test/resources/solr-clustering/conf/stopwords.txt (original)
+++ lucene/dev/branches/flexscoring/solr/contrib/clustering/src/test/resources/solr-clustering/conf/stopwords.txt Mon May 23 16:49:59 2011
@@ -55,4 +55,5 @@ to
 was
 will
 with
+solrownstopword
 

Modified: lucene/dev/branches/flexscoring/solr/contrib/extraction/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/solr/contrib/extraction/CHANGES.txt?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/solr/contrib/extraction/CHANGES.txt (original)
+++ lucene/dev/branches/flexscoring/solr/contrib/extraction/CHANGES.txt Mon May 23 16:49:59 2011
@@ -22,7 +22,7 @@ to your Solr Home lib directory.  See ht
 
 Current Version: Tika 0.8 (released 11/07/2010)
 
-$Id:$
+$Id$
 
 ================== Release 4.0-dev ==================
 
@@ -30,7 +30,8 @@ $Id:$
 
 ================== Release 3.2-dev ==================
 
-(No Changes)
+* SOLR-2480: Add ignoreTikaException flag so that users can ignore TikaException but index
+  meta data. (Shinichiro Abe, koji)
 
 ================== Release 3.1-dev ==================
 

Modified: lucene/dev/branches/flexscoring/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java (original)
+++ lucene/dev/branches/flexscoring/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java Mon May 23 16:49:59 2011
@@ -16,20 +16,27 @@
  */
 package org.apache.solr.handler.extraction;
 
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringWriter;
+import java.util.Locale;
+
 import org.apache.commons.io.IOUtils;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.common.params.UpdateParams;
 import org.apache.solr.common.util.ContentStream;
 import org.apache.solr.common.util.NamedList;
+import org.apache.solr.handler.ContentStreamLoader;
 import org.apache.solr.request.SolrQueryRequest;
 import org.apache.solr.response.SolrQueryResponse;
 import org.apache.solr.schema.IndexSchema;
 import org.apache.solr.update.AddUpdateCommand;
 import org.apache.solr.update.processor.UpdateRequestProcessor;
-import org.apache.solr.handler.ContentStreamLoader;
 import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
@@ -37,26 +44,24 @@ import org.apache.tika.sax.XHTMLContentH
 import org.apache.tika.sax.xpath.Matcher;
 import org.apache.tika.sax.xpath.MatchingContentHandler;
 import org.apache.tika.sax.xpath.XPathParser;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.mime.MediaType;
-import org.apache.xml.serialize.OutputFormat;
 import org.apache.xml.serialize.BaseMarkupSerializer;
-import org.apache.xml.serialize.XMLSerializer;
+import org.apache.xml.serialize.OutputFormat;
 import org.apache.xml.serialize.TextSerializer;
+import org.apache.xml.serialize.XMLSerializer;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.StringWriter;
-import java.util.Locale;
-
 
 /**
  * The class responsible for loading extracted content into Solr.
  *
  **/
 public class ExtractingDocumentLoader extends ContentStreamLoader {
+
+  private static final Logger log = LoggerFactory.getLogger(ExtractingDocumentLoader.class);
+
   /**
    * Extract Only supported format
    */
@@ -74,6 +79,7 @@ public class ExtractingDocumentLoader ex
   final IndexSchema schema;
   final SolrParams params;
   final UpdateRequestProcessor processor;
+  final boolean ignoreTikaException;
   protected AutoDetectParser autoDetectParser;
 
   private final AddUpdateCommand templateAdd;
@@ -95,6 +101,8 @@ public class ExtractingDocumentLoader ex
     //this is lightweight
     autoDetectParser = new AutoDetectParser(config);
     this.factory = factory;
+    
+    ignoreTikaException = params.getBool(ExtractingParams.IGNORE_TIKA_EXCEPTION, false);
   }
 
 
@@ -180,9 +188,17 @@ public class ExtractingDocumentLoader ex
           parsingHandler = new MatchingContentHandler(handler, matcher);
         } //else leave it as is
 
-        //potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
-        ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
-        parser.parse(inputStream, parsingHandler, metadata, context);
+        try{
+          //potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
+          ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
+          parser.parse(inputStream, parsingHandler, metadata, context);
+        } catch (TikaException e) {
+          if(ignoreTikaException)
+            log.warn(new StringBuilder("skip extracting text due to ").append(e.getLocalizedMessage())
+                .append(". metadata=").append(metadata.toString()).toString());
+          else
+            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
+        }
         if (extractOnly == false) {
           addDoc(handler);
         } else {
@@ -202,8 +218,6 @@ public class ExtractingDocumentLoader ex
         }
       } catch (SAXException e) {
         throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
-      } catch (TikaException e) {
-        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
       } finally {
         IOUtils.closeQuietly(inputStream);
       }

Modified: lucene/dev/branches/flexscoring/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java (original)
+++ lucene/dev/branches/flexscoring/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java Mon May 23 16:49:59 2011
@@ -28,6 +28,11 @@ public interface ExtractingParams {
    */
   public static final String LOWERNAMES = "lowernames";
 
+  /**
+   * if true, ignore TikaException (give up to extract text but index meta data)
+   */
+  public static final String IGNORE_TIKA_EXCEPTION = "ignoreTikaException";
+
 
   /**
    * The param prefix for mapping Tika metadata to Solr fields.

Modified: lucene/dev/branches/flexscoring/solr/example/solr/conf/solrconfig.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/solr/example/solr/conf/solrconfig.xml?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/solr/example/solr/conf/solrconfig.xml (original)
+++ lucene/dev/branches/flexscoring/solr/example/solr/conf/solrconfig.xml Mon May 23 16:49:59 2011
@@ -1198,17 +1198,20 @@
     <lst name="engine">
       <!-- The name, only one can be named "default" -->
       <str name="name">default</str>
-      <!-- Class name of Carrot2 clustering algorithm. 
-           
+
+      <!-- Class name of Carrot2 clustering algorithm.
+
            Currently available algorithms are:
            
            * org.carrot2.clustering.lingo.LingoClusteringAlgorithm
            * org.carrot2.clustering.stc.STCClusteringAlgorithm
+           * org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm
            
            See http://project.carrot2.org/algorithms.html for the
            algorithm's characteristics.
         -->
       <str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str>
+
       <!-- Overriding values for Carrot2 default algorithm attributes.
 
            For a description of all available attributes, see:
@@ -1219,9 +1222,22 @@
            name and attribute value as parameter value.
         -->
       <str name="LingoClusteringAlgorithm.desiredClusterCountBase">20</str>
-      
+
+      <!-- Location of Carrot2 lexical resources.
+
+           A directory from which to load Carrot2-specific stop words
+           and stop labels. Absolute or relative to Solr config directory.
+           If a specific resource (e.g. stopwords.en) is present in the
+           specified dir, it will completely override the corresponding
+           default one that ships with Carrot2.
+
+           For an overview of Carrot2 lexical resources, see:
+           http://download.carrot2.org/head/manual/#chapter.lexical-resources
+        -->
+      <str name="carrot.lexicalResourcesDir">clustering/carrot2</str>
+
       <!-- The language to assume for the documents.
-           
+
            For a list of allowed values, see:
            http://download.carrot2.org/stable/manual/#section.attribute.lingo.MultilingualClustering.defaultLanguage
        -->
@@ -1360,7 +1376,6 @@
 
       <!-- Configure the standard encoder -->
       <encoder name="html" 
-               default="true"
                class="solr.highlight.HtmlEncoder" />
 
       <!-- Configure the standard fragListBuilder -->

Modified: lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/core/SolrConfig.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/core/SolrConfig.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/core/SolrConfig.java (original)
+++ lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/core/SolrConfig.java Mon May 23 16:49:59 2011
@@ -57,7 +57,6 @@ import java.util.regex.Pattern;
 import java.util.regex.Matcher;
 import java.io.FileFilter;
 import java.io.IOException;
-import java.io.InputStream;
 
 
 /**
@@ -130,12 +129,12 @@ public class SolrConfig extends Config {
   throws ParserConfigurationException, IOException, SAXException {
     super(loader, name, is, "/config/");
     initLibs();
+    luceneMatchVersion = getLuceneVersion("luceneMatchVersion");
     defaultIndexConfig = new SolrIndexConfig(this, null, null);
     mainIndexConfig = new SolrIndexConfig(this, "mainIndex", defaultIndexConfig);
     reopenReaders = getBool("mainIndex/reopenReaders", true);
     
     booleanQueryMaxClauseCount = getInt("query/maxBooleanClauses", BooleanQuery.getMaxClauseCount());
-    luceneMatchVersion = getLuceneVersion("luceneMatchVersion");
     log.info("Using Lucene MatchVersion: " + luceneMatchVersion);
 
     filtOptEnabled = getBool("query/boolTofilterOptimizer/@enabled", false);

Modified: lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/request/SimpleFacets.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/request/SimpleFacets.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/request/SimpleFacets.java (original)
+++ lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/request/SimpleFacets.java Mon May 23 16:49:59 2011
@@ -656,7 +656,6 @@ public class SimpleFacets {
       }
     }
 
-    Term template = new Term(field);
     DocsEnum docsEnum = null;
     CharArr spare = new CharArr();
 
@@ -676,10 +675,6 @@ public class SimpleFacets {
 
           if (df >= minDfFilterCache) {
             // use the filter cache
-            // TODO: need a term query that takes a BytesRef to handle binary terms
-            spare.reset();
-            ByteUtils.UTF8toUTF16(term, spare);
-            Term t = template.createTerm(spare.toString());
 
             if (deState==null) {
               deState = new SolrIndexSearcher.DocsEnumState();

Modified: lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/response/JSONResponseWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/response/JSONResponseWriter.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/response/JSONResponseWriter.java (original)
+++ lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/response/JSONResponseWriter.java Mon May 23 16:49:59 2011
@@ -442,7 +442,7 @@ class JSONWriter extends TextResponseWri
 
       for (int i=0; i<val.length(); i++) {
         char ch = val.charAt(i);
-        if ((ch > '#' && ch != '\\' && ch !=  '\u2028') || ch==' ') { // fast path
+        if ((ch > '#' && ch != '\\' && ch < '\u2028') || ch == ' ') { // fast path
           writer.write(ch);
           continue;
         }
@@ -457,7 +457,10 @@ class JSONWriter extends TextResponseWri
           case '\t': writer.write('\\'); writer.write('t'); break;
           case '\b': writer.write('\\'); writer.write('b'); break;
           case '\f': writer.write('\\'); writer.write('f'); break;
-          case '\u2028': unicodeEscape(writer,ch); break;
+          case '\u2028': // fallthrough
+          case '\u2029':
+            unicodeEscape(writer,ch);
+            break;
           // case '/':
           default: {
             if (ch <= 0x1F) {

Modified: lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/response/XMLWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/response/XMLWriter.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/response/XMLWriter.java (original)
+++ lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/response/XMLWriter.java Mon May 23 16:49:59 2011
@@ -19,9 +19,7 @@ package org.apache.solr.response;
 
 import java.io.IOException;
 import java.io.Writer;
-import java.util.ArrayList;
 import java.util.Arrays;
-import java.util.Collection;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.Set;
@@ -32,11 +30,13 @@ import org.apache.solr.common.params.Com
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.common.util.XML;
 import org.apache.solr.request.SolrQueryRequest;
-import org.apache.solr.schema.SchemaField;
 import org.apache.solr.search.ReturnFields;
 
 
-public final class XMLWriter extends TextResponseWriter {
+/**
+ * @lucene.internal
+ */
+public class XMLWriter extends TextResponseWriter {
 
   public static float CURRENT_VERSION=2.2f;
 
@@ -54,13 +54,8 @@ public final class XMLWriter extends Tex
   
   private static final char[] XML_START2_NOSCHEMA=("<response>\n").toCharArray();
 
-  private boolean defaultIndent=false;
   final int version;
 
-  // temporary working objects...
-  // be careful not to use these recursively...
-  private final ArrayList tlst = new ArrayList();
-
   public static void writeResponse(Writer writer, SolrQueryRequest req, SolrQueryResponse rsp) throws IOException {
     XMLWriter xmlWriter = null;
     try {
@@ -106,7 +101,7 @@ public final class XMLWriter extends Tex
     writer.write(XML_START2_NOSCHEMA);
 
     // dump response values
-    NamedList lst = rsp.getValues();
+    NamedList<?> lst = rsp.getValues();
     Boolean omitHeader = req.getParams().getBool(CommonParams.OMIT_HEADER);
     if(omitHeader != null && omitHeader) lst.remove("responseHeader");
     int sz = lst.size();

Modified: lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/search/JoinQParserPlugin.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/search/JoinQParserPlugin.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/search/JoinQParserPlugin.java (original)
+++ lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/search/JoinQParserPlugin.java Mon May 23 16:49:59 2011
@@ -457,7 +457,7 @@ class JoinQuery extends Query {
         return resultList.get(0);
       }
 
-      int sz = resultList.size();
+      int sz = 0;
 
       for (DocSet set : resultList)
         sz += set.size();

Modified: lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/search/SolrIndexSearcher.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/search/SolrIndexSearcher.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/search/SolrIndexSearcher.java (original)
+++ lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/search/SolrIndexSearcher.java Mon May 23 16:49:59 2011
@@ -811,7 +811,7 @@ public class SolrIndexSearcher extends I
       bitsSet += upto;
       result = new BitDocSet(obs, bitsSet);
     } else {
-      result = new SortedIntDocSet(Arrays.copyOf(docs, upto));
+      result = upto==0 ? DocSet.EMPTY : new SortedIntDocSet(Arrays.copyOf(docs, upto));
     }
 
     if (useCache) {

Modified: lucene/dev/branches/flexscoring/solr/src/test/org/apache/solr/TestJoin.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/solr/src/test/org/apache/solr/TestJoin.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/solr/src/test/org/apache/solr/TestJoin.java (original)
+++ lucene/dev/branches/flexscoring/solr/src/test/org/apache/solr/TestJoin.java Mon May 23 16:49:59 2011
@@ -101,6 +101,14 @@ public class TestJoin extends SolrTestCa
     int indexIter=50 * RANDOM_MULTIPLIER;
     int queryIter=50 * RANDOM_MULTIPLIER;
 
+    // groups of fields that have any chance of matching... used to
+    // increase test effectiveness by avoiding 0 resultsets much of the time.
+    String[][] compat = new String[][] {
+        {"small_s","small2_s","small2_ss","small3_ss"},
+        {"small_i","small2_i","small2_is","small3_is"}
+    };
+
+
     while (--indexIter >= 0) {
       int indexSize = random.nextInt(20 * RANDOM_MULTIPLIER);
 
@@ -121,8 +129,19 @@ public class TestJoin extends SolrTestCa
       Map<String, Map<Comparable, Set<Comparable>>> pivots = new HashMap<String, Map<Comparable, Set<Comparable>>>();
 
       for (int qiter=0; qiter<queryIter; qiter++) {
-        String fromField = types.get(random.nextInt(types.size())).fname;
-        String toField = types.get(random.nextInt(types.size())).fname;
+        String fromField;
+        String toField;
+        if (random.nextInt(100) < 5) {
+          // pick random fields 5% of the time
+          fromField = types.get(random.nextInt(types.size())).fname;
+          // pick the same field 50% of the time we pick a random field (since other fields won't match anything)
+          toField = (random.nextInt(100) < 50) ? fromField : types.get(random.nextInt(types.size())).fname;
+        } else {
+          // otherwise, pick compatible fields that have a chance of matching indexed tokens
+          String[] group = compat[random.nextInt(compat.length)];
+          fromField = group[random.nextInt(group.length)];
+          toField = group[random.nextInt(group.length)];
+        }
 
         Map<Comparable, Set<Comparable>> pivot = pivots.get(fromField+"/"+toField);
         if (pivot == null) {
@@ -146,7 +165,7 @@ public class TestJoin extends SolrTestCa
         resultSet.put("start", 0);
         resultSet.put("docs", sortedDocs);
 
-        // todo: use filters
+        // todo: use different join queries for better coverage
 
         SolrQueryRequest req = req("wt","json","indent","true", "echoParams","all",
             "q","{!join from="+fromField+" to="+toField
@@ -159,7 +178,7 @@ public class TestJoin extends SolrTestCa
         Object realResponse = ObjectBuilder.fromJSON(strResponse);
         String err = JSONTestUtil.matchObj("/response", realResponse, resultSet);
         if (err != null) {
-          log.error("GROUPING MISMATCH: " + err
+          log.error("JOIN MISMATCH: " + err
            + "\n\trequest="+req
            + "\n\tresult="+strResponse
            + "\n\texpected="+ JSONUtil.toJSON(resultSet)