You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/05/23 18:50:04 UTC
svn commit: r1126578 [3/4] - in /lucene/dev/branches/flexscoring: ./
dev-tools/eclipse/ dev-tools/idea/ dev-tools/idea/.idea/
dev-tools/idea/lucene/ dev-tools/idea/lucene/contrib/ant/
dev-tools/idea/lucene/contrib/db/bdb-je/ dev-tools/idea/lucene/contr...
Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sinks/TestTeeSinkTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sinks/TestTeeSinkTokenFilter.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sinks/TestTeeSinkTokenFilter.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sinks/TestTeeSinkTokenFilter.java Mon May 23 16:49:59 2011
@@ -84,7 +84,7 @@ public class TestTeeSinkTokenFilter exte
// with BaseTokenStreamTestCase now...
public void testEndOffsetPositionWithTeeSinkTokenFilter() throws Exception {
Directory dir = newDirectory();
- Analyzer analyzer = new WhitespaceAnalyzer(TEST_VERSION_CURRENT);
+ Analyzer analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false);
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
Document doc = new Document();
TeeSinkTokenFilter tee = new TeeSinkTokenFilter(analyzer.tokenStream("field", new StringReader("abcd ")));
@@ -108,7 +108,7 @@ public class TestTeeSinkTokenFilter exte
}
public void testGeneral() throws IOException {
- final TeeSinkTokenFilter source = new TeeSinkTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer1.toString())));
+ final TeeSinkTokenFilter source = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer1.toString()), MockTokenizer.WHITESPACE, false));
final TokenStream sink1 = source.newSinkTokenStream();
final TokenStream sink2 = source.newSinkTokenStream(theFilter);
@@ -122,16 +122,17 @@ public class TestTeeSinkTokenFilter exte
}
public void testMultipleSources() throws Exception {
- final TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer1.toString())));
+ final TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer1.toString()), MockTokenizer.WHITESPACE, false));
final TeeSinkTokenFilter.SinkTokenStream dogDetector = tee1.newSinkTokenStream(dogFilter);
final TeeSinkTokenFilter.SinkTokenStream theDetector = tee1.newSinkTokenStream(theFilter);
+ tee1.reset();
final TokenStream source1 = new CachingTokenFilter(tee1);
tee1.addAttribute(CheckClearAttributesAttribute.class);
dogDetector.addAttribute(CheckClearAttributesAttribute.class);
theDetector.addAttribute(CheckClearAttributesAttribute.class);
- final TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(buffer2.toString())));
+ final TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(buffer2.toString()), MockTokenizer.WHITESPACE, false));
tee2.addSinkTokenStream(dogDetector);
tee2.addSinkTokenStream(theDetector);
final TokenStream source2 = tee2;
Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sinks/TokenRangeSinkTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sinks/TokenRangeSinkTokenizerTest.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sinks/TokenRangeSinkTokenizerTest.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sinks/TokenRangeSinkTokenizerTest.java Mon May 23 16:49:59 2011
@@ -20,14 +20,14 @@ import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.MockTokenizer;
public class TokenRangeSinkTokenizerTest extends BaseTokenStreamTestCase {
public void test() throws IOException {
TokenRangeSinkFilter sinkFilter = new TokenRangeSinkFilter(2, 4);
String test = "The quick red fox jumped over the lazy brown dogs";
- TeeSinkTokenFilter tee = new TeeSinkTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test)));
+ TeeSinkTokenFilter tee = new TeeSinkTokenFilter(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false));
TeeSinkTokenFilter.SinkTokenStream rangeToks = tee.newSinkTokenStream(sinkFilter);
int count = 0;
Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sinks/TokenTypeSinkTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sinks/TokenTypeSinkTokenizerTest.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sinks/TokenTypeSinkTokenizerTest.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sinks/TokenTypeSinkTokenizerTest.java Mon May 23 16:49:59 2011
@@ -20,9 +20,9 @@ import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
@@ -32,7 +32,7 @@ public class TokenTypeSinkTokenizerTest
TokenTypeSinkFilter sinkFilter = new TokenTypeSinkFilter("D");
String test = "The quick red fox jumped over the lazy brown dogs";
- TeeSinkTokenFilter ttf = new TeeSinkTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test))));
+ TeeSinkTokenFilter ttf = new TeeSinkTokenFilter(new WordTokenFilter(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false)));
TeeSinkTokenFilter.SinkTokenStream sink = ttf.newSinkTokenStream(sinkFilter);
boolean seenDogs = false;
Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilter.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilter.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilter.java Mon May 23 16:49:59 2011
@@ -22,8 +22,8 @@ import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
@@ -36,7 +36,7 @@ public class TestSwedishLightStemFilter
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
- Tokenizer source = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
+ Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(source, new SwedishLightStemFilter(source));
}
};
Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilter.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilter.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilter.java Mon May 23 16:49:59 2011
@@ -25,6 +25,7 @@ import java.util.Collection;
import java.util.List;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
@@ -43,14 +44,14 @@ public class TestSynonymFilter extends B
static void assertTokenizesTo(SynonymMap dict, String input,
String expected[]) throws IOException {
- Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
+ Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
SynonymFilter stream = new SynonymFilter(tokenizer, dict);
assertTokenStreamContents(stream, expected);
}
static void assertTokenizesTo(SynonymMap dict, String input,
String expected[], int posIncs[]) throws IOException {
- Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
+ Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
SynonymFilter stream = new SynonymFilter(tokenizer, dict);
assertTokenStreamContents(stream, expected, posIncs);
}
Modified: lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilter.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilter.java (original)
+++ lucene/dev/branches/flexscoring/modules/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilter.java Mon May 23 16:49:59 2011
@@ -20,8 +20,8 @@ package org.apache.lucene.analysis.tr;
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.core.WhitespaceTokenizer;
/**
* Test the Turkish lowercase filter.
@@ -32,8 +32,8 @@ public class TestTurkishLowerCaseFilter
* Test composed forms
*/
public void testTurkishLowerCaseFilter() throws Exception {
- TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
- "\u0130STANBUL \u0130ZM\u0130R ISPARTA"));
+ TokenStream stream = new MockTokenizer(new StringReader(
+ "\u0130STANBUL \u0130ZM\u0130R ISPARTA"), MockTokenizer.WHITESPACE, false);
TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(stream);
assertTokenStreamContents(filter, new String[] {"istanbul", "izmir",
"\u0131sparta",});
@@ -43,8 +43,8 @@ public class TestTurkishLowerCaseFilter
* Test decomposed forms
*/
public void testDecomposed() throws Exception {
- TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
- "\u0049\u0307STANBUL \u0049\u0307ZM\u0049\u0307R ISPARTA"));
+ TokenStream stream = new MockTokenizer(new StringReader(
+ "\u0049\u0307STANBUL \u0049\u0307ZM\u0049\u0307R ISPARTA"), MockTokenizer.WHITESPACE, false);
TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(stream);
assertTokenStreamContents(filter, new String[] {"istanbul", "izmir",
"\u0131sparta",});
@@ -56,8 +56,8 @@ public class TestTurkishLowerCaseFilter
* to U+0130 + U+0316, and is lowercased the same way.
*/
public void testDecomposed2() throws Exception {
- TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
- "\u0049\u0316\u0307STANBUL \u0049\u0307ZM\u0049\u0307R I\u0316SPARTA"));
+ TokenStream stream = new MockTokenizer(new StringReader(
+ "\u0049\u0316\u0307STANBUL \u0049\u0307ZM\u0049\u0307R I\u0316SPARTA"), MockTokenizer.WHITESPACE, false);
TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(stream);
assertTokenStreamContents(filter, new String[] {"i\u0316stanbul", "izmir",
"\u0131\u0316sparta",});
Modified: lucene/dev/branches/flexscoring/modules/grouping/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/grouping/build.xml?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/grouping/build.xml (original)
+++ lucene/dev/branches/flexscoring/modules/grouping/build.xml Mon May 23 16:49:59 2011
@@ -10,4 +10,6 @@
<import file="../../lucene/contrib/contrib-build.xml"/>
<property name="working.dir" location="work"/>
+
+ <target name="dist-maven" depends="jar-core,javadocs,contrib-build.dist-maven" />
</project>
Modified: lucene/dev/branches/flexscoring/modules/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java (original)
+++ lucene/dev/branches/flexscoring/modules/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java Mon May 23 16:49:59 2011
@@ -34,9 +34,12 @@ import org.apache.lucene.search.SortFiel
import org.apache.lucene.util.BytesRef;
/** FirstPassGroupingCollector is the first of two passes necessary
- * to collected grouped hits. This pass gathers the top N sorted
+ * to collect grouped hits. This pass gathers the top N sorted
* groups.
*
+ * <p>See {@link org.apache.lucene.search.grouping} for more
+ * details including a full code example.</p>
+ *
* @lucene.experimental
*/
@@ -229,7 +232,9 @@ public class FirstPassGroupingCollector
// We already tested that the document is competitive, so replace
// the bottom group with this new group.
- final CollectedSearchGroup bottomGroup = orderedGroups.pollLast();
+ // java 6-only: final CollectedSearchGroup bottomGroup = orderedGroups.pollLast();
+ final CollectedSearchGroup bottomGroup = orderedGroups.last();
+ orderedGroups.remove(bottomGroup);
assert orderedGroups.size() == topNGroups -1;
groupMap.remove(bottomGroup.groupValue);
Modified: lucene/dev/branches/flexscoring/modules/grouping/src/java/org/apache/lucene/search/grouping/SearchGroup.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/grouping/src/java/org/apache/lucene/search/grouping/SearchGroup.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/grouping/src/java/org/apache/lucene/search/grouping/SearchGroup.java (original)
+++ lucene/dev/branches/flexscoring/modules/grouping/src/java/org/apache/lucene/search/grouping/SearchGroup.java Mon May 23 16:49:59 2011
@@ -20,7 +20,7 @@ package org.apache.lucene.search.groupin
import org.apache.lucene.util.BytesRef;
/** @lucene.experimental */
-class SearchGroup {
+public class SearchGroup {
public BytesRef groupValue;
public Comparable[] sortValues;
}
Modified: lucene/dev/branches/flexscoring/modules/grouping/src/java/org/apache/lucene/search/grouping/SecondPassGroupingCollector.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/grouping/src/java/org/apache/lucene/search/grouping/SecondPassGroupingCollector.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/grouping/src/java/org/apache/lucene/search/grouping/SecondPassGroupingCollector.java (original)
+++ lucene/dev/branches/flexscoring/modules/grouping/src/java/org/apache/lucene/search/grouping/SecondPassGroupingCollector.java Mon May 23 16:49:59 2011
@@ -33,7 +33,14 @@ import org.apache.lucene.search.TopScore
import org.apache.lucene.util.BytesRef;
/**
- * See {@link FirstPassGroupingCollector}.
+ * SecondPassGroupingCollector is the second of two passes
+ * necessary to collect grouped docs. This pass gathers the
+ * top N documents per top group computed from the
+ * first pass.
+ *
+ * <p>See {@link org.apache.lucene.search.grouping} for more
+ * details including a full code example.</p>
+ *
* @lucene.experimental
*/
public class SecondPassGroupingCollector extends Collector {
Modified: lucene/dev/branches/flexscoring/modules/grouping/src/java/org/apache/lucene/search/grouping/TopGroups.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/grouping/src/java/org/apache/lucene/search/grouping/TopGroups.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/grouping/src/java/org/apache/lucene/search/grouping/TopGroups.java (original)
+++ lucene/dev/branches/flexscoring/modules/grouping/src/java/org/apache/lucene/search/grouping/TopGroups.java Mon May 23 16:49:59 2011
@@ -21,9 +21,6 @@ import org.apache.lucene.search.SortFiel
/** Represents result returned by a grouping search.
*
- * Note that we do not return the total number of unique
- * groups; doing so would be costly.
- *
* @lucene.experimental */
public class TopGroups {
/** Number of documents matching the search */
@@ -32,6 +29,9 @@ public class TopGroups {
/** Number of documents grouped into the topN groups */
public final int totalGroupedHitCount;
+ /** The total number of unique groups. If <code>null</code> this value is not computed. */
+ public final Integer totalGroupCount;
+
/** Group results in groupSort order */
public final GroupDocs[] groups;
@@ -47,5 +47,15 @@ public class TopGroups {
this.totalHitCount = totalHitCount;
this.totalGroupedHitCount = totalGroupedHitCount;
this.groups = groups;
+ this.totalGroupCount = null;
+ }
+
+ public TopGroups(TopGroups oldTopGroups, Integer totalGroupCount) {
+ this.groupSort = oldTopGroups.groupSort;
+ this.withinGroupSort = oldTopGroups.withinGroupSort;
+ this.totalHitCount = oldTopGroups.totalHitCount;
+ this.totalGroupedHitCount = oldTopGroups.totalGroupedHitCount;
+ this.groups = oldTopGroups.groups;
+ this.totalGroupCount = totalGroupCount;
}
}
Modified: lucene/dev/branches/flexscoring/modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java (original)
+++ lucene/dev/branches/flexscoring/modules/grouping/src/test/org/apache/lucene/search/grouping/TestGrouping.java Mon May 23 16:49:59 2011
@@ -17,13 +17,7 @@
package org.apache.lucene.search.grouping;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.List;
+import java.util.*;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
@@ -32,14 +26,7 @@ import org.apache.lucene.document.Numeri
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
-import org.apache.lucene.search.Collector;
-import org.apache.lucene.search.FieldCache;
-import org.apache.lucene.search.FieldDoc;
-import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.ScoreDoc;
-import org.apache.lucene.search.Sort;
-import org.apache.lucene.search.SortField;
-import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
@@ -120,7 +107,7 @@ public class TestGrouping extends Lucene
final SecondPassGroupingCollector c2 = new SecondPassGroupingCollector(groupField, c1.getTopGroups(0, true), groupSort, null, 5, true, false, true);
indexSearcher.search(new TermQuery(new Term("content", "random")), c2);
-
+
final TopGroups groups = c2.getTopGroups(0);
assertEquals(7, groups.totalHitCount);
@@ -218,11 +205,11 @@ public class TestGrouping extends Lucene
};
}
- private Comparable[] fillFields(GroupDoc d, Sort sort) {
+ private Comparable<?>[] fillFields(GroupDoc d, Sort sort) {
final SortField[] sortFields = sort.getSort();
- final Comparable[] fields = new Comparable[sortFields.length];
+ final Comparable<?>[] fields = new Comparable[sortFields.length];
for(int fieldIDX=0;fieldIDX<sortFields.length;fieldIDX++) {
- final Comparable c;
+ final Comparable<?> c;
final SortField sf = sortFields[fieldIDX];
if (sf.getField().equals("sort1")) {
c = d.sort1;
@@ -242,6 +229,7 @@ public class TestGrouping extends Lucene
boolean fillFields,
boolean getScores,
boolean getMaxScores,
+ boolean doAllGroups,
Sort groupSort,
Sort docSort,
int topNGroups,
@@ -254,9 +242,10 @@ public class TestGrouping extends Lucene
Arrays.sort(groupDocs, groupSortComp);
final HashMap<BytesRef,List<GroupDoc>> groups = new HashMap<BytesRef,List<GroupDoc>>();
final List<BytesRef> sortedGroups = new ArrayList<BytesRef>();
- final List<Comparable[]> sortedGroupFields = new ArrayList<Comparable[]>();
+ final List<Comparable<?>[]> sortedGroupFields = new ArrayList<Comparable<?>[]>();
int totalHitCount = 0;
+ Set<BytesRef> knownGroups = new HashSet<BytesRef>();
for(GroupDoc d : groupDocs) {
// TODO: would be better to filter by searchTerm before sorting!
@@ -264,6 +253,13 @@ public class TestGrouping extends Lucene
continue;
}
totalHitCount++;
+
+ if (doAllGroups) {
+ if (!knownGroups.contains(d.group)) {
+ knownGroups.add(d.group);
+ }
+ }
+
List<GroupDoc> l = groups.get(d.group);
if (l == null) {
sortedGroups.add(d.group);
@@ -316,7 +312,14 @@ public class TestGrouping extends Lucene
fillFields ? sortedGroupFields.get(idx) : null);
}
- return new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result);
+ if (doAllGroups) {
+ return new TopGroups(
+ new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result),
+ knownGroups.size()
+ );
+ } else {
+ return new TopGroups(groupSort.getSort(), docSort.getSort(), totalHitCount, totalGroupedHitCount, result);
+ }
}
public void testRandom() throws Exception {
@@ -334,7 +337,7 @@ public class TestGrouping extends Lucene
if (VERBOSE) {
System.out.println("TEST: numDocs=" + numDocs + " numGroups=" + numGroups);
}
-
+
final List<BytesRef> groups = new ArrayList<BytesRef>();
for(int i=0;i<numGroups;i++) {
groups.add(new BytesRef(_TestUtil.randomRealisticUnicodeString(random)));
@@ -427,25 +430,69 @@ public class TestGrouping extends Lucene
//final int docOffset = 0;
final boolean doCache = random.nextBoolean();
+ final boolean doAllGroups = random.nextBoolean();
if (VERBOSE) {
- System.out.println("TEST: groupSort=" + groupSort + " docSort=" + docSort + " searchTerm=" + searchTerm + " topNGroups=" + topNGroups + " groupOffset=" + groupOffset + " docOffset=" + docOffset + " doCache=" + doCache + " docsPerGroup=" + docsPerGroup);
+ System.out.println("TEST: groupSort=" + groupSort + " docSort=" + docSort + " searchTerm=" + searchTerm + " topNGroups=" + topNGroups + " groupOffset=" + groupOffset + " docOffset=" + docOffset + " doCache=" + doCache + " docsPerGroup=" + docsPerGroup + " doAllGroups=" + doAllGroups);
+ }
+
+ final AllGroupsCollector allGroupsCollector;
+ if (doAllGroups) {
+ allGroupsCollector = new AllGroupsCollector("group");
+ } else {
+ allGroupsCollector = null;
}
final FirstPassGroupingCollector c1 = new FirstPassGroupingCollector("group", groupSort, groupOffset+topNGroups);
final CachingCollector cCache;
final Collector c;
+
+ final boolean useWrappingCollector = random.nextBoolean();
+
if (doCache) {
final double maxCacheMB = random.nextDouble();
if (VERBOSE) {
System.out.println("TEST: maxCacheMB=" + maxCacheMB);
}
- c = cCache = new CachingCollector(c1, true, maxCacheMB);
+
+ if (useWrappingCollector) {
+ if (doAllGroups) {
+ cCache = CachingCollector.create(c1, true, maxCacheMB);
+ c = MultiCollector.wrap(cCache, allGroupsCollector);
+ } else {
+ c = cCache = CachingCollector.create(c1, true, maxCacheMB);
+ }
+ } else {
+ // Collect only into cache, then replay multiple times:
+ c = cCache = CachingCollector.create(false, true, maxCacheMB);
+ }
} else {
- c = c1;
cCache = null;
+ if (doAllGroups) {
+ c = MultiCollector.wrap(c1, allGroupsCollector);
+ } else {
+ c = c1;
+ }
}
+
s.search(new TermQuery(new Term("content", searchTerm)), c);
+ if (doCache && !useWrappingCollector) {
+ if (cCache.isCached()) {
+ // Replay for first-pass grouping
+ cCache.replay(c1);
+ if (doAllGroups) {
+ // Replay for all groups:
+ cCache.replay(allGroupsCollector);
+ }
+ } else {
+ // Replay by re-running search:
+ s.search(new TermQuery(new Term("content", searchTerm)), c1);
+ if (doAllGroups) {
+ s.search(new TermQuery(new Term("content", searchTerm)), allGroupsCollector);
+ }
+ }
+ }
+
final Collection<SearchGroup> topGroups = c1.getTopGroups(groupOffset, fillFields);
final TopGroups groupsResult;
@@ -474,8 +521,13 @@ public class TestGrouping extends Lucene
} else {
s.search(new TermQuery(new Term("content", searchTerm)), c2);
}
-
- groupsResult = c2.getTopGroups(docOffset);
+
+ if (doAllGroups) {
+ TopGroups tempTopGroups = c2.getTopGroups(docOffset);
+ groupsResult = new TopGroups(tempTopGroups, allGroupsCollector.getGroupCount());
+ } else {
+ groupsResult = c2.getTopGroups(docOffset);
+ }
} else {
groupsResult = null;
if (VERBOSE) {
@@ -483,7 +535,7 @@ public class TestGrouping extends Lucene
}
}
- final TopGroups expectedGroups = slowGrouping(groupDocs, searchTerm, fillFields, getScores, getMaxScores, groupSort, docSort, topNGroups, docsPerGroup, groupOffset, docOffset);
+ final TopGroups expectedGroups = slowGrouping(groupDocs, searchTerm, fillFields, getScores, getMaxScores, doAllGroups, groupSort, docSort, topNGroups, docsPerGroup, groupOffset, docOffset);
try {
// NOTE: intentional but temporary field cache insanity!
@@ -508,7 +560,10 @@ public class TestGrouping extends Lucene
assertEquals(expected.groups.length, actual.groups.length);
assertEquals(expected.totalHitCount, actual.totalHitCount);
assertEquals(expected.totalGroupedHitCount, actual.totalGroupedHitCount);
-
+ if (expected.totalGroupCount != null) {
+ assertEquals(expected.totalGroupCount, actual.totalGroupCount);
+ }
+
for(int groupIDX=0;groupIDX<expected.groups.length;groupIDX++) {
if (VERBOSE) {
System.out.println(" check groupIDX=" + groupIDX);
@@ -516,7 +571,7 @@ public class TestGrouping extends Lucene
final GroupDocs expectedGroup = expected.groups[groupIDX];
final GroupDocs actualGroup = actual.groups[groupIDX];
assertEquals(expectedGroup.groupValue, actualGroup.groupValue);
- assertEquals(expectedGroup.groupSortValues, actualGroup.groupSortValues);
+ assertArrayEquals(expectedGroup.groupSortValues, actualGroup.groupSortValues);
// TODO
// assertEquals(expectedGroup.maxScore, actualGroup.maxScore);
@@ -532,7 +587,7 @@ public class TestGrouping extends Lucene
assertEquals(expectedFD.doc, docIDtoID[actualFD.doc]);
// TODO
// assertEquals(expectedFD.score, actualFD.score);
- assertEquals(expectedFD.fields, actualFD.fields);
+ assertArrayEquals(expectedFD.fields, actualFD.fields);
}
}
}
Modified: lucene/dev/branches/flexscoring/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/solr/CHANGES.txt?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/solr/CHANGES.txt (original)
+++ lucene/dev/branches/flexscoring/solr/CHANGES.txt Mon May 23 16:49:59 2011
@@ -26,7 +26,7 @@ Versions of Major Components
---------------------
Apache Lucene trunk
Apache Tika 0.8
-Carrot2 3.4.2
+Carrot2 3.5.0
Velocity 1.6.4 and Velocity Tools 2.0
Apache UIMA 2.3.1-SNAPSHOT
@@ -329,6 +329,10 @@ Bug Fixes
* SOLR-2495: The JSON parser could hang on corrupted input and could fail
to detect numbers that were too large to fit in a long. (yonik)
+* SOLR-2520: Make JSON response format escape \u2029 as well as \u2028
+ in strings since those characters are not valid in javascript strings
+ (although they are valid in JSON strings). (yonik)
+
Other Changes
----------------------
@@ -336,6 +340,9 @@ Other Changes
* SOLR-2105: Rename RequestHandler param 'update.processor' to 'update.chain'.
(Jan Høydahl via Mark Miller)
+* SOLR-2528: Remove default="true" from HtmlEncoder in example solrconfig.xml,
+ because html encoding confuses non-ascii users. (koji)
+
Build
----------------------
Modified: lucene/dev/branches/flexscoring/solr/contrib/clustering/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/solr/contrib/clustering/CHANGES.txt?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/solr/contrib/clustering/CHANGES.txt (original)
+++ lucene/dev/branches/flexscoring/solr/contrib/clustering/CHANGES.txt Mon May 23 16:49:59 2011
@@ -9,11 +9,19 @@ CHANGES
$Id$
================== Release 4.0.0-dev ==================
-(No Changes)
+* SOLR-2448: Search results clustering updates: bisecting k-means
+ clustering algorithm added, loading of Carrot2 stop words from
+ <solr.home>/conf/carrot2 (SOLR-2449), using Solr's stopwords.txt
+ for clustering (SOLR-2450), output of cluster scores (SOLR-2505)
+ (Stanislaw Osinski, Dawid Weiss).
================== Release 3.2.0-dev ==================
-(No Changes)
+* SOLR-2448: Search results clustering updates: bisecting k-means
+ clustering algorithm added, loading of Carrot2 stop words from
+ <solr.home>/conf/carrot2 (SOLR-2449), using Solr's stopwords.txt
+ for clustering (SOLR-2450), output of cluster scores (SOLR-2505)
+ (Stanislaw Osinski, Dawid Weiss).
================== Release 3.1.0-dev ==================
Modified: lucene/dev/branches/flexscoring/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java (original)
+++ lucene/dev/branches/flexscoring/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java Mon May 23 16:49:59 2011
@@ -18,9 +18,11 @@ package org.apache.solr.handler.clusteri
*/
import java.io.IOException;
+import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
@@ -37,6 +39,7 @@ import org.apache.solr.common.params.Sol
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.core.SolrCore;
+import org.apache.solr.core.SolrResourceLoader;
import org.apache.solr.handler.clustering.SearchClusteringEngine;
import org.apache.solr.handler.component.HighlightComponent;
import org.apache.solr.highlight.SolrHighlighter;
@@ -52,9 +55,17 @@ import org.carrot2.core.ControllerFactor
import org.carrot2.core.Document;
import org.carrot2.core.IClusteringAlgorithm;
import org.carrot2.core.attribute.AttributeNames;
+import org.carrot2.text.linguistic.DefaultLexicalDataFactoryDescriptor;
+import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor;
+import org.carrot2.util.resource.ClassLoaderLocator;
+import org.carrot2.util.resource.IResource;
+import org.carrot2.util.resource.IResourceLocator;
+import org.carrot2.util.resource.ResourceLookup;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
/**
@@ -64,19 +75,33 @@ import com.google.common.collect.Sets;
*
* @link http://project.carrot2.org
*/
-@SuppressWarnings("unchecked")
public class CarrotClusteringEngine extends SearchClusteringEngine {
- private transient static Logger log = LoggerFactory
+ private transient static Logger log = LoggerFactory
.getLogger(CarrotClusteringEngine.class);
+ /**
+ * The subdirectory in Solr config dir to read customized Carrot2 resources from.
+ */
+ private static final String CARROT_RESOURCES_PREFIX = "clustering/carrot2";
+
+ /**
+ * Name of Carrot2 document's field containing Solr document's identifier.
+ */
+ private static final String SOLR_DOCUMENT_ID = "solrId";
+
+ /**
+ * Name of Solr document's field containing the document's identifier. To avoid
+ * repeating the content of documents in clusters on output, each cluster contains
+ * identifiers of documents it contains.
+ */
+ private String idFieldName;
+
/**
* Carrot2 controller that manages instances of clustering algorithms
*/
private Controller controller = ControllerFactory.createPooling();
private Class<? extends IClusteringAlgorithm> clusteringAlgorithmClass;
- private String idFieldName;
-
@Override
@Deprecated
public Object cluster(Query query, DocList docList, SolrQueryRequest sreq) {
@@ -101,6 +126,10 @@ public class CarrotClusteringEngine exte
attributes.put(AttributeNames.DOCUMENTS, documents);
attributes.put(AttributeNames.QUERY, query.toString());
+ // Pass the fields on which clustering runs to the
+ // SolrStopwordsCarrot2LexicalDataFactory
+ attributes.put("solrFieldNames", getFieldsForClustering(sreq));
+
// Pass extra overriding attributes from the request, if any
extractCarrotAttributes(sreq.getParams(), attributes);
@@ -113,22 +142,68 @@ public class CarrotClusteringEngine exte
}
}
- @Override
+ @Override
+ @SuppressWarnings({ "unchecked", "rawtypes" })
public String init(NamedList config, final SolrCore core) {
String result = super.init(config, core);
- SolrParams initParams = SolrParams.toSolrParams(config);
+ final SolrParams initParams = SolrParams.toSolrParams(config);
// Initialize Carrot2 controller. Pass initialization attributes, if any.
HashMap<String, Object> initAttributes = new HashMap<String, Object>();
extractCarrotAttributes(initParams, initAttributes);
-
- // Customize the language model factory. The implementation we provide here
- // is included in the code base of Solr, so that it's possible to refactor
- // the Lucene APIs the factory relies on if needed.
- initAttributes.put("PreprocessingPipeline.languageModelFactory",
- LuceneLanguageModelFactory.class);
- this.controller.init(initAttributes);
+ // Customize the stemmer and tokenizer factories. The implementations we provide here
+ // are included in the code base of Solr, so that it's possible to refactor
+ // the Lucene APIs the factories rely on if needed.
+ // Additionally, we set a custom lexical resource factory for Carrot2 that
+ // will use both Carrot2 default stop words as well as stop words from
+ // the StopFilter defined on the field.
+ BasicPreprocessingPipelineDescriptor.attributeBuilder(initAttributes)
+ .stemmerFactory(LuceneCarrot2StemmerFactory.class)
+ .tokenizerFactory(LuceneCarrot2TokenizerFactory.class)
+ .lexicalDataFactory(SolrStopwordsCarrot2LexicalDataFactory.class);
+
+ // Pass the schema to SolrStopwordsCarrot2LexicalDataFactory.
+ initAttributes.put("solrIndexSchema", core.getSchema());
+
+ // Customize Carrot2's resource lookup to first look for resources
+ // using Solr's resource loader. If that fails, try loading from the classpath.
+ DefaultLexicalDataFactoryDescriptor.attributeBuilder(initAttributes)
+ .resourceLookup(new ResourceLookup(new IResourceLocator() {
+ @Override
+ public IResource[] getAll(final String resource) {
+ final SolrResourceLoader resourceLoader = core.getResourceLoader();
+ final String carrot2ResourcesDir = resourceLoader.getConfigDir()
+ + initParams.get(CarrotParams.LEXICAL_RESOURCES_DIR, CARROT_RESOURCES_PREFIX);
+ try {
+ log.debug("Looking for " + resource + " in "
+ + carrot2ResourcesDir);
+ final InputStream resourceStream = resourceLoader
+ .openResource(carrot2ResourcesDir + "/" + resource);
+
+ log.info(resource + " loaded from " + carrot2ResourcesDir);
+ final IResource foundResource = new IResource() {
+ @Override
+ public InputStream open() throws IOException {
+ return resourceStream;
+ }
+ };
+ return new IResource[] { foundResource };
+ } catch (RuntimeException e) {
+ // No way to distinguish if the resource was found but failed
+ // to load or wasn't found at all, so we simply fall back
+ // to Carrot2 defaults here by returning an empty locations array.
+ log.debug(resource + " not found in " + carrot2ResourcesDir
+ + ". Using the default " + resource + " from Carrot JAR.");
+ return new IResource[] {};
+ }
+ }
+ },
+
+ // Using the class loader directly because this time we want to omit the prefix
+ new ClassLoaderLocator(core.getResourceLoader().getClassLoader())));
+
+ this.controller.init(initAttributes);
this.idFieldName = core.getSchema().getUniqueKeyField().getName();
// Make sure the requested Carrot2 clustering algorithm class is available
@@ -148,17 +223,29 @@ public class CarrotClusteringEngine exte
protected Set<String> getFieldsToLoad(SolrQueryRequest sreq){
SolrParams solrParams = sreq.getParams();
- // Names of fields to deliver content for clustering
- String urlField = solrParams.get(CarrotParams.URL_FIELD_NAME, "url");
+ HashSet<String> fields = Sets.newHashSet(getFieldsForClustering(sreq));
+ fields.add(idFieldName);
+ fields.add(solrParams.get(CarrotParams.URL_FIELD_NAME, "url"));
+ return fields;
+ }
+
+ /**
+ * Returns the names of fields that will be delivering the actual
+ * content for clustering. Currently, there are two such fields: document
+ * title and document content.
+ */
+ private Set<String> getFieldsForClustering(SolrQueryRequest sreq) {
+ SolrParams solrParams = sreq.getParams();
+
String titleField = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
String snippetField = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleField);
if (StringUtils.isBlank(snippetField)) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, CarrotParams.SNIPPET_FIELD_NAME
+ " must not be blank.");
}
- return Sets.newHashSet(urlField, titleField, snippetField, idFieldName);
- }
-
+ return Sets.newHashSet(titleField, snippetField);
+ }
+
/**
* Prepares Carrot2 documents for clustering.
*/
@@ -180,7 +267,7 @@ public class CarrotClusteringEngine exte
if (produceSummary == true) {
highlighter = HighlightComponent.getHighlighter(core);
if (highlighter != null){
- Map args = new HashMap();
+ Map<String, Object> args = Maps.newHashMap();
snippetFieldAry = new String[]{snippetField};
args.put(HighlightParams.FIELDS, snippetFieldAry);
args.put(HighlightParams.HIGHLIGHT, "true");
@@ -214,11 +301,12 @@ public class CarrotClusteringEngine exte
if (produceSummary && docIds != null) {
docsHolder[0] = docIds.get(sdoc).intValue();
DocList docAsList = new DocSlice(0, 1, docsHolder, scores, 1, 1.0f);
- NamedList highlights = highlighter.doHighlighting(docAsList, theQuery, req, snippetFieldAry);
+ NamedList<Object> highlights = highlighter.doHighlighting(docAsList, theQuery, req, snippetFieldAry);
if (highlights != null && highlights.size() == 1) {//should only be one value given our setup
//should only be one document with one field
- NamedList tmp = (NamedList) highlights.getVal(0);
- String [] highlt = (String[]) tmp.get(snippetField);
+ @SuppressWarnings("unchecked")
+ NamedList<String []> tmp = (NamedList<String[]>) highlights.getVal(0);
+ String [] highlt = tmp.get(snippetField);
if (highlt != null && highlt.length == 1) {
snippet = highlt[0];
}
@@ -226,27 +314,13 @@ public class CarrotClusteringEngine exte
}
Document carrotDocument = new Document(getValue(sdoc, titleField),
snippet, (String)sdoc.getFieldValue(urlField));
- carrotDocument.setField("solrId", sdoc.getFieldValue(idFieldName));
+ carrotDocument.setField(SOLR_DOCUMENT_ID, sdoc.getFieldValue(idFieldName));
result.add(carrotDocument);
}
return result;
}
- @Deprecated
- protected String getValue(org.apache.lucene.document.Document doc,
- String field) {
- StringBuilder result = new StringBuilder();
- String[] vals = doc.getValues(field);
- for (int i = 0; i < vals.length; i++) {
- // Join multiple values with a period so that Carrot2 does not pick up
- // phrases that cross field value boundaries (in most cases it would
- // create useless phrases).
- result.append(vals[i]).append(" . ");
- }
- return result.toString().trim();
- }
-
protected String getValue(SolrDocument sdoc, String field) {
StringBuilder result = new StringBuilder();
Collection<Object> vals = sdoc.getFieldValues(field);
@@ -261,9 +335,9 @@ public class CarrotClusteringEngine exte
return result.toString().trim();
}
- private List clustersToNamedList(List<Cluster> carrotClusters,
+ private List<NamedList<Object>> clustersToNamedList(List<Cluster> carrotClusters,
SolrParams solrParams) {
- List result = new ArrayList();
+ List<NamedList<Object>> result = Lists.newArrayList();
clustersToNamedList(carrotClusters, result, solrParams.getBool(
CarrotParams.OUTPUT_SUB_CLUSTERS, true), solrParams.getInt(
CarrotParams.NUM_DESCRIPTIONS, Integer.MAX_VALUE));
@@ -271,25 +345,40 @@ public class CarrotClusteringEngine exte
}
private void clustersToNamedList(List<Cluster> outputClusters,
- List parent, boolean outputSubClusters, int maxLabels) {
+ List<NamedList<Object>> parent, boolean outputSubClusters, int maxLabels) {
for (Cluster outCluster : outputClusters) {
- NamedList cluster = new SimpleOrderedMap();
+ NamedList<Object> cluster = new SimpleOrderedMap<Object>();
parent.add(cluster);
+ // Add labels
List<String> labels = outCluster.getPhrases();
- if (labels.size() > maxLabels)
+ if (labels.size() > maxLabels) {
labels = labels.subList(0, maxLabels);
+ }
cluster.add("labels", labels);
+ // Add cluster score
+ final Double score = outCluster.getScore();
+ if (score != null) {
+ cluster.add("score", score);
+ }
+
+ // Add other topics marker
+ if (outCluster.isOtherTopics()) {
+ cluster.add("other-topics", outCluster.isOtherTopics());
+ }
+
+ // Add documents
List<Document> docs = outputSubClusters ? outCluster.getDocuments() : outCluster.getAllDocuments();
- List docList = new ArrayList();
+ List<Object> docList = Lists.newArrayList();
cluster.add("docs", docList);
for (Document doc : docs) {
- docList.add(doc.getField("solrId"));
+ docList.add(doc.getField(SOLR_DOCUMENT_ID));
}
- if (outputSubClusters) {
- List subclusters = new ArrayList();
+ // Add subclusters
+ if (outputSubClusters && !outCluster.getSubclusters().isEmpty()) {
+ List<NamedList<Object>> subclusters = Lists.newArrayList();
cluster.add("clusters", subclusters);
clustersToNamedList(outCluster.getSubclusters(), subclusters,
outputSubClusters, maxLabels);
Modified: lucene/dev/branches/flexscoring/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java (original)
+++ lucene/dev/branches/flexscoring/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java Mon May 23 16:49:59 2011
@@ -35,6 +35,8 @@ public interface CarrotParams {
String OUTPUT_SUB_CLUSTERS = CARROT_PREFIX + "outputSubClusters";
String SUMMARY_FRAGSIZE = CARROT_PREFIX + "fragzise";
+ String LEXICAL_RESOURCES_DIR = CARROT_PREFIX + "lexicalResourcesDir";
+
public static final Set<String> CARROT_PARAM_NAMES = ImmutableSet.of(
ALGORITHM, TITLE_FIELD_NAME, URL_FIELD_NAME, SNIPPET_FIELD_NAME,
PRODUCE_SUMMARY, NUM_DESCRIPTIONS, OUTPUT_SUB_CLUSTERS, SUMMARY_FRAGSIZE);
Modified: lucene/dev/branches/flexscoring/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java (original)
+++ lucene/dev/branches/flexscoring/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java Mon May 23 16:49:59 2011
@@ -17,6 +17,11 @@ package org.apache.solr.handler.clusteri
* limitations under the License.
*/
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
import org.apache.lucene.index.Term;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
@@ -37,15 +42,11 @@ import org.apache.solr.util.SolrPluginUt
import org.carrot2.util.attribute.AttributeUtils;
import org.junit.Test;
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
+import com.google.common.collect.ImmutableList;
/**
*
*/
-@SuppressWarnings("unchecked")
public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
@Test
public void testCarrotLingo() throws Exception {
@@ -74,7 +75,7 @@ public class CarrotClusteringEngineTest
@Test
public void testWithoutSubclusters() throws Exception {
- checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs),
+ checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs),
1, 1, 0);
}
@@ -82,7 +83,7 @@ public class CarrotClusteringEngineTest
public void testWithSubclusters() throws Exception {
ModifiableSolrParams params = new ModifiableSolrParams();
params.set(CarrotParams.OUTPUT_SUB_CLUSTERS, true);
- checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs), 1, 1, 2);
+ checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs), 1, 1, 2);
}
@Test
@@ -90,19 +91,107 @@ public class CarrotClusteringEngineTest
ModifiableSolrParams params = new ModifiableSolrParams();
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "labels"), 5);
params.set(CarrotParams.NUM_DESCRIPTIONS, 3);
- checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs,
+ checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs,
params), 1, 3, 0);
}
@Test
+ public void testClusterScores() throws Exception {
+ ModifiableSolrParams params = new ModifiableSolrParams();
+ params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
+ List<NamedList<Object>> clusters = checkEngine(getClusteringEngine("mock"),
+ AbstractClusteringTestCase.numberOfDocs, params);
+ int i = 1;
+ for (NamedList<Object> cluster : clusters) {
+ final Double score = getScore(cluster);
+ assertNotNull(score);
+ assertEquals(0.25 * i++, score, 0);
+ }
+ }
+
+ @Test
+ public void testOtherTopics() throws Exception {
+ ModifiableSolrParams params = new ModifiableSolrParams();
+ params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
+ params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "otherTopicsModulo"), 2);
+ List<NamedList<Object>> clusters = checkEngine(getClusteringEngine("mock"),
+ AbstractClusteringTestCase.numberOfDocs, params);
+ int i = 1;
+ for (NamedList<Object> cluster : clusters) {
+ assertEquals(i++ % 2 == 0 ? true : null, isOtherTopics(cluster));
+ }
+ }
+
+ @Test
public void testCarrotAttributePassing() throws Exception {
ModifiableSolrParams params = new ModifiableSolrParams();
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "labels"), 3);
- checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs,
+ checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs,
params), 1, 3, 0);
}
+ @Test
+ public void testLexicalResourcesFromSolrConfigDefaultDir() throws Exception {
+ checkLexicalResourcesFromSolrConfig("lexical-resource-check",
+ "online,customsolrstopword,customsolrstoplabel");
+ }
+
+ @Test
+ public void testLexicalResourcesFromSolrConfigCustomDir() throws Exception {
+ checkLexicalResourcesFromSolrConfig("lexical-resource-check-custom-resource-dir",
+ "online,customsolrstopwordcustomdir,customsolrstoplabelcustomdir");
+ }
+
+ private void checkLexicalResourcesFromSolrConfig(String engineName, String wordsToCheck)
+ throws IOException {
+ ModifiableSolrParams params = new ModifiableSolrParams();
+ params.set("merge-resources", false);
+ params.set(AttributeUtils.getKey(
+ LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
+ wordsToCheck);
+
+ // "customsolrstopword" is in stopwords.en, "customsolrstoplabel" is in
+ // stoplabels.en, so we're expecting only one cluster with label "online".
+ final List<NamedList<Object>> clusters = checkEngine(
+ getClusteringEngine(engineName), 1, params);
+ assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
+ }
+
+ @Test
+ public void solrStopWordsUsedInCarrot2Clustering() throws Exception {
+ ModifiableSolrParams params = new ModifiableSolrParams();
+ params.set("merge-resources", false);
+ params.set(AttributeUtils.getKey(
+ LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
+ "online,solrownstopword");
+
+ // "solrownstopword" is in stopwords.txt, so we're expecting
+ // only one cluster with label "online".
+ final List<NamedList<Object>> clusters = checkEngine(
+ getClusteringEngine("lexical-resource-check"), 1, params);
+ assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
+ }
+
+ @Test
+ public void solrStopWordsNotDefinedOnAFieldForClustering() throws Exception {
+ ModifiableSolrParams params = new ModifiableSolrParams();
+ // Force string fields to be used for clustering. Does not make sense
+ // in a real word, but does the job in the test.
+ params.set(CarrotParams.TITLE_FIELD_NAME, "url");
+ params.set(CarrotParams.SNIPPET_FIELD_NAME, "url");
+ params.set("merge-resources", false);
+ params.set(AttributeUtils.getKey(
+ LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
+ "online,solrownstopword");
+
+ final List<NamedList<Object>> clusters = checkEngine(
+ getClusteringEngine("lexical-resource-check"), 2, params);
+ assertEquals(ImmutableList.of("online"), getLabels(clusters.get(0)));
+ assertEquals(ImmutableList.of("solrownstopword"),
+ getLabels(clusters.get(1)));
+ }
+
private CarrotClusteringEngine getClusteringEngine(String engineName) {
ClusteringComponent comp = (ClusteringComponent) h.getCore()
.getSearchComponent("clustering");
@@ -114,18 +203,18 @@ public class CarrotClusteringEngineTest
return engine;
}
- private List checkEngine(CarrotClusteringEngine engine,
+ private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine,
int expectedNumClusters) throws IOException {
return checkEngine(engine, numberOfDocs, expectedNumClusters, new MatchAllDocsQuery(), new ModifiableSolrParams());
}
- private List checkEngine(CarrotClusteringEngine engine,
+ private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine,
int expectedNumClusters, SolrParams clusteringParams) throws IOException {
return checkEngine(engine, numberOfDocs, expectedNumClusters, new MatchAllDocsQuery(), clusteringParams);
}
- private List checkEngine(CarrotClusteringEngine engine, int expectedNumDocs,
+ private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine, int expectedNumDocs,
int expectedNumClusters, Query query, SolrParams clusteringParams) throws IOException {
// Get all documents to cluster
RefCounted<SolrIndexSearcher> ref = h.getCore().getSearcher();
@@ -145,7 +234,9 @@ public class CarrotClusteringEngineTest
LocalSolrQueryRequest req = new LocalSolrQueryRequest(h.getCore(), solrParams);
Map<SolrDocument,Integer> docIds = new HashMap<SolrDocument, Integer>(docList.size());
SolrDocumentList solrDocList = SolrPluginUtils.docListToSolrDocumentList( docList, searcher, engine.getFieldsToLoad(req), docIds );
- List results = (List)engine.cluster(query, solrDocList, docIds, req);
+
+ @SuppressWarnings("unchecked")
+ List<NamedList<Object>> results = (List<NamedList<Object>>) engine.cluster(query, solrDocList, docIds, req);
req.close();
assertEquals("number of clusters: " + results, expectedNumClusters, results.size());
checkClusters(results, false);
@@ -155,51 +246,74 @@ public class CarrotClusteringEngineTest
}
}
- private void checkClusters(List results, int expectedDocCount,
+ private void checkClusters(List<NamedList<Object>> results, int expectedDocCount,
int expectedLabelCount, int expectedSubclusterCount) {
for (int i = 0; i < results.size(); i++) {
- NamedList cluster = (NamedList) results.get(i);
+ NamedList<Object> cluster = results.get(i);
checkCluster(cluster, expectedDocCount, expectedLabelCount,
expectedSubclusterCount);
}
}
- private void checkClusters(List results, boolean hasSubclusters) {
+ private void checkClusters(List<NamedList<Object>> results, boolean hasSubclusters) {
for (int i = 0; i < results.size(); i++) {
- checkCluster((NamedList) results.get(i), hasSubclusters);
+ checkCluster(results.get(i), hasSubclusters);
}
}
- private void checkCluster(NamedList cluster, boolean hasSubclusters) {
- List docs = (List) cluster.get("docs");
+ private void checkCluster(NamedList<Object> cluster, boolean hasSubclusters) {
+ List<Object> docs = getDocs(cluster);
assertNotNull("docs is null and it shouldn't be", docs);
for (int j = 0; j < docs.size(); j++) {
String id = (String) docs.get(j);
assertNotNull("id is null and it shouldn't be", id);
}
- List labels = (List) cluster.get("labels");
+ List<String> labels = getLabels(cluster);
assertNotNull("labels is null but it shouldn't be", labels);
if (hasSubclusters) {
- List subclusters = (List) cluster.get("clusters");
+ List<NamedList<Object>> subclusters = getSubclusters(cluster);
assertNotNull("subclusters is null but it shouldn't be", subclusters);
}
}
- private void checkCluster(NamedList cluster, int expectedDocCount,
+ private void checkCluster(NamedList<Object> cluster, int expectedDocCount,
int expectedLabelCount, int expectedSubclusterCount) {
checkCluster(cluster, expectedSubclusterCount > 0);
assertEquals("number of docs in cluster", expectedDocCount,
- ((List) cluster.get("docs")).size());
+ getDocs(cluster).size());
assertEquals("number of labels in cluster", expectedLabelCount,
- ((List) cluster.get("labels")).size());
+ getLabels(cluster).size());
if (expectedSubclusterCount > 0) {
- List subclusters = (List) cluster.get("clusters");
+ List<NamedList<Object>> subclusters = getSubclusters(cluster);
assertEquals("numClusters", expectedSubclusterCount, subclusters.size());
assertEquals("number of subclusters in cluster",
expectedSubclusterCount, subclusters.size());
}
}
+
+ @SuppressWarnings("unchecked")
+ private List<NamedList<Object>> getSubclusters(NamedList<Object> cluster) {
+ return (List<NamedList<Object>>) cluster.get("clusters");
+ }
+
+ @SuppressWarnings("unchecked")
+ private List<String> getLabels(NamedList<Object> cluster) {
+ return (List<String>) cluster.get("labels");
+ }
+
+ private Double getScore(NamedList<Object> cluster) {
+ return (Double) cluster.get("score");
+ }
+
+ private Boolean isOtherTopics(NamedList<Object> cluster) {
+ return (Boolean)cluster.get("other-topics");
+ }
+
+ @SuppressWarnings("unchecked")
+ private List<Object> getDocs(NamedList<Object> cluster) {
+ return (List<Object>) cluster.get("docs");
+ }
}
Modified: lucene/dev/branches/flexscoring/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java (original)
+++ lucene/dev/branches/flexscoring/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java Mon May 23 16:49:59 2011
@@ -49,6 +49,11 @@ public class MockClusteringAlgorithm ext
@IntRange(min = 1, max = 5)
private int labels = 1;
+ @Input
+ @Processing
+ @Attribute
+ private int otherTopicsModulo = 0;
+
@Override
public void process() throws ProcessingException {
clusters = Lists.newArrayList();
@@ -59,21 +64,26 @@ public class MockClusteringAlgorithm ext
int documentIndex = 1;
for (Document document : documents) {
StringBuilder label = new StringBuilder("Cluster " + documentIndex);
- Cluster cluster = createCluster(label.toString(), document);
+ Cluster cluster = createCluster(label.toString(), documentIndex, document);
clusters.add(cluster);
for (int i = 1; i <= depth; i++) {
label.append(".");
label.append(i);
- Cluster newCluster = createCluster(label.toString(), document);
- cluster.addSubclusters(createCluster(label.toString(), document), newCluster);
+ Cluster newCluster = createCluster(label.toString(), documentIndex, document);
+ cluster.addSubclusters(createCluster(label.toString(), documentIndex, document), newCluster);
cluster = newCluster;
}
documentIndex++;
}
}
- private Cluster createCluster(String labelBase, Document... documents) {
+ private Cluster createCluster(String labelBase, int documentIndex, Document... documents) {
Cluster cluster = new Cluster();
+ cluster.setScore(documentIndex * 0.25);
+ if (otherTopicsModulo != 0 && documentIndex % otherTopicsModulo == 0)
+ {
+ cluster.setOtherTopics(true);
+ }
for (int i = 0; i < labels; i++) {
cluster.addPhrases(labelBase + "#" + (i + 1));
}
Modified: lucene/dev/branches/flexscoring/solr/contrib/clustering/src/test/resources/solr-clustering/conf/solrconfig.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/solr/contrib/clustering/src/test/resources/solr-clustering/conf/solrconfig.xml?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/solr/contrib/clustering/src/test/resources/solr-clustering/conf/solrconfig.xml (original)
+++ lucene/dev/branches/flexscoring/solr/contrib/clustering/src/test/resources/solr-clustering/conf/solrconfig.xml Mon May 23 16:49:59 2011
@@ -396,6 +396,15 @@
<str name="name">mock</str>
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.MockClusteringAlgorithm</str>
</lst>
+ <lst name="engine">
+ <str name="name">lexical-resource-check</str>
+ <str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.LexicalResourcesCheckClusteringAlgorithm</str>
+ </lst>
+ <lst name="engine">
+ <str name="name">lexical-resource-check-custom-resource-dir</str>
+ <str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.LexicalResourcesCheckClusteringAlgorithm</str>
+ <str name="carrot.lexicalResourcesDir">clustering/custom</str>
+ </lst>
</searchComponent>
<searchComponent class="org.apache.solr.handler.clustering.ClusteringComponent" name="doc-clustering">
Modified: lucene/dev/branches/flexscoring/solr/contrib/clustering/src/test/resources/solr-clustering/conf/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/solr/contrib/clustering/src/test/resources/solr-clustering/conf/stopwords.txt?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/solr/contrib/clustering/src/test/resources/solr-clustering/conf/stopwords.txt (original)
+++ lucene/dev/branches/flexscoring/solr/contrib/clustering/src/test/resources/solr-clustering/conf/stopwords.txt Mon May 23 16:49:59 2011
@@ -55,4 +55,5 @@ to
was
will
with
+solrownstopword
Modified: lucene/dev/branches/flexscoring/solr/contrib/extraction/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/solr/contrib/extraction/CHANGES.txt?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/solr/contrib/extraction/CHANGES.txt (original)
+++ lucene/dev/branches/flexscoring/solr/contrib/extraction/CHANGES.txt Mon May 23 16:49:59 2011
@@ -22,7 +22,7 @@ to your Solr Home lib directory. See ht
Current Version: Tika 0.8 (released 11/07/2010)
-$Id:$
+$Id$
================== Release 4.0-dev ==================
@@ -30,7 +30,8 @@ $Id:$
================== Release 3.2-dev ==================
-(No Changes)
+* SOLR-2480: Add ignoreTikaException flag so that users can ignore TikaException but index
+ meta data. (Shinichiro Abe, koji)
================== Release 3.1-dev ==================
Modified: lucene/dev/branches/flexscoring/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java (original)
+++ lucene/dev/branches/flexscoring/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java Mon May 23 16:49:59 2011
@@ -16,20 +16,27 @@
*/
package org.apache.solr.handler.extraction;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringWriter;
+import java.util.Locale;
+
import org.apache.commons.io.IOUtils;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.params.UpdateParams;
import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.util.NamedList;
+import org.apache.solr.handler.ContentStreamLoader;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.update.AddUpdateCommand;
import org.apache.solr.update.processor.UpdateRequestProcessor;
-import org.apache.solr.handler.ContentStreamLoader;
import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
@@ -37,26 +44,24 @@ import org.apache.tika.sax.XHTMLContentH
import org.apache.tika.sax.xpath.Matcher;
import org.apache.tika.sax.xpath.MatchingContentHandler;
import org.apache.tika.sax.xpath.XPathParser;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.mime.MediaType;
-import org.apache.xml.serialize.OutputFormat;
import org.apache.xml.serialize.BaseMarkupSerializer;
-import org.apache.xml.serialize.XMLSerializer;
+import org.apache.xml.serialize.OutputFormat;
import org.apache.xml.serialize.TextSerializer;
+import org.apache.xml.serialize.XMLSerializer;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.StringWriter;
-import java.util.Locale;
-
/**
* The class responsible for loading extracted content into Solr.
*
**/
public class ExtractingDocumentLoader extends ContentStreamLoader {
+
+ private static final Logger log = LoggerFactory.getLogger(ExtractingDocumentLoader.class);
+
/**
* Extract Only supported format
*/
@@ -74,6 +79,7 @@ public class ExtractingDocumentLoader ex
final IndexSchema schema;
final SolrParams params;
final UpdateRequestProcessor processor;
+ final boolean ignoreTikaException;
protected AutoDetectParser autoDetectParser;
private final AddUpdateCommand templateAdd;
@@ -95,6 +101,8 @@ public class ExtractingDocumentLoader ex
//this is lightweight
autoDetectParser = new AutoDetectParser(config);
this.factory = factory;
+
+ ignoreTikaException = params.getBool(ExtractingParams.IGNORE_TIKA_EXCEPTION, false);
}
@@ -180,9 +188,17 @@ public class ExtractingDocumentLoader ex
parsingHandler = new MatchingContentHandler(handler, matcher);
} //else leave it as is
- //potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
- ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
- parser.parse(inputStream, parsingHandler, metadata, context);
+ try{
+ //potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
+ ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
+ parser.parse(inputStream, parsingHandler, metadata, context);
+ } catch (TikaException e) {
+ if(ignoreTikaException)
+ log.warn(new StringBuilder("skip extracting text due to ").append(e.getLocalizedMessage())
+ .append(". metadata=").append(metadata.toString()).toString());
+ else
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
+ }
if (extractOnly == false) {
addDoc(handler);
} else {
@@ -202,8 +218,6 @@ public class ExtractingDocumentLoader ex
}
} catch (SAXException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
- } catch (TikaException e) {
- throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
} finally {
IOUtils.closeQuietly(inputStream);
}
Modified: lucene/dev/branches/flexscoring/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java (original)
+++ lucene/dev/branches/flexscoring/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java Mon May 23 16:49:59 2011
@@ -28,6 +28,11 @@ public interface ExtractingParams {
*/
public static final String LOWERNAMES = "lowernames";
+ /**
+ * if true, ignore TikaException (give up to extract text but index meta data)
+ */
+ public static final String IGNORE_TIKA_EXCEPTION = "ignoreTikaException";
+
/**
* The param prefix for mapping Tika metadata to Solr fields.
Modified: lucene/dev/branches/flexscoring/solr/example/solr/conf/solrconfig.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/solr/example/solr/conf/solrconfig.xml?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/solr/example/solr/conf/solrconfig.xml (original)
+++ lucene/dev/branches/flexscoring/solr/example/solr/conf/solrconfig.xml Mon May 23 16:49:59 2011
@@ -1198,17 +1198,20 @@
<lst name="engine">
<!-- The name, only one can be named "default" -->
<str name="name">default</str>
- <!-- Class name of Carrot2 clustering algorithm.
-
+
+ <!-- Class name of Carrot2 clustering algorithm.
+
Currently available algorithms are:
* org.carrot2.clustering.lingo.LingoClusteringAlgorithm
* org.carrot2.clustering.stc.STCClusteringAlgorithm
+ * org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm
See http://project.carrot2.org/algorithms.html for the
algorithm's characteristics.
-->
<str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str>
+
<!-- Overriding values for Carrot2 default algorithm attributes.
For a description of all available attributes, see:
@@ -1219,9 +1222,22 @@
name and attribute value as parameter value.
-->
<str name="LingoClusteringAlgorithm.desiredClusterCountBase">20</str>
-
+
+ <!-- Location of Carrot2 lexical resources.
+
+ A directory from which to load Carrot2-specific stop words
+ and stop labels. Absolute or relative to Solr config directory.
+ If a specific resource (e.g. stopwords.en) is present in the
+ specified dir, it will completely override the corresponding
+ default one that ships with Carrot2.
+
+ For an overview of Carrot2 lexical resources, see:
+ http://download.carrot2.org/head/manual/#chapter.lexical-resources
+ -->
+ <str name="carrot.lexicalResourcesDir">clustering/carrot2</str>
+
<!-- The language to assume for the documents.
-
+
For a list of allowed values, see:
http://download.carrot2.org/stable/manual/#section.attribute.lingo.MultilingualClustering.defaultLanguage
-->
@@ -1360,7 +1376,6 @@
<!-- Configure the standard encoder -->
<encoder name="html"
- default="true"
class="solr.highlight.HtmlEncoder" />
<!-- Configure the standard fragListBuilder -->
Modified: lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/core/SolrConfig.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/core/SolrConfig.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/core/SolrConfig.java (original)
+++ lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/core/SolrConfig.java Mon May 23 16:49:59 2011
@@ -57,7 +57,6 @@ import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.io.FileFilter;
import java.io.IOException;
-import java.io.InputStream;
/**
@@ -130,12 +129,12 @@ public class SolrConfig extends Config {
throws ParserConfigurationException, IOException, SAXException {
super(loader, name, is, "/config/");
initLibs();
+ luceneMatchVersion = getLuceneVersion("luceneMatchVersion");
defaultIndexConfig = new SolrIndexConfig(this, null, null);
mainIndexConfig = new SolrIndexConfig(this, "mainIndex", defaultIndexConfig);
reopenReaders = getBool("mainIndex/reopenReaders", true);
booleanQueryMaxClauseCount = getInt("query/maxBooleanClauses", BooleanQuery.getMaxClauseCount());
- luceneMatchVersion = getLuceneVersion("luceneMatchVersion");
log.info("Using Lucene MatchVersion: " + luceneMatchVersion);
filtOptEnabled = getBool("query/boolTofilterOptimizer/@enabled", false);
Modified: lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/request/SimpleFacets.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/request/SimpleFacets.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/request/SimpleFacets.java (original)
+++ lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/request/SimpleFacets.java Mon May 23 16:49:59 2011
@@ -656,7 +656,6 @@ public class SimpleFacets {
}
}
- Term template = new Term(field);
DocsEnum docsEnum = null;
CharArr spare = new CharArr();
@@ -676,10 +675,6 @@ public class SimpleFacets {
if (df >= minDfFilterCache) {
// use the filter cache
- // TODO: need a term query that takes a BytesRef to handle binary terms
- spare.reset();
- ByteUtils.UTF8toUTF16(term, spare);
- Term t = template.createTerm(spare.toString());
if (deState==null) {
deState = new SolrIndexSearcher.DocsEnumState();
Modified: lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/response/JSONResponseWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/response/JSONResponseWriter.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/response/JSONResponseWriter.java (original)
+++ lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/response/JSONResponseWriter.java Mon May 23 16:49:59 2011
@@ -442,7 +442,7 @@ class JSONWriter extends TextResponseWri
for (int i=0; i<val.length(); i++) {
char ch = val.charAt(i);
- if ((ch > '#' && ch != '\\' && ch != '\u2028') || ch==' ') { // fast path
+ if ((ch > '#' && ch != '\\' && ch < '\u2028') || ch == ' ') { // fast path
writer.write(ch);
continue;
}
@@ -457,7 +457,10 @@ class JSONWriter extends TextResponseWri
case '\t': writer.write('\\'); writer.write('t'); break;
case '\b': writer.write('\\'); writer.write('b'); break;
case '\f': writer.write('\\'); writer.write('f'); break;
- case '\u2028': unicodeEscape(writer,ch); break;
+ case '\u2028': // fallthrough
+ case '\u2029':
+ unicodeEscape(writer,ch);
+ break;
// case '/':
default: {
if (ch <= 0x1F) {
Modified: lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/response/XMLWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/response/XMLWriter.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/response/XMLWriter.java (original)
+++ lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/response/XMLWriter.java Mon May 23 16:49:59 2011
@@ -19,9 +19,7 @@ package org.apache.solr.response;
import java.io.IOException;
import java.io.Writer;
-import java.util.ArrayList;
import java.util.Arrays;
-import java.util.Collection;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
@@ -32,11 +30,13 @@ import org.apache.solr.common.params.Com
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.XML;
import org.apache.solr.request.SolrQueryRequest;
-import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.ReturnFields;
-public final class XMLWriter extends TextResponseWriter {
+/**
+ * @lucene.internal
+ */
+public class XMLWriter extends TextResponseWriter {
public static float CURRENT_VERSION=2.2f;
@@ -54,13 +54,8 @@ public final class XMLWriter extends Tex
private static final char[] XML_START2_NOSCHEMA=("<response>\n").toCharArray();
- private boolean defaultIndent=false;
final int version;
- // temporary working objects...
- // be careful not to use these recursively...
- private final ArrayList tlst = new ArrayList();
-
public static void writeResponse(Writer writer, SolrQueryRequest req, SolrQueryResponse rsp) throws IOException {
XMLWriter xmlWriter = null;
try {
@@ -106,7 +101,7 @@ public final class XMLWriter extends Tex
writer.write(XML_START2_NOSCHEMA);
// dump response values
- NamedList lst = rsp.getValues();
+ NamedList<?> lst = rsp.getValues();
Boolean omitHeader = req.getParams().getBool(CommonParams.OMIT_HEADER);
if(omitHeader != null && omitHeader) lst.remove("responseHeader");
int sz = lst.size();
Modified: lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/search/JoinQParserPlugin.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/search/JoinQParserPlugin.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/search/JoinQParserPlugin.java (original)
+++ lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/search/JoinQParserPlugin.java Mon May 23 16:49:59 2011
@@ -457,7 +457,7 @@ class JoinQuery extends Query {
return resultList.get(0);
}
- int sz = resultList.size();
+ int sz = 0;
for (DocSet set : resultList)
sz += set.size();
Modified: lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/search/SolrIndexSearcher.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/search/SolrIndexSearcher.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/search/SolrIndexSearcher.java (original)
+++ lucene/dev/branches/flexscoring/solr/src/java/org/apache/solr/search/SolrIndexSearcher.java Mon May 23 16:49:59 2011
@@ -811,7 +811,7 @@ public class SolrIndexSearcher extends I
bitsSet += upto;
result = new BitDocSet(obs, bitsSet);
} else {
- result = new SortedIntDocSet(Arrays.copyOf(docs, upto));
+ result = upto==0 ? DocSet.EMPTY : new SortedIntDocSet(Arrays.copyOf(docs, upto));
}
if (useCache) {
Modified: lucene/dev/branches/flexscoring/solr/src/test/org/apache/solr/TestJoin.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/flexscoring/solr/src/test/org/apache/solr/TestJoin.java?rev=1126578&r1=1126577&r2=1126578&view=diff
==============================================================================
--- lucene/dev/branches/flexscoring/solr/src/test/org/apache/solr/TestJoin.java (original)
+++ lucene/dev/branches/flexscoring/solr/src/test/org/apache/solr/TestJoin.java Mon May 23 16:49:59 2011
@@ -101,6 +101,14 @@ public class TestJoin extends SolrTestCa
int indexIter=50 * RANDOM_MULTIPLIER;
int queryIter=50 * RANDOM_MULTIPLIER;
+ // groups of fields that have any chance of matching... used to
+ // increase test effectiveness by avoiding 0 resultsets much of the time.
+ String[][] compat = new String[][] {
+ {"small_s","small2_s","small2_ss","small3_ss"},
+ {"small_i","small2_i","small2_is","small3_is"}
+ };
+
+
while (--indexIter >= 0) {
int indexSize = random.nextInt(20 * RANDOM_MULTIPLIER);
@@ -121,8 +129,19 @@ public class TestJoin extends SolrTestCa
Map<String, Map<Comparable, Set<Comparable>>> pivots = new HashMap<String, Map<Comparable, Set<Comparable>>>();
for (int qiter=0; qiter<queryIter; qiter++) {
- String fromField = types.get(random.nextInt(types.size())).fname;
- String toField = types.get(random.nextInt(types.size())).fname;
+ String fromField;
+ String toField;
+ if (random.nextInt(100) < 5) {
+ // pick random fields 5% of the time
+ fromField = types.get(random.nextInt(types.size())).fname;
+ // pick the same field 50% of the time we pick a random field (since other fields won't match anything)
+ toField = (random.nextInt(100) < 50) ? fromField : types.get(random.nextInt(types.size())).fname;
+ } else {
+ // otherwise, pick compatible fields that have a chance of matching indexed tokens
+ String[] group = compat[random.nextInt(compat.length)];
+ fromField = group[random.nextInt(group.length)];
+ toField = group[random.nextInt(group.length)];
+ }
Map<Comparable, Set<Comparable>> pivot = pivots.get(fromField+"/"+toField);
if (pivot == null) {
@@ -146,7 +165,7 @@ public class TestJoin extends SolrTestCa
resultSet.put("start", 0);
resultSet.put("docs", sortedDocs);
- // todo: use filters
+ // todo: use different join queries for better coverage
SolrQueryRequest req = req("wt","json","indent","true", "echoParams","all",
"q","{!join from="+fromField+" to="+toField
@@ -159,7 +178,7 @@ public class TestJoin extends SolrTestCa
Object realResponse = ObjectBuilder.fromJSON(strResponse);
String err = JSONTestUtil.matchObj("/response", realResponse, resultSet);
if (err != null) {
- log.error("GROUPING MISMATCH: " + err
+ log.error("JOIN MISMATCH: " + err
+ "\n\trequest="+req
+ "\n\tresult="+strResponse
+ "\n\texpected="+ JSONUtil.toJSON(resultSet)