You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ct...@apache.org on 2018/10/22 23:35:24 UTC
[39/50] [abbrv] lucene-solr:jira/solr-12746: SOLR-12879 - MinHash
query parser
SOLR-12879 - MinHash query parser
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/3093dc52
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/3093dc52
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/3093dc52
Branch: refs/heads/jira/solr-12746
Commit: 3093dc52ede69e14ed2fe04ecd8a98cb15d301a4
Parents: d83a6be
Author: Tommaso Teofili <te...@adobe.com>
Authored: Sat Oct 20 08:28:02 2018 +0200
Committer: Cassandra Targett <ct...@apache.org>
Committed: Sun Oct 21 15:46:48 2018 -0500
----------------------------------------------------------------------
solr/CHANGES.txt | 2 +
.../org/apache/solr/search/MinHashQParser.java | 154 +++++
.../solr/search/MinHashQParserPlugin.java | 32 +
.../solr/collection1/conf/schema-minhash.xml | 630 +++++++++++++++++++
.../collection1/conf/solrconfig-minhash.xml | 567 +++++++++++++++++
.../apache/solr/search/TestMinHashQParser.java | 358 +++++++++++
6 files changed, 1743 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3093dc52/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 53370a0..1b29554 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -65,6 +65,8 @@ New Features
locale was changed from ROOT to en_US since well-known patterns assume this locale.
(David Smiley, Bar Rotstein)
+* SOLR-12879: MinHash query parser that builds queries providing a measure of Jaccard similarity
+
* SOLR-12593: The default configSet now includes an "ignored_*" dynamic field. (David Smiley)
Optimizations
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3093dc52/solr/core/src/java/org/apache/solr/search/MinHashQParser.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/search/MinHashQParser.java b/solr/core/src/java/org/apache/solr/search/MinHashQParser.java
new file mode 100644
index 0000000..cdd625a
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/search/MinHashQParser.java
@@ -0,0 +1,154 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.search;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.ConstantScoreQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.util.BytesRef;
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.request.SolrQueryRequest;
+
+/**
+ * The query parser can be used in two modes
+ * 1) where text is analysed and generates min hashes as part of normal lucene analysis
+ * 2) where text is pre-analysed and hashes are added as string to the index
+ * An analyzer can still be defined to support text based query against the text field
+ * <p>
+ * Options:
+ * sim - required similary - default is 1
+ * tp - required true positive rate - default is 1
+ * field - when providing text the analyser for this field is used to generate the finger print
+ * sep - a separator for provided hashes
+ * analyzer_field - the field to use for for analysing suppplied text - if not supplied defaults to field
+ *
+ */
+public class MinHashQParser extends QParser {
+ public MinHashQParser(String qstr, SolrParams localParams, SolrParams params, SolrQueryRequest req) {
+ super(qstr, localParams, params, req);
+ }
+
+ @Override
+ public Query parse() throws SyntaxError {
+
+ float similarity = localParams.getFloat("sim", 1.0f);
+ float expectedTruePositive = localParams.getFloat("tp", 1.0f);
+ String field = localParams.get("field", "min_hash");
+ String separator = localParams.get("sep", "");
+ String analyzerField = localParams.get("analyzer_field", field);
+
+
+ ArrayList<BytesRef> hashes = new ArrayList<>();
+ if (separator.isEmpty()) {
+ try {
+ getHashesFromTokenStream(analyzerField, hashes);
+ } catch (Exception e) {
+ throw new SyntaxError(e);
+ }
+ } else {
+ getHashesFromQueryString(separator, hashes);
+ }
+
+ return createFingerPrintQuery(field, hashes, similarity, expectedTruePositive);
+
+ }
+
+ private void getHashesFromQueryString(String separator, ArrayList<BytesRef> hashes) {
+ Arrays.stream(qstr.split(separator)).forEach(s -> {
+ hashes.add(new BytesRef(s));
+ });
+ }
+
+ private void getHashesFromTokenStream(String analyserField, ArrayList<BytesRef> hashes) throws Exception {
+ TokenStream ts = getReq().getSchema().getIndexAnalyzer().tokenStream(analyserField, qstr);
+ TermToBytesRefAttribute termAttribute = ts.getAttribute(TermToBytesRefAttribute.class);
+ ts.reset();
+ while (ts.incrementToken()) {
+ BytesRef term = termAttribute.getBytesRef();
+ hashes.add(BytesRef.deepCopyOf(term));
+ }
+ ts.end();
+ ts.close();
+ }
+
+ private Query createFingerPrintQuery(String field, List<BytesRef> minhashes, float similarity, float expectedTruePositive) {
+ int bandSize = 1;
+ if (expectedTruePositive < 1) {
+ bandSize = computeBandSize(minhashes.size(), similarity, expectedTruePositive);
+ }
+
+ BooleanQuery.Builder builder = new BooleanQuery.Builder();
+ BooleanQuery.Builder childBuilder = new BooleanQuery.Builder();
+ int rowInBand = 0;
+ for (BytesRef minHash : minhashes) {
+ TermQuery tq = new TermQuery(new Term(field, minHash));
+ if (bandSize == 1) {
+ builder.add(new ConstantScoreQuery(tq), Occur.SHOULD);
+ } else {
+ childBuilder.add(new ConstantScoreQuery(tq), Occur.MUST);
+ rowInBand++;
+ if (rowInBand == bandSize) {
+ builder.add(new ConstantScoreQuery(childBuilder.build()),
+ Occur.SHOULD);
+ childBuilder = new BooleanQuery.Builder();
+ rowInBand = 0;
+ }
+ }
+ }
+ // Avoid a dubious narrow band .... wrap around and pad with the
+ // start
+ if (childBuilder.build().clauses().size() > 0) {
+ for (BytesRef token : minhashes) {
+ TermQuery tq = new TermQuery(new Term(field, token.toString()));
+ childBuilder.add(new ConstantScoreQuery(tq), Occur.MUST);
+ rowInBand++;
+ if (rowInBand == bandSize) {
+ builder.add(new ConstantScoreQuery(childBuilder.build()),
+ Occur.SHOULD);
+ break;
+ }
+ }
+ }
+
+ if (expectedTruePositive >= 1.0 && similarity < 1) {
+ builder.setMinimumNumberShouldMatch((int) (Math.ceil(minhashes.size() * similarity)));
+ }
+ return builder.build();
+
+ }
+
+ static int computeBandSize(int numHash, double similarity, double expectedTruePositive) {
+ for (int bands = 1; bands <= numHash; bands++) {
+ int rowsInBand = numHash / bands;
+ double truePositive = 1 - Math.pow(1 - Math.pow(similarity, rowsInBand), bands);
+ if (truePositive > expectedTruePositive) {
+ return rowsInBand;
+ }
+ }
+ return 1;
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3093dc52/solr/core/src/java/org/apache/solr/search/MinHashQParserPlugin.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/search/MinHashQParserPlugin.java b/solr/core/src/java/org/apache/solr/search/MinHashQParserPlugin.java
new file mode 100644
index 0000000..4567e25
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/search/MinHashQParserPlugin.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.search;
+
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.request.SolrQueryRequest;
+
+/**
+ * {@link QParserPlugin} based on {@link MinHashQParser}.
+ */
+public class MinHashQParserPlugin extends QParserPlugin {
+ @Override
+ public QParser createParser(String qstr, SolrParams localParams, SolrParams params, SolrQueryRequest req) {
+ return new MinHashQParser(qstr, localParams, params, req);
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3093dc52/solr/core/src/test-files/solr/collection1/conf/schema-minhash.xml
----------------------------------------------------------------------
diff --git a/solr/core/src/test-files/solr/collection1/conf/schema-minhash.xml b/solr/core/src/test-files/solr/collection1/conf/schema-minhash.xml
new file mode 100644
index 0000000..a13ba35
--- /dev/null
+++ b/solr/core/src/test-files/solr/collection1/conf/schema-minhash.xml
@@ -0,0 +1,630 @@
+<?xml version="1.0" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- The Solr schema file. This file should be named "schema.xml" and
+ should be located where the classloader for the Solr webapp can find it.
+
+ This schema is used for testing, and as such has everything and the
+ kitchen sink thrown in. See example/solr/conf/schema.xml for a
+ more concise example.
+
+ -->
+
+<schema name="test" version="1.6">
+
+
+ <!-- field type definitions... note that the "name" attribute is
+ just a label to be used by field definitions. The "class"
+ attribute and any other attributes determine the real type and
+ behavior of the fieldtype.
+ -->
+
+ <!-- numeric field types that store and index the text
+ value verbatim (and hence don't sort correctly or support range queries.)
+ These are provided more for backward compatability, allowing one
+ to create a schema that matches an existing lucene index.
+ -->
+
+ <fieldType name="int" class="${solr.tests.IntegerFieldType}" precisionStep="0" omitNorms="true" positionIncrementGap="0" docValues="true"/>
+ <fieldType name="float" class="${solr.tests.FloatFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="long" class="${solr.tests.LongFieldType}" precisionStep="0" omitNorms="true" positionIncrementGap="0" docValues="true"/>
+ <fieldType name="double" class="${solr.tests.DoubleFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+
+ <fieldType name="tint" class="${solr.tests.IntegerFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="tfloat" class="${solr.tests.FloatFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="tlong" class="${solr.tests.LongFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+ <fieldType name="tdouble" class="${solr.tests.DoubleFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+
+ <!-- numeric field types that manipulate the value into
+ a string value that isn't human readable in it's internal form,
+ but sorts correctly and supports range queries.
+
+ If sortMissingLast="true" then a sort on this field will cause documents
+ without the field to come after documents with the field,
+ regardless of the requested sort order.
+ If sortMissingFirst="true" then a sort on this field will cause documents
+ without the field to come before documents with the field,
+ regardless of the requested sort order.
+ If sortMissingLast="false" and sortMissingFirst="false" (the default),
+ then default lucene sorting will be used which places docs without the field
+ first in an ascending sort and last in a descending sort.
+ -->
+
+
+ <!-- Field type demonstrating an Analyzer failure -->
+ <fieldtype name="failtype1" class="solr.TextField">
+ <analyzer type="index">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="0" catenateWords="0"
+ catenateNumbers="0" catenateAll="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory"/>
+ </analyzer>
+ </fieldtype>
+
+ <!-- Demonstrating ignoreCaseChange -->
+ <fieldtype name="wdf_nocase" class="solr.TextField">
+ <analyzer type="index">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="0" catenateWords="0"
+ catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" preserveOriginal="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="0" catenateWords="0"
+ catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" preserveOriginal="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldtype>
+
+ <fieldtype name="wdf_preserve" class="solr.TextField">
+ <analyzer type="index">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="1" catenateWords="0"
+ catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" preserveOriginal="1"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="1" catenateWords="0"
+ catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" preserveOriginal="1"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldtype>
+
+
+ <fieldtype name="boolean" class="solr.BoolField" sortMissingLast="true"/>
+ <fieldtype name="string" class="solr.StrField" sortMissingLast="true" docValues="true"/>
+
+ <!-- format for date is 1995-12-31T23:59:59.999Z and only the fractional
+ seconds part (.999) is optional.
+ -->
+ <fieldtype name="date" class="${solr.tests.DateFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="0"/>
+ <fieldtype name="tdate" class="${solr.tests.DateFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="6"/>
+
+
+ <!-- solr.TextField allows the specification of custom
+ text analyzers specified as a tokenizer and a list
+ of token filters.
+ -->
+ <fieldtype name="text" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.StopFilterFactory"/>
+ <filter class="solr.PorterStemFilterFactory"/>
+ </analyzer>
+ </fieldtype>
+
+
+ <fieldtype name="nametext" class="solr.TextField">
+ <analyzer class="org.apache.lucene.analysis.core.WhitespaceAnalyzer"/>
+ </fieldtype>
+
+ <!-- fieldtypes in this section isolate tokenizers and tokenfilters for testing -->
+ <fieldtype name="keywordtok" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
+ </analyzer>
+ </fieldtype>
+ <fieldtype name="standardtok" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ </analyzer>
+ </fieldtype>
+ <fieldtype name="lettertok" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.LetterTokenizerFactory"/>
+ </analyzer>
+ </fieldtype>
+ <fieldtype name="whitetok" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ </analyzer>
+ </fieldtype>
+ <fieldtype name="HTMLstandardtok" class="solr.TextField">
+ <analyzer>
+ <charFilter class="solr.HTMLStripCharFilterFactory"/>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ </analyzer>
+ </fieldtype>
+ <fieldtype name="HTMLwhitetok" class="solr.TextField">
+ <analyzer>
+ <charFilter class="solr.HTMLStripCharFilterFactory"/>
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ </analyzer>
+ </fieldtype>
+ <fieldtype name="standardtokfilt" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ </analyzer>
+ </fieldtype>
+ <fieldtype name="standardfilt" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ </analyzer>
+ </fieldtype>
+ <fieldtype name="lowerfilt" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldtype>
+ <fieldtype name="lowerpunctfilt" class="solr.TextField">
+ <analyzer type="index">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1"
+ catenateNumbers="1" catenateAll="1" splitOnCaseChange="1"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1"
+ catenateNumbers="1" catenateAll="1" splitOnCaseChange="1"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldtype>
+ <fieldtype name="patternreplacefilt" class="solr.TextField">
+ <analyzer type="index">
+ <tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
+ <filter class="solr.PatternReplaceFilterFactory"
+ pattern="([^a-zA-Z])" replacement="_" replace="all"
+ />
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
+ </analyzer>
+ </fieldtype>
+ <fieldtype name="patterntok" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.PatternTokenizerFactory" pattern=","/>
+ </analyzer>
+ </fieldtype>
+ <fieldtype name="porterfilt" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.PorterStemFilterFactory"/>
+ </analyzer>
+ </fieldtype>
+ <!-- fieldtype name="snowballfilt" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.SnowballPorterFilterFactory"/>
+ </analyzer>
+ </fieldtype -->
+ <fieldtype name="engporterfilt" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.PorterStemFilterFactory"/>
+ </analyzer>
+ </fieldtype>
+ <fieldtype name="custengporterfilt" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.PorterStemFilterFactory"/>
+ </analyzer>
+ </fieldtype>
+ <fieldtype name="stopfilt" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true"/>
+ </analyzer>
+ </fieldtype>
+ <fieldtype name="custstopfilt" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ </analyzer>
+ </fieldtype>
+ <fieldtype name="lengthfilt" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.LengthFilterFactory" min="2" max="5"/>
+ </analyzer>
+ </fieldtype>
+ <fieldType name="charfilthtmlmap" class="solr.TextField">
+ <analyzer>
+ <charFilter class="solr.HTMLStripCharFilterFactory"/>
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <fieldtype name="subword" class="solr.TextField" multiValued="true" positionIncrementGap="100">
+ <analyzer type="index">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1"
+ catenateNumbers="1" catenateAll="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.StopFilterFactory"/>
+ <filter class="solr.PorterStemFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0"
+ catenateNumbers="0" catenateAll="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.StopFilterFactory"/>
+ <filter class="solr.PorterStemFilterFactory"/>
+ </analyzer>
+ </fieldtype>
+
+ <fieldtype name="numericsubword" class="solr.TextField" multiValued="true" positionIncrementGap="100">
+ <analyzer type="index">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" splitOnNumerics="0" splitOnCaseChange="0" generateWordParts="1"
+ generateNumberParts="0" catenateWords="0" catenateNumbers="0" catenateAll="0"/>
+ <filter class="solr.StopFilterFactory"/>
+ <filter class="solr.PorterStemFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" splitOnNumerics="0" splitOnCaseChange="0" generateWordParts="1"
+ generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
+ <filter class="solr.StopFilterFactory"/>
+ <filter class="solr.PorterStemFilterFactory"/>
+ </analyzer>
+ </fieldtype>
+
+ <fieldtype name="protectedsubword" class="solr.TextField" multiValued="true" positionIncrementGap="100">
+ <analyzer type="index">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" splitOnNumerics="0" splitOnCaseChange="0" generateWordParts="1"
+ generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"/>
+ <filter class="solr.FlattenGraphFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldtype>
+
+
+ <!-- more flexible in matching skus, but more chance of a false match -->
+ <fieldtype name="skutype1" class="solr.TextField">
+ <analyzer type="index">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1"
+ catenateNumbers="1" catenateAll="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1"
+ catenateNumbers="1" catenateAll="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldtype>
+
+ <!-- less flexible in matching skus, but less chance of a false match -->
+ <fieldtype name="skutype2" class="solr.TextField">
+ <analyzer type="index">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1"
+ catenateNumbers="1" catenateAll="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1"
+ catenateNumbers="1" catenateAll="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldtype>
+
+ <!-- less flexible in matching skus, but less chance of a false match -->
+ <fieldtype name="syn" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ </analyzer>
+ </fieldtype>
+
+
+ <fieldtype name="unstored" class="solr.StrField" indexed="true" stored="false"/>
+
+
+ <fieldtype name="textgap" class="solr.TextField" multiValued="true" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldtype>
+
+ <fieldType name="uuid" class="solr.UUIDField"/>
+
+ <!-- Try out some point types -->
+ <fieldType name="xy" class="solr.PointType" dimension="2" subFieldType="double"/>
+ <fieldType name="x" class="solr.PointType" dimension="1" subFieldType="double"/>
+ <fieldType name="tenD" class="solr.PointType" dimension="10" subFieldType="double"/>
+ <!-- Use the sub field suffix -->
+ <fieldType name="xyd" class="solr.PointType" dimension="2" subFieldSuffix="_d1"/>
+ <fieldtype name="geohash" class="solr.GeoHashField"/>
+
+
+ <fieldType name="latLon" class="solr.LatLonType" subFieldType="double"/>
+
+ <!-- Min hash analysed field type -->
+
+ <fieldType name="text_min_hash" class="solr.TextField" positionIncrementGap="100">
+ <analyzer>
+ <tokenizer class="solr.ICUTokenizerFactory"/>
+ <filter class="solr.ICUFoldingFilterFactory"/>
+ <filter class="solr.ShingleFilterFactory" minShingleSize="5" outputUnigrams="false" outputUnigramsIfNoShingles="false" maxShingleSize="5" tokenSeparator=" "/>
+ <filter class="org.apache.lucene.analysis.minhash.MinHashFilterFactory" bucketCount="512" hashSetSize="1" hashCount="1"/>
+ </analyzer>
+ </fieldType>
+
+ <fieldType name="strings" class="solr.StrField" sortMissingLast="true" multiValued="true"/>
+
+ <!-- some per-field similarity examples -->
+
+ <!-- specify a Similarity classname directly -->
+ <!--
+ <fieldType name="sim1" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ </analyzer>
+ <similarity class="org.apache.lucene.misc.SweetSpotSimilarity"/>
+ </fieldType>
+ -->
+ <!-- specify a Similarity factory -->
+ <!--
+ <fieldType name="sim2" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ </analyzer>
+ <similarity class="org.apache.solr.search.similarities.CustomSimilarityFactory">
+ <str name="echo">is there an echo?</str>
+ </similarity>
+ </fieldType>
+ -->
+ <!-- don't specify any sim at all: get the default -->
+ <!--
+ <fieldType name="sim3" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ </analyzer>
+ </fieldType>
+ -->
+
+
+ <field name="id" type="string" indexed="true" stored="true" multiValued="false" required="false"/>
+ <field name="signatureField" type="string" indexed="true" stored="false"/>
+ <field name="uuid" type="uuid" stored="true"/>
+ <field name="name" type="nametext" indexed="true" stored="true"/>
+ <field name="text" type="text" indexed="true" stored="false"/>
+ <field name="subject" type="text" indexed="true" stored="true"/>
+ <field name="title" type="nametext" indexed="true" stored="true"/>
+ <field name="weight" type="float" indexed="true" stored="true" multiValued="false"/>
+ <field name="bday" type="date" indexed="true" stored="true" multiValued="false"/>
+
+ <field name="title_stemmed" type="text" indexed="true" stored="false"/>
+ <field name="title_lettertok" type="lettertok" indexed="true" stored="false"/>
+
+ <field name="syn" type="syn" indexed="true" stored="true"/>
+
+ <!-- to test property inheritance and overriding -->
+ <field name="shouldbeunstored" type="unstored"/>
+ <field name="shouldbestored" type="unstored" stored="true"/>
+ <field name="shouldbeunindexed" type="unstored" indexed="false" stored="true"/>
+
+ <!-- Test points -->
+ <!-- Test points -->
+ <field name="home" type="xy" indexed="true" stored="true" multiValued="false"/>
+ <field name="x" type="x" indexed="true" stored="true" multiValued="false"/>
+ <field name="homed" type="xyd" indexed="true" stored="true" multiValued="false"/>
+ <field name="home_ns" type="xy" indexed="true" stored="false" multiValued="false"/>
+ <field name="work" type="xy" indexed="true" stored="true" multiValued="false"/>
+
+ <field name="home_ll" type="latLon" indexed="true" stored="true" multiValued="false"/>
+ <field name="home_gh" type="geohash" indexed="true" stored="true" multiValued="false"/>
+
+
+ <field name="point10" type="tenD" indexed="true" stored="true" multiValued="false"/>
+
+
+ <!-- test different combinations of indexed and stored -->
+ <field name="bind" type="boolean" indexed="true" stored="false"/>
+ <field name="bsto" type="boolean" indexed="false" stored="true"/>
+ <field name="bindsto" type="boolean" indexed="true" stored="true"/>
+ <field name="isto" type="int" indexed="false" stored="true"/>
+ <field name="iind" type="int" indexed="true" stored="false"/>
+ <field name="ssto" type="string" indexed="false" stored="true"/>
+ <field name="sind" type="string" indexed="true" stored="false"/>
+ <field name="sindsto" type="string" indexed="true" stored="true"/>
+
+ <!-- test combinations of term vector settings -->
+ <field name="test_basictv" type="text" termVectors="true"/>
+ <field name="test_notv" type="text" termVectors="false"/>
+ <field name="test_postv" type="text" termVectors="true" termPositions="true"/>
+ <field name="test_offtv" type="text" termVectors="true" termOffsets="true"/>
+ <field name="test_posofftv" type="text" termVectors="true"
+ termPositions="true" termOffsets="true"/>
+
+ <!-- fields to test individual tokenizers and tokenfilters -->
+ <field name="keywordtok" type="keywordtok" indexed="true" stored="true"/>
+ <field name="standardtok" type="standardtok" indexed="true" stored="true"/>
+ <field name="HTMLstandardtok" type="HTMLstandardtok" indexed="true" stored="true"/>
+ <field name="lettertok" type="lettertok" indexed="true" stored="true"/>
+ <field name="whitetok" type="whitetok" indexed="true" stored="true"/>
+ <field name="HTMLwhitetok" type="HTMLwhitetok" indexed="true" stored="true"/>
+ <field name="standardtokfilt" type="standardtokfilt" indexed="true" stored="true"/>
+ <field name="standardfilt" type="standardfilt" indexed="true" stored="true"/>
+ <field name="lowerfilt" type="lowerfilt" indexed="true" stored="true"/>
+ <field name="lowerfilt1" type="lowerfilt" indexed="true" stored="true"/>
+ <field name="lowerfilt1and2" type="lowerfilt" indexed="true" stored="true"/>
+ <field name="patterntok" type="patterntok" indexed="true" stored="true"/>
+ <field name="patternreplacefilt" type="patternreplacefilt" indexed="true" stored="true"/>
+ <field name="porterfilt" type="porterfilt" indexed="true" stored="true"/>
+ <field name="engporterfilt" type="engporterfilt" indexed="true" stored="true"/>
+ <field name="custengporterfilt" type="custengporterfilt" indexed="true" stored="true"/>
+ <field name="stopfilt" type="stopfilt" indexed="true" stored="true"/>
+ <field name="custstopfilt" type="custstopfilt" indexed="true" stored="true"/>
+ <field name="lengthfilt" type="lengthfilt" indexed="true" stored="true"/>
+ <field name="wdf_nocase" type="wdf_nocase" indexed="true" stored="true"/>
+ <field name="wdf_preserve" type="wdf_preserve" indexed="true" stored="true"/>
+
+ <field name="numberpartfail" type="failtype1" indexed="true" stored="true"/>
+
+ <field name="nullfirst" type="string" indexed="true" stored="true" sortMissingFirst="true" multiValued="false"/>
+
+ <field name="subword" type="subword" indexed="true" stored="true"/>
+ <field name="subword_offsets" type="subword" indexed="true" stored="true" termOffsets="true"/>
+ <field name="numericsubword" type="numericsubword" indexed="true" stored="true"/>
+ <field name="protectedsubword" type="protectedsubword" indexed="true" stored="true"/>
+
+ <field name="sku1" type="skutype1" indexed="true" stored="true"/>
+ <field name="sku2" type="skutype2" indexed="true" stored="true"/>
+
+ <field name="textgap" type="textgap" indexed="true" stored="true"/>
+
+ <field name="timestamp" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/>
+ <field name="multiDefault" type="string" indexed="true" stored="true" default="muLti-Default" multiValued="true"/>
+ <field name="intDefault" type="int" indexed="true" stored="true" default="42" multiValued="false"/>
+
+ <!--
+ <field name="sim1text" type="sim1" indexed="true" stored="true"/>
+ <field name="sim2text" type="sim2" indexed="true" stored="true"/>
+ <field name="sim3text" type="sim3" indexed="true" stored="true"/>
+ -->
+
+ <field name="tlong" type="tlong" indexed="true" stored="true"/>
+
+ <field name="_version_" type="long" indexed="true" stored="true"/>
+
+ <field name="min_hash_string" type="strings" multiValued="true" indexed="true" stored="true"/>
+ <field name="min_hash_analysed" type="text_min_hash" multiValued="false" indexed="true" stored="false"/>
+
+ <!-- Dynamic field definitions. If a field name is not found, dynamicFields
+ will be used if the name matches any of the patterns.
+ RESTRICTION: the glob-like pattern in the name attribute must have
+ a "*" only at the start or the end.
+ EXAMPLE: name="*_i" will match any field ending in _i (like myid_i, z_i)
+ Longer patterns will be matched first. if equal size patterns
+ both match, the first appearing in the schema will be used.
+ -->
+ <dynamicField name="*_i" type="int" indexed="true" stored="true"/>
+ <dynamicField name="*_i1" type="int" indexed="true" stored="true" multiValued="false"/>
+
+ <dynamicField name="*_s" type="string" indexed="true" stored="true"/>
+ <dynamicField name="*_s1" type="string" indexed="true" stored="true" multiValued="false"/>
+ <dynamicField name="*_l" type="long" indexed="true" stored="true"/>
+ <dynamicField name="*_l1" type="long" indexed="true" stored="true" multiValued="false"/>
+ <dynamicField name="*_t" type="text" indexed="true" stored="true"/>
+ <dynamicField name="*_b" type="boolean" indexed="true" stored="true"/>
+ <dynamicField name="*_f" type="float" indexed="true" stored="true"/>
+ <dynamicField name="*_f1" type="float" indexed="true" stored="true" multiValued="false"/>
+ <dynamicField name="*_d" type="double" indexed="true" stored="true"/>
+ <dynamicField name="*_d1" type="double" indexed="true" stored="true" multiValued="false"/>
+ <dynamicField name="*_dt" type="date" indexed="true" stored="true"/>
+ <dynamicField name="*_dt1" type="date" indexed="true" stored="true" multiValued="false"/>
+
+ <!-- some trie-coded dynamic fields for faster range queries -->
+ <dynamicField name="*_ti" type="tint" indexed="true" stored="true"/>
+ <dynamicField name="*_ti1" type="tint" indexed="true" stored="true" multiValued="false"/>
+ <dynamicField name="*_tl" type="tlong" indexed="true" stored="true"/>
+ <dynamicField name="*_tl1" type="tlong" indexed="true" stored="true" multiValued="false"/>
+ <dynamicField name="*_tf" type="tfloat" indexed="true" stored="true"/>
+ <dynamicField name="*_tf1" type="tfloat" indexed="true" stored="true" multiValued="false"/>
+ <dynamicField name="*_td" type="tdouble" indexed="true" stored="true"/>
+ <dynamicField name="*_td1" type="tdouble" indexed="true" stored="true" multiValued="false"/>
+ <dynamicField name="*_tds" type="tdouble" indexed="true" stored="true" multiValued="false"/>
+ <dynamicField name="*_tdt" type="tdate" indexed="true" stored="true"/>
+ <dynamicField name="*_tdt1" type="tdate" indexed="true" stored="true" multiValued="false"/>
+
+
+ <dynamicField name="*_sI" type="string" indexed="true" stored="false"/>
+ <dynamicField name="*_sS" type="string" indexed="false" stored="true"/>
+ <dynamicField name="t_*" type="text" indexed="true" stored="true"/>
+ <dynamicField name="tv_*" type="text" indexed="true" stored="true"
+ termVectors="true" termPositions="true" termOffsets="true"/>
+ <dynamicField name="tv_mv_*" type="text" indexed="true" stored="true" multiValued="true"
+ termVectors="true" termPositions="true" termOffsets="true"/>
+
+ <dynamicField name="*_p" type="xyd" indexed="true" stored="true" multiValued="false"/>
+
+ <!-- special fields for dynamic copyField test -->
+ <dynamicField name="dynamic_*" type="string" indexed="true" stored="true"/>
+ <dynamicField name="*_dynamic" type="string" indexed="true" stored="true"/>
+
+ <!-- for testing to ensure that longer patterns are matched first -->
+ <dynamicField name="*aa" type="string" indexed="true" stored="true"/>
+
+ <!-- ignored becuase not stored or indexed -->
+ <dynamicField name="*_ignored" type="text" indexed="false" stored="false"/>
+
+ <dynamicField name="*_mfacet" type="string" indexed="true" stored="false" multiValued="true"/>
+
+ <!-- make sure custom sims work with dynamic fields -->
+ <!--
+ <dynamicField name="*_sim1" type="sim1" indexed="true" stored="true"/>
+ <dynamicField name="*_sim2" type="sim2" indexed="true" stored="true"/>
+ <dynamicField name="*_sim3" type="sim3" indexed="true" stored="true"/>
+ -->
+
+ <uniqueKey>id</uniqueKey>
+
+ <!-- copyField commands copy one field to another at the time a document
+ is added to the index. It's used either to index the same field different
+ ways, or to add multiple fields to the same field for easier/faster searching.
+ -->
+ <copyField source="title" dest="title_stemmed"/>
+ <copyField source="title" dest="title_lettertok"/>
+
+ <copyField source="title" dest="text"/>
+ <copyField source="subject" dest="text"/>
+
+ <copyField source="lowerfilt1" dest="lowerfilt1and2"/>
+ <copyField source="lowerfilt" dest="lowerfilt1and2"/>
+
+ <copyField source="*_t" dest="text"/>
+
+
+ <!-- dynamic destination -->
+ <copyField source="*_dynamic" dest="dynamic_*"/>
+
+</schema>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3093dc52/solr/core/src/test-files/solr/collection1/conf/solrconfig-minhash.xml
----------------------------------------------------------------------
diff --git a/solr/core/src/test-files/solr/collection1/conf/solrconfig-minhash.xml b/solr/core/src/test-files/solr/collection1/conf/solrconfig-minhash.xml
new file mode 100644
index 0000000..f814d34
--- /dev/null
+++ b/solr/core/src/test-files/solr/collection1/conf/solrconfig-minhash.xml
@@ -0,0 +1,567 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- This is a "kitchen sink" config file that tests can use.
+ When writting a new test, feel free to add *new* items (plugins,
+ config options, etc...) as long as they don't break any existing
+ tests. if you need to test something esoteric please add a new
+ "solrconfig-your-esoteric-purpose.xml" config file.
+
+ Note in particular that this test is used by MinimalSchemaTest so
+ Anything added to this file needs to work correctly even if there
+ is now uniqueKey or defaultSearch Field.
+ -->
+
+<config>
+
+ <jmx />
+
+ <!-- Used to specify an alternate directory to hold all index data.
+ It defaults to "index" if not present, and should probably
+ not be changed if replication is in use. -->
+ <dataDir>${solr.data.dir:}</dataDir>
+
+ <!-- The DirectoryFactory to use for indexes.
+ solr.StandardDirectoryFactory, the default, is filesystem based.
+ solr.RAMDirectoryFactory is memory based and not persistent. -->
+ <directoryFactory name="DirectoryFactory" class="${solr.directoryFactory:solr.RAMDirectoryFactory}">
+ <double name="maxWriteMBPerSecDefault">1000000</double>
+ <double name="maxWriteMBPerSecFlush">2000000</double>
+ <double name="maxWriteMBPerSecMerge">3000000</double>
+ <double name="maxWriteMBPerSecRead">4000000</double>
+ <str name="solr.hdfs.home">${solr.hdfs.home:}</str>
+ <bool name="solr.hdfs.blockcache.enabled">${solr.hdfs.blockcache.enabled:true}</bool>
+ <bool name="solr.hdfs.blockcache.global">${solr.hdfs.blockcache.global:true}</bool>
+ <bool name="solr.hdfs.blockcache.write.enabled">${solr.hdfs.blockcache.write.enabled:false}</bool>
+ <int name="solr.hdfs.blockcache.blocksperbank">10</int>
+ <int name="solr.hdfs.blockcache.slab.count">1</int>
+ </directoryFactory>
+
+ <schemaFactory class="ClassicIndexSchemaFactory"/>
+
+ <luceneMatchVersion>${tests.luceneMatchVersion:LATEST}</luceneMatchVersion>
+
+ <statsCache class="${solr.statsCache:}"/>
+
+ <xi:include href="solrconfig.snippet.randomindexconfig.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
+
+ <updateHandler class="solr.DirectUpdateHandler2">
+
+ <autoCommit>
+ <maxTime>${solr.autoCommit.maxTime:-1}</maxTime>
+ </autoCommit>
+
+ <!-- autocommit pending docs if certain criteria are met
+ <autoCommit>
+ <maxDocs>10000</maxDocs>
+ <maxTime>3600000</maxTime>
+ </autoCommit>
+ -->
+
+ <updateLog enable="${enable.update.log:true}">
+ <str name="dir">${solr.ulog.dir:}</str>
+ </updateLog>
+
+ <commitWithin>
+ <softCommit>${solr.commitwithin.softcommit:true}</softCommit>
+ </commitWithin>
+
+ </updateHandler>
+
+ <query>
+ <!-- Maximum number of clauses in a boolean query... can affect
+ range or wildcard queries that expand to big boolean
+ queries. An exception is thrown if exceeded.
+ -->
+ <maxBooleanClauses>1024</maxBooleanClauses>
+
+ <!-- Cache specification for Filters or DocSets - unordered set of *all* documents
+ that match a particular query.
+ -->
+ <filterCache
+ class="solr.search.FastLRUCache"
+ size="512"
+ initialSize="512"
+ autowarmCount="2"/>
+
+ <queryResultCache
+ class="solr.search.LRUCache"
+ size="512"
+ initialSize="512"
+ autowarmCount="2"/>
+
+ <documentCache
+ class="solr.search.LRUCache"
+ size="512"
+ initialSize="512"
+ autowarmCount="0"/>
+
+ <cache name="perSegFilter"
+ class="solr.search.LRUCache"
+ size="10"
+ initialSize="0"
+ autowarmCount="10" />
+
+ <!-- If true, stored fields that are not requested will be loaded lazily.
+ -->
+ <enableLazyFieldLoading>true</enableLazyFieldLoading>
+
+ <!--
+
+ <cache name="myUserCache"
+ class="solr.search.LRUCache"
+ size="4096"
+ initialSize="1024"
+ autowarmCount="1024"
+ regenerator="MyRegenerator"
+ />
+ -->
+
+ <!--
+ <useFilterForSortedQuery>true</useFilterForSortedQuery>
+ -->
+
+ <queryResultWindowSize>10</queryResultWindowSize>
+
+ <!-- set maxSize artificially low to exercise both types of sets -->
+ <HashDocSet maxSize="3" loadFactor="0.75"/>
+
+ <!-- boolToFilterOptimizer converts boolean clauses with zero boost
+ into cached filters if the number of docs selected by the clause exceeds
+ the threshold (represented as a fraction of the total index)
+ -->
+ <boolTofilterOptimizer enabled="false" cacheSize="32" threshold=".05"/>
+
+ <!-- a newSearcher event is fired whenever a new searcher is being prepared
+ and there is a current searcher handling requests (aka registered). -->
+ <!-- QuerySenderListener takes an array of NamedList and executes a
+ local query request for each NamedList in sequence. -->
+ <!--
+ <listener event="newSearcher" class="solr.QuerySenderListener">
+ <arr name="queries">
+ <lst> <str name="q">solr</str> <str name="start">0</str> <str name="rows">10</str> </lst>
+ <lst> <str name="q">rocks</str> <str name="start">0</str> <str name="rows">10</str> </lst>
+ </arr>
+ </listener>
+ -->
+
+ <!-- a firstSearcher event is fired whenever a new searcher is being
+ prepared but there is no current registered searcher to handle
+ requests or to gain prewarming data from. -->
+ <!--
+ <listener event="firstSearcher" class="solr.QuerySenderListener">
+ <arr name="queries">
+ <lst> <str name="q">fast_warm</str> <str name="start">0</str> <str name="rows">10</str> </lst>
+ </arr>
+ </listener>
+ -->
+
+ <slowQueryThresholdMillis>2000</slowQueryThresholdMillis>
+
+ </query>
+
+ <queryResponseWriter name="xml" default="true"
+ class="solr.XMLResponseWriter" />
+
+ <!-- An alternate set representation that uses an integer hash to store filters (sets of docids).
+ If the set cardinality <= maxSize elements, then HashDocSet will be used instead of the bitset
+ based HashBitset. -->
+
+ <!-- requestHandler plugins
+ -->
+ <requestHandler name="/select" class="solr.SearchHandler">
+ <bool name="httpCaching">true</bool>
+ </requestHandler>
+
+ <requestHandler name="/dismax" class="solr.SearchHandler" >
+ <lst name="defaults">
+ <str name="defType">dismax</str>
+ <str name="q.alt">*:*</str>
+ <float name="tie">0.01</float>
+ <str name="qf">
+ text^0.5 features_t^1.0 subject^1.4 title_stemmed^2.0
+ </str>
+ <str name="pf">
+ text^0.2 features_t^1.1 subject^1.4 title_stemmed^2.0 title^1.5
+ </str>
+ <str name="bf">
+ weight^0.5 recip(rord(id),1,1000,1000)^0.3
+ </str>
+ <str name="mm">
+ 3<-1 5<-2 6<90%
+ </str>
+ <int name="ps">100</int>
+ </lst>
+ </requestHandler>
+
+ <requestHandler name="/mock" class="org.apache.solr.core.MockQuerySenderListenerReqHandler"/>
+
+ <!-- test query parameter defaults -->
+ <requestHandler name="/defaults" class="solr.SearchHandler">
+ <lst name="defaults">
+ <int name="rows">4</int>
+ <bool name="hl">true</bool>
+ <str name="hl.fl">text,name,subject,title,whitetok</str>
+ </lst>
+ </requestHandler>
+
+ <!-- test query parameter defaults -->
+ <requestHandler name="/lazy" class="solr.SearchHandler" startup="lazy">
+ <lst name="defaults">
+ <int name="rows">4</int>
+ <bool name="hl">true</bool>
+ <str name="hl.fl">text,name,subject,title,whitetok</str>
+ </lst>
+ </requestHandler>
+
+
+
+ <searchComponent name="spellcheck" class="org.apache.solr.handler.component.SpellCheckComponent">
+ <!-- This is slightly different from the field value so we can test dealing with token offset changes -->
+ <str name="queryAnalyzerFieldType">lowerpunctfilt</str>
+
+ <lst name="spellchecker">
+ <str name="name">default</str>
+ <str name="field">lowerfilt</str>
+ <str name="spellcheckIndexDir">spellchecker1</str>
+ <str name="buildOnCommit">false</str>
+ </lst>
+ <lst name="spellchecker">
+ <str name="name">direct</str>
+ <str name="classname">DirectSolrSpellChecker</str>
+ <str name="field">lowerfilt</str>
+ <int name="minQueryLength">3</int>
+ </lst>
+ <lst name="spellchecker">
+ <str name="name">wordbreak</str>
+ <str name="classname">solr.WordBreakSolrSpellChecker</str>
+ <str name="field">lowerfilt</str>
+ <str name="combineWords">true</str>
+ <str name="breakWords">true</str>
+ <int name="maxChanges">10</int>
+ </lst>
+ <lst name="spellchecker">
+ <str name="name">multipleFields</str>
+ <str name="field">lowerfilt1and2</str>
+ <str name="spellcheckIndexDir">spellcheckerMultipleFields</str>
+ <str name="buildOnCommit">false</str>
+ </lst>
+ <!-- Example of using different distance measure -->
+ <lst name="spellchecker">
+ <str name="name">jarowinkler</str>
+ <str name="field">lowerfilt</str>
+ <!-- Use a different Distance Measure -->
+ <str name="distanceMeasure">org.apache.lucene.search.spell.JaroWinklerDistance</str>
+ <str name="spellcheckIndexDir">spellchecker2</str>
+
+ </lst>
+ <lst name="spellchecker">
+ <str name="classname">solr.FileBasedSpellChecker</str>
+ <str name="name">external</str>
+ <str name="sourceLocation">spellings.txt</str>
+ <str name="characterEncoding">UTF-8</str>
+ <str name="spellcheckIndexDir">spellchecker3</str>
+ </lst>
+ <!-- Comparator -->
+ <lst name="spellchecker">
+ <str name="name">freq</str>
+ <str name="field">lowerfilt</str>
+ <str name="spellcheckIndexDir">spellcheckerFreq</str>
+ <!-- comparatorClass be one of:
+ 1. score (default)
+ 2. freq (Frequency first, then score)
+ 3. A fully qualified class name
+ -->
+ <str name="comparatorClass">freq</str>
+ <str name="buildOnCommit">false</str>
+ </lst>
+ <lst name="spellchecker">
+ <str name="name">fqcn</str>
+ <str name="field">lowerfilt</str>
+ <str name="spellcheckIndexDir">spellcheckerFQCN</str>
+ <str name="comparatorClass">org.apache.solr.spelling.SampleComparator</str>
+ <str name="buildOnCommit">false</str>
+ </lst>
+ <lst name="spellchecker">
+ <str name="name">perDict</str>
+ <str name="classname">org.apache.solr.handler.component.DummyCustomParamSpellChecker</str>
+ <str name="field">lowerfilt</str>
+ </lst>
+ </searchComponent>
+
+ <searchComponent name="termsComp" class="org.apache.solr.handler.component.TermsComponent"/>
+
+ <requestHandler name="/terms" class="org.apache.solr.handler.component.SearchHandler">
+ <arr name="components">
+ <str>termsComp</str>
+ </arr>
+ </requestHandler>
+
+
+ <!--
+ The SpellingQueryConverter to convert raw (CommonParams.Q) queries into tokens. Uses a simple regular expression
+ to strip off field markup, boosts, ranges, etc. but it is not guaranteed to match an exact parse from the query parser.
+ -->
+ <queryConverter name="queryConverter" class="org.apache.solr.spelling.SpellingQueryConverter"/>
+
+ <requestHandler name="/spellCheckCompRH" class="org.apache.solr.handler.component.SearchHandler">
+ <lst name="defaults">
+ <!-- omp = Only More Popular -->
+ <str name="spellcheck.onlyMorePopular">false</str>
+ <!-- exr = Extended Results -->
+ <str name="spellcheck.extendedResults">false</str>
+ <!-- The number of suggestions to return -->
+ <str name="spellcheck.count">1</str>
+ </lst>
+ <arr name="last-components">
+ <str>spellcheck</str>
+ </arr>
+ </requestHandler>
+ <requestHandler name="/spellCheckCompRH_Direct" class="org.apache.solr.handler.component.SearchHandler">
+ <lst name="defaults">
+ <str name="spellcheck.dictionary">direct</str>
+ <str name="spellcheck.onlyMorePopular">false</str>
+ <str name="spellcheck.extendedResults">false</str>
+ <str name="spellcheck.count">1</str>
+ </lst>
+ <arr name="last-components">
+ <str>spellcheck</str>
+ </arr>
+ </requestHandler>
+ <requestHandler name="/spellCheckWithWordbreak" class="org.apache.solr.handler.component.SearchHandler">
+ <lst name="defaults">
+ <str name="spellcheck.dictionary">default</str>
+ <str name="spellcheck.dictionary">wordbreak</str>
+ <str name="spellcheck.count">20</str>
+ </lst>
+ <arr name="last-components">
+ <str>spellcheck</str>
+ </arr>
+ </requestHandler>
+ <requestHandler name="/spellCheckWithWordbreak_Direct" class="org.apache.solr.handler.component.SearchHandler">
+ <lst name="defaults">
+ <str name="spellcheck.dictionary">direct</str>
+ <str name="spellcheck.dictionary">wordbreak</str>
+ <str name="spellcheck.count">20</str>
+ </lst>
+ <arr name="last-components">
+ <str>spellcheck</str>
+ </arr>
+ </requestHandler>
+ <requestHandler name="/spellCheckCompRH1" class="org.apache.solr.handler.component.SearchHandler">
+ <lst name="defaults">
+ <str name="defType">dismax</str>
+ <str name="qf">lowerfilt1^1</str>
+ </lst>
+ <arr name="last-components">
+ <str>spellcheck</str>
+ </arr>
+ </requestHandler>
+
+ <requestHandler name="/mltrh" class="org.apache.solr.handler.component.SearchHandler">
+
+ </requestHandler>
+
+ <searchComponent name="tvComponent" class="org.apache.solr.handler.component.TermVectorComponent"/>
+
+ <requestHandler name="/tvrh" class="org.apache.solr.handler.component.SearchHandler">
+ <lst name="defaults">
+
+ </lst>
+ <arr name="last-components">
+ <str>tvComponent</str>
+ </arr>
+ </requestHandler>
+
+ <requestHandler name="/mlt" class="solr.MoreLikeThisHandler">
+ </requestHandler>
+
+ <searchComponent class="solr.HighlightComponent" name="highlight">
+ <highlighting>
+ <!-- Configure the standard fragmenter -->
+ <fragmenter name="gap" class="org.apache.solr.highlight.GapFragmenter" default="true">
+ <lst name="defaults">
+ <int name="hl.fragsize">100</int>
+ </lst>
+ </fragmenter>
+
+ <fragmenter name="regex" class="org.apache.solr.highlight.RegexFragmenter">
+ <lst name="defaults">
+ <int name="hl.fragsize">70</int>
+ </lst>
+ </fragmenter>
+
+ <!-- Configure the standard formatter -->
+ <formatter name="html" class="org.apache.solr.highlight.HtmlFormatter" default="true">
+ <lst name="defaults">
+ <str name="hl.simple.pre"><![CDATA[<em>]]></str>
+ <str name="hl.simple.post"><![CDATA[</em>]]></str>
+ </lst>
+ </formatter>
+
+ <!-- Configure the standard fragListBuilder -->
+ <fragListBuilder name="simple" class="org.apache.solr.highlight.SimpleFragListBuilder" default="true"/>
+
+ <!-- Configure the standard fragmentsBuilder -->
+ <fragmentsBuilder name="simple" class="org.apache.solr.highlight.SimpleFragmentsBuilder" default="true"/>
+ <fragmentsBuilder name="scoreOrder" class="org.apache.solr.highlight.ScoreOrderFragmentsBuilder"/>
+
+ <boundaryScanner name="simple" class="solr.highlight.SimpleBoundaryScanner" default="true">
+ <lst name="defaults">
+ <str name="hl.bs.maxScan">10</str>
+ <str name="hl.bs.chars">.,!? 	 </str>
+ </lst>
+ </boundaryScanner>
+
+ <boundaryScanner name="breakIterator" class="solr.highlight.BreakIteratorBoundaryScanner">
+ <lst name="defaults">
+ <str name="hl.bs.type">WORD</str>
+ <str name="hl.bs.language">en</str>
+ <str name="hl.bs.country">US</str>
+ </lst>
+ </boundaryScanner>
+ </highlighting>
+ </searchComponent>
+
+ <requestDispatcher>
+ <requestParsers enableRemoteStreaming="true" enableStreamBody="true" multipartUploadLimitInKB="-1" />
+ <httpCaching lastModifiedFrom="openTime" etagSeed="Solr" never304="false">
+ <cacheControl>max-age=30, public</cacheControl>
+ </httpCaching>
+ </requestDispatcher>
+
+ <requestHandler name="/search-facet-def" class="solr.SearchHandler" >
+ <lst name="defaults">
+ <str name="facet.field">foo_s</str>
+ </lst>
+ <lst name="appends">
+ <str name="facet.query">foo_s:bar</str>
+ </lst>
+ </requestHandler>
+ <requestHandler name="/search-facet-invariants" class="solr.SearchHandler" >
+ <lst name="invariants">
+ <str name="facet.field">foo_s</str>
+ <str name="facet.query">foo_s:bar</str>
+ </lst>
+ </requestHandler>
+
+ <admin>
+ <defaultQuery>solr</defaultQuery>
+ <gettableFiles>solrconfig.xml schema.xml</gettableFiles>
+ </admin>
+
+ <!-- test getting system property -->
+ <propTest attr1="${solr.test.sys.prop1}-$${literal}"
+ attr2="${non.existent.sys.prop:default-from-config}">prefix-${solr.test.sys.prop2}-suffix</propTest>
+
+ <queryParser name="foo" class="FooQParserPlugin"/>
+
+ <updateRequestProcessorChain name="dedupe">
+ <processor class="org.apache.solr.update.processor.SignatureUpdateProcessorFactory">
+ <bool name="enabled">false</bool>
+ <bool name="overwriteDupes">true</bool>
+ <str name="fields">v_t,t_field</str>
+ <str name="signatureClass">org.apache.solr.update.processor.TextProfileSignature</str>
+ </processor>
+ <processor class="solr.RunUpdateProcessorFactory" />
+ </updateRequestProcessorChain>
+ <updateRequestProcessorChain name="dedupe-allfields">
+ <processor class="org.apache.solr.update.processor.SignatureUpdateProcessorFactory">
+ <bool name="enabled">false</bool>
+ <bool name="overwriteDupes">false</bool>
+ <str name="signatureField">id</str>
+ <str name="fields"></str>
+ <str name="signatureClass">org.apache.solr.update.processor.Lookup3Signature</str>
+ </processor>
+ <processor class="solr.RunUpdateProcessorFactory" />
+ </updateRequestProcessorChain>
+ <updateRequestProcessorChain name="stored_sig">
+ <!-- this chain is valid even though the signature field is not
+ indexed, because we are not asking for dups to be overwritten
+ -->
+ <processor class="org.apache.solr.update.processor.SignatureUpdateProcessorFactory">
+ <bool name="enabled">true</bool>
+ <str name="signatureField">non_indexed_signature_sS</str>
+ <bool name="overwriteDupes">false</bool>
+ <str name="fields">v_t,t_field</str>
+ <str name="signatureClass">org.apache.solr.update.processor.TextProfileSignature</str>
+ </processor>
+ <processor class="solr.RunUpdateProcessorFactory" />
+ </updateRequestProcessorChain>
+ <updateRequestProcessorChain name="uniq-fields">
+ <processor class="org.apache.solr.update.processor.UniqFieldsUpdateProcessorFactory">
+ <arr name="fieldName">
+ <str>uniq</str>
+ <str>uniq2</str>
+ <str>uniq3</str>
+ </arr>
+ </processor>
+ <processor class="solr.RunUpdateProcessorFactory" />
+ </updateRequestProcessorChain>
+
+ <updateRequestProcessorChain name="distrib-dup-test-chain-explicit">
+ <!-- explicit test using processors before and after distrib -->
+ <processor class="solr.RegexReplaceProcessorFactory">
+ <str name="fieldName">regex_dup_A_s</str>
+ <str name="pattern">x</str>
+ <str name="replacement">x_x</str>
+ </processor>
+ <processor class="solr.DistributedUpdateProcessorFactory" />
+ <processor class="solr.RegexReplaceProcessorFactory">
+ <str name="fieldName">regex_dup_B_s</str>
+ <str name="pattern">x</str>
+ <str name="replacement">x_x</str>
+ </processor>
+ <processor class="solr.RunUpdateProcessorFactory" />
+ </updateRequestProcessorChain>
+
+ <updateRequestProcessorChain name="distrib-dup-test-chain-implicit">
+ <!-- implicit test w/o distrib declared-->
+ <processor class="solr.RegexReplaceProcessorFactory">
+ <str name="fieldName">regex_dup_A_s</str>
+ <str name="pattern">x</str>
+ <str name="replacement">x_x</str>
+ </processor>
+ <processor class="solr.RegexReplaceProcessorFactory">
+ <str name="fieldName">regex_dup_B_s</str>
+ <str name="pattern">x</str>
+ <str name="replacement">x_x</str>
+ </processor>
+ <processor class="solr.RunUpdateProcessorFactory" />
+ </updateRequestProcessorChain>
+
+ <restManager>
+ <!--
+ IMPORTANT: Due to the Lucene SecurityManager, tests can only write to their runtime directory or below.
+ But it's easier to just keep everything in memory for testing so no remnants are left behind.
+ -->
+ <str name="storageIO">org.apache.solr.rest.ManagedResourceStorage$InMemoryStorageIO</str>
+ </restManager>
+
+ <!-- warning: not a best practice; requests generally ought to be explicit to thus not require this -->
+ <initParams path="/select,/dismax,/defaults,/lazy,/spellCheckCompRH,/spellCheckWithWordbreak,/spellCheckCompRH_Direct,/spellCheckCompRH1,/mltrh,/tvrh,/search-facet-def,/search-facet-invariants,/terms">
+ <lst name="defaults">
+ <str name="df">text</str>
+ </lst>
+ </initParams>
+
+ <queryParser name="minhash" class="org.apache.solr.search.MinHashQParserPlugin" />
+
+</config>
+
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3093dc52/solr/core/src/test/org/apache/solr/search/TestMinHashQParser.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/search/TestMinHashQParser.java b/solr/core/src/test/org/apache/solr/search/TestMinHashQParser.java
new file mode 100644
index 0000000..78027cb
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/search/TestMinHashQParser.java
@@ -0,0 +1,358 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.search;
+
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.ConstantScoreQuery;
+import org.apache.lucene.search.Query;
+import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.request.SolrQueryRequest;
+import org.junit.After;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class TestMinHashQParser extends SolrTestCaseJ4 {
+
+ /**
+ * Initializes core and does some sanity checking of schema
+ */
+ @BeforeClass
+ public static void beforeClass() throws Exception {
+ initCore("solrconfig-minhash.xml", "schema-minhash.xml");
+ }
+
+ @After
+ public void afterTest() {
+ assertU(delQ("*:*"));
+ assertU(commit());
+ }
+
+ @Test
+ public void testBandSize() {
+ // Examples from mining massive data sets
+ assertEquals(5, MinHashQParser.computeBandSize(100, 0.8, 0.9995));
+ assertEquals(5, MinHashQParser.computeBandSize(100, 0.7, 0.974));
+ assertEquals(5, MinHashQParser.computeBandSize(100, 0.6, 0.8));
+ assertEquals(5, MinHashQParser.computeBandSize(100, 0.5, 0.465));
+ assertEquals(5, MinHashQParser.computeBandSize(100, 0.4, 0.185));
+ assertEquals(5, MinHashQParser.computeBandSize(100, 0.3, 0.046));
+ assertEquals(5, MinHashQParser.computeBandSize(100, 0.2, 0.005));
+ }
+
+
+ @Test
+ public void testAnalysedMinHash() {
+ assertU(adoc("id", "doc_1", "min_hash_analysed", "Min Hashing is great for spotted strings of exact matching words"));
+ assertU(adoc("id", "doc_2", "min_hash_analysed", "Min Hashing is great for rabbits who like to spot strings of exact matching words"));
+ assertU(commit());
+
+ String gQuery = "*:*";
+ SolrQueryRequest qr = createRequest(gQuery);
+ assertQ(qr, "//*[@numFound='2']");
+
+ gQuery = "{!minhash field=\"min_hash_analysed\"}Min Hashing is great for spotted strings of exact matching words";
+ qr = createRequest(gQuery);
+ assertQ(qr, "//*[@numFound='2']",
+ "//result/doc[1]/str[@name='id'][.='doc_1']",
+ "//result/doc[1]/float[@name='score'][.=512.0]",
+ "//result/doc[2]/str[@name='id'][.='doc_2']",
+ "//result/doc[2]/float[@name='score'][.=255.0]");
+
+ gQuery = "{!minhash field=\"min_hash_analysed\"}Min Hashing is great for";
+ qr = createRequest(gQuery);
+ assertQ(qr, "//*[@numFound='2']",
+ "//result/doc[1]/str[@name='id'][.='doc_1']",
+ "//result/doc[1]/float[@name='score'][.=512.0]",
+ "//result/doc[2]/str[@name='id'][.='doc_2']",
+ "//result/doc[2]/float[@name='score'][.=512.0]");
+
+ gQuery = "{!minhash field=\"min_hash_analysed\" sim=\"0.9\" tp=\"0.9\"}Min Hashing is great for spotted strings of exact matching words";
+ qr = createRequest(gQuery);
+ assertQ(qr, "//*[@numFound='2']",
+ "//result/doc[1]/str[@name='id'][.='doc_1']",
+ "//result/doc[1]/float[@name='score'][.=23.0]",
+ "//result/doc[2]/str[@name='id'][.='doc_2']",
+ "//result/doc[2]/float[@name='score'][.=10.0]");
+
+ gQuery = "{!minhash field=\"min_hash_analysed\" sim=\"0.9\"}Min Hashing is great for spotted strings of exact matching words";
+ qr = createRequest(gQuery);
+ assertQ(qr, "//*[@numFound='1']",
+ "//result/doc[1]/str[@name='id'][.='doc_1']",
+ "//result/doc[1]/float[@name='score'][.=512.0]");
+
+ gQuery = "{!minhash field=\"min_hash_analysed\" sim=\"0.9\" analyzer_field=\"min_hash_analysed\"}Min Hashing is great for spotted strings of exact matching words";
+ qr = createRequest(gQuery);
+ assertQ(qr, "//*[@numFound='1']",
+ "//result/doc[1]/str[@name='id'][.='doc_1']",
+ "//result/doc[1]/float[@name='score'][.=512.0]");
+
+ gQuery = "{!minhash field=\"min_hash_analysed\" sim=\"0.9\" analyzer_field=\"min_hash_string\"}Min Hashing is great for spotted strings of exact matching words";
+ qr = createRequest(gQuery);
+ assertQ(qr, "//*[@numFound='0']");
+ }
+
+ @Test
+ public void testPreAnalysedMinHash() {
+ assertU(adoc("id", "doc_1", "min_hash_string", "HASH1", "min_hash_string", "HASH2", "min_hash_string", "HASH3"));
+ assertU(adoc("id", "doc_2", "min_hash_string", "HASH1", "min_hash_string", "HASH2", "min_hash_string", "HASH4"));
+ assertU(commit());
+
+ String gQuery = "*:*";
+ SolrQueryRequest qr = createRequest(gQuery);
+ assertQ(qr, "//*[@numFound='2']",
+ "//result/doc[1]/str[@name='id'][.='doc_1']",
+ "//result/doc[1]/float[@name='score'][.=1.0]",
+ "//result/doc[2]/str[@name='id'][.='doc_2']",
+ "//result/doc[2]/float[@name='score'][.=1.0]");
+
+ gQuery = "{!minhash field=\"min_hash_string\"}HASH1";
+ qr = createRequest(gQuery);
+ assertQ(qr, "//*[@numFound='2']",
+ "//result/doc[1]/str[@name='id'][.='doc_1']",
+ "//result/doc[1]/float[@name='score'][.=1.0]",
+ "//result/doc[2]/str[@name='id'][.='doc_2']",
+ "//result/doc[2]/float[@name='score'][.=1.0]");
+
+
+ gQuery = "{!minhash field=\"min_hash_string\" sep=\",\"}HASH1,HASH2,HASH3";
+ qr = createRequest(gQuery);
+ assertQ(qr, "//*[@numFound='2']",
+ "//result/doc[1]/str[@name='id'][.='doc_1']",
+ "//result/doc[1]/float[@name='score'][.=3.0]",
+ "//result/doc[2]/str[@name='id'][.='doc_2']",
+ "//result/doc[2]/float[@name='score'][.=2.0]");
+ }
+
+ @Test
+ public void testNestedQuery() {
+
+ assertU(adoc("id", "doc_1", "min_hash_string", "HASH1", "min_hash_string", "HASH2", "min_hash_string", "HASH3"));
+ assertU(adoc("id", "doc_2", "min_hash_string", "HASH1", "min_hash_string", "HASH2", "min_hash_string", "HASH4"));
+ assertU(commit());
+
+ String gQuery = "*:*";
+ SolrQueryRequest qr = createRequest(gQuery);
+ assertQ(qr, "//*[@numFound='2']",
+ "//result/doc[1]/str[@name='id'][.='doc_1']",
+ "//result/doc[1]/float[@name='score'][.=1.0]",
+ "//result/doc[2]/str[@name='id'][.='doc_2']",
+ "//result/doc[2]/float[@name='score'][.=1.0]");
+
+ gQuery = "*:* AND _query_:{!minhash field=\"min_hash_string\" sep=\",\"}HASH3";
+ qr = createRequest(gQuery);
+ assertQ(qr, "//*[@numFound='1']",
+ "//result/doc[1]/str[@name='id'][.='doc_1']",
+ "//result/doc[1]/float[@name='score'][.=2.0]");
+
+ gQuery = "*:* AND _query_:{!minhash field=\"min_hash_string\" sep=\",\" sep=\"0.9\" tp=\"0.9\"}HASH3";
+ qr = createRequest(gQuery);
+ assertQ(qr, "//*[@numFound='1']",
+ "//result/doc[1]/str[@name='id'][.='doc_1']",
+ "//result/doc[1]/float[@name='score'][.=2.0]");
+
+ gQuery = "*:* AND _query_:{!minhash field=\"min_hash_string\" sep=\",\" sep=\"0.1\" tp=\"0.1\"}HASH3";
+ qr = createRequest(gQuery);
+ assertQ(qr, "//*[@numFound='1']",
+ "//result/doc[1]/str[@name='id'][.='doc_1']",
+ "//result/doc[1]/float[@name='score'][.=2.0]");
+
+ }
+
+ @Test
+ public void testBasic() {
+
+ assertU(adoc("id", "doc_1", "min_hash_analysed", "woof woof woof woof woof"));
+ assertU(adoc("id", "doc_2", "min_hash_analysed", "woof woof woof woof woof puff"));
+ assertU(adoc("id", "doc_3", "min_hash_analysed", "woof woof woof woof puff"));
+ assertU(commit());
+
+ String gQuery = "*:*";
+ SolrQueryRequest qr = createRequest(gQuery);
+ assertQ(qr, "//*[@numFound='3']",
+ "//result/doc[1]/str[@name='id'][.='doc_1']",
+ "//result/doc[1]/float[@name='score'][.=1.0]",
+ "//result/doc[2]/str[@name='id'][.='doc_2']",
+ "//result/doc[2]/float[@name='score'][.=1.0]",
+ "//result/doc[3]/str[@name='id'][.='doc_3']",
+ "//result/doc[3]/float[@name='score'][.=1.0]");
+
+
+ gQuery = "{!minhash field=\"min_hash_analysed\"}woof woof woof woof woof puff";
+ qr = createRequest(gQuery);
+ assertQ(qr, "//*[@numFound='3']",
+ "//result/doc[1]/str[@name='id'][.='doc_2']",
+ "//result/doc[1]/float[@name='score'][.=512.0]",
+ "//result/doc[2]/str[@name='id'][.='doc_1']",
+ "//result/doc[2]/float[@name='score'][.=295.0]",
+ "//result/doc[3]/str[@name='id'][.='doc_3']",
+ "//result/doc[3]/float[@name='score'][.=217.0]");
+
+ gQuery = "{!minhash field=\"min_hash_analysed\" sep=\",\"}℁팽徭聙↝ꇁ홱杯,跻\uF7E1ꠅ�찼薷\uE24Eꔾ";
+ qr = createRequest(gQuery);
+ assertQ(qr, "//*[@numFound='3']",
+ "//result/doc[1]/str[@name='id'][.='doc_2']",
+ "//result/doc[1]/float[@name='score'][.=2.0]",
+ "//result/doc[2]/str[@name='id'][.='doc_1']",
+ "//result/doc[2]/float[@name='score'][.=1.0]",
+ "//result/doc[3]/str[@name='id'][.='doc_3']",
+ "//result/doc[3]/float[@name='score'][.=1.0]");
+
+ gQuery = "{!minhash field=\"min_hash_analysed\" analyzer_field=\"min_hash_string\"}℁팽徭聙↝ꇁ홱杯";
+ qr = createRequest(gQuery);
+ assertQ(qr, "//*[@numFound='2']",
+ "//result/doc[1]/str[@name='id'][.='doc_1']",
+ "//result/doc[1]/float[@name='score'][.=1.0]",
+ "//result/doc[2]/str[@name='id'][.='doc_2']",
+ "//result/doc[2]/float[@name='score'][.=1.0]");
+
+ }
+
+
+ @Test
+ public void test() {
+
+ String[] parts = new String[]{"one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"};
+
+ for (int i = 0; i < parts.length; i++) {
+ StringBuilder builder = new StringBuilder();
+ for (int j = 0; j < parts.length - i; j++) {
+ if (builder.length() > 0) {
+ builder.append(" ");
+ }
+ builder.append(parts[i + j]);
+ if (j >= 5 - 1) {
+ assertU(adoc("id", "doc_" + i + "_" + j, "min_hash_analysed", builder.toString()));
+ }
+ }
+ }
+
+ assertU(commit());
+
+ String gQuery = "*:*";
+ SolrQueryRequest qr = createRequest(gQuery);
+ assertQ(qr, "//*[@numFound='21']");
+
+ gQuery = "{!minhash field=\"min_hash_analysed\"}one two three four five";
+ qr = createRequest(gQuery);
+ assertQ(qr, "//*[@numFound='6']");
+
+ gQuery = "{!minhash field=\"min_hash_analysed\"}two three four five six";
+ qr = createRequest(gQuery);
+ assertQ(qr, "//*[@numFound='10']");
+
+ gQuery = "{!minhash field=\"min_hash_analysed\"}three four five six seven";
+ qr = createRequest(gQuery);
+ assertQ(qr, "//*[@numFound='12']");
+
+ gQuery = "{!minhash field=\"min_hash_analysed\"}four five six seven eight";
+ qr = createRequest(gQuery);
+ assertQ(qr, "//*[@numFound='12']");
+
+ gQuery = "{!minhash field=\"min_hash_analysed\"}five six seven eight nine";
+ qr = createRequest(gQuery);
+ assertQ(qr, "//*[@numFound='10']");
+
+ gQuery = "{!minhash field=\"min_hash_analysed\"}six seven eight nine ten";
+ qr = createRequest(gQuery);
+ assertQ(qr, "//*[@numFound='6']");
+
+
+ gQuery = "{!minhash field=\"min_hash_analysed\"}one two three four five six seven eight nine ten";
+ qr = createRequest(gQuery);
+ assertQ(qr, "//*[@numFound='21']",
+ "//result/doc[1]/str[@name='id'][.='doc_0_9']",
+ "//result/doc[1]/float[@name='score'][.=512.0]",
+ "//result/doc[2]/str[@name='id'][.='doc_1_8']",
+ "//result/doc[2]/float[@name='score'][.=425.0]",
+ "//result/doc[3]/str[@name='id'][.='doc_0_8']",
+ "//result/doc[3]/float[@name='score'][.=341.0]",
+ "//result/doc[4]/str[@name='id'][.='doc_2_7']",
+ "//result/doc[4]/float[@name='score'][.=331.0]",
+ "//result/doc[5]/str[@name='id'][.='doc_0_7']",
+ "//result/doc[5]/float[@name='score'][.=305.0]",
+ "//result/doc[6]/str[@name='id'][.='doc_3_6']",
+ "//result/doc[6]/float[@name='score'][.=274.0]",
+ "//result/doc[7]/str[@name='id'][.='doc_1_7']",
+ "//result/doc[7]/float[@name='score'][.=254.0]",
+ "//result/doc[8]/str[@name='id'][.='doc_0_6']",
+ "//result/doc[8]/float[@name='score'][.=238.0]",
+ "//result/doc[9]/str[@name='id'][.='doc_1_6']",
+ "//result/doc[9]/float[@name='score'][.=218.0]",
+ "//result/doc[10]/str[@name='id'][.='doc_4_5']",
+ "//result/doc[10]/float[@name='score'][.=207.0]",
+ "//result/doc[11]/str[@name='id'][.='doc_0_5']",
+ "//result/doc[11]/float[@name='score'][.=181.0]",
+
+ "//result/doc[12]/str[@name='id'][.='doc_5_4']",
+ "//result/doc[12]/float[@name='score'][.=171.0]",
+ "//result/doc[13]/str[@name='id'][.='doc_2_6']",
+ "//result/doc[13]/float[@name='score'][.=160.0]",
+ "//result/doc[14]/str[@name='id'][.='doc_1_5']",
+ "//result/doc[14]/float[@name='score'][.=151.0]",
+ "//result/doc[15]/str[@name='id'][.='doc_2_5']",
+ "//result/doc[15]/float[@name='score'][.=124.0]",
+ "//result/doc[16]/str[@name='id'][.='doc_3_5']",
+ "//result/doc[16]/float[@name='score'][.=103.0]",
+ "//result/doc[17]/str[@name='id'][.='doc_1_4']",
+ "//result/doc[17]/float[@name='score'][.=94.0]",
+ "//result/doc[18]/str[@name='id'][.='doc_0_4']",
+ "//result/doc[18]/float[@name='score'][.=87.0]",
+ "//result/doc[19]/str[@name='id'][.='doc_3_4']",
+ "//result/doc[19]/float[@name='score'][.=67.0]",
+ "//result/doc[20]/str[@name='id'][.='doc_2_4']",
+ "//result/doc[20]/float[@name='score'][.=57.0]"
+ // "//result/doc[21]/str[@name='id'][.='doc_0_8']",
+ // "//result/doc[21]/float[@name='score'][.=341.0]"
+ );
+ }
+
+ @Test
+ public void testBandsWrap() throws SyntaxError {
+
+ NamedList<Object> par = new NamedList<>();
+ par.add("sim", "0.8");
+ par.add("tp", "0.694");
+ par.add("sep", ",");
+ par.add("debug", "false");
+
+ QParser qparser = h.getCore().getQueryPlugin("minhash").createParser("1, 2, 3, 4, 5, 6, 7, 8, 9, 10", SolrParams.toSolrParams(par), null, null);
+ Query query = qparser.getQuery();
+
+ BooleanQuery bq = (BooleanQuery)query;
+ assertEquals(4, bq.clauses().size());
+ for(BooleanClause clause : bq.clauses()) {
+ assertEquals(3, ((BooleanQuery)((ConstantScoreQuery)clause.getQuery()).getQuery()) .clauses().size());
+ }
+
+ }
+
+ private SolrQueryRequest createRequest(String query) {
+ SolrQueryRequest qr = req(query);
+ NamedList<Object> par = qr.getParams().toNamedList();
+ par.add("debug", "false");
+ par.add("rows", "30");
+ par.add("fl", "id,score");
+ par.remove("qt");
+ SolrParams newp = SolrParams.toSolrParams(par);
+ qr.setParams(newp);
+ return qr;
+ }
+}