You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ab...@apache.org on 2017/04/03 18:45:08 UTC
[02/52] [abbrv] lucene-solr:jira/solr-9959: SOLR-10344: Update Solr
default/example and test configs to use WordDelimiterGraphFilterFactory
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b46b8278/solr/core/src/test-files/solr/configsets/cloud-dynamic/conf/schema.xml
----------------------------------------------------------------------
diff --git a/solr/core/src/test-files/solr/configsets/cloud-dynamic/conf/schema.xml b/solr/core/src/test-files/solr/configsets/cloud-dynamic/conf/schema.xml
index af201c0..d15c199 100644
--- a/solr/core/src/test-files/solr/configsets/cloud-dynamic/conf/schema.xml
+++ b/solr/core/src/test-files/solr/configsets/cloud-dynamic/conf/schema.xml
@@ -45,26 +45,41 @@
<fieldType name="failtype1" class="solr.TextField">
<analyzer type="index">
<tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="0" catenateWords="0"
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="0" catenateWords="0"
catenateNumbers="0" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory"/>
</analyzer>
</fieldType>
<!-- Demonstrating ignoreCaseChange -->
<fieldType name="wdf_nocase" class="solr.TextField">
- <analyzer>
+ <analyzer type="index">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="0" catenateWords="0"
+ catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" preserveOriginal="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
<tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="0" catenateWords="0"
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="0" catenateWords="0"
catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" preserveOriginal="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="wdf_preserve" class="solr.TextField">
- <analyzer>
+ <analyzer type="index">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="1" catenateWords="0"
+ catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" preserveOriginal="1"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
<tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="1" catenateWords="0"
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" preserveOriginal="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
@@ -86,14 +101,15 @@
<fieldType name="text" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
<analyzer type="index">
<tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1"
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1"
catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.PorterStemFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0"
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.PorterStemFilterFactory"/>
@@ -111,14 +127,15 @@
<fieldType name="text_np" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1"
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1"
catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.PorterStemFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0"
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.PorterStemFilterFactory"/>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b46b8278/solr/core/src/test-files/solr/configsets/doc-expiry/conf/schema.xml
----------------------------------------------------------------------
diff --git a/solr/core/src/test-files/solr/configsets/doc-expiry/conf/schema.xml b/solr/core/src/test-files/solr/configsets/doc-expiry/conf/schema.xml
index 8400fe8..c622eba 100644
--- a/solr/core/src/test-files/solr/configsets/doc-expiry/conf/schema.xml
+++ b/solr/core/src/test-files/solr/configsets/doc-expiry/conf/schema.xml
@@ -45,26 +45,41 @@
<fieldType name="failtype1" class="solr.TextField">
<analyzer type="index">
<tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="0" catenateWords="0"
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="0" catenateWords="0"
catenateNumbers="0" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory"/>
</analyzer>
</fieldType>
<!-- Demonstrating ignoreCaseChange -->
<fieldType name="wdf_nocase" class="solr.TextField">
- <analyzer>
+ <analyzer type="index">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="0" catenateWords="0"
+ catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" preserveOriginal="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
<tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="0" catenateWords="0"
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="0" catenateWords="0"
catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" preserveOriginal="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="wdf_preserve" class="solr.TextField">
- <analyzer>
+ <analyzer type="index">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="1" catenateWords="0"
+ catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" preserveOriginal="1"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
<tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="1" catenateWords="0"
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" preserveOriginal="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
@@ -86,14 +101,15 @@
<fieldType name="text" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
<analyzer type="index">
<tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1"
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1"
catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.PorterStemFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0"
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.PorterStemFilterFactory"/>
@@ -111,14 +127,15 @@
<fieldType name="text_np" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1"
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1"
catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.PorterStemFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0"
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.PorterStemFilterFactory"/>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b46b8278/solr/core/src/test/org/apache/solr/ConvertedLegacyTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/ConvertedLegacyTest.java b/solr/core/src/test/org/apache/solr/ConvertedLegacyTest.java
index 767b811..bf7925a 100644
--- a/solr/core/src/test/org/apache/solr/ConvertedLegacyTest.java
+++ b/solr/core/src/test/org/apache/solr/ConvertedLegacyTest.java
@@ -872,7 +872,7 @@ public class ConvertedLegacyTest extends SolrTestCaseJ4 {
);
- // intra-word delimiter testing (WordDelimiterFilter)
+ // intra-word delimiter testing (WordDelimiterGraphFilter)
assertU("<add><doc><field name=\"id\">42</field><field name=\"subword\">foo bar</field></doc></add>");
assertU("<commit/>");
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b46b8278/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java b/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java
index d2ef555..fc0f6be 100644
--- a/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java
+++ b/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java
@@ -382,7 +382,7 @@ public class FieldAnalysisRequestHandlerTest extends AnalysisRequestHandlerTestB
}
@Test
- public void testPositionHistoryWithWDF() throws Exception {
+ public void testPositionHistoryWithWDGF() throws Exception {
FieldAnalysisRequest request = new FieldAnalysisRequest();
request.addFieldType("skutype1");
@@ -407,12 +407,12 @@ public class FieldAnalysisRequestHandlerTest extends AnalysisRequestHandlerTestB
assertToken(tokenList.get(1), new TokenInfo("3456-12", null, "word", 4, 11, 2, new int[]{2}, null, false));
assertToken(tokenList.get(2), new TokenInfo("a", null, "word", 12, 13, 3, new int[]{3}, null, false));
assertToken(tokenList.get(3), new TokenInfo("Test", null, "word", 14, 18, 4, new int[]{4}, null, false));
- tokenList = indexPart.get("org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter");
- assertNotNull("Expcting WordDelimiterFilter analysis breakdown", tokenList);
+ tokenList = indexPart.get("org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter");
+ assertNotNull("Expcting WordDelimiterGraphFilter analysis breakdown", tokenList);
assertEquals(6, tokenList.size());
assertToken(tokenList.get(0), new TokenInfo("hi", null, "word", 0, 2, 1, new int[]{1,1}, null, false));
- assertToken(tokenList.get(1), new TokenInfo("3456", null, "word", 4, 8, 2, new int[]{2,2}, null, false));
- assertToken(tokenList.get(2), new TokenInfo("345612", null, "word", 4, 11, 2, new int[]{2,2}, null, false));
+ assertToken(tokenList.get(1), new TokenInfo("345612", null, "word", 4, 11, 2, new int[]{2,2}, null, false));
+ assertToken(tokenList.get(2), new TokenInfo("3456", null, "word", 4, 8, 2, new int[]{2,2}, null, false));
assertToken(tokenList.get(3), new TokenInfo("12", null, "word", 9, 11, 3, new int[]{2,3}, null, false));
assertToken(tokenList.get(4), new TokenInfo("a", null, "word", 12, 13, 4, new int[]{3,4}, null, false));
assertToken(tokenList.get(5), new TokenInfo("Test", null, "word", 14, 18, 5, new int[]{4,5}, null, false));
@@ -420,8 +420,8 @@ public class FieldAnalysisRequestHandlerTest extends AnalysisRequestHandlerTestB
assertNotNull("Expcting LowerCaseFilter analysis breakdown", tokenList);
assertEquals(6, tokenList.size());
assertToken(tokenList.get(0), new TokenInfo("hi", null, "word", 0, 2, 1, new int[]{1,1,1}, null, false));
- assertToken(tokenList.get(1), new TokenInfo("3456", null, "word", 4, 8, 2, new int[]{2,2,2}, null, false));
- assertToken(tokenList.get(2), new TokenInfo("345612", null, "word", 4, 11, 2, new int[]{2,2,2}, null, false));
+ assertToken(tokenList.get(1), new TokenInfo("345612", null, "word", 4, 11, 2, new int[]{2,2,2}, null, false));
+ assertToken(tokenList.get(2), new TokenInfo("3456", null, "word", 4, 8, 2, new int[]{2,2,2}, null, false));
assertToken(tokenList.get(3), new TokenInfo("12", null, "word", 9, 11, 3, new int[]{2,3,3}, null, false));
assertToken(tokenList.get(4), new TokenInfo("a", null, "word", 12, 13, 4, new int[]{3,4,4}, null, false));
assertToken(tokenList.get(5), new TokenInfo("test", null, "word", 14, 18, 5, new int[]{4,5,5}, null, false));
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b46b8278/solr/core/src/test/org/apache/solr/rest/schema/TestBulkSchemaAPI.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/rest/schema/TestBulkSchemaAPI.java b/solr/core/src/test/org/apache/solr/rest/schema/TestBulkSchemaAPI.java
index 166d1fc..9f37967 100644
--- a/solr/core/src/test/org/apache/solr/rest/schema/TestBulkSchemaAPI.java
+++ b/solr/core/src/test/org/apache/solr/rest/schema/TestBulkSchemaAPI.java
@@ -396,7 +396,7 @@ public class TestBulkSchemaAPI extends RestTestBase {
" 'name' : 'myNewTxtField',\n" +
" 'class':'solr.TextField',\n" +
" 'positionIncrementGap':'100',\n" +
- " 'analyzer' : {\n" +
+ " 'indexAnalyzer' : {\n" +
" 'charFilters':[\n" +
" {\n" +
" 'class':'solr.PatternReplaceCharFilterFactory',\n" +
@@ -407,7 +407,32 @@ public class TestBulkSchemaAPI extends RestTestBase {
" 'tokenizer':{'class':'solr.WhitespaceTokenizerFactory'},\n" +
" 'filters':[\n" +
" {\n" +
- " 'class':'solr.WordDelimiterFilterFactory',\n" +
+ " 'class':'solr.WordDelimiterGraphFilterFactory',\n" +
+ " 'preserveOriginal':'0'\n" +
+ " },\n" +
+ " {\n" +
+ " 'class':'solr.StopFilterFactory',\n" +
+ " 'words':'stopwords.txt',\n" +
+ " 'ignoreCase':'true'\n" +
+ " },\n" +
+ " {'class':'solr.LowerCaseFilterFactory'},\n" +
+ " {'class':'solr.ASCIIFoldingFilterFactory'},\n" +
+ " {'class':'solr.KStemFilterFactory'},\n" +
+ " {'class':'solr.FlattenGraphFilterFactory'}\n" +
+ " ]\n" +
+ " },\n" +
+ " 'queryAnalyzer' : {\n" +
+ " 'charFilters':[\n" +
+ " {\n" +
+ " 'class':'solr.PatternReplaceCharFilterFactory',\n" +
+ " 'replacement':'$1$1',\n" +
+ " 'pattern':'([a-zA-Z])\\\\\\\\1+'\n" +
+ " }\n" +
+ " ],\n" +
+ " 'tokenizer':{'class':'solr.WhitespaceTokenizerFactory'},\n" +
+ " 'filters':[\n" +
+ " {\n" +
+ " 'class':'solr.WordDelimiterGraphFilterFactory',\n" +
" 'preserveOriginal':'0'\n" +
" },\n" +
" {\n" +
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b46b8278/solr/example/example-DIH/solr/db/conf/managed-schema
----------------------------------------------------------------------
diff --git a/solr/example/example-DIH/solr/db/conf/managed-schema b/solr/example/example-DIH/solr/db/conf/managed-schema
index eead56f..1a1012f 100644
--- a/solr/example/example-DIH/solr/db/conf/managed-schema
+++ b/solr/example/example-DIH/solr/db/conf/managed-schema
@@ -500,7 +500,7 @@
<!-- A text field with defaults appropriate for English, plus
aggressive word-splitting and autophrase features enabled.
This field is just like text_en, except it adds
- WordDelimiterFilter to enable splitting and matching of
+ WordDelimiterGraphFilter to enable splitting and matching of
words on case-change, alpha numeric boundaries, and
non-alphanumeric chars. This means certain compound word
cases will work, for example query "wi fi" will match
@@ -518,10 +518,11 @@
ignoreCase="true"
words="lang/stopwords_en.txt"
/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory" />
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
@@ -530,7 +531,7 @@
ignoreCase="true"
words="lang/stopwords_en.txt"
/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
@@ -540,16 +541,29 @@
<!-- Less flexible matching, but less false matches. Probably not ideal for product names,
but may be good for SKUs. Can insert dashes in the wrong place and still match. -->
<fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
- <analyzer>
+ <analyzer type="index">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+ <filter class="solr.EnglishMinimalStemFilterFactory"/>
+ <!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
+ possible with WordDelimiterGraphFilter in conjuncton with stemming. -->
+ <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory" />
+ </analyzer>
+ <analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.EnglishMinimalStemFilterFactory"/>
<!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
- possible with WordDelimiterFilter in conjuncton with stemming. -->
+ possible with WordDelimiterGraphFilter in conjuncton with stemming. -->
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b46b8278/solr/example/example-DIH/solr/db/conf/synonyms.txt
----------------------------------------------------------------------
diff --git a/solr/example/example-DIH/solr/db/conf/synonyms.txt b/solr/example/example-DIH/solr/db/conf/synonyms.txt
index 7f72128..eab4ee8 100644
--- a/solr/example/example-DIH/solr/db/conf/synonyms.txt
+++ b/solr/example/example-DIH/solr/db/conf/synonyms.txt
@@ -21,7 +21,7 @@ fooaaa,baraaa,bazaaa
GB,gib,gigabyte,gigabytes
MB,mib,megabyte,megabytes
Television, Televisions, TV, TVs
-#notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming
+#notice we use "gib" instead of "GiB" so any WordDelimiterGraphFilter coming
#after us won't split it into two words.
# Synonym mappings can be used for spelling correction too
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b46b8278/solr/example/example-DIH/solr/mail/conf/managed-schema
----------------------------------------------------------------------
diff --git a/solr/example/example-DIH/solr/mail/conf/managed-schema b/solr/example/example-DIH/solr/mail/conf/managed-schema
index 076f83f..016f105 100644
--- a/solr/example/example-DIH/solr/mail/conf/managed-schema
+++ b/solr/example/example-DIH/solr/mail/conf/managed-schema
@@ -419,7 +419,7 @@
<!-- A text field with defaults appropriate for English, plus
aggressive word-splitting and autophrase features enabled.
This field is just like text_en, except it adds
- WordDelimiterFilter to enable splitting and matching of
+ WordDelimiterGraphFilter to enable splitting and matching of
words on case-change, alpha numeric boundaries, and
non-alphanumeric chars. This means certain compound word
cases will work, for example query "wi fi" will match
@@ -437,10 +437,11 @@
ignoreCase="true"
words="lang/stopwords_en.txt"
/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory" />
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
@@ -449,7 +450,7 @@
ignoreCase="true"
words="lang/stopwords_en.txt"
/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
@@ -459,16 +460,29 @@
<!-- Less flexible matching, but less false matches. Probably not ideal for product names,
but may be good for SKUs. Can insert dashes in the wrong place and still match. -->
<fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
- <analyzer>
+ <analyzer type="index">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+ <filter class="solr.EnglishMinimalStemFilterFactory"/>
+ <!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
+ possible with WordDelimiterGraphFilter in conjuncton with stemming. -->
+ <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory" />
+ </analyzer>
+ <analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.EnglishMinimalStemFilterFactory"/>
<!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
- possible with WordDelimiterFilter in conjuncton with stemming. -->
+ possible with WordDelimiterGraphFilter in conjuncton with stemming. -->
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b46b8278/solr/example/example-DIH/solr/mail/conf/synonyms.txt
----------------------------------------------------------------------
diff --git a/solr/example/example-DIH/solr/mail/conf/synonyms.txt b/solr/example/example-DIH/solr/mail/conf/synonyms.txt
index 7f72128..eab4ee8 100644
--- a/solr/example/example-DIH/solr/mail/conf/synonyms.txt
+++ b/solr/example/example-DIH/solr/mail/conf/synonyms.txt
@@ -21,7 +21,7 @@ fooaaa,baraaa,bazaaa
GB,gib,gigabyte,gigabytes
MB,mib,megabyte,megabytes
Television, Televisions, TV, TVs
-#notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming
+#notice we use "gib" instead of "GiB" so any WordDelimiterGraphFilter coming
#after us won't split it into two words.
# Synonym mappings can be used for spelling correction too
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b46b8278/solr/example/example-DIH/solr/rss/conf/managed-schema
----------------------------------------------------------------------
diff --git a/solr/example/example-DIH/solr/rss/conf/managed-schema b/solr/example/example-DIH/solr/rss/conf/managed-schema
index e35f49d..2064c58 100644
--- a/solr/example/example-DIH/solr/rss/conf/managed-schema
+++ b/solr/example/example-DIH/solr/rss/conf/managed-schema
@@ -242,18 +242,19 @@
<filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
-->
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory" />
</analyzer>
<analyzer type="query">
<charFilter class="solr.HTMLStripCharFilterFactory"/>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
@@ -450,7 +451,7 @@
<!-- A text field with defaults appropriate for English, plus
aggressive word-splitting and autophrase features enabled.
This field is just like text_en, except it adds
- WordDelimiterFilter to enable splitting and matching of
+ WordDelimiterGraphFilter to enable splitting and matching of
words on case-change, alpha numeric boundaries, and
non-alphanumeric chars. This means certain compound word
cases will work, for example query "wi fi" will match
@@ -468,10 +469,11 @@
ignoreCase="true"
words="lang/stopwords_en.txt"
/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory" />
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
@@ -480,7 +482,7 @@
ignoreCase="true"
words="lang/stopwords_en.txt"
/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
@@ -490,16 +492,29 @@
<!-- Less flexible matching, but less false matches. Probably not ideal for product names,
but may be good for SKUs. Can insert dashes in the wrong place and still match. -->
<fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
- <analyzer>
+ <analyzer type="index">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+ <filter class="solr.EnglishMinimalStemFilterFactory"/>
+ <!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
+ possible with WordDelimiterGraphFilter in conjuncton with stemming. -->
+ <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory" />
+ </analyzer>
+ <analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.EnglishMinimalStemFilterFactory"/>
<!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
- possible with WordDelimiterFilter in conjuncton with stemming. -->
+ possible with WordDelimiterGraphFilter in conjuncton with stemming. -->
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b46b8278/solr/example/example-DIH/solr/rss/conf/synonyms.txt
----------------------------------------------------------------------
diff --git a/solr/example/example-DIH/solr/rss/conf/synonyms.txt b/solr/example/example-DIH/solr/rss/conf/synonyms.txt
index 7f72128..eab4ee8 100644
--- a/solr/example/example-DIH/solr/rss/conf/synonyms.txt
+++ b/solr/example/example-DIH/solr/rss/conf/synonyms.txt
@@ -21,7 +21,7 @@ fooaaa,baraaa,bazaaa
GB,gib,gigabyte,gigabytes
MB,mib,megabyte,megabytes
Television, Televisions, TV, TVs
-#notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming
+#notice we use "gib" instead of "GiB" so any WordDelimiterGraphFilter coming
#after us won't split it into two words.
# Synonym mappings can be used for spelling correction too
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b46b8278/solr/example/example-DIH/solr/solr/conf/managed-schema
----------------------------------------------------------------------
diff --git a/solr/example/example-DIH/solr/solr/conf/managed-schema b/solr/example/example-DIH/solr/solr/conf/managed-schema
index 6be0ad9..04c85c0 100644
--- a/solr/example/example-DIH/solr/solr/conf/managed-schema
+++ b/solr/example/example-DIH/solr/solr/conf/managed-schema
@@ -500,7 +500,7 @@
<!-- A text field with defaults appropriate for English, plus
aggressive word-splitting and autophrase features enabled.
This field is just like text_en, except it adds
- WordDelimiterFilter to enable splitting and matching of
+ WordDelimiterGraphFilter to enable splitting and matching of
words on case-change, alpha numeric boundaries, and
non-alphanumeric chars. This means certain compound word
cases will work, for example query "wi fi" will match
@@ -518,10 +518,11 @@
ignoreCase="true"
words="lang/stopwords_en.txt"
/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory" />
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
@@ -530,7 +531,7 @@
ignoreCase="true"
words="lang/stopwords_en.txt"
/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
@@ -540,16 +541,29 @@
<!-- Less flexible matching, but less false matches. Probably not ideal for product names,
but may be good for SKUs. Can insert dashes in the wrong place and still match. -->
<fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
- <analyzer>
+ <analyzer type="index">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+ <filter class="solr.EnglishMinimalStemFilterFactory"/>
+ <!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
+ possible with WordDelimiterGraphFilter in conjuncton with stemming. -->
+ <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory" />
+ </analyzer>
+ <analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.EnglishMinimalStemFilterFactory"/>
<!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
- possible with WordDelimiterFilter in conjuncton with stemming. -->
+ possible with WordDelimiterGraphFilter in conjuncton with stemming. -->
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b46b8278/solr/example/example-DIH/solr/solr/conf/synonyms.txt
----------------------------------------------------------------------
diff --git a/solr/example/example-DIH/solr/solr/conf/synonyms.txt b/solr/example/example-DIH/solr/solr/conf/synonyms.txt
index 7f72128..eab4ee8 100644
--- a/solr/example/example-DIH/solr/solr/conf/synonyms.txt
+++ b/solr/example/example-DIH/solr/solr/conf/synonyms.txt
@@ -21,7 +21,7 @@ fooaaa,baraaa,bazaaa
GB,gib,gigabyte,gigabytes
MB,mib,megabyte,megabytes
Television, Televisions, TV, TVs
-#notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming
+#notice we use "gib" instead of "GiB" so any WordDelimiterGraphFilter coming
#after us won't split it into two words.
# Synonym mappings can be used for spelling correction too
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b46b8278/solr/example/example-DIH/solr/tika/conf/managed-schema
----------------------------------------------------------------------
diff --git a/solr/example/example-DIH/solr/tika/conf/managed-schema b/solr/example/example-DIH/solr/tika/conf/managed-schema
index c4dccb2..58b2a80 100644
--- a/solr/example/example-DIH/solr/tika/conf/managed-schema
+++ b/solr/example/example-DIH/solr/tika/conf/managed-schema
@@ -353,7 +353,7 @@
<!-- A text field with defaults appropriate for English, plus
aggressive word-splitting and autophrase features enabled.
This field is just like text_en, except it adds
- WordDelimiterFilter to enable splitting and matching of
+ WordDelimiterGraphFilter to enable splitting and matching of
words on case-change, alpha numeric boundaries, and
non-alphanumeric chars. This means certain compound word
cases will work, for example query "wi fi" will match
@@ -362,13 +362,14 @@
<fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.PorterStemFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory" />
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
@@ -377,13 +378,23 @@
<!-- Less flexible matching, but less false matches. Probably not ideal for product names,
but may be good for SKUs. Can insert dashes in the wrong place and still match. -->
<fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
- <analyzer>
+ <analyzer type="index">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.EnglishMinimalStemFilterFactory"/>
+ <!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
+ possible with WordDelimiterGraphFilter in conjuncton with stemming. -->
+ <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory" />
+ </analyzer>
+ <analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishMinimalStemFilterFactory"/>
<!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
- possible with WordDelimiterFilter in conjuncton with stemming. -->
+ possible with WordDelimiterGraphFilter in conjuncton with stemming. -->
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b46b8278/solr/example/files/conf/managed-schema
----------------------------------------------------------------------
diff --git a/solr/example/files/conf/managed-schema b/solr/example/files/conf/managed-schema
index e936bcd..ff209be 100644
--- a/solr/example/files/conf/managed-schema
+++ b/solr/example/files/conf/managed-schema
@@ -145,27 +145,39 @@
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.StopFilterFactory" words="lang/stopwords_en.txt" ignoreCase="true"/>
- <filter class="solr.WordDelimiterFilterFactory" catenateNumbers="1" generateNumberParts="1" splitOnCaseChange="1" generateWordParts="1" catenateAll="0" catenateWords="1"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" catenateNumbers="1" generateNumberParts="1" splitOnCaseChange="1" generateWordParts="1" catenateAll="0" catenateWords="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory" />
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" expand="true" ignoreCase="true" synonyms="synonyms.txt"/>
<filter class="solr.StopFilterFactory" words="lang/stopwords_en.txt" ignoreCase="true"/>
- <filter class="solr.WordDelimiterFilterFactory" catenateNumbers="0" generateNumberParts="1" splitOnCaseChange="1" generateWordParts="1" catenateAll="0" catenateWords="0"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" catenateNumbers="0" generateNumberParts="1" splitOnCaseChange="1" generateWordParts="1" catenateAll="0" catenateWords="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_en_splitting_tight" class="solr.TextField" autoGeneratePhraseQueries="true" positionIncrementGap="100">
- <analyzer>
+ <analyzer type="index">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" expand="false" ignoreCase="true" synonyms="synonyms.txt"/>
+ <filter class="solr.StopFilterFactory" words="lang/stopwords_en.txt" ignoreCase="true"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" catenateNumbers="1" generateNumberParts="0" generateWordParts="0" catenateAll="0" catenateWords="1"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+ <filter class="solr.EnglishMinimalStemFilterFactory"/>
+ <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory" />
+ </analyzer>
+ <analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" expand="false" ignoreCase="true" synonyms="synonyms.txt"/>
<filter class="solr.StopFilterFactory" words="lang/stopwords_en.txt" ignoreCase="true"/>
- <filter class="solr.WordDelimiterFilterFactory" catenateNumbers="1" generateNumberParts="0" generateWordParts="0" catenateAll="0" catenateWords="1"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" catenateNumbers="1" generateNumberParts="0" generateWordParts="0" catenateAll="0" catenateWords="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.EnglishMinimalStemFilterFactory"/>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b46b8278/solr/example/files/conf/synonyms.txt
----------------------------------------------------------------------
diff --git a/solr/example/files/conf/synonyms.txt b/solr/example/files/conf/synonyms.txt
index 7f72128..eab4ee8 100644
--- a/solr/example/files/conf/synonyms.txt
+++ b/solr/example/files/conf/synonyms.txt
@@ -21,7 +21,7 @@ fooaaa,baraaa,bazaaa
GB,gib,gigabyte,gigabytes
MB,mib,megabyte,megabytes
Television, Televisions, TV, TVs
-#notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming
+#notice we use "gib" instead of "GiB" so any WordDelimiterGraphFilter coming
#after us won't split it into two words.
# Synonym mappings can be used for spelling correction too
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b46b8278/solr/server/solr/configsets/basic_configs/conf/managed-schema
----------------------------------------------------------------------
diff --git a/solr/server/solr/configsets/basic_configs/conf/managed-schema b/solr/server/solr/configsets/basic_configs/conf/managed-schema
index d7aacc4..09aaae3 100644
--- a/solr/server/solr/configsets/basic_configs/conf/managed-schema
+++ b/solr/server/solr/configsets/basic_configs/conf/managed-schema
@@ -436,7 +436,7 @@
<!-- A text field with defaults appropriate for English, plus
aggressive word-splitting and autophrase features enabled.
This field is just like text_en, except it adds
- WordDelimiterFilter to enable splitting and matching of
+ WordDelimiterGraphFilter to enable splitting and matching of
words on case-change, alpha numeric boundaries, and
non-alphanumeric chars. This means certain compound word
cases will work, for example query "wi fi" will match
@@ -455,10 +455,11 @@
ignoreCase="true"
words="lang/stopwords_en.txt"
/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory" />
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
@@ -467,7 +468,7 @@
ignoreCase="true"
words="lang/stopwords_en.txt"
/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
@@ -478,16 +479,29 @@
but may be good for SKUs. Can insert dashes in the wrong place and still match. -->
<dynamicField name="*_txt_en_split_tight" type="text_en_splitting_tight" indexed="true" stored="true"/>
<fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
- <analyzer>
+ <analyzer type="index">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+ <filter class="solr.EnglishMinimalStemFilterFactory"/>
+ <!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
+ possible with WordDelimiterGraphFilter in conjuncton with stemming. -->
+ <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory" />
+ </analyzer>
+ <analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.EnglishMinimalStemFilterFactory"/>
<!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
- possible with WordDelimiterFilter in conjuncton with stemming. -->
+ possible with WordDelimiterGraphFilter in conjuncton with stemming. -->
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b46b8278/solr/server/solr/configsets/basic_configs/conf/synonyms.txt
----------------------------------------------------------------------
diff --git a/solr/server/solr/configsets/basic_configs/conf/synonyms.txt b/solr/server/solr/configsets/basic_configs/conf/synonyms.txt
index 7f72128..eab4ee8 100644
--- a/solr/server/solr/configsets/basic_configs/conf/synonyms.txt
+++ b/solr/server/solr/configsets/basic_configs/conf/synonyms.txt
@@ -21,7 +21,7 @@ fooaaa,baraaa,bazaaa
GB,gib,gigabyte,gigabytes
MB,mib,megabyte,megabytes
Television, Televisions, TV, TVs
-#notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming
+#notice we use "gib" instead of "GiB" so any WordDelimiterGraphFilter coming
#after us won't split it into two words.
# Synonym mappings can be used for spelling correction too
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b46b8278/solr/server/solr/configsets/data_driven_schema_configs/conf/managed-schema
----------------------------------------------------------------------
diff --git a/solr/server/solr/configsets/data_driven_schema_configs/conf/managed-schema b/solr/server/solr/configsets/data_driven_schema_configs/conf/managed-schema
index b716f9c..0319eb0 100644
--- a/solr/server/solr/configsets/data_driven_schema_configs/conf/managed-schema
+++ b/solr/server/solr/configsets/data_driven_schema_configs/conf/managed-schema
@@ -435,7 +435,7 @@
<!-- A text field with defaults appropriate for English, plus
aggressive word-splitting and autophrase features enabled.
This field is just like text_en, except it adds
- WordDelimiterFilter to enable splitting and matching of
+ WordDelimiterGraphFilter to enable splitting and matching of
words on case-change, alpha numeric boundaries, and
non-alphanumeric chars. This means certain compound word
cases will work, for example query "wi fi" will match
@@ -454,10 +454,11 @@
ignoreCase="true"
words="lang/stopwords_en.txt"
/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory" />
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
@@ -466,7 +467,7 @@
ignoreCase="true"
words="lang/stopwords_en.txt"
/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
@@ -477,16 +478,29 @@
but may be good for SKUs. Can insert dashes in the wrong place and still match. -->
<dynamicField name="*_txt_en_split_tight" type="text_en_splitting_tight" indexed="true" stored="true"/>
<fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
- <analyzer>
+ <analyzer type="index">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+ <filter class="solr.EnglishMinimalStemFilterFactory"/>
+ <!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
+ possible with WordDelimiterGraphFilter in conjuncton with stemming. -->
+ <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory" />
+ </analyzer>
+ <analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.EnglishMinimalStemFilterFactory"/>
<!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
- possible with WordDelimiterFilter in conjuncton with stemming. -->
+ possible with WordDelimiterGraphFilter in conjuncton with stemming. -->
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b46b8278/solr/server/solr/configsets/data_driven_schema_configs/conf/synonyms.txt
----------------------------------------------------------------------
diff --git a/solr/server/solr/configsets/data_driven_schema_configs/conf/synonyms.txt b/solr/server/solr/configsets/data_driven_schema_configs/conf/synonyms.txt
index 7f72128..eab4ee8 100644
--- a/solr/server/solr/configsets/data_driven_schema_configs/conf/synonyms.txt
+++ b/solr/server/solr/configsets/data_driven_schema_configs/conf/synonyms.txt
@@ -21,7 +21,7 @@ fooaaa,baraaa,bazaaa
GB,gib,gigabyte,gigabytes
MB,mib,megabyte,megabytes
Television, Televisions, TV, TVs
-#notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming
+#notice we use "gib" instead of "GiB" so any WordDelimiterGraphFilter coming
#after us won't split it into two words.
# Synonym mappings can be used for spelling correction too
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b46b8278/solr/server/solr/configsets/sample_techproducts_configs/conf/managed-schema
----------------------------------------------------------------------
diff --git a/solr/server/solr/configsets/sample_techproducts_configs/conf/managed-schema b/solr/server/solr/configsets/sample_techproducts_configs/conf/managed-schema
index 9caf3d6..17dadd4 100644
--- a/solr/server/solr/configsets/sample_techproducts_configs/conf/managed-schema
+++ b/solr/server/solr/configsets/sample_techproducts_configs/conf/managed-schema
@@ -544,7 +544,7 @@
<!-- A text field with defaults appropriate for English, plus
aggressive word-splitting and autophrase features enabled.
This field is just like text_en, except it adds
- WordDelimiterFilter to enable splitting and matching of
+ WordDelimiterGraphFilter to enable splitting and matching of
words on case-change, alpha numeric boundaries, and
non-alphanumeric chars. This means certain compound word
cases will work, for example query "wi fi" will match
@@ -562,10 +562,11 @@
ignoreCase="true"
words="lang/stopwords_en.txt"
/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory" />
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
@@ -574,7 +575,7 @@
ignoreCase="true"
words="lang/stopwords_en.txt"
/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
@@ -584,16 +585,29 @@
<!-- Less flexible matching, but less false matches. Probably not ideal for product names,
but may be good for SKUs. Can insert dashes in the wrong place and still match. -->
<fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
- <analyzer>
+ <analyzer type="index">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+ <filter class="solr.EnglishMinimalStemFilterFactory"/>
+ <!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
+ possible with WordDelimiterGraphFilter in conjuncton with stemming. -->
+ <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory" />
+ </analyzer>
+ <analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.EnglishMinimalStemFilterFactory"/>
<!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
- possible with WordDelimiterFilter in conjuncton with stemming. -->
+ possible with WordDelimiterGraphFilter in conjuncton with stemming. -->
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b46b8278/solr/server/solr/configsets/sample_techproducts_configs/conf/synonyms.txt
----------------------------------------------------------------------
diff --git a/solr/server/solr/configsets/sample_techproducts_configs/conf/synonyms.txt b/solr/server/solr/configsets/sample_techproducts_configs/conf/synonyms.txt
index 7f72128..eab4ee8 100644
--- a/solr/server/solr/configsets/sample_techproducts_configs/conf/synonyms.txt
+++ b/solr/server/solr/configsets/sample_techproducts_configs/conf/synonyms.txt
@@ -21,7 +21,7 @@ fooaaa,baraaa,bazaaa
GB,gib,gigabyte,gigabytes
MB,mib,megabyte,megabytes
Television, Televisions, TV, TVs
-#notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming
+#notice we use "gib" instead of "GiB" so any WordDelimiterGraphFilter coming
#after us won't split it into two words.
# Synonym mappings can be used for spelling correction too
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/b46b8278/solr/solrj/src/test-files/solrj/solr/collection1/conf/schema-sql.xml
----------------------------------------------------------------------
diff --git a/solr/solrj/src/test-files/solrj/solr/collection1/conf/schema-sql.xml b/solr/solrj/src/test-files/solrj/solr/collection1/conf/schema-sql.xml
index 56bf625..96bbcd8 100644
--- a/solr/solrj/src/test-files/solrj/solr/collection1/conf/schema-sql.xml
+++ b/solr/solrj/src/test-files/solrj/solr/collection1/conf/schema-sql.xml
@@ -72,26 +72,41 @@
<fieldtype name="failtype1" class="solr.TextField">
<analyzer type="index">
<tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="0" catenateWords="0"
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="0" catenateWords="0"
catenateNumbers="0" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory"/>
</analyzer>
</fieldtype>
<!-- Demonstrating ignoreCaseChange -->
<fieldtype name="wdf_nocase" class="solr.TextField">
- <analyzer>
+ <analyzer type="index">
<tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="0" catenateWords="0"
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="0" catenateWords="0"
+ catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" preserveOriginal="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="0" catenateWords="0"
catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" preserveOriginal="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldtype>
<fieldtype name="wdf_preserve" class="solr.TextField">
- <analyzer>
+ <analyzer type="index">
<tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="1" catenateWords="0"
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="1" catenateWords="0"
+ catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" preserveOriginal="1"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" preserveOriginal="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
@@ -191,9 +206,16 @@
</analyzer>
</fieldtype>
<fieldtype name="lowerpunctfilt" class="solr.TextField">
- <analyzer>
+ <analyzer type="index">
+ <tokenizer class="solr.MockTokenizerFactory"/>
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1"
+ catenateNumbers="1" catenateAll="1" splitOnCaseChange="1"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
<tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1"
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1"
catenateNumbers="1" catenateAll="1" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
@@ -265,15 +287,16 @@
<fieldtype name="subword" class="solr.TextField" multiValued="true" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1"
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1"
catenateNumbers="1" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory"/>
<filter class="solr.PorterStemFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0"
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory"/>
@@ -285,15 +308,16 @@
<analyzer type="index">
<tokenizer class="solr.MockTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.WordDelimiterFilterFactory" splitOnNumerics="0" splitOnCaseChange="0" generateWordParts="1"
+ <filter class="solr.WordDelimiterGraphFilterFactory" splitOnNumerics="0" splitOnCaseChange="0" generateWordParts="1"
generateNumberParts="0" catenateWords="0" catenateNumbers="0" catenateAll="0"/>
<filter class="solr.StopFilterFactory"/>
<filter class="solr.PorterStemFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.MockTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.WordDelimiterFilterFactory" splitOnNumerics="0" splitOnCaseChange="0" generateWordParts="1"
+ <filter class="solr.WordDelimiterGraphFilterFactory" splitOnNumerics="0" splitOnCaseChange="0" generateWordParts="1"
generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
<filter class="solr.StopFilterFactory"/>
<filter class="solr.PorterStemFilterFactory"/>
@@ -304,8 +328,9 @@
<analyzer type="index">
<tokenizer class="solr.MockTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.WordDelimiterFilterFactory" splitOnNumerics="0" splitOnCaseChange="0" generateWordParts="1"
+ <filter class="solr.WordDelimiterGraphFilterFactory" splitOnNumerics="0" splitOnCaseChange="0" generateWordParts="1"
generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"/>
+ <filter class="solr.FlattenGraphFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.MockTokenizerFactory"/>
@@ -318,13 +343,14 @@
<fieldtype name="skutype1" class="solr.TextField">
<analyzer type="index">
<tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1"
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1"
catenateNumbers="1" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1"
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1"
catenateNumbers="1" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
@@ -334,13 +360,14 @@
<fieldtype name="skutype2" class="solr.TextField">
<analyzer type="index">
<tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1"
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1"
catenateNumbers="1" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.FlattenGraphFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.MockTokenizerFactory"/>
- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1"
+ <filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1"
catenateNumbers="1" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>