You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2011/06/17 22:01:05 UTC

svn commit: r1137007 [3/5] - in /incubator/stanbol/trunk: enhancer/clerezza/org.apache.stanbol.enhancer.clerezza/ enhancer/engines/autotagging/ enhancer/engines/autotagging/src/main/java/org/apache/stanbol/enhancer/engines/autotagging/impl/ entityhub/g...

Propchange: incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/mapping-FoldToASCII.txt
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/mapping-ISOLatin1Accent.txt
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/mapping-ISOLatin1Accent.txt?rev=1137007&r1=1137006&r2=1137007&view=diff
==============================================================================
--- incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/mapping-ISOLatin1Accent.txt (original)
+++ incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/mapping-ISOLatin1Accent.txt Fri Jun 17 20:01:04 2011
@@ -1,246 +1,246 @@
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Syntax:
-#   "source" => "target"
-#     "source".length() > 0 (source cannot be empty.)
-#     "target".length() >= 0 (target can be empty.)
-
-# example:
-#   "À" => "A"
-#   "\u00C0" => "A"
-#   "\u00C0" => "\u0041"
-#   "ß" => "ss"
-#   "\t" => " "
-#   "\n" => ""
-
-# À => A
-"\u00C0" => "A"
-
-# Á => A
-"\u00C1" => "A"
-
-# Â => A
-"\u00C2" => "A"
-
-# Ã => A
-"\u00C3" => "A"
-
-# Ä => A
-"\u00C4" => "A"
-
-# Å => A
-"\u00C5" => "A"
-
-# Æ => AE
-"\u00C6" => "AE"
-
-# Ç => C
-"\u00C7" => "C"
-
-# È => E
-"\u00C8" => "E"
-
-# É => E
-"\u00C9" => "E"
-
-# Ê => E
-"\u00CA" => "E"
-
-# Ë => E
-"\u00CB" => "E"
-
-# Ì => I
-"\u00CC" => "I"
-
-# Í => I
-"\u00CD" => "I"
-
-# Î => I
-"\u00CE" => "I"
-
-# Ï => I
-"\u00CF" => "I"
-
-# IJ => IJ
-"\u0132" => "IJ"
-
-# Ð => D
-"\u00D0" => "D"
-
-# Ñ => N
-"\u00D1" => "N"
-
-# Ò => O
-"\u00D2" => "O"
-
-# Ó => O
-"\u00D3" => "O"
-
-# Ô => O
-"\u00D4" => "O"
-
-# Õ => O
-"\u00D5" => "O"
-
-# Ö => O
-"\u00D6" => "O"
-
-# Ø => O
-"\u00D8" => "O"
-
-# Π=> OE
-"\u0152" => "OE"
-
-# Þ
-"\u00DE" => "TH"
-
-# Ù => U
-"\u00D9" => "U"
-
-# Ú => U
-"\u00DA" => "U"
-
-# Û => U
-"\u00DB" => "U"
-
-# Ü => U
-"\u00DC" => "U"
-
-# Ý => Y
-"\u00DD" => "Y"
-
-# Ÿ => Y
-"\u0178" => "Y"
-
-# à => a
-"\u00E0" => "a"
-
-# á => a
-"\u00E1" => "a"
-
-# â => a
-"\u00E2" => "a"
-
-# ã => a
-"\u00E3" => "a"
-
-# ä => a
-"\u00E4" => "a"
-
-# å => a
-"\u00E5" => "a"
-
-# æ => ae
-"\u00E6" => "ae"
-
-# ç => c
-"\u00E7" => "c"
-
-# è => e
-"\u00E8" => "e"
-
-# é => e
-"\u00E9" => "e"
-
-# ê => e
-"\u00EA" => "e"
-
-# ë => e
-"\u00EB" => "e"
-
-# ì => i
-"\u00EC" => "i"
-
-# í => i
-"\u00ED" => "i"
-
-# î => i
-"\u00EE" => "i"
-
-# ï => i
-"\u00EF" => "i"
-
-# ij => ij
-"\u0133" => "ij"
-
-# ð => d
-"\u00F0" => "d"
-
-# ñ => n
-"\u00F1" => "n"
-
-# ò => o
-"\u00F2" => "o"
-
-# ó => o
-"\u00F3" => "o"
-
-# ô => o
-"\u00F4" => "o"
-
-# õ => o
-"\u00F5" => "o"
-
-# ö => o
-"\u00F6" => "o"
-
-# ø => o
-"\u00F8" => "o"
-
-# œ => oe
-"\u0153" => "oe"
-
-# ß => ss
-"\u00DF" => "ss"
-
-# þ => th
-"\u00FE" => "th"
-
-# ù => u
-"\u00F9" => "u"
-
-# ú => u
-"\u00FA" => "u"
-
-# û => u
-"\u00FB" => "u"
-
-# ü => u
-"\u00FC" => "u"
-
-# ý => y
-"\u00FD" => "y"
-
-# ÿ => y
-"\u00FF" => "y"
-
-# ff => ff
-"\uFB00" => "ff"
-
-# fi => fi
-"\uFB01" => "fi"
-
-# fl => fl
-"\uFB02" => "fl"
-
-# ffi => ffi
-"\uFB03" => "ffi"
-
-# ffl => ffl
-"\uFB04" => "ffl"
-
-# ſt => ft
-"\uFB05" => "ft"
-
-# st => st
-"\uFB06" => "st"
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Syntax:
+#   "source" => "target"
+#     "source".length() > 0 (source cannot be empty.)
+#     "target".length() >= 0 (target can be empty.)
+
+# example:
+#   "À" => "A"
+#   "\u00C0" => "A"
+#   "\u00C0" => "\u0041"
+#   "ß" => "ss"
+#   "\t" => " "
+#   "\n" => ""
+
+# À => A
+"\u00C0" => "A"
+
+# Á => A
+"\u00C1" => "A"
+
+# Â => A
+"\u00C2" => "A"
+
+# Ã => A
+"\u00C3" => "A"
+
+# Ä => A
+"\u00C4" => "A"
+
+# Å => A
+"\u00C5" => "A"
+
+# Æ => AE
+"\u00C6" => "AE"
+
+# Ç => C
+"\u00C7" => "C"
+
+# È => E
+"\u00C8" => "E"
+
+# É => E
+"\u00C9" => "E"
+
+# Ê => E
+"\u00CA" => "E"
+
+# Ë => E
+"\u00CB" => "E"
+
+# Ì => I
+"\u00CC" => "I"
+
+# Í => I
+"\u00CD" => "I"
+
+# Î => I
+"\u00CE" => "I"
+
+# Ï => I
+"\u00CF" => "I"
+
+# IJ => IJ
+"\u0132" => "IJ"
+
+# Ð => D
+"\u00D0" => "D"
+
+# Ñ => N
+"\u00D1" => "N"
+
+# Ò => O
+"\u00D2" => "O"
+
+# Ó => O
+"\u00D3" => "O"
+
+# Ô => O
+"\u00D4" => "O"
+
+# Õ => O
+"\u00D5" => "O"
+
+# Ö => O
+"\u00D6" => "O"
+
+# Ø => O
+"\u00D8" => "O"
+
+# Π=> OE
+"\u0152" => "OE"
+
+# Þ
+"\u00DE" => "TH"
+
+# Ù => U
+"\u00D9" => "U"
+
+# Ú => U
+"\u00DA" => "U"
+
+# Û => U
+"\u00DB" => "U"
+
+# Ü => U
+"\u00DC" => "U"
+
+# Ý => Y
+"\u00DD" => "Y"
+
+# Ÿ => Y
+"\u0178" => "Y"
+
+# à => a
+"\u00E0" => "a"
+
+# á => a
+"\u00E1" => "a"
+
+# â => a
+"\u00E2" => "a"
+
+# ã => a
+"\u00E3" => "a"
+
+# ä => a
+"\u00E4" => "a"
+
+# å => a
+"\u00E5" => "a"
+
+# æ => ae
+"\u00E6" => "ae"
+
+# ç => c
+"\u00E7" => "c"
+
+# è => e
+"\u00E8" => "e"
+
+# é => e
+"\u00E9" => "e"
+
+# ê => e
+"\u00EA" => "e"
+
+# ë => e
+"\u00EB" => "e"
+
+# ì => i
+"\u00EC" => "i"
+
+# í => i
+"\u00ED" => "i"
+
+# î => i
+"\u00EE" => "i"
+
+# ï => i
+"\u00EF" => "i"
+
+# ij => ij
+"\u0133" => "ij"
+
+# ð => d
+"\u00F0" => "d"
+
+# ñ => n
+"\u00F1" => "n"
+
+# ò => o
+"\u00F2" => "o"
+
+# ó => o
+"\u00F3" => "o"
+
+# ô => o
+"\u00F4" => "o"
+
+# õ => o
+"\u00F5" => "o"
+
+# ö => o
+"\u00F6" => "o"
+
+# ø => o
+"\u00F8" => "o"
+
+# œ => oe
+"\u0153" => "oe"
+
+# ß => ss
+"\u00DF" => "ss"
+
+# þ => th
+"\u00FE" => "th"
+
+# ù => u
+"\u00F9" => "u"
+
+# ú => u
+"\u00FA" => "u"
+
+# û => u
+"\u00FB" => "u"
+
+# ü => u
+"\u00FC" => "u"
+
+# ý => y
+"\u00FD" => "y"
+
+# ÿ => y
+"\u00FF" => "y"
+
+# ff => ff
+"\uFB00" => "ff"
+
+# fi => fi
+"\uFB01" => "fi"
+
+# fl => fl
+"\uFB02" => "fl"
+
+# ffi => ffi
+"\uFB03" => "ffi"
+
+# ffl => ffl
+"\uFB04" => "ffl"
+
+# ſt => ft
+"\uFB05" => "ft"
+
+# st => st
+"\uFB06" => "st"

Modified: incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/protwords.txt
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/protwords.txt?rev=1137007&r1=1137006&r2=1137007&view=diff
==============================================================================
--- incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/protwords.txt (original)
+++ incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/protwords.txt Fri Jun 17 20:01:04 2011
@@ -1,21 +1,19 @@
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#-----------------------------------------------------------------------
-# Use a protected word file to protect against the stemmer reducing two
-# unrelated words to the same base word.
-
-# Some non-words that normally won't be encountered,
-# just to test that they won't be stemmed.
-dontstems
-zwhacky
-
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#-----------------------------------------------------------------------
+# Use a protected word file to protect against the stemmer reducing two
+# unrelated words to the same base word.
+
+# Some non-words that normally won't be encountered,
+# just to test that they won't be stemmed.
+

Modified: incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/schema.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/schema.xml?rev=1137007&r1=1137006&r2=1137007&view=diff
==============================================================================
--- incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/schema.xml (original)
+++ incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/schema.xml Fri Jun 17 20:01:04 2011
@@ -1,21 +1,21 @@
-<?xml version="1.0" encoding="UTF-8" ?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
 <!--  
  This is the Solr schema file. This file should be named "schema.xml" and
  should be in the conf directory under the solr home
@@ -32,44 +32,48 @@
  to specific requirements. See the comments within this schema for more
  details!
 
- For more information, on how to customize the Solr schema.xml in general, 
- please see http://wiki.apache.org/solr/SchemaXml.
-
--->
-
-<schema name="Apache Stanbol SolrYard Schema" version="1.2">
+ For more information, on how to customize this file, please see
+ http://wiki.apache.org/solr/SchemaXml
+
+-->
+
+<schema name="Apache Stanbol SolrYard Schema" version="1.3">
   <!--
     The SolrYard supports a list of types that is reflected by
     "fieldType" specifications within this schema.
     See the specific fieldType definition for more information
   -->
-  <types>
+  <types>
     <!-- 
       This fieldType is used to store values with the dataType "xsd:string".
       It is NOT used for natural language texts. Assume that this data type is
       used for ISBN numbers, article numbers, string representations of
       unsupported data types ...
     -->
-    <fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
+    <!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
+    <fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>    
+
     <!-- 
       This can be used as alternative to "string" to enable case insensitive
       searches on string values.
       The KeywordTokenizerFactory ensures that the whole string is preserved as
       a single token.
     -->
+    <!-- lowercases the entire field value, keeping it as a single token.  -->
     <fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100">
       <analyzer>
         <tokenizer class="solr.KeywordTokenizerFactory"/>
         <filter class="solr.LowerCaseFilterFactory" />
       </analyzer>
     </fieldType>
-
-
+
+
     <!-- boolean type: "true" or "false" used to store values with the datatype "xsd:boolean" -->
-    <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" omitNorms="true"/>
+    <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" omitNorms="true"/>
     <!--Binary data type. The data should be sent/retrieved in as Base64 encoded Strings.
         Currently not used by the SolrYard implementation, but reserved for future use. -->
-    <fieldtype name="binary" class="solr.BinaryField"/>
+    <fieldtype name="binary" class="solr.BinaryField"/>
+
     <!--
       Default numeric and date field types. By default used to index numeric values.
       Note that the "solr.TrieIntField" does support indexing values at various
@@ -79,24 +83,32 @@
       for all numeric fields of that types. See Solr documentation for
       suitable values and examples.
     -->
-    <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
-    <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
-    <fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
-    <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
     <fieldType name="date" class="solr.TrieDateField" omitNorms="true" precisionStep="0" positionIncrementGap="0"/>
-
-    <!--
-      Numeric and date field types that do activate indexing values at various
-      levels of precision to accelerate range queries.
-      This can be used to activate hierarchival indexing for specific
-      fields. See Notes within the field section.
-    -->
-    <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
-    <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
-    <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
-    <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+
+    <!--
+     Numeric field types that index each value at various levels of precision
+     to accelerate range queries when the number of values between the range
+     endpoints is large. See the javadoc for NumericRangeQuery for internal
+     implementation details.
+
+     Smaller precisionStep values (specified in bits) will lead to more tokens
+     indexed per value, slightly larger index size, and faster range queries.
+     A precisionStep of 0 disables indexing at different precision levels.
+    -->
+    <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
     <fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/>
-
+
+    <!-- not used
+    <fieldType name="random" class="solr.RandomSortField" indexed="true" />
+     -->
+
     <!-- 
       Natural Language Texts
       
@@ -117,141 +129,158 @@
       together with string values within a special field to support searches for
       texts without an specified language.
     -->
-    <!-- 
-      A general unstemmed text field - good if one does not know the language of the field.
-      This is used as the default fieldType for fields that store values of different
-      languages.
-      It is also the default fieldType for languages that do not define special fieldTypes.
-    -->
-    <fieldType name="textgen" class="solr.TextField" positionIncrementGap="100">
-      <analyzer type="index">
-        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
-        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
-        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/>
-        <filter class="solr.LowerCaseFilterFactory"/>
-      </analyzer>
-      <analyzer type="query">
-        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
-        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
-        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true"/>
-        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/>
-        <filter class="solr.LowerCaseFilterFactory"/>
-      </analyzer>
-    </fieldType>
-    
-    <!-- 
-      A text field that only splits on whitespace for exact matching of words.
-      Currently not used. May be used as an alternative to the textgen fieldType.
-    -->
+
+    <!-- A text field that only splits on whitespace for exact matching of words 
+        Currently not used. May be used as an alternative to the textgen fieldType.
+    <fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
+      <analyzer>
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+      </analyzer>
+    </fieldType> -->
+
     <!--
-    <fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
-      <analyzer>
-        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
-      </analyzer>
-    </fieldType>
-    -->
-    
+        Default Text field configuration that comes with the Solr distribution. Currently
+        Not used by the SolrYard
+        
+          
+        A text field that uses WordDelimiterFilter to enable splitting and matching of
+        words on case-change, alpha numeric boundaries, and non-alphanumeric chars,
+        so that a query of "wifi" or "wi fi" could match a document containing "Wi-Fi".
+        Synonyms and stopwords are customized by external files, and stemming is enabled.
+        The attribute autoGeneratePhraseQueries="true" (the default) causes words that get split to
+        form phrase queries. For example, WordDelimiterFilter splitting text:pdp-11 will cause the parser
+        to generate text:"pdp 11" rather than (text:PDP OR text:11).
+        NOTE: autoGeneratePhraseQueries="true" tends to not work well for non whitespace delimited languages.
+    <fieldType name="text_en" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
+      <analyzer type="index">
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+        <filter class="solr.PorterStemFilterFactory"/>
+      </analyzer>
+      <analyzer type="query">
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true"/>
+        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+        <filter class="solr.PorterStemFilterFactory"/>
+      </analyzer>
+    </fieldType>  -->
+
+
     <!-- 
-      This is the default fieldType used for english language texts.
-      
-      Less flexible matching than the text_en field type, but less false matches.  
-      Probably not ideal for product names, but may be good for SKUs. 
-      Can insert dashes in the wrong place and still match.
-    -->
-    <fieldType name="text_en_Tight" class="solr.TextField" positionIncrementGap="100" >
-      <analyzer>
-        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
-        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
-        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
-        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
-        <filter class="solr.LowerCaseFilterFactory"/>
-        <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
-        <!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
-             possible with WordDelimiterFilter in conjuncton with stemming. -->
-        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
-      </analyzer>
-    </fieldType>
-
-
+         This is the default fieldType used for english language texts.
+         
+         Less flexible matching, but less false matches.  Probably not ideal for product names,
+         but may be good for SKUs.  Can insert dashes in the wrong place and still match. -->
+    <fieldType name="text_en_Tight" class="solr.TextField" positionIncrementGap="100" >
+      <analyzer>
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
+        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+        <filter class="solr.EnglishMinimalStemFilterFactory"/>
+        <!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
+             possible with WordDelimiterFilter in conjuncton with stemming. -->
+        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
+
     <!-- 
-      This can be used as an alternative to the "text_en_Tight" fieldTpye for
-      english langauge texts.
-      
-      A text field that uses WordDelimiterFilter to enable splitting and matching of
-      words on case-change, alpha numeric boundaries, and non-alphanumeric chars,
-      so that a query of "wifi" or "wi fi" could match a document containing "Wi-Fi".
-      Synonyms and stopwords are customized by external files, and stemming is enabled.
-    -->
-    <!--
-    <fieldType name="text_en" class="solr.TextField" positionIncrementGap="100">
-      <analyzer type="index">
-        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
-        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true"/>
-        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
-        <filter class="solr.LowerCaseFilterFactory"/>
-        <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
-      </analyzer>
-      <analyzer type="query">
-        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
-        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
-        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true"/>
-        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
-        <filter class="solr.LowerCaseFilterFactory"/>
-        <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
-      </analyzer>
-    </fieldType>
-    -->
-    
-    <!--
-      The SolrYard allows leading Wildcards (e.g. "*aris"). To provide
-      good query performance for such queries one need to configure
-      fieldTypes that use the ReversedWildcardFilterFactory as shown by
-      this example.
-      See Solr documentation for details
-      
-      A general unstemmed text field that indexes tokens normally and also
-      reversed (via ReversedWildcardFilterFactory), to enable more efficient 
-	  leading wildcard queries. 
-    -->
-    <!--
-    <fieldType name="text_rev" class="solr.TextField" positionIncrementGap="100">
-      <analyzer type="index">
-        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
-        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
-        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/>
-        <filter class="solr.LowerCaseFilterFactory"/>
-        <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true" maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
-      </analyzer>
-      <analyzer type="query">
-        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
-        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
-        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true"/>
-        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/>
-        <filter class="solr.LowerCaseFilterFactory"/>
-      </analyzer>
-    </fieldType>
-    -->
-    <!-- charFilter + WhitespaceTokenizer  -->
-    <!--
-    <fieldType name="textCharNorm" class="solr.TextField" positionIncrementGap="100" >
-      <analyzer>
-        <charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/>
-        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
-      </analyzer>
-    </fieldType>
-    -->
+         The default for any language without a special field definition.
 
-    <!--
-      This can be used to deactivate some functionality of the SolrYard or
-      to configure that some fields of a data set are not stored nor indexed
-      regardless of the Apache Stanbol Entityhub configuration!
-    --> 
-    <fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" /> 
-
- </types>
-
-
- <fields>
+         A general unstemmed text field - good if one does not know the language of the field -->
+    <fieldType name="textgen" class="solr.TextField" positionIncrementGap="100">
+      <analyzer type="index">
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true"/>
+        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+      <analyzer type="query">
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true"/>
+        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
+
+    <!-- A general unstemmed text field that indexes tokens normally and also
+         reversed (via ReversedWildcardFilterFactory), to enable more efficient 
+	 leading wildcard queries.
+     
+     Not used 
+    <fieldType name="text_rev" class="solr.TextField" positionIncrementGap="100">
+      <analyzer type="index">
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true" maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
+      </analyzer>
+      <analyzer type="query">
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true"/>
+        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+    </fieldType> -->
+
+    <!-- A KeywordTokenizer that does not include some properties of the source text.
+         
+         TODO:
+          - This might be usefull for searching labels
+          - Rename to label if used for that
+          - Add 0-9 to the regex patter to preserve numbers
+         
+      -->
+    <fieldType name="alphaOnlySort" class="solr.TextField" sortMissingLast="true" omitNorms="true">
+      <analyzer>
+        <!-- KeywordTokenizer does not tokenize -->
+        <tokenizer class="solr.KeywordTokenizerFactory"/>
+        <filter class="solr.LowerCaseFilterFactory" />
+        <filter class="solr.TrimFilterFactory" />
+        <filter class="solr.PatternReplaceFilterFactory" pattern="([^a-z])" replacement="" replace="all" />
+      </analyzer>
+    </fieldType>
+    
+    <!-- not used
+    <fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" >
+      <analyzer>
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/>
+      </analyzer>
+    </fieldtype> -->
+
+    <fieldType name="text_path" class="solr.TextField" positionIncrementGap="100">
+      <analyzer>
+        <tokenizer class="solr.PathHierarchyTokenizerFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <!-- since fields of this type are by default not stored or indexed,
+         any data added to them will be ignored outright.  --> 
+    <fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" />
+
+    <!-- Spatial features are not yet supported by the Entityhub
+    <fieldType name="point" class="solr.PointType" dimension="2" subFieldSuffix="_d"/>
+    <fieldType name="location" class="solr.LatLonType" subFieldSuffix="_coordinate"/>
+    <fieldtype name="geohash" class="solr.GeoHashField"/>
+     -->
+ </types>
+
+
+ <fields>
    <!-- 
      For Information about the different attributes for fields
      see http://wiki.apache.org/solr/SchemaXml. 
@@ -280,7 +309,7 @@
      Do not change this definition!
    -->
    <field name="_domain" type="string" indexed="true" stored="false" multiValued="true"/>
-
+   
    <!--
      DBPedia specific Field definitions 
    -->
@@ -300,6 +329,7 @@
    <field name="@de/dbp-ont:abstract/"  type="textgen"  indexed="true" stored="false" multiValued="true" termVectors="true"/>
    <field name="@it/dbp-ont:abstract/"  type="textgen"  indexed="true" stored="false" multiValued="true" termVectors="true"/>
    <field name="@fr/dbp-ont:abstract/"  type="textgen"  indexed="true" stored="false" multiValued="true" termVectors="true"/>
+
 
    <!-- 
      Dynamic field definitions (used if a field name is not found)
@@ -332,7 +362,6 @@
    <dynamicField name="dou/*"  type="double"  indexed="true"  stored="true" multiValued="true"/>
    <dynamicField name="cal/*"  type="date"    indexed="true"  stored="true" multiValued="true"/>
    <dynamicField name="dur/*"  type="string"  indexed="true"  stored="true" multiValued="true"/>
-      
    <!-- 
      String fields that are not natural language
      To support case insensitive searches in such fields change 
@@ -380,8 +409,6 @@
    -->
    <dynamicField name="@*"  type="textgen"  indexed="true"  stored="true" multiValued="true"/>
 
-
-
    <!--
      To add special configurations for specific fields one
      has to include the fieldName within the prefix of the
@@ -423,28 +450,29 @@
      This field need not to be stored. The type can be changed to alternatives
      as described in the types section of this configuration.
    -->
-   <dynamicField name="_!@*"  type="textgen"  indexed="true"  stored="false" multiValued="true" termVectors="true"/>
+   <dynamicField name="_!@*"  type="textgen"  indexed="true"  stored="false"
+     multiValued="true" />
    <!-- 
      fields starting with "_config/" are used to store configurations about how the
      index was created within the index (e.g. used namespace prefixes).
      Do not change this definition!
    -->
    <dynamicField name="_config/*" type="string" indexed="false" multiValued="true"/>
-      
- </fields>
-
+   
+ </fields>
+
  <!-- 
    Field to use to determine and enforce document uniqueness.
    -->
- <uniqueKey>uri</uniqueKey>
-
+ <uniqueKey>uri</uniqueKey>
+
  <!-- 
    field for the QueryParser to use when an explicit fieldname is absent.
    The SolrYard does currently not take advantage of this. However it can
    be used when directly accessing the SolrYard.
  -->
- <defaultSearchField>_text</defaultSearchField>
-
+ <defaultSearchField>_text</defaultSearchField>
+
  <!--
    The SolrYard explizitly adds AND and OR for all boolean terms in
    generated queries. So changing that should have no influence on
@@ -452,13 +480,13 @@
    
    SolrQueryParser configuration: defaultOperator="AND|OR" 
  -->
- <solrQueryParser defaultOperator="OR"/>
-
+ <solrQueryParser defaultOperator="OR"/>
+
   <!--
     The SolrYard Implementation assumes the following copyField commands.
     This commands MUST NOT be removed! 
    -->
-
+
    <!-- 
      Values of all fields that represent natural language texts
      or string values are copied to the default search field
@@ -477,5 +505,7 @@
      all references to it)
    -->
    <copyField source="ref/*" dest="_ref"/>
-   
-</schema>
+	
+
+
+</schema>