You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2011/05/29 12:39:43 UTC

svn commit: r1128859 - in /lucene/dev/branches/branch_3x: ./ lucene/ lucene/backwards/ solr/ solr/example/solr/conf/ solr/site/ solr/site/skin/ solr/src/java/org/apache/solr/schema/ solr/src/site/src/documentation/content/xdocs/ solr/src/test-files/sol...

Author: mikemccand
Date: Sun May 29 10:39:43 2011
New Revision: 1128859

URL: http://svn.apache.org/viewvc?rev=1128859&view=rev
Log:
SOLR-2519: improve defaults for text_* field types

Added:
    lucene/dev/branches/branch_3x/solr/example/solr/conf/stopwords_en.txt
      - copied unchanged from r1128856, lucene/dev/trunk/solr/example/solr/conf/stopwords_en.txt
Modified:
    lucene/dev/branches/branch_3x/   (props changed)
    lucene/dev/branches/branch_3x/lucene/   (props changed)
    lucene/dev/branches/branch_3x/lucene/backwards/   (props changed)
    lucene/dev/branches/branch_3x/solr/   (props changed)
    lucene/dev/branches/branch_3x/solr/CHANGES.txt
    lucene/dev/branches/branch_3x/solr/example/solr/conf/schema.xml
    lucene/dev/branches/branch_3x/solr/example/solr/conf/solrconfig.xml
    lucene/dev/branches/branch_3x/solr/example/solr/conf/stopwords.txt
    lucene/dev/branches/branch_3x/solr/site/skin/basic.css
    lucene/dev/branches/branch_3x/solr/site/skin/print.css
    lucene/dev/branches/branch_3x/solr/site/skin/profile.css
    lucene/dev/branches/branch_3x/solr/site/skin/screen.css
    lucene/dev/branches/branch_3x/solr/site/tutorial.html
    lucene/dev/branches/branch_3x/solr/site/tutorial.pdf
    lucene/dev/branches/branch_3x/solr/src/java/org/apache/solr/schema/TextField.java
    lucene/dev/branches/branch_3x/solr/src/site/src/documentation/content/xdocs/tutorial.xml
    lucene/dev/branches/branch_3x/solr/src/test-files/solr/conf/schema12.xml
    lucene/dev/branches/branch_3x/solr/src/test/org/apache/solr/client/solrj/SolrExampleTests.java

Modified: lucene/dev/branches/branch_3x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/CHANGES.txt?rev=1128859&r1=1128858&r2=1128859&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/solr/CHANGES.txt Sun May 29 10:39:43 2011
@@ -23,6 +23,14 @@ on how to get started.
 
 ======================= 3.x (not yet released) ================
 
+Bug Fixes
+----------------------
+
+* SOLR-2519: Improve text_* fieldTypes in example schema.xml: improve
+  cross-language defaults for text_general; break out separate
+  English-specific fieldTypes (Jan Høydahl, hossman, Robert Muir,
+  yonik, Mike McCandless)
+
 ==================  3.2.0  ==================
 Versions of Major Components
 ---------------------
@@ -121,7 +129,6 @@ Bug Fixes
 * SOLR-2539: VectorValueSource.floatVal incorrectly used byteVal on sub-sources.
   (Tom Liu via yonik)
 
-
 Other Changes
 ----------------------
 

Modified: lucene/dev/branches/branch_3x/solr/example/solr/conf/schema.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/example/solr/conf/schema.xml?rev=1128859&r1=1128858&r2=1128859&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/example/solr/conf/schema.xml (original)
+++ lucene/dev/branches/branch_3x/solr/example/solr/conf/schema.xml Sun May 29 10:39:43 2011
@@ -45,15 +45,16 @@
     that avoids logging every request
 -->
 
-<schema name="example" version="1.3">
+<schema name="example" version="1.4">
   <!-- attribute "name" is the name of this schema and is only used for display purposes.
        Applications should change this to reflect the nature of the search collection.
-       version="1.2" is Solr's version number for the schema syntax and semantics.  It should
+       version="1.4" is Solr's version number for the schema syntax and semantics.  It should
        not normally be changed by applications.
        1.0: multiValued attribute did not exist, all fields are multiValued by nature
        1.1: multiValued attribute introduced, false by default 
        1.2: omitTermFreqAndPositions attribute introduced, true by default except for text fields.
        1.3: removed optional field compress feature
+       1.4: default auto-phrase (QueryParser feature) to off
      -->
 
   <types>
@@ -209,16 +210,87 @@
       </analyzer>
     </fieldType>
 
-    <!-- A text field that uses WordDelimiterFilter to enable splitting and matching of
-        words on case-change, alpha numeric boundaries, and non-alphanumeric chars,
-        so that a query of "wifi" or "wi fi" could match a document containing "Wi-Fi".
-        Synonyms and stopwords are customized by external files, and stemming is enabled.
-        The attribute autoGeneratePhraseQueries="true" (the default) causes words that get split to
-        form phrase queries. For example, WordDelimiterFilter splitting text:pdp-11 will cause the parser
-        to generate text:"pdp 11" rather than (text:PDP OR text:11).
-        NOTE: autoGeneratePhraseQueries="true" tends to not work well for non whitespace delimited languages.
+    <!-- A general text field that has reasonable, generic
+         cross-language defaults: it tokenizes with StandardTokenizer,
+	 removes stop words from case-insensitive "stopwords.txt"
+	 (empty by default), and down cases.  At query time only, it
+	 also applies synonyms. -->
+    <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
+      <analyzer type="index">
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+        <!-- in this example, we will only use synonyms at query time
+        <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+        -->
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+      <analyzer type="query">
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
+        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <!-- A text field with defaults appropriate for English: it
+         tokenizes with StandardTokenizer, removes English stop words
+         (stopwords_en.txt), down cases, protects words from protwords.txt, and
+         finally applies Porter's stemming.  The query time analyzer
+         also applies synonyms from synonyms.txt. -->
+    <fieldType name="text_en" class="solr.TextField" positionIncrementGap="100">
+      <analyzer type="index">
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <!-- in this example, we will only use synonyms at query time
+        <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+        -->
+        <!-- Case insensitive stop word removal.
+          add enablePositionIncrements=true in both the index and query
+          analyzers to leave a 'gap' for more accurate phrase queries.
+        -->
+        <filter class="solr.StopFilterFactory"
+                ignoreCase="true"
+                words="stopwords_en.txt"
+                enablePositionIncrements="true"
+                />
+        <filter class="solr.LowerCaseFilterFactory"/>
+	<filter class="solr.EnglishPossessiveFilterFactory"/>
+        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+	<!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory:
+        <filter class="solr.EnglishMinimalStemFilterFactory"/>
+	-->
+        <filter class="solr.PorterStemFilterFactory"/>
+      </analyzer>
+      <analyzer type="query">
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+        <filter class="solr.StopFilterFactory"
+                ignoreCase="true"
+                words="stopwords_en.txt"
+                enablePositionIncrements="true"
+                />
+        <filter class="solr.LowerCaseFilterFactory"/>
+	<filter class="solr.EnglishPossessiveFilterFactory"/>
+        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+	<!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory:
+        <filter class="solr.EnglishMinimalStemFilterFactory"/>
+	-->
+        <filter class="solr.PorterStemFilterFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <!-- A text field with defaults appropriate for English, plus
+	 aggressive word-splitting and autophrase features enabled.
+	 This field is just like text_en, except it adds
+	 WordDelimiterFilter to enable splitting and matching of
+	 words on case-change, alpha numeric boundaries, and
+	 non-alphanumeric chars.  This means certain compound word
+	 cases will work, for example query "wi fi" will match
+	 document "WiFi" or "wi-fi".  However, other cases will still
+	 not match, for example if the query is "wifi" and the
+	 document is "wi fi" or if the query is "wi-fi" and the
+	 document is "wifi".
         -->
-    <fieldType name="text" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
+    <fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
       <analyzer type="index">
         <tokenizer class="solr.WhitespaceTokenizerFactory"/>
         <!-- in this example, we will only use synonyms at query time
@@ -230,7 +302,7 @@
         -->
         <filter class="solr.StopFilterFactory"
                 ignoreCase="true"
-                words="stopwords.txt"
+                words="stopwords_en.txt"
                 enablePositionIncrements="true"
                 />
         <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
@@ -243,7 +315,7 @@
         <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
         <filter class="solr.StopFilterFactory"
                 ignoreCase="true"
-                words="stopwords.txt"
+                words="stopwords_en.txt"
                 enablePositionIncrements="true"
                 />
         <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
@@ -253,14 +325,13 @@
       </analyzer>
     </fieldType>
 
-
     <!-- Less flexible matching, but less false matches.  Probably not ideal for product names,
          but may be good for SKUs.  Can insert dashes in the wrong place and still match. -->
-    <fieldType name="textTight" class="solr.TextField" positionIncrementGap="100" >
+    <fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
       <analyzer>
         <tokenizer class="solr.WhitespaceTokenizerFactory"/>
         <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
-        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords_en.txt"/>
         <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
         <filter class="solr.LowerCaseFilterFactory"/>
         <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
@@ -271,57 +342,27 @@
       </analyzer>
     </fieldType>
 
-
-    <!-- A general unstemmed text field - good if one does not know the language of the field -->
-    <fieldType name="textgen" class="solr.TextField" positionIncrementGap="100">
-      <analyzer type="index">
-        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
-        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
-        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/>
-        <filter class="solr.LowerCaseFilterFactory"/>
-      </analyzer>
-      <analyzer type="query">
-        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
-        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
-        <filter class="solr.StopFilterFactory"
-                ignoreCase="true"
-                words="stopwords.txt"
-                enablePositionIncrements="true"
-                />
-        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/>
-        <filter class="solr.LowerCaseFilterFactory"/>
-      </analyzer>
-    </fieldType>
-
-
-    <!-- A general unstemmed text field that indexes tokens normally and also
-         reversed (via ReversedWildcardFilterFactory), to enable more efficient 
-	 leading wildcard queries. -->
-    <fieldType name="text_rev" class="solr.TextField" positionIncrementGap="100">
+    <!-- Just like text_general except it reverses the characters of
+	 each token, to enable more efficient leading wildcard queries. -->
+    <fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100">
       <analyzer type="index">
-        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <tokenizer class="solr.StandardTokenizerFactory"/>
         <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
-        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/>
         <filter class="solr.LowerCaseFilterFactory"/>
         <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
            maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
       </analyzer>
       <analyzer type="query">
-        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <tokenizer class="solr.StandardTokenizerFactory"/>
         <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
-        <filter class="solr.StopFilterFactory"
-                ignoreCase="true"
-                words="stopwords.txt"
-                enablePositionIncrements="true"
-                />
-        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
         <filter class="solr.LowerCaseFilterFactory"/>
       </analyzer>
     </fieldType>
 
     <!-- charFilter + WhitespaceTokenizer  -->
     <!--
-    <fieldType name="textCharNorm" class="solr.TextField" positionIncrementGap="100" >
+    <fieldType name="text_char_norm" class="solr.TextField" positionIncrementGap="100" >
       <analyzer>
         <charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/>
         <tokenizer class="solr.WhitespaceTokenizerFactory"/>
@@ -454,13 +495,13 @@
    -->
 
    <field name="id" type="string" indexed="true" stored="true" required="true" /> 
-   <field name="sku" type="textTight" indexed="true" stored="true" omitNorms="true"/>
-   <field name="name" type="textgen" indexed="true" stored="true"/>
+   <field name="sku" type="text_en_splitting_tight" indexed="true" stored="true" omitNorms="true"/>
+   <field name="name" type="text_general" indexed="true" stored="true"/>
    <field name="alphaNameSort" type="alphaOnlySort" indexed="true" stored="false"/>
-   <field name="manu" type="textgen" indexed="true" stored="true" omitNorms="true"/>
+   <field name="manu" type="text_general" indexed="true" stored="true" omitNorms="true"/>
    <field name="cat" type="string" indexed="true" stored="true" multiValued="true"/>
-   <field name="features" type="text" indexed="true" stored="true" multiValued="true"/>
-   <field name="includes" type="text" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" />
+   <field name="features" type="text_general" indexed="true" stored="true" multiValued="true"/>
+   <field name="includes" type="text_general" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" />
 
    <field name="weight" type="float" indexed="true" stored="true"/>
    <field name="price"  type="float" indexed="true" stored="true"/>
@@ -478,13 +519,13 @@
      Some fields are multiValued only because Tika currently may return
      multiple values for them.
    -->
-   <field name="title" type="text" indexed="true" stored="true" multiValued="true"/>
-   <field name="subject" type="text" indexed="true" stored="true"/>
-   <field name="description" type="text" indexed="true" stored="true"/>
-   <field name="comments" type="text" indexed="true" stored="true"/>
-   <field name="author" type="textgen" indexed="true" stored="true"/>
-   <field name="keywords" type="textgen" indexed="true" stored="true"/>
-   <field name="category" type="textgen" indexed="true" stored="true"/>
+   <field name="title" type="text_general" indexed="true" stored="true" multiValued="true"/>
+   <field name="subject" type="text_general" indexed="true" stored="true"/>
+   <field name="description" type="text_general" indexed="true" stored="true"/>
+   <field name="comments" type="text_general" indexed="true" stored="true"/>
+   <field name="author" type="text_general" indexed="true" stored="true"/>
+   <field name="keywords" type="text_general" indexed="true" stored="true"/>
+   <field name="category" type="text_general" indexed="true" stored="true"/>
    <field name="content_type" type="string" indexed="true" stored="true" multiValued="true"/>
    <field name="last_modified" type="date" indexed="true" stored="true"/>
    <field name="links" type="string" indexed="true" stored="true" multiValued="true"/>
@@ -492,11 +533,11 @@
 
    <!-- catchall field, containing all other searchable text fields (implemented
         via copyField further on in this schema  -->
-   <field name="text" type="text" indexed="true" stored="false" multiValued="true"/>
+   <field name="text" type="text_general" indexed="true" stored="false" multiValued="true"/>
 
    <!-- catchall text field that indexes tokens both normally and in reverse for efficient
         leading wildcard queries. -->
-   <field name="text_rev" type="text_rev" indexed="true" stored="false" multiValued="true"/>
+   <field name="text_rev" type="text_general_rev" indexed="true" stored="false" multiValued="true"/>
 
    <!-- non-tokenized version of manufacturer to make it easier to sort or group
         results by manufacturer.  copied from "manu" via copyField -->
@@ -522,8 +563,8 @@
    <dynamicField name="*_i"  type="int"    indexed="true"  stored="true"/>
    <dynamicField name="*_s"  type="string"  indexed="true"  stored="true"/>
    <dynamicField name="*_l"  type="long"   indexed="true"  stored="true"/>
-   <dynamicField name="*_t"  type="text"    indexed="true"  stored="true"/>
-   <dynamicField name="*_txt" type="text"    indexed="true"  stored="true" multiValued="true"/>
+   <dynamicField name="*_t"  type="text_general"    indexed="true"  stored="true"/>
+   <dynamicField name="*_txt" type="text_general"    indexed="true"  stored="true" multiValued="true"/>
    <dynamicField name="*_b"  type="boolean" indexed="true"  stored="true"/>
    <dynamicField name="*_f"  type="float"  indexed="true"  stored="true"/>
    <dynamicField name="*_d"  type="double" indexed="true"  stored="true"/>
@@ -544,7 +585,7 @@
    <dynamicField name="*_pi"  type="pint"    indexed="true"  stored="true"/>
 
    <dynamicField name="ignored_*" type="ignored" multiValued="true"/>
-   <dynamicField name="attr_*" type="textgen" indexed="true" stored="true" multiValued="true"/>
+   <dynamicField name="attr_*" type="text_general" indexed="true" stored="true" multiValued="true"/>
 
    <dynamicField name="random_*" type="random" />
 

Modified: lucene/dev/branches/branch_3x/solr/example/solr/conf/solrconfig.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/example/solr/conf/solrconfig.xml?rev=1128859&r1=1128858&r2=1128859&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/example/solr/conf/solrconfig.xml (original)
+++ lucene/dev/branches/branch_3x/solr/example/solr/conf/solrconfig.xml Sun May 29 10:39:43 2011
@@ -251,7 +251,7 @@
          of detailed information when indexing.
 
          Setting The value to true will instruct the underlying Lucene
-         IndexWriter to write it's debugging info the specified file
+         IndexWriter to write its debugging info the specified file
       -->
      <infoStream file="INFOSTREAM.txt">false</infoStream> 
 

Modified: lucene/dev/branches/branch_3x/solr/example/solr/conf/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/example/solr/conf/stopwords.txt?rev=1128859&r1=1128858&r2=1128859&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/example/solr/conf/stopwords.txt (original)
+++ lucene/dev/branches/branch_3x/solr/example/solr/conf/stopwords.txt Sun May 29 10:39:43 2011
@@ -12,47 +12,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-#-----------------------------------------------------------------------
-# a couple of test stopwords to test that the words are really being
-# configured from this file:
-stopworda
-stopwordb
-
-#Standard english stop words taken from Lucene's StopAnalyzer
-a
-an
-and
-are
-as
-at
-be
-but
-by
-for
-if
-in
-into
-is
-it
-no
-not
-of
-on
-or
-s
-such
-t
-that
-the
-their
-then
-there
-these
-they
-this
-to
-was
-will
-with
-

Modified: lucene/dev/branches/branch_3x/solr/site/skin/basic.css
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/site/skin/basic.css?rev=1128859&r1=1128858&r2=1128859&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/site/skin/basic.css (original)
+++ lucene/dev/branches/branch_3x/solr/site/skin/basic.css Sun May 29 10:39:43 2011
@@ -163,4 +163,4 @@ p {
 .codefrag {
   font-family: "Courier New", Courier, monospace;
   font-size: 110%;
-}
+}
\ No newline at end of file

Modified: lucene/dev/branches/branch_3x/solr/site/skin/print.css
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/site/skin/print.css?rev=1128859&r1=1128858&r2=1128859&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/site/skin/print.css (original)
+++ lucene/dev/branches/branch_3x/solr/site/skin/print.css Sun May 29 10:39:43 2011
@@ -51,4 +51,4 @@ a:link, a:visited {
 
 acronym {
   border: 0;
-}
+}
\ No newline at end of file

Modified: lucene/dev/branches/branch_3x/solr/site/skin/profile.css
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/site/skin/profile.css?rev=1128859&r1=1128858&r2=1128859&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/site/skin/profile.css (original)
+++ lucene/dev/branches/branch_3x/solr/site/skin/profile.css Sun May 29 10:39:43 2011
@@ -172,4 +172,4 @@ a:hover { color:#6587ff} 
     }
       
     
-  
+  
\ No newline at end of file

Modified: lucene/dev/branches/branch_3x/solr/site/skin/screen.css
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/site/skin/screen.css?rev=1128859&r1=1128858&r2=1128859&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/site/skin/screen.css (original)
+++ lucene/dev/branches/branch_3x/solr/site/skin/screen.css Sun May 29 10:39:43 2011
@@ -584,4 +584,4 @@ p.instruction {
   list-style-image: url('../images/instruction_arrow.png');
   list-style-position: outside;
   margin-left: 2em;
-} 
+} 
\ No newline at end of file

Modified: lucene/dev/branches/branch_3x/solr/site/tutorial.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/site/tutorial.html?rev=1128859&r1=1128858&r2=1128859&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/site/tutorial.html (original)
+++ lucene/dev/branches/branch_3x/solr/site/tutorial.html Sun May 29 10:39:43 2011
@@ -437,9 +437,9 @@ SimplePostTool: COMMITting Solr index ch
 <p>
 You may have noticed that even though the file <span class="codefrag">solr.xml</span> has now
 been POSTed to the server twice, you still only get 1 result when searching for
-"solr".  This is because the example schema.xml specifies a "uniqueKey" field
+"solr".  This is because the example <span class="codefrag">schema.xml</span> specifies a "<span class="codefrag">uniqueKey</span>" field
 called "<span class="codefrag">id</span>".  Whenever you POST instructions to Solr to add a
-document with the same value for the uniqueKey as an existing document, it
+document with the same value for the <span class="codefrag">uniqueKey</span> as an existing document, it
 automatically replaces it for you.  You can see that that has happened by
 looking at the values for <span class="codefrag">numDocs</span> and <span class="codefrag">maxDoc</span> in the
 "CORE"/searcher section of the statistics page...  </p>
@@ -450,20 +450,20 @@ looking at the values for <span class="c
 </p>
 <p>
   
-<strong>numDocs</strong> represents the number of searchable documents in the
+<strong><span class="codefrag">numDocs</span></strong> represents the number of searchable documents in the
   index (and will be larger than the number of XML files since some files
-  contained more than one <span class="codefrag">&lt;doc&gt;</span>). <strong>maxDoc</strong>
-  may be larger as the maxDoc count includes logically deleted documents that
+  contained more than one <span class="codefrag">&lt;doc&gt;</span>). <strong><span class="codefrag">maxDoc</span></strong>
+  may be larger as the <span class="codefrag">maxDoc</span> count includes logically deleted documents that
   have not yet been removed from the index. You can re-post the sample XML
-  files over and over again as much as you want and numDocs will never
-  increase,because the new documents will constantly be replacing the old.
+  files over and over again as much as you want and <span class="codefrag">numDocs</span> will never
+  increase, because the new documents will constantly be replacing the old.
 </p>
 <p>
 Go ahead and edit the existing XML files to change some of the data, and re-run
 the <span class="codefrag">java -jar post.jar</span> command, you'll see your changes reflected
 in subsequent searches.
 </p>
-<a name="N1011B"></a><a name="Deleting+Data"></a>
+<a name="N1012C"></a><a name="Deleting+Data"></a>
 <h3 class="boxed">Deleting Data</h3>
 <p>You can delete data by POSTing a delete command to the update URL and specifying the value
       of the document's unique key field, or a query that matches multiple documents (be careful with that one!).  Since these commands
@@ -474,7 +474,7 @@ in subsequent searches.
 <p>Now if you go to the <a href="http://localhost:8983/solr/admin/stats.jsp">statistics</a> page and scroll down
        to the UPDATE_HANDLERS section and verify that "<span class="codefrag">deletesById : 1</span>"</p>
 <p>If you search for <a href="http://localhost:8983/solr/select?q=id:SP2514N">id:SP2514N</a> it will still be found,
-       because index changes are not visible until, and a new searcher is opened.  To cause
+       because index changes are not visible until changes are committed and a new searcher is opened.  To cause
        this to happen, send a commit command to Solr (post.jar does this for you by default):</p>
 <pre class="code">java -jar post.jar</pre>
 <p>Now re-execute the previous search and verify that no matching documents are found.  Also revisit the
@@ -483,7 +483,7 @@ in subsequent searches.
       <a href="http://localhost:8983/solr/select?q=name:DDR&fl=name">DDR</a> in the name:</p>
 <pre class="code">java -Ddata=args -jar post.jar "&lt;delete&gt;&lt;query&gt;name:DDR&lt;/query&gt;&lt;/delete&gt;"</pre>
 <p>Commit can be an expensive operation so it's best to make many changes to an index in a batch and
-      then send the commit command at the end.  There is also an optimize command that does the same thing as commit,
+      then send the <span class="codefrag">commit</span> command at the end.  There is also an <span class="codefrag">optimize</span> command that does the same thing as <span class="codefrag">commit</span>,
       in addition to merging all index segments into a single segment, making it faster to search and causing any
       deleted documents to be removed.  All of the update commands are documented <a href="http://wiki.apache.org/solr/UpdateXmlMessages">here</a>.
     </p>
@@ -492,14 +492,14 @@ in subsequent searches.
 </div>
 
 
-<a name="N10161"></a><a name="Querying+Data"></a>
+<a name="N1017B"></a><a name="Querying+Data"></a>
 <h2 class="boxed">Querying Data</h2>
 <div class="section">
 <p>
-    Searches are done via HTTP GET on the select URL with the query string in the q parameter.
+    Searches are done via HTTP GET on the <span class="codefrag">select</span> URL with the query string in the <span class="codefrag">q</span> parameter.
     You can pass a number of optional <a href="http://wiki.apache.org/solr/StandardRequestHandler">request parameters</a>
-    to the request handler to control what information is returned.  For example, you can use the "fl" parameter
-    to control what stored fields are returned, and if the relevancy score is returned...
+    to the request handler to control what information is returned.  For example, you can use the "<span class="codefrag">fl</span>" parameter
+    to control what stored fields are returned, and if the relevancy score is returned:
   </p>
 <ul>
       
@@ -521,13 +521,13 @@ in subsequent searches.
 </ul>
 <p>
     Solr provides a <a href="http://localhost:8983/solr/admin/form.jsp">query form</a> within the web admin interface
-    that allows setting the various request parameters and is useful when trying out or debugging queries.
+    that allows setting the various request parameters and is useful when testing or debugging queries.
   </p>
-<a name="N10196"></a><a name="Sorting"></a>
+<a name="N101B9"></a><a name="Sorting"></a>
 <h3 class="boxed">Sorting</h3>
 <p>
       Solr provides a simple method to sort on one or more indexed fields.
-      Use the 'sort' parameter to specify "field direction" pairs...
+      Use the "<span class="codefrag">sort</span>' parameter to specify "field direction" pairs, separated by commas if there's more than one sort field:
     </p>
 <ul>
       
@@ -545,7 +545,7 @@ in subsequent searches.
     
 </ul>
 <p>
-      "score" can also be used as a field name when specifying a sort...
+      "<span class="codefrag">score</span>" can also be used as a field name when specifying a sort:
     </p>
 <ul>
       
@@ -559,7 +559,7 @@ in subsequent searches.
     
 </ul>
 <p>
-      Complex functions may also be used to sort results...
+      Complex functions may also be used to sort results:
     </p>
 <ul>
       
@@ -575,12 +575,12 @@ in subsequent searches.
 
 
 
-<a name="N101D4"></a><a name="Highlighting"></a>
+<a name="N101FD"></a><a name="Highlighting"></a>
 <h2 class="boxed">Highlighting</h2>
 <div class="section">
 <p>
     Hit highlighting returns relevent snippets of each returned document, and highlights
-    keywords from the query within those context snippets.
+    terms from the query within those context snippets.
   </p>
 <p>
     The following example searches for <span class="codefrag">video card</span> and requests
@@ -602,7 +602,7 @@ in subsequent searches.
 
 
 
-<a name="N101FD"></a><a name="Faceted+Search"></a>
+<a name="N10226"></a><a name="Faceted+Search"></a>
 <h2 class="boxed">Faceted Search</h2>
 <div class="section">
 <p>
@@ -661,7 +661,7 @@ in subsequent searches.
 
 
 
-<a name="N1024E"></a><a name="Search+UI"></a>
+<a name="N10277"></a><a name="Search+UI"></a>
 <h2 class="boxed">Search UI</h2>
 <div class="section">
 <p>
@@ -679,28 +679,44 @@ in subsequent searches.
 
 
 
-<a name="N10261"></a><a name="Text+Analysis"></a>
+<a name="N1028A"></a><a name="Text+Analysis"></a>
 <h2 class="boxed">Text Analysis</h2>
 <div class="section">
 <p>
-    Text fields are typically indexed by breaking the field into words and applying various transformations such as
+    Text fields are typically indexed by breaking the text into words and applying various transformations such as
     lowercasing, removing plurals, or stemming to increase relevancy.  The same text transformations are normally
     applied to any queries in order to match what is indexed.
   </p>
-<p>Example queries demonstrating relevancy improving transformations:</p>
+<p>
+    The <a href="http://wiki.apache.org/solr/SchemaXml">schema</a> defines
+    the fields in the index and what type of analysis is applied to them.  The current schema your server is using
+    may be accessed via the <span class="codefrag">[SCHEMA]</span> link on the <a href="http://localhost:8983/solr/admin/">admin</a> page.
+  </p>
+<p>
+    The best analysis components (tokenization and filtering) for your textual content depends heavily on language.
+    As you can see in the above <span class="codefrag">[SCHEMA]</span> link, the fields in the example schema are using a <span class="codefrag">fieldType</span>
+    named <span class="codefrag">text_general</span>, which has defaults appropriate for all languages.
+  </p>
+<p>
+    If you know your textual content is English, as is the case for the example documents in this tutorial,
+    and you'd like to apply English-specific stemming and stop word removal, as well as split compound words, you can use the <span class="codefrag">text_en_splitting</span> fieldType instead.
+    Go ahead and edit the <span class="codefrag">schema.xml</span> under the <span class="codefrag">solr/example/solr/conf</span> directory,
+    and change the <span class="codefrag">type</span> for fields <span class="codefrag">text</span> and <span class="codefrag">features</span> from <span class="codefrag">text_general</span> to <span class="codefrag">text_en_splitting</span>.
+    Restart the server and then re-post all of the documents, and then these queries will show the English-specific transformations:
+  </p>
 <ul>
     
 <li>A search for
        <a href="http://localhost:8983/solr/select/?indent=on&q=power-shot&fl=name">power-shot</a>
        matches <span class="codefrag">PowerShot</span>, and
       <a href="http://localhost:8983/solr/select/?indent=on&q=adata&fl=name">adata</a>
-      matches <span class="codefrag">A-DATA</span> due to the use of WordDelimiterFilter and LowerCaseFilter.
+      matches <span class="codefrag">A-DATA</span> due to the use of <span class="codefrag">WordDelimiterFilter</span> and <span class="codefrag">LowerCaseFilter</span>.
     </li>
 
     
 <li>A search for
       <a href="http://localhost:8983/solr/select/?indent=on&q=features:recharging&fl=name,features">features:recharging</a>
-       matches <span class="codefrag">Rechargeable</span> due to stemming with the EnglishPorterFilter.
+       matches <span class="codefrag">Rechargeable</span> due to stemming with the <span class="codefrag">EnglishPorterFilter</span>.
     </li>
 
     
@@ -708,20 +724,15 @@ in subsequent searches.
        <a href="http://localhost:8983/solr/select/?indent=on&q=%221 gigabyte%22&fl=name">"1 gigabyte"</a>
        matches things with <span class="codefrag">GB</span>, and the misspelled
       <a href="http://localhost:8983/solr/select/?indent=on&q=pixima&fl=name">pixima</a>
-       matches <span class="codefrag">Pixma</span> due to use of a SynonymFilter.
+       matches <span class="codefrag">Pixma</span> due to use of a <span class="codefrag">SynonymFilter</span>.
     </li>
 
   
 </ul>
-<p>
-    The <a href="http://wiki.apache.org/solr/SchemaXml">schema</a> defines
-    the fields in the index and what type of analysis is applied to them.  The current schema your server is using
-    may be accessed via the <span class="codefrag">[SCHEMA]</span> link on the <a href="http://localhost:8983/solr/admin/">admin</a> page.
-  </p>
 <p>A full description of the analysis components, Analyzers, Tokenizers, and TokenFilters
     available for use is <a href="http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters">here</a>.
   </p>
-<a name="N102B1"></a><a name="Analysis+Debugging"></a>
+<a name="N1030A"></a><a name="Analysis+Debugging"></a>
 <h3 class="boxed">Analysis Debugging</h3>
 <p>There is a handy <a href="http://localhost:8983/solr/admin/analysis.jsp">analysis</a>
       debugging page where you can see how a text value is broken down into words,
@@ -731,7 +742,7 @@ in subsequent searches.
       
 <a href="http://localhost:8983/solr/admin/analysis.jsp?name=name&val=Canon+Power-Shot+SD500">This</a>
       shows how "<span class="codefrag">Canon Power-Shot SD500</span>" would be indexed as a value in the name field.  Each row of
-      the table shows the resulting tokens after having passed through the next TokenFilter in the Analyzer for the <span class="codefrag">name</span> field.
+      the table shows the resulting tokens after having passed through the next <span class="codefrag">TokenFilter</span> in the analyzer for the <span class="codefrag">name</span> field.
       Notice how both <span class="codefrag">powershot</span> and <span class="codefrag">power</span>, <span class="codefrag">shot</span> are indexed.  Tokens generated at the same position
       are shown in the same column, in this case <span class="codefrag">shot</span> and <span class="codefrag">powershot</span>.
     </p>
@@ -750,12 +761,12 @@ in subsequent searches.
 </div>
 
 
-<a name="N102F0"></a><a name="Conclusion"></a>
+<a name="N1034C"></a><a name="Conclusion"></a>
 <h2 class="boxed">Conclusion</h2>
 <div class="section">
 <p>
   Congratulations!  You successfully ran a small Solr instance, added some
-  documents, and made changes to the index.  You learned about queries, text
+  documents, and made changes to the index and schema.  You learned about queries, text
   analysis, and the Solr admin interface.  You're ready to start using Solr on
   your own project!  Continue on with the following steps:
 </p>
@@ -763,20 +774,20 @@ in subsequent searches.
   
 <li>Subscribe to the Solr <a href="mailing_lists.html">mailing lists</a>!</li>
   
-<li>Make a copy of the Solr example directory as a template for your project.</li>
+<li>Make a copy of the Solr <span class="codefrag">example</span> directory as a template for your project.</li>
   
-<li>Customize the schema and other config in solr/conf/ to meet your needs.</li> 
+<li>Customize the schema and other config in <span class="codefrag">solr/conf/</span> to meet your needs.</li> 
 
 </ul>
 <p>
-  Solr as a ton of other features that we haven't touched on here, including
+  Solr has a ton of other features that we haven't touched on here, including
   <a href="http://wiki.apache.org/solr/DistributedSearch">distributed search</a>
   to handle huge document collections,
   <a href="http://wiki.apache.org/solr/FunctionQuery">function queries</a>,
   <a href="http://wiki.apache.org/solr/StatsComponent">numeric field statistics</a>,
   and
   <a href="http://wiki.apache.org/solr/ClusteringComponent">search results clustering</a>.
-  Explore the <a href="http://wiki.apache.org/solr/">Solr Wiki</a> to find out
+  Explore the <a href="http://wiki.apache.org/solr/">Solr Wiki</a> to find
   more details about Solr's many
   <a href="features.html">features</a>.
 </p>

Modified: lucene/dev/branches/branch_3x/solr/site/tutorial.pdf
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/site/tutorial.pdf?rev=1128859&r1=1128858&r2=1128859&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/dev/branches/branch_3x/solr/src/java/org/apache/solr/schema/TextField.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/src/java/org/apache/solr/schema/TextField.java?rev=1128859&r1=1128858&r2=1128859&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/src/java/org/apache/solr/schema/TextField.java (original)
+++ lucene/dev/branches/branch_3x/solr/src/java/org/apache/solr/schema/TextField.java Sun May 29 10:39:43 2011
@@ -46,12 +46,17 @@ import java.io.StringReader;
  * @version $Id$
  */
 public class TextField extends FieldType {
-  protected boolean autoGeneratePhraseQueries = true;
+  protected boolean autoGeneratePhraseQueries;
 
   @Override
   protected void init(IndexSchema schema, Map<String,String> args) {
     properties |= TOKENIZED;
     if (schema.getVersion()> 1.1f) properties &= ~OMIT_TF_POSITIONS;
+    if (schema.getVersion() > 1.3f) {
+      autoGeneratePhraseQueries = false;
+    } else {
+      autoGeneratePhraseQueries = true;
+    }
     String autoGeneratePhraseQueriesStr = args.remove("autoGeneratePhraseQueries");
     if (autoGeneratePhraseQueriesStr != null)
       autoGeneratePhraseQueries = Boolean.parseBoolean(autoGeneratePhraseQueriesStr);

Modified: lucene/dev/branches/branch_3x/solr/src/site/src/documentation/content/xdocs/tutorial.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/src/site/src/documentation/content/xdocs/tutorial.xml?rev=1128859&r1=1128858&r2=1128859&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/src/site/src/documentation/content/xdocs/tutorial.xml (original)
+++ lucene/dev/branches/branch_3x/solr/src/site/src/documentation/content/xdocs/tutorial.xml Sun May 29 10:39:43 2011
@@ -210,9 +210,9 @@ SimplePostTool: COMMITting Solr index ch
 <p>
 You may have noticed that even though the file <code>solr.xml</code> has now
 been POSTed to the server twice, you still only get 1 result when searching for
-"solr".  This is because the example schema.xml specifies a "uniqueKey" field
+"solr".  This is because the example <code>schema.xml</code> specifies a "<code>uniqueKey</code>" field
 called "<code>id</code>".  Whenever you POST instructions to Solr to add a
-document with the same value for the uniqueKey as an existing document, it
+document with the same value for the <code>uniqueKey</code> as an existing document, it
 automatically replaces it for you.  You can see that that has happened by
 looking at the values for <code>numDocs</code> and <code>maxDoc</code> in the
 "CORE"/searcher section of the statistics page...  </p>
@@ -221,13 +221,13 @@ looking at the values for <code>numDocs<
 </p>
 
 <p>
-  <strong>numDocs</strong> represents the number of searchable documents in the
+  <strong><code>numDocs</code></strong> represents the number of searchable documents in the
   index (and will be larger than the number of XML files since some files
-  contained more than one <code>&lt;doc&gt;</code>). <strong>maxDoc</strong>
-  may be larger as the maxDoc count includes logically deleted documents that
+  contained more than one <code>&lt;doc&gt;</code>). <strong><code>maxDoc</code></strong>
+  may be larger as the <code>maxDoc</code> count includes logically deleted documents that
   have not yet been removed from the index. You can re-post the sample XML
-  files over and over again as much as you want and numDocs will never
-  increase,because the new documents will constantly be replacing the old.
+  files over and over again as much as you want and <code>numDocs</code> will never
+  increase, because the new documents will constantly be replacing the old.
 </p>
 <p>
 Go ahead and edit the existing XML files to change some of the data, and re-run
@@ -246,7 +246,7 @@ in subsequent searches.
     <p>Now if you go to the <a href="http://localhost:8983/solr/admin/stats.jsp">statistics</a> page and scroll down
        to the UPDATE_HANDLERS section and verify that "<code>deletesById : 1</code>"</p>
     <p>If you search for <a href="http://localhost:8983/solr/select?q=id:SP2514N">id:SP2514N</a> it will still be found,
-       because index changes are not visible until, and a new searcher is opened.  To cause
+       because index changes are not visible until changes are committed and a new searcher is opened.  To cause
        this to happen, send a commit command to Solr (post.jar does this for you by default):</p>
     <source>java -jar post.jar</source>
     <p>Now re-execute the previous search and verify that no matching documents are found.  Also revisit the
@@ -256,7 +256,7 @@ in subsequent searches.
     <source>java -Ddata=args -jar post.jar "&lt;delete>&lt;query>name:DDR&lt;/query>&lt;/delete>"</source>
 
     <p>Commit can be an expensive operation so it's best to make many changes to an index in a batch and
-      then send the commit command at the end.  There is also an optimize command that does the same thing as commit,
+      then send the <code>commit</code> command at the end.  There is also an <code>optimize</code> command that does the same thing as <code>commit</code>,
       in addition to merging all index segments into a single segment, making it faster to search and causing any
       deleted documents to be removed.  All of the update commands are documented <a href="http://wiki.apache.org/solr/UpdateXmlMessages">here</a>.
     </p>
@@ -272,10 +272,10 @@ in subsequent searches.
   <title>Querying Data</title>
 
   <p>
-    Searches are done via HTTP GET on the select URL with the query string in the q parameter.
+    Searches are done via HTTP GET on the <code>select</code> URL with the query string in the <code>q</code> parameter.
     You can pass a number of optional <a href="http://wiki.apache.org/solr/StandardRequestHandler">request parameters</a>
-    to the request handler to control what information is returned.  For example, you can use the "fl" parameter
-    to control what stored fields are returned, and if the relevancy score is returned...
+    to the request handler to control what information is returned.  For example, you can use the "<code>fl</code>" parameter
+    to control what stored fields are returned, and if the relevancy score is returned:
   </p>
 
     <ul>
@@ -288,7 +288,7 @@ in subsequent searches.
 
   <p>
     Solr provides a <a href="http://localhost:8983/solr/admin/form.jsp">query form</a> within the web admin interface
-    that allows setting the various request parameters and is useful when trying out or debugging queries.
+    that allows setting the various request parameters and is useful when testing or debugging queries.
   </p>
 
   <section>
@@ -296,7 +296,7 @@ in subsequent searches.
 
     <p>
       Solr provides a simple method to sort on one or more indexed fields.
-      Use the 'sort' parameter to specify "field direction" pairs...
+      Use the "<code>sort</code>' parameter to specify "field direction" pairs, separated by commas if there's more than one sort field:
     </p>
 
     <ul>
@@ -306,7 +306,7 @@ in subsequent searches.
     </ul>
 
     <p>
-      "score" can also be used as a field name when specifying a sort...
+      "<code>score</code>" can also be used as a field name when specifying a sort:
     </p>
     <ul>
       <li><a href="http://localhost:8983/solr/select/?indent=on&amp;q=video&amp;sort=score+desc">q=video&amp;sort=score desc</a></li>
@@ -314,7 +314,7 @@ in subsequent searches.
     </ul>
 
     <p>
-      Complex functions may also be used to sort results...
+      Complex functions may also be used to sort results:
     </p>
     <ul>
       <li><a href="http://localhost:8983/solr/select/?indent=on&amp;q=*:*&amp;sort=div(popularity,add(price,1))+desc">q=video&amp;sort=div(popularity,add(price,1)) desc</a></li>
@@ -334,7 +334,7 @@ in subsequent searches.
   <title>Highlighting</title>
   <p>
     Hit highlighting returns relevent snippets of each returned document, and highlights
-    keywords from the query within those context snippets.
+    terms from the query within those context snippets.
   </p>
   <p>
     The following example searches for <code>video card</code> and requests
@@ -429,42 +429,52 @@ in subsequent searches.
   <title>Text Analysis</title>
 
   <p>
-    Text fields are typically indexed by breaking the field into words and applying various transformations such as
+    Text fields are typically indexed by breaking the text into words and applying various transformations such as
     lowercasing, removing plurals, or stemming to increase relevancy.  The same text transformations are normally
     applied to any queries in order to match what is indexed.
   </p>
 
-  <p>Example queries demonstrating relevancy improving transformations:</p>
+  <p>
+    The <a href="http://wiki.apache.org/solr/SchemaXml">schema</a> defines
+    the fields in the index and what type of analysis is applied to them.  The current schema your server is using
+    may be accessed via the <code>[SCHEMA]</code> link on the <a href="http://localhost:8983/solr/admin/">admin</a> page.
+  </p>
+
+  <p>
+    The best analysis components (tokenization and filtering) for your textual content depends heavily on language.
+    As you can see in the above <code>[SCHEMA]</code> link, the fields in the example schema are using a <code>fieldType</code>
+    named <code>text_general</code>, which has defaults appropriate for all languages.
+  </p>
+
+  <p>
+    If you know your textual content is English, as is the case for the example documents in this tutorial,
+    and you'd like to apply English-specific stemming and stop word removal, as well as split compound words, you can use the <code>text_en_splitting</code> fieldType instead.
+    Go ahead and edit the <code>schema.xml</code> under the <code>solr/example/solr/conf</code> directory,
+    and change the <code>type</code> for fields <code>text</code> and <code>features</code> from <code>text_general</code> to <code>text_en_splitting</code>.
+    Restart the server and then re-post all of the documents, and then these queries will show the English-specific transformations:
+  </p>
   <ul>
     <li>A search for
        <a href="http://localhost:8983/solr/select/?indent=on&amp;q=power-shot&amp;fl=name">power-shot</a>
        matches <code>PowerShot</code>, and
       <a href="http://localhost:8983/solr/select/?indent=on&amp;q=adata&amp;fl=name">adata</a>
-      matches <code>A-DATA</code> due to the use of WordDelimiterFilter and LowerCaseFilter.
+      matches <code>A-DATA</code> due to the use of <code>WordDelimiterFilter</code> and <code>LowerCaseFilter</code>.
     </li>
 
     <li>A search for
       <a href="http://localhost:8983/solr/select/?indent=on&amp;q=features:recharging&amp;fl=name,features">features:recharging</a>
-       matches <code>Rechargeable</code> due to stemming with the EnglishPorterFilter.
+       matches <code>Rechargeable</code> due to stemming with the <code>EnglishPorterFilter</code>.
     </li>
 
     <li>A search for
        <a href="http://localhost:8983/solr/select/?indent=on&amp;q=&quot;1 gigabyte&quot;&amp;fl=name">"1 gigabyte"</a>
        matches things with <code>GB</code>, and the misspelled
       <a href="http://localhost:8983/solr/select/?indent=on&amp;q=pixima&amp;fl=name">pixima</a>
-       matches <code>Pixma</code> due to use of a SynonymFilter.
+       matches <code>Pixma</code> due to use of a <code>SynonymFilter</code>.
     </li>
 
   </ul>
 
-
-  <p>
-    The <a href="http://wiki.apache.org/solr/SchemaXml">schema</a> defines
-    the fields in the index and what type of analysis is applied to them.  The current schema your server is using
-    may be accessed via the <code>[SCHEMA]</code> link on the <a href="http://localhost:8983/solr/admin/">admin</a> page.
-  </p>
-
-
   <p>A full description of the analysis components, Analyzers, Tokenizers, and TokenFilters
     available for use is <a href="http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters">here</a>.
   </p>
@@ -479,7 +489,7 @@ in subsequent searches.
     <p>
       <a href="http://localhost:8983/solr/admin/analysis.jsp?name=name&amp;val=Canon+Power-Shot+SD500">This</a>
       shows how "<code>Canon Power-Shot SD500</code>" would be indexed as a value in the name field.  Each row of
-      the table shows the resulting tokens after having passed through the next TokenFilter in the Analyzer for the <code>name</code> field.
+      the table shows the resulting tokens after having passed through the next <code>TokenFilter</code> in the analyzer for the <code>name</code> field.
       Notice how both <code>powershot</code> and <code>power</code>, <code>shot</code> are indexed.  Tokens generated at the same position
       are shown in the same column, in this case <code>shot</code> and <code>powershot</code>.
     </p>
@@ -501,25 +511,25 @@ in subsequent searches.
   <title>Conclusion</title>
   <p>
   Congratulations!  You successfully ran a small Solr instance, added some
-  documents, and made changes to the index.  You learned about queries, text
+  documents, and made changes to the index and schema.  You learned about queries, text
   analysis, and the Solr admin interface.  You're ready to start using Solr on
   your own project!  Continue on with the following steps:
 </p>
 <ul>
   <li>Subscribe to the Solr <a href="mailing_lists.html">mailing lists</a>!</li>
-  <li>Make a copy of the Solr example directory as a template for your project.</li>
-  <li>Customize the schema and other config in solr/conf/ to meet your needs.</li> 
+  <li>Make a copy of the Solr <code>example</code> directory as a template for your project.</li>
+  <li>Customize the schema and other config in <code>solr/conf/</code> to meet your needs.</li> 
 </ul>
 
 <p>
-  Solr as a ton of other features that we haven't touched on here, including
+  Solr has a ton of other features that we haven't touched on here, including
   <a href="http://wiki.apache.org/solr/DistributedSearch">distributed search</a>
   to handle huge document collections,
   <a href="http://wiki.apache.org/solr/FunctionQuery">function queries</a>,
   <a href="http://wiki.apache.org/solr/StatsComponent">numeric field statistics</a>,
   and
   <a href="http://wiki.apache.org/solr/ClusteringComponent">search results clustering</a>.
-  Explore the <a href="http://wiki.apache.org/solr/">Solr Wiki</a> to find out
+  Explore the <a href="http://wiki.apache.org/solr/">Solr Wiki</a> to find
   more details about Solr's many
   <a href="features.html">features</a>.
 </p>

Modified: lucene/dev/branches/branch_3x/solr/src/test-files/solr/conf/schema12.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/src/test-files/solr/conf/schema12.xml?rev=1128859&r1=1128858&r2=1128859&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/src/test-files/solr/conf/schema12.xml (original)
+++ lucene/dev/branches/branch_3x/solr/src/test-files/solr/conf/schema12.xml Sun May 29 10:39:43 2011
@@ -28,7 +28,7 @@
      $Name:  $
   -->
 
-<schema name="test" version="1.2">
+<schema name="test" version="1.4">
   <types>
 
     <!-- field type definitions... note that the "name" attribute is
@@ -104,7 +104,7 @@
 
 
     <!-- HighlitText optimizes storage for (long) columns which will be highlit -->
-    <fieldtype name="highlittext" class="solr.TextField" compressThreshold="345" />
+    <fieldtype name="highlittext" class="solr.TextField"/>
 
     <fieldtype name="boolean" class="solr.BoolField" sortMissingLast="true"/>
     <fieldtype name="string" class="solr.StrField" sortMissingLast="true"/>
@@ -116,7 +116,7 @@
     <fieldtype name="tdate" class="solr.TrieDateField" sortMissingLast="true" precisionStep="6"/>
     <fieldtype name="pdate" class="solr.DateField" sortMissingLast="true"/>
 
-  <fieldType name="text" class="solr.TextField" positionIncrementGap="100">
+  <fieldType name="text" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true" >
       <analyzer type="index">
         <tokenizer class="solr.WhitespaceTokenizerFactory"/>
         <filter class="solr.StopFilterFactory"
@@ -146,7 +146,7 @@
 
 
     <!-- field type that doesn't generate phrases from unquoted multiple tokens per analysis unit -->
-   <fieldType name="text_np" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false" >
+   <fieldType name="text_np" class="solr.TextField" positionIncrementGap="100">
       <analyzer type="index">
         <tokenizer class="solr.WhitespaceTokenizerFactory"/>
         <filter class="solr.StopFilterFactory"

Modified: lucene/dev/branches/branch_3x/solr/src/test/org/apache/solr/client/solrj/SolrExampleTests.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/src/test/org/apache/solr/client/solrj/SolrExampleTests.java?rev=1128859&r1=1128858&r2=1128859&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/src/test/org/apache/solr/client/solrj/SolrExampleTests.java (original)
+++ lucene/dev/branches/branch_3x/solr/src/test/org/apache/solr/client/solrj/SolrExampleTests.java Sun May 29 10:39:43 2011
@@ -20,7 +20,6 @@ package org.apache.solr.client.solrj;
 
 import java.io.IOException;
 import java.io.StringWriter;
-import java.io.File;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.List;
@@ -648,4 +647,27 @@ abstract public class SolrExampleTests e
     
     // System.out.println( rsp.getResults().getNumFound() + " :::: 444: "+ff.getValues() );
   }
+
+  @Test
+  public void testChineseDefaults() throws Exception {
+    // Empty the database...
+    server.deleteByQuery( "*:*" );// delete everything!
+    server.commit();
+    assertNumFound( "*:*", 0 ); // make sure it got in
+
+    // Beijing medical University
+    UpdateRequest req = new UpdateRequest();
+    SolrInputDocument doc = new SolrInputDocument();
+    doc.addField("id", "42");
+    doc.addField("text", "北京医科大学");
+    req.add(doc);
+
+    req.setAction(AbstractUpdateRequest.ACTION.COMMIT, true, true );
+    req.process( server );
+
+    // Beijing university should match:
+    SolrQuery query = new SolrQuery("北京大学");
+    QueryResponse rsp = server.query( query );
+    assertEquals(1, rsp.getResults().getNumFound());
+  }
 }