You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by si...@apache.org on 2009/02/27 07:21:38 UTC

svn commit: r748408 - in /lucene/nutch/trunk: CHANGES.txt conf/schema.xml

Author: siren
Date: Fri Feb 27 06:21:37 2009
New Revision: 748408

URL: http://svn.apache.org/viewvc?rev=748408&view=rev
Log:
NUTCH-699 - Add an "official" solr schema for solr integration. Contributed by dogacan, Dmitry Lihachev

Added:
    lucene/nutch/trunk/conf/schema.xml
Modified:
    lucene/nutch/trunk/CHANGES.txt

Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=748408&r1=748407&r2=748408&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Fri Feb 27 06:21:37 2009
@@ -361,6 +361,9 @@
 
 135. NUTCH-698 - CrawlDb is corrupted after a few crawl cycles (dogacan
      via siren)
+     
+136. NUTCH-699 - Add an "official" solr schema for solr integration (dogacan,
+     Dmitry Lihachev via siren)
 
 Release 0.9 - 2007-04-02
 

Added: lucene/nutch/trunk/conf/schema.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/schema.xml?rev=748408&view=auto
==============================================================================
--- lucene/nutch/trunk/conf/schema.xml (added)
+++ lucene/nutch/trunk/conf/schema.xml Fri Feb 27 06:21:37 2009
@@ -0,0 +1,109 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+    <!--
+        Licensed to the Apache Software Foundation (ASF) under one or
+        more contributor license agreements. See the NOTICE file
+        distributed with this work for additional information regarding
+        copyright ownership. The ASF licenses this file to You under the
+        Apache License, Version 2.0 (the "License"); you may not use
+        this file except in compliance with the License. You may obtain
+        a copy of the License at
+        http://www.apache.org/licenses/LICENSE-2.0 Unless required by
+        applicable law or agreed to in writing, software distributed
+        under the License is distributed on an "AS IS" BASIS, WITHOUT
+        WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+        See the License for the specific language governing permissions
+        and limitations under the License.
+    -->
+    <!--
+        Description: This document contains solr schema definition to be
+        used with solr integration currently build into Nutch. See
+        https://issues.apache.org/jira/browse/NUTCH-442
+        https://issues.apache.org/jira/browse/NUTCH-699 for more info.
+    -->
+<schema name="nutch" version="1.1">
+    <types>
+        <fieldType name="string" class="solr.StrField"
+            sortMissingLast="true" omitNorms="true"/>
+        <fieldType name="long" class="solr.LongField"
+            omitNorms="true"/>
+        <fieldType name="float" class="solr.FloatField"
+            omitNorms="true"/>
+        <fieldType name="text" class="solr.TextField"
+            positionIncrementGap="100">
+            <analyzer>
+                <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+                <filter class="solr.StopFilterFactory"
+                    ignoreCase="true" words="stopwords.txt"/>
+                <filter class="solr.WordDelimiterFilterFactory"
+                    generateWordParts="1" generateNumberParts="1"
+                    catenateWords="1" catenateNumbers="1" catenateAll="0"
+                    splitOnCaseChange="1"/>
+                <filter class="solr.LowerCaseFilterFactory"/>
+                <filter class="solr.EnglishPorterFilterFactory"
+                    protected="protwords.txt"/>
+                <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+            </analyzer>
+        </fieldType>
+        <fieldType name="url" class="solr.TextField"
+            positionIncrementGap="100">
+            <analyzer>
+                <tokenizer class="solr.StandardTokenizerFactory"/>
+                <filter class="solr.LowerCaseFilterFactory"/>
+                <filter class="solr.WordDelimiterFilterFactory"
+                    generateWordParts="1" generateNumberParts="1"/>
+                <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+            </analyzer>
+        </fieldType>
+    </types>
+    <fields>
+        <field name="id" type="string" stored="true" indexed="true"/>
+
+        <!-- core fields -->
+        <field name="segment" type="string" stored="true" indexed="false"/>
+        <field name="digest" type="string" stored="true" indexed="false"/>
+        <field name="boost" type="float" stored="true" indexed="false"/>
+
+        <!-- fields for index-basic plugin -->
+        <field name="host" type="url" stored="false" indexed="true"/>
+        <field name="site" type="string" stored="false" indexed="true"/>
+        <field name="url" type="url" stored="true" indexed="true"
+            required="true"/>
+        <field name="content" type="text" stored="false" indexed="true"/>
+        <field name="title" type="text" stored="true" indexed="true"/>
+        <field name="cache" type="string" stored="true" indexed="false"/>
+        <field name="tstamp" type="long" stored="true" indexed="false"/>
+
+        <!-- fields for index-anchor plugin -->
+        <field name="anchor" type="string" stored="true" indexed="true"
+            multiValued="true"/>
+
+        <!-- fields for index-more plugin -->
+        <field name="type" type="string" stored="true" indexed="true"
+            multiValued="true"/>
+        <field name="contentLength" type="long" stored="true"
+            indexed="false"/>
+        <field name="lastModified" type="long" stored="true"
+            indexed="false"/>
+        <field name="date" type="string" stored="true" indexed="true"/>
+
+        <!-- fields for languageidentifier plugin -->
+        <field name="lang" type="string" stored="true" indexed="true"/>
+
+        <!-- fields for subcollection plugin -->
+        <field name="subcollection" type="string" stored="true"
+            indexed="true"/>
+
+        <!-- fields for feed plugin -->
+        <field name="author" type="string" stored="true" indexed="true"/>
+        <field name="tag" type="string" stored="true" indexed="true"/>
+        <field name="feed" type="string" stored="true" indexed="true"/>
+        <field name="publishedDate" type="string" stored="true"
+            indexed="true"/>
+        <field name="updatedDate" type="string" stored="true"
+            indexed="true"/>
+    </fields>
+    <uniqueKey>id</uniqueKey>
+    <defaultSearchField>content</defaultSearchField>
+    <solrQueryParser defaultOperator="OR"/>
+    <copyField source="url" dest="id"/>
+</schema>
\ No newline at end of file