You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@nutch.apache.org by "lu_jin_hong@163.com" <lu...@163.com> on 2014/08/12 10:32:31 UTC

How to index the plugin field in nutch with solr?










Hi, everyone:
I integrate nutch/solr/hbase to construct a search engine, it work well, except that some fileds in the schma.xml are not indexed to solr.
The fields in " <!-- core fields -->" and " <!-- fields for index-basic plugin -->" are indexed to solr, but other fields, such as the fields in "<!-- fields for index-anchor plugin -->"  <!-- fields for index-more plugin --> , are not.
what is the problem? Or any other work should be do for that?

The schema.xml likes this:<schema name="nutch" version="1.5">    <types>        <fieldType name="string" class="solr.StrField" sortMissingLast="true"            omitNorms="true"/>        <fieldType name="long" class="solr.TrieLongField" precisionStep="0"            omitNorms="true" positionIncrementGap="0"/>        <fieldType name="float" class="solr.TrieFloatField" precisionStep="0"            omitNorms="true" positionIncrementGap="0"/>        <fieldType name="date" class="solr.TrieDateField" precisionStep="0"            omitNorms="true" positionIncrementGap="0"/>        <fieldType name="text" class="solr.TextField"            positionIncrementGap="100">            <analyzer>                <tokenizer class="solr.WhitespaceTokenizerFactory"/>                <filter class="solr.StopFilterFactory"                    ignoreCase="true" words="stopwords.txt"/>                <filter class="solr.WordDelimiterFilterFactory"                    generateWordParts="1" generateNumberParts="1"                    catenateWords="1" catenateNumbers="1" catenateAll="0"                    splitOnCaseChange="1"/>                <filter class="solr.LowerCaseFilterFactory"/>                <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>            </analyzer>        </fieldType>        <fieldType name="url" class="solr.TextField"            positionIncrementGap="100">            <analyzer>                <tokenizer class="solr.StandardTokenizerFactory"/>                <filter class="solr.LowerCaseFilterFactory"/>                <filter class="solr.WordDelimiterFilterFactory"/last            positionIncrementGap="100">            <analyzer>                <tokenizer class="solr.StandardTokenizerFactory"/>                <filter class="solr.LowerCaseFilterFactory"/>                <filter class="solr.WordDelimiterFilterFactory"                    generateWordParts="1" generateNumberParts="1"/>            </analyzer>        </fieldType>    </types>    <fields>        <field name="id" type="string" stored="true" indexed="true"/>
        <!-- core fields -->        <field name="batchId" type="string" stored="true" indexed="false"/>        <field name="digest" type="string" stored="true" indexed="false"/>        <field name="boost" type="float" stored="true" indexed="false"/>
        <!-- fields for index-basic plugin -->        <field name="host" type="url" stored="false" indexed="true"/>        <field name="url" type="url" stored="true" indexed="true"            required="true"/>        <field name="content" type="text" stored="true" indexed="true"/>        <field name="title" type="text" stored="true" indexed="true"/>        <field name="cache" type="string" stored="true" indexed="false"/>        <field name="tstamp" type="date" stored="true" indexed="true"/>
        <field name="_version_" type="long" indexed="true" stored="true"/>        <!-- fields for index-anchor plugin -->        <field name="anchor" type="string" stored="true" indexed="true"            multiValued="true"/>
        <!-- fields for index-more plugin -->        <field name="type" type="string" stored="true" indexed="true"            multiValued="true"/>        <field name="contentLength" type="long" stored="true"            indexed="true"/>        <field name="lastModified" type="date" stored="true"            indexed="true"/>        <field name="date" type="date" stored="true" indexed="true"/>
        <!-- fields for languageidentifier plugin -->        <field name="lang" type="string" stored="true" indexed="true"/>
        <!-- fields for subcollection plugin -->        <field name="subcollection" type="string" stored="true"            indexed="true" multiValued="true"/>
        <!-- fields for feed plugin (tag is also used by microformats-reltag)-->        <field name="author" type="string" stored="true" indexed="true"/>        <field name="tag" type="string" stored="true" indexed="true" multiValued="true"/>        <field name="feed" type="string" stored="true" indexed="true"/>        <field name="publishedDate" type="date" stored="true"            indexed="true"/>        <field name="updatedDate" type="date" stored="true"            indexed="true"/>
        <!-- fields for creativecommons plugin -->        <field name="cc" type="string" stored="true" indexed="true"            multiValued="true"/>
        <!-- fields for tld plugin -->        <field name="tld" type="string" stored="false" indexed="false"/>    </fields>    <uniqueKey>id</uniqueKey>    <defaultSearchField>content</defaultSearchField>    <solrQueryParser defaultOperator="OR"/></schema>


thanks,lu_jin_hong@163.com


Re: How to index the plugin field in nutch with solr?

Posted by Sebastian Nagel <wa...@googlemail.com>.
Hi,

> except that some fileds in the schma.xml are not indexed to solr.
> The fields in " <!-- core fields -->" and " <!-- fields for index-basic plugin -->" are indexed
> to solr, but other fields, such as the fields in "<!-- fields for index-anchor plugin -->"  <!--
> fields for index-more plugin --> , are not.
> what is the problem? Or any other work should be do for that?

Of course, these plugins must be also activated in property "plugin.includes".
Per default, only index-basic and index-anchor are active.

For index-anchor you will have also a look at the property
"db.ignore.internal.links": if false (the default), anchor texts
from the same host are not recorded, and hence, not indexed.

Best,
Sebastian

On 08/12/2014 10:32 AM, lu_jin_hong@163.com wrote:
> 
> 
> Hi, everyone:
> I integrate nutch/solr/hbase to construct a search engine, it work well, except that some fileds in the schma.xml are not indexed to solr.
> The fields in " <!-- core fields -->" and " <!-- fields for index-basic plugin -->" are indexed to solr, but other fields, such as the fields in "<!-- fields for index-anchor plugin -->"  <!-- fields for index-more plugin --> , are not.
> what is the problem? Or any other work should be do for that?
> 
> The schema.xml likes this:<schema name="nutch" version="1.5">    <types>        <fieldType name="string" class="solr.StrField" sortMissingLast="true"            omitNorms="true"/>        <fieldType name="long" class="solr.TrieLongField" precisionStep="0"            omitNorms="true" positionIncrementGap="0"/>        <fieldType name="float" class="solr.TrieFloatField" precisionStep="0"            omitNorms="true" positionIncrementGap="0"/>        <fieldType name="date" class="solr.TrieDateField" precisionStep="0"            omitNorms="true" positionIncrementGap="0"/>        <fieldType name="text" class="solr.TextField"            positionIncrementGap="100">            <analyzer>                <tokenizer class="solr.WhitespaceTokenizerFactory"/>                <filter class="solr.StopFilterFactory"                    ignoreCase="true" words="stopwords.txt"/>                <filter class="solr.WordDelimiterFilterFactory"                    generateWordParts="1" generateNumberParts="1"
  
                   catenateWords="1" catenateNumbers="1" catenateAll="0"                    splitOnCaseChange="1"/>                <filter class="solr.LowerCaseFilterFactory"/>                <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>            </analyzer>        </fieldType>        <fieldType name="url" class="solr.TextField"            positionIncrementGap="100">            <analyzer>                <tokenizer class="solr.StandardTokenizerFactory"/>                <filter class="solr.LowerCaseFilterFactory"/>                <filter class="solr.WordDelimiterFilterFactory"/last            positionIncrementGap="100">            <analyzer>                <tokenizer class="solr.StandardTokenizerFactory"/>                <filter class="solr.LowerCaseFilterFactory"/>                <filter class="solr.WordDelimiterFilterFactory"                    generateWordParts="1" generateNumberParts="1"/>            </analyzer>        </fieldType>    </types>    <fields>        <fiel
 d
 name="id" type="string" stored="true" indexed="true"/>
>         <!-- core fields -->        <field name="batchId" type="string" stored="true" indexed="false"/>        <field name="digest" type="string" stored="true" indexed="false"/>        <field name="boost" type="float" stored="true" indexed="false"/>
>         <!-- fields for index-basic plugin -->        <field name="host" type="url" stored="false" indexed="true"/>        <field name="url" type="url" stored="true" indexed="true"            required="true"/>        <field name="content" type="text" stored="true" indexed="true"/>        <field name="title" type="text" stored="true" indexed="true"/>        <field name="cache" type="string" stored="true" indexed="false"/>        <field name="tstamp" type="date" stored="true" indexed="true"/>
>         <field name="_version_" type="long" indexed="true" stored="true"/>        <!-- fields for index-anchor plugin -->        <field name="anchor" type="string" stored="true" indexed="true"            multiValued="true"/>
>         <!-- fields for index-more plugin -->        <field name="type" type="string" stored="true" indexed="true"            multiValued="true"/>        <field name="contentLength" type="long" stored="true"            indexed="true"/>        <field name="lastModified" type="date" stored="true"            indexed="true"/>        <field name="date" type="date" stored="true" indexed="true"/>
>         <!-- fields for languageidentifier plugin -->        <field name="lang" type="string" stored="true" indexed="true"/>
>         <!-- fields for subcollection plugin -->        <field name="subcollection" type="string" stored="true"            indexed="true" multiValued="true"/>
>         <!-- fields for feed plugin (tag is also used by microformats-reltag)-->        <field name="author" type="string" stored="true" indexed="true"/>        <field name="tag" type="string" stored="true" indexed="true" multiValued="true"/>        <field name="feed" type="string" stored="true" indexed="true"/>        <field name="publishedDate" type="date" stored="true"            indexed="true"/>        <field name="updatedDate" type="date" stored="true"            indexed="true"/>
>         <!-- fields for creativecommons plugin -->        <field name="cc" type="string" stored="true" indexed="true"            multiValued="true"/>
>         <!-- fields for tld plugin -->        <field name="tld" type="string" stored="false" indexed="false"/>    </fields>    <uniqueKey>id</uniqueKey>    <defaultSearchField>content</defaultSearchField>    <solrQueryParser defaultOperator="OR"/></schema>
> 
> 
> thanks,lu_jin_hong@163.com
>