You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@nutch.apache.org by Marek Bachmann <m....@uni-kassel.de> on 2011/10/12 02:05:34 UTC
All boost values are 1.0 in solr

Hey ho,

first of all: I am not sure if this topic belongs to the solr or nutch 
list, sorry for the double post.

For some reasons all of the solr documents have a boost value of 1.0

I indexed them using the solrindex command from nutch 1.3. The pages 
were scored with Webgraph an the output of the used crawldb is:

11/10/12 01:55:05 INFO crawl.CrawlDbReader: Statistics for CrawlDb: crawldb
11/10/12 01:55:05 INFO crawl.CrawlDbReader: TOTAL urls: 243751
11/10/12 01:55:05 INFO crawl.CrawlDbReader: retry 0:    242738
11/10/12 01:55:05 INFO crawl.CrawlDbReader: retry 1:    627
11/10/12 01:55:05 INFO crawl.CrawlDbReader: retry 2:    127
11/10/12 01:55:05 INFO crawl.CrawlDbReader: retry 3:    148
11/10/12 01:55:05 INFO crawl.CrawlDbReader: retry 4:    111
11/10/12 01:55:05 INFO crawl.CrawlDbReader: min score:  0.0
11/10/12 01:55:05 INFO crawl.CrawlDbReader: avg score:  0.4357474
11/10/12 01:55:05 INFO crawl.CrawlDbReader: max score:  2764.215
11/10/12 01:55:05 INFO crawl.CrawlDbReader: status 1 (db_unfetched): 
32425
11/10/12 01:55:05 INFO crawl.CrawlDbReader: status 2 (db_fetched): 
182141
11/10/12 01:55:05 INFO crawl.CrawlDbReader: status 3 (db_gone): 17783
11/10/12 01:55:05 INFO crawl.CrawlDbReader: status 4 (db_redir_temp):   8506
11/10/12 01:55:05 INFO crawl.CrawlDbReader: status 5 (db_redir_perm):   2302
11/10/12 01:55:05 INFO crawl.CrawlDbReader: status 6 (db_notmodified):  594
11/10/12 01:55:05 INFO crawl.CrawlDbReader: CrawlDb statistics: done

as you can see, the urls have a score. Shouldn't these values appear in 
the boost field of the solr documents after indexing?
The version of the solr server is 3.4

Anybody any suggestions?

Thanks in advance

schema.xml:

<schema name="nutch" version="1.3">
     <types>
         <fieldType name="string" class="solr.StrField" 
sortMissingLast="true"
             omitNorms="true"/>
         <fieldType name="long" class="solr.TrieLongField" precisionStep="0"
             omitNorms="true" positionIncrementGap="0"/>
         <fieldType name="float" class="solr.TrieFloatField" 
precisionStep="0"
             omitNorms="true" positionIncrementGap="0"/>
         <fieldType name="date" class="solr.TrieDateField" precisionStep="0"
             omitNorms="true" positionIncrementGap="0"/>

         <fieldType name="text" class="solr.TextField"
             positionIncrementGap="100">

             <analyzer>
                 <tokenizer class="solr.WhitespaceTokenizerFactory"/>
                 <filter class="solr.StopFilterFactory"
                     ignoreCase="true" words="stopwords.txt"/>
                 <filter class="solr.WordDelimiterFilterFactory"
                     generateWordParts="1" generateNumberParts="1"
                     catenateWords="1" catenateNumbers="1" catenateAll="0"
                     splitOnCaseChange="1"/>
                 <filter class="solr.LowerCaseFilterFactory"/>
                 <filter class="solr.EnglishPorterFilterFactory"
                     protected="protwords.txt"/>
                 <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
             </analyzer>
         </fieldType>

         <fieldType name="url" class="solr.TextField"
             positionIncrementGap="100">
             <analyzer>
                 <tokenizer class="solr.StandardTokenizerFactory"/>
                 <filter class="solr.LowerCaseFilterFactory"/>
                 <filter class="solr.WordDelimiterFilterFactory"
                     generateWordParts="1" generateNumberParts="1"/>
             </analyzer>
         </fieldType>
     </types>
     <fields>

         <field name="id" type="string" stored="true" indexed="true"/>

         <!-- core fields -->
         <field name="segment" type="string" stored="true" indexed="false"/>
         <field name="digest" type="string" stored="true" indexed="false"/>
         <field name="boost" type="float" stored="true" indexed="false"/>

         <!-- fields for index-basic plugin -->
         <field name="host" type="url" stored="false" indexed="true"/>
         <field name="site" type="string" stored="false" indexed="true"/>

         <field name="url" type="url" stored="true" indexed="true"
             required="true"/>
         <field name="content" type="text" stored="true" indexed="true"/>
         <field name="title" type="text" stored="true" indexed="true" 
multiValued="true"/>
         <field name="cache" type="string" stored="true" indexed="false"/>
         <field name="tstamp" type="date" stored="true" indexed="false"/>

         <!-- fields for index-anchor plugin -->
         <field name="anchor" type="string" stored="true" indexed="true"
             multiValued="true"/>

         <!-- fields for index-more plugin -->

         <field name="type" type="string" stored="true" indexed="true"
             multiValued="true"/>
         <field name="contentLength" type="long" stored="true"
             indexed="false"/>
         <field name="lastModified" type="date" stored="true"
             indexed="false"/>
         <field name="date" type="date" stored="true" indexed="true"/>

         <!-- fields for languageidentifier plugin -->
         <field name="lang" type="string" stored="true" indexed="true"/>

         <!-- fields for subcollection plugin -->
         <field name="subcollection" type="string" stored="true"
             indexed="true" multiValued="true"/>

         <!-- fields for feed plugin (tag is also used by 
microformats-reltag)-->
         <field name="author" type="string" stored="true" indexed="true"/>
         <field name="tag" type="string" stored="true" indexed="true" 
multiValued="true"/>
         <field name="feed" type="string" stored="true" indexed="true"/>
         <field name="publishedDate" type="date" stored="true"
             indexed="true"/>
         <field name="updatedDate" type="date" stored="true"
             indexed="true"/>

         <!-- fields for creativecommons plugin -->
         <field name="cc" type="string" stored="true" indexed="true"
             multiValued="true"/>

     </fields>
     <uniqueKey>id</uniqueKey>
     <defaultSearchField>content</defaultSearchField>
     <solrQueryParser defaultOperator="OR"/>
</schema>

solrindex-mapping.xml:

<mapping>
	<!-- Simple mapping of fields created by Nutch IndexingFilters
	     to fields defined (and expected) in Solr schema.xml.

              Any fields in NutchDocument that match a name defined
              in field/@source will be renamed to the corresponding
              field/@dest.
              Additionally, if a field name (before mapping) matches
              a copyField/@source then its values will be copied to
              the corresponding copyField/@dest.

              uniqueKey has the same meaning as in Solr schema.xml
              and defaults to "id" if not defined.
          -->
	<fields>
		<field dest="content" source="content"/>
		<field dest="site" source="site"/>
		<field dest="title" source="title"/>
		<field dest="host" source="host"/>
		<field dest="segment" source="segment"/>
		<field dest="boost" source="boost"/>
		<field dest="digest" source="digest"/>
		<field dest="tstamp" source="tstamp"/>
		<field dest="id" source="url"/>
		<copyField source="url" dest="url"/>
	</fields>
	<uniqueKey>id</uniqueKey>
</mapping>