You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-user@lucene.apache.org by Marek Bachmann <m....@uni-kassel.de> on 2011/10/12 02:05:34 UTC
All boost values are 1.0 in solr
Hey ho,
first of all: I am not sure if this topic belongs to the solr or nutch
list, sorry for the double post.
For some reasons all of the solr documents have a boost value of 1.0
I indexed them using the solrindex command from nutch 1.3. The pages
were scored with Webgraph an the output of the used crawldb is:
11/10/12 01:55:05 INFO crawl.CrawlDbReader: Statistics for CrawlDb: crawldb
11/10/12 01:55:05 INFO crawl.CrawlDbReader: TOTAL urls: 243751
11/10/12 01:55:05 INFO crawl.CrawlDbReader: retry 0: 242738
11/10/12 01:55:05 INFO crawl.CrawlDbReader: retry 1: 627
11/10/12 01:55:05 INFO crawl.CrawlDbReader: retry 2: 127
11/10/12 01:55:05 INFO crawl.CrawlDbReader: retry 3: 148
11/10/12 01:55:05 INFO crawl.CrawlDbReader: retry 4: 111
11/10/12 01:55:05 INFO crawl.CrawlDbReader: min score: 0.0
11/10/12 01:55:05 INFO crawl.CrawlDbReader: avg score: 0.4357474
11/10/12 01:55:05 INFO crawl.CrawlDbReader: max score: 2764.215
11/10/12 01:55:05 INFO crawl.CrawlDbReader: status 1 (db_unfetched):
32425
11/10/12 01:55:05 INFO crawl.CrawlDbReader: status 2 (db_fetched):
182141
11/10/12 01:55:05 INFO crawl.CrawlDbReader: status 3 (db_gone): 17783
11/10/12 01:55:05 INFO crawl.CrawlDbReader: status 4 (db_redir_temp): 8506
11/10/12 01:55:05 INFO crawl.CrawlDbReader: status 5 (db_redir_perm): 2302
11/10/12 01:55:05 INFO crawl.CrawlDbReader: status 6 (db_notmodified): 594
11/10/12 01:55:05 INFO crawl.CrawlDbReader: CrawlDb statistics: done
as you can see, the urls have a score. Shouldn't these values appear in
the boost field of the solr documents after indexing?
The version of the solr server is 3.4
Anybody any suggestions?
Thanks in advance
schema.xml:
<schema name="nutch" version="1.3">
<types>
<fieldType name="string" class="solr.StrField"
sortMissingLast="true"
omitNorms="true"/>
<fieldType name="long" class="solr.TrieLongField" precisionStep="0"
omitNorms="true" positionIncrementGap="0"/>
<fieldType name="float" class="solr.TrieFloatField"
precisionStep="0"
omitNorms="true" positionIncrementGap="0"/>
<fieldType name="date" class="solr.TrieDateField" precisionStep="0"
omitNorms="true" positionIncrementGap="0"/>
<fieldType name="text" class="solr.TextField"
positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.StopFilterFactory"
ignoreCase="true" words="stopwords.txt"/>
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1"
catenateWords="1" catenateNumbers="1" catenateAll="0"
splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory"
protected="protwords.txt"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="url" class="solr.TextField"
positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1"/>
</analyzer>
</fieldType>
</types>
<fields>
<field name="id" type="string" stored="true" indexed="true"/>
<!-- core fields -->
<field name="segment" type="string" stored="true" indexed="false"/>
<field name="digest" type="string" stored="true" indexed="false"/>
<field name="boost" type="float" stored="true" indexed="false"/>
<!-- fields for index-basic plugin -->
<field name="host" type="url" stored="false" indexed="true"/>
<field name="site" type="string" stored="false" indexed="true"/>
<field name="url" type="url" stored="true" indexed="true"
required="true"/>
<field name="content" type="text" stored="true" indexed="true"/>
<field name="title" type="text" stored="true" indexed="true"
multiValued="true"/>
<field name="cache" type="string" stored="true" indexed="false"/>
<field name="tstamp" type="date" stored="true" indexed="false"/>
<!-- fields for index-anchor plugin -->
<field name="anchor" type="string" stored="true" indexed="true"
multiValued="true"/>
<!-- fields for index-more plugin -->
<field name="type" type="string" stored="true" indexed="true"
multiValued="true"/>
<field name="contentLength" type="long" stored="true"
indexed="false"/>
<field name="lastModified" type="date" stored="true"
indexed="false"/>
<field name="date" type="date" stored="true" indexed="true"/>
<!-- fields for languageidentifier plugin -->
<field name="lang" type="string" stored="true" indexed="true"/>
<!-- fields for subcollection plugin -->
<field name="subcollection" type="string" stored="true"
indexed="true" multiValued="true"/>
<!-- fields for feed plugin (tag is also used by
microformats-reltag)-->
<field name="author" type="string" stored="true" indexed="true"/>
<field name="tag" type="string" stored="true" indexed="true"
multiValued="true"/>
<field name="feed" type="string" stored="true" indexed="true"/>
<field name="publishedDate" type="date" stored="true"
indexed="true"/>
<field name="updatedDate" type="date" stored="true"
indexed="true"/>
<!-- fields for creativecommons plugin -->
<field name="cc" type="string" stored="true" indexed="true"
multiValued="true"/>
</fields>
<uniqueKey>id</uniqueKey>
<defaultSearchField>content</defaultSearchField>
<solrQueryParser defaultOperator="OR"/>
</schema>
solrindex-mapping.xml:
<mapping>
<!-- Simple mapping of fields created by Nutch IndexingFilters
to fields defined (and expected) in Solr schema.xml.
Any fields in NutchDocument that match a name defined
in field/@source will be renamed to the corresponding
field/@dest.
Additionally, if a field name (before mapping) matches
a copyField/@source then its values will be copied to
the corresponding copyField/@dest.
uniqueKey has the same meaning as in Solr schema.xml
and defaults to "id" if not defined.
-->
<fields>
<field dest="content" source="content"/>
<field dest="site" source="site"/>
<field dest="title" source="title"/>
<field dest="host" source="host"/>
<field dest="segment" source="segment"/>
<field dest="boost" source="boost"/>
<field dest="digest" source="digest"/>
<field dest="tstamp" source="tstamp"/>
<field dest="id" source="url"/>
<copyField source="url" dest="url"/>
</fields>
<uniqueKey>id</uniqueKey>
</mapping>