You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2013/05/09 00:04:04 UTC
svn commit: r1480484 - in /nutch/branches/2.x: CHANGES.txt
conf/schema-solr4.xml conf/schema.xml
src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
Author: snagel
Date: Wed May 8 22:04:04 2013
New Revision: 1480484
URL: http://svn.apache.org/r1480484
Log:
NUTCH-956 solrindex issues
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/conf/schema-solr4.xml
nutch/branches/2.x/conf/schema.xml
nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1480484&r1=1480483&r2=1480484&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Wed May 8 22:04:04 2013
@@ -2,6 +2,8 @@ Nutch Change Log
Release 2.2 - Current Development
+* NUTCH-956 solrindex issues: add field tld to Solr schema (Alexis via lewismc, snagel)
+
* NUTCH-1277 Fix [fallthrough] javac warnings (tejasp)
* NUTCH-1514 Phase out the deprecated configuration properties (if possible) (tejasp)
Modified: nutch/branches/2.x/conf/schema-solr4.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/schema-solr4.xml?rev=1480484&r1=1480483&r2=1480484&view=diff
==============================================================================
--- nutch/branches/2.x/conf/schema-solr4.xml (original)
+++ nutch/branches/2.x/conf/schema-solr4.xml Wed May 8 22:04:04 2013
@@ -346,6 +346,9 @@
<!-- fields for creativecommons plugin -->
<field name="cc" type="string" stored="true" indexed="true" multiValued="true"/>
+
+ <!-- fields for tld plugin -->
+ <field name="tld" type="string" stored="false" indexed="false"/>
</fields>
<uniqueKey>id</uniqueKey>
<defaultSearchField>text</defaultSearchField>
Modified: nutch/branches/2.x/conf/schema.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/schema.xml?rev=1480484&r1=1480483&r2=1480484&view=diff
==============================================================================
--- nutch/branches/2.x/conf/schema.xml (original)
+++ nutch/branches/2.x/conf/schema.xml Wed May 8 22:04:04 2013
@@ -114,6 +114,9 @@
<!-- fields for creativecommons plugin -->
<field name="cc" type="string" stored="true" indexed="true"
multiValued="true"/>
+
+ <!-- fields for tld plugin -->
+ <field name="tld" type="string" stored="false" indexed="false"/>
</fields>
<uniqueKey>id</uniqueKey>
<defaultSearchField>content</defaultSearchField>
Modified: nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=1480484&r1=1480483&r2=1480484&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original)
+++ nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Wed May 8 22:04:04 2013
@@ -44,10 +44,12 @@ import org.slf4j.LoggerFactory;
/**
* Add (or reset) a few metaData properties as respective fields (if they are
- * available), so that they can be displayed by more.jsp (called by search.jsp).
+ * available), so that they can be accurately used within the search index.
*
- * content-type is indexed to support query by type: last-modifed is indexed to
- * support query by date:
+ * 'lastModifed' is indexed to support query by date, 'contentLength' obtains content length from the HTTP
+ * header, 'type' field is indexed to support query by type and finally the 'title' field is an attempt
+ * to reset the title if a content-disposition hint exists. The logic is that such a presence is indicative
+ * that the content provider wants the filename therein to be used as the title.
*
* Still need to make content-length searchable!
*
@@ -171,7 +173,9 @@ public class MoreIndexingFilter implemen
*/
private NutchDocument addType(NutchDocument doc, WebPage page, String url) {
String mimeType = null;
- Utf8 contentType = page.getFromHeaders(new Utf8(HttpHeaders.CONTENT_TYPE));
+ Utf8 contentType = page.getContentType();
+ if (contentType == null)
+ contentType = page.getFromHeaders(new Utf8(HttpHeaders.CONTENT_TYPE));
if (contentType == null) {
// Note by Jerome Charron on 20050415:
// Content Type not solved by a previous plugin
@@ -194,13 +198,11 @@ public class MoreIndexingFilter implemen
return doc;
}
- //String scontentType = mimeType.getName();
-
doc.add("type", mimeType);
// Check if we need to split the content type in sub parts
- if ( null != contentType && conf.getBoolean("moreIndexingFilter.indexMimeTypeParts", true)) {
- String[] parts = getParts(contentType.toString());
+ if (conf.getBoolean("moreIndexingFilter.indexMimeTypeParts", true)) {
+ String[] parts = getParts(mimeType);
for(String part: parts) {
doc.add("type", part);