You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2013/05/09 00:04:53 UTC

svn commit: r1480485 - in /nutch/trunk: CHANGES.txt conf/schema-solr4.xml conf/schema.xml src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java

Author: snagel
Date: Wed May  8 22:04:53 2013
New Revision: 1480485

URL: http://svn.apache.org/r1480485
Log:
NUTCH-956 solrindex issues

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/schema-solr4.xml
    nutch/trunk/conf/schema.xml
    nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1480485&r1=1480484&r2=1480485&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed May  8 22:04:53 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 (trunk): Current Development
 
+* NUTCH-956 solrindex issues: add field tld to Solr schema (Alexis via lewismc, snagel)
+
 * NUTCH-1277 Fix [fallthrough] javac warnings (tejasp)
 
 * NUTCH-1514 Phase out the deprecated configuration properties (if possible) (tejasp)

Modified: nutch/trunk/conf/schema-solr4.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/schema-solr4.xml?rev=1480485&r1=1480484&r2=1480485&view=diff
==============================================================================
--- nutch/trunk/conf/schema-solr4.xml (original)
+++ nutch/trunk/conf/schema-solr4.xml Wed May  8 22:04:53 2013
@@ -345,6 +345,9 @@
 
     <!-- fields for creativecommons plugin -->
     <field name="cc" type="string" stored="true" indexed="true" multiValued="true"/>
+
+    <!-- fields for tld plugin -->    
+    <field name="tld" type="string" stored="false" indexed="false"/>
  </fields>
  <uniqueKey>id</uniqueKey>
  <defaultSearchField>text</defaultSearchField>

Modified: nutch/trunk/conf/schema.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/schema.xml?rev=1480485&r1=1480484&r2=1480485&view=diff
==============================================================================
--- nutch/trunk/conf/schema.xml (original)
+++ nutch/trunk/conf/schema.xml Wed May  8 22:04:53 2013
@@ -114,6 +114,9 @@
         <!-- fields for creativecommons plugin -->
         <field name="cc" type="string" stored="true" indexed="true"
             multiValued="true"/>
+            
+        <!-- fields for tld plugin -->    
+        <field name="tld" type="string" stored="false" indexed="false"/>
     </fields>
     <uniqueKey>id</uniqueKey>
     <defaultSearchField>content</defaultSearchField>

Modified: nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=1480485&r1=1480484&r2=1480485&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Wed May  8 22:04:53 2013
@@ -52,12 +52,13 @@ import org.apache.commons.lang.StringUti
 import org.apache.commons.lang.time.DateUtils;
 
 /**
- * Add (or reset) a few metaData properties as respective fields
- * (if they are available), so that they can be displayed by more.jsp
- * (called by search.jsp).
- *
- * content-type is indexed to support query by type:
- * last-modifed is indexed to support query by date:
+ * Add (or reset) a few metaData properties as respective fields (if they are
+ * available), so that they can be accurately used within the search index.
+ * 
+ * 'lastModifed' is indexed to support query by date, 'contentLength' obtains content length from the HTTP
+ * header, 'type' field is indexed to support query by type and finally the 'title' field is an attempt 
+ * to reset the title if a content-disposition hint exists. The logic is that such a presence is indicative 
+ * that the content provider wants the filename therein to be used as the title.
  *
  * Still need to make content-length searchable!
  *