You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2010/10/27 12:25:49 UTC

svn commit: r1027889 - in /nutch/branches/branch-1.3: conf/nutch-default.xml src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java

Author: markus
Date: Wed Oct 27 10:25:49 2010
New Revision: 1027889

URL: http://svn.apache.org/viewvc?rev=1027889&view=rev
Log:
Ported NUTCH-901 Making index-more plug-in configurable to 1.3

Modified:
    nutch/branches/branch-1.3/conf/nutch-default.xml
    nutch/branches/branch-1.3/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
    nutch/branches/branch-1.3/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java

Modified: nutch/branches/branch-1.3/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/conf/nutch-default.xml?rev=1027889&r1=1027888&r2=1027889&view=diff
==============================================================================
--- nutch/branches/branch-1.3/conf/nutch-default.xml (original)
+++ nutch/branches/branch-1.3/conf/nutch-default.xml Wed Oct 27 10:25:49 2010
@@ -751,6 +751,17 @@
   </description>
 </property>
 
+<!-- moreindexingfilter plugin properties -->
+
+<property>
+  <name>moreIndexingFilter.indexMimeTypeParts</name>
+  <value>true</value>
+  <description>Determines whether the index-more plugin will split the mime-type
+  in sub parts, this requires the type field to be multi valued. Set to true for backward
+  compatibility. False will not split the mime-type.
+  </description>
+</property>
+
 <!-- indexingfilter plugin properties -->
 
 <property>

Modified: nutch/branches/branch-1.3/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=1027889&r1=1027888&r2=1027889&view=diff
==============================================================================
--- nutch/branches/branch-1.3/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original)
+++ nutch/branches/branch-1.3/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Wed Oct 27 10:25:49 2010
@@ -224,10 +224,13 @@ public class MoreIndexingFilter implemen
     
     doc.add("type", contentType);
 
-    String[] parts = getParts(contentType);
+    // Check if we need to split the content type in sub parts
+    if (conf.getBoolean("moreIndexingFilter.indexMimeTypeParts", true)) {
+      String[] parts = getParts(contentType.toString());
 
-    for(String part: parts) {
-      doc.add("type", part);
+      for(String part: parts) {
+        doc.add("type", part);
+      }
     }
     
     // leave this for future improvement

Modified: nutch/branches/branch-1.3/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java?rev=1027889&r1=1027888&r2=1027889&view=diff
==============================================================================
--- nutch/branches/branch-1.3/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java (original)
+++ nutch/branches/branch-1.3/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java Wed Oct 27 10:25:49 2010
@@ -43,7 +43,29 @@ public class TestMoreIndexingFilter exte
   public void testGetParts() {
     String[] parts = MoreIndexingFilter.getParts("text/html");
     assertParts(parts, 2, "text", "html");
+  }
 
+  /**
+   * @since NUTCH-901
+   */
+  public void testNoParts(){
+     Configuration conf = NutchConfiguration.create();
+     conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
+     MoreIndexingFilter filter = new MoreIndexingFilter();
+     filter.setConf(conf);
+     assertNotNull(filter);
+     NutchDocument doc = new NutchDocument();
+     try{
+       filter.filter(doc, "http://nutch.apache.org/index.html", new WebPage());
+     }
+     catch(Exception e){
+       e.printStackTrace();
+       fail(e.getMessage());
+     }
+     assertNotNull(doc);
+     assertTrue(doc.getFieldNames().contains("type"));
+     assertEquals(1, doc.getFieldValues("type").size());
+     assertEquals("text/html", doc.getFieldValue("type"));     
   }
 
   private void assertParts(String[] parts, int count, String... expected) {