You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2010/10/27 12:25:49 UTC
svn commit: r1027889 - in /nutch/branches/branch-1.3: conf/nutch-default.xml
src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
Author: markus
Date: Wed Oct 27 10:25:49 2010
New Revision: 1027889
URL: http://svn.apache.org/viewvc?rev=1027889&view=rev
Log:
Ported NUTCH-901 Making index-more plug-in configurable to 1.3
Modified:
nutch/branches/branch-1.3/conf/nutch-default.xml
nutch/branches/branch-1.3/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
nutch/branches/branch-1.3/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
Modified: nutch/branches/branch-1.3/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/conf/nutch-default.xml?rev=1027889&r1=1027888&r2=1027889&view=diff
==============================================================================
--- nutch/branches/branch-1.3/conf/nutch-default.xml (original)
+++ nutch/branches/branch-1.3/conf/nutch-default.xml Wed Oct 27 10:25:49 2010
@@ -751,6 +751,17 @@
</description>
</property>
+<!-- moreindexingfilter plugin properties -->
+
+<property>
+ <name>moreIndexingFilter.indexMimeTypeParts</name>
+ <value>true</value>
+ <description>Determines whether the index-more plugin will split the mime-type
+ in sub parts, this requires the type field to be multi valued. Set to true for backward
+ compatibility. False will not split the mime-type.
+ </description>
+</property>
+
<!-- indexingfilter plugin properties -->
<property>
Modified: nutch/branches/branch-1.3/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=1027889&r1=1027888&r2=1027889&view=diff
==============================================================================
--- nutch/branches/branch-1.3/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original)
+++ nutch/branches/branch-1.3/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Wed Oct 27 10:25:49 2010
@@ -224,10 +224,13 @@ public class MoreIndexingFilter implemen
doc.add("type", contentType);
- String[] parts = getParts(contentType);
+ // Check if we need to split the content type in sub parts
+ if (conf.getBoolean("moreIndexingFilter.indexMimeTypeParts", true)) {
+ String[] parts = getParts(contentType.toString());
- for(String part: parts) {
- doc.add("type", part);
+ for(String part: parts) {
+ doc.add("type", part);
+ }
}
// leave this for future improvement
Modified: nutch/branches/branch-1.3/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java?rev=1027889&r1=1027888&r2=1027889&view=diff
==============================================================================
--- nutch/branches/branch-1.3/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java (original)
+++ nutch/branches/branch-1.3/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java Wed Oct 27 10:25:49 2010
@@ -43,7 +43,29 @@ public class TestMoreIndexingFilter exte
public void testGetParts() {
String[] parts = MoreIndexingFilter.getParts("text/html");
assertParts(parts, 2, "text", "html");
+ }
+ /**
+ * @since NUTCH-901
+ */
+ public void testNoParts(){
+ Configuration conf = NutchConfiguration.create();
+ conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
+ MoreIndexingFilter filter = new MoreIndexingFilter();
+ filter.setConf(conf);
+ assertNotNull(filter);
+ NutchDocument doc = new NutchDocument();
+ try{
+ filter.filter(doc, "http://nutch.apache.org/index.html", new WebPage());
+ }
+ catch(Exception e){
+ e.printStackTrace();
+ fail(e.getMessage());
+ }
+ assertNotNull(doc);
+ assertTrue(doc.getFieldNames().contains("type"));
+ assertEquals(1, doc.getFieldValues("type").size());
+ assertEquals("text/html", doc.getFieldValue("type"));
}
private void assertParts(String[] parts, int count, String... expected) {