You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2015/01/07 23:25:19 UTC

svn commit: r1650181 - in /nutch/trunk: CHANGES.txt src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java

Author: snagel
Date: Wed Jan  7 22:25:18 2015
New Revision: 1650181

URL: http://svn.apache.org/r1650181
Log:
NUTCH-1140 index-more plugin, resetTitle creates multiple values in title field

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
    nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1650181&r1=1650180&r2=1650181&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jan  7 22:25:18 2015
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1140 index-more plugin, resetTitle creates multiple values in title field (Joe Liedtke, kaveh minooie via snagel)
+
 * NUTCH-1904 Schema for Solr4 doesn't include _version_ field (mattmann)
 
 * NUTCH-1897 Easier debugging of plugin XML errors (markus)

Modified: nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=1650181&r1=1650180&r2=1650181&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Wed Jan  7 22:25:18 2015
@@ -289,7 +289,7 @@ public class MoreIndexingFilter implemen
 
   private NutchDocument resetTitle(NutchDocument doc, ParseData data, String url) {
     String contentDisposition = data.getMeta(Metadata.CONTENT_DISPOSITION);
-    if (contentDisposition == null)
+    if (contentDisposition == null || doc.getFieldValue("title") != null)
       return doc;
 
     for (int i=0; i<patterns.length; i++) {

Modified: nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java?rev=1650181&r1=1650180&r2=1650181&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java Wed Jan  7 22:25:18 2015
@@ -82,11 +82,21 @@ public class TestMoreIndexingFilter {
     MoreIndexingFilter filter = new MoreIndexingFilter();
     filter.setConf(conf);
 
-    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
-      new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
-        "http://www.example.com/"), new CrawlDatum(), new Inlinks());
+    Text url = new Text("http://www.example.com/");
+    ParseImpl parseImpl = new ParseImpl("text", new ParseData(
+        new ParseStatus(), "title", new Outlink[0], metadata));
+
+    NutchDocument doc = new NutchDocument();
+    doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks());
 
     Assert.assertEquals("content-disposition not detected", "filename.ext", doc.getFieldValue("title"));
+    
+    /* NUTCH-1140: do not add second title to avoid a multi-valued title field */
+    doc = new NutchDocument();
+    doc.add("title", "title");
+    doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks());
+    Assert.assertEquals("do not add second title by content-disposition",
+        "title", doc.getFieldValue("title"));
   }
 
   private void assertParts(String[] parts, int count, String... expected) {