You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2012/06/25 16:49:05 UTC

svn commit: r1353585 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java

Author: markus
Date: Mon Jun 25 14:49:03 2012
New Revision: 1353585

URL: http://svn.apache.org/viewvc?rev=1353585&view=rev
Log:
NUTCH-1407 BasicIndexingFilter to optionally add domain field

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/nutch-default.xml
    nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1353585&r1=1353584&r2=1353585&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon Jun 25 14:49:03 2012
@@ -2,6 +2,8 @@ Nutch Change Log
 
 (trunk) Current Development:
 
+* NUTCH-1407 BasicIndexingFilter to optionally add domain field (markus)
+
 * NUTCH-1408 RobotRulesParser main doesn't take URL's (markus)
 
 * NUTCH-1400 Remove developer -core option for bin/nutch (jnioche)

Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1353585&r1=1353584&r2=1353585&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Mon Jun 25 14:49:03 2012
@@ -900,6 +900,12 @@
 </property>
 
 <property>
+  <name>indexer.add.domain</name>
+  <value>false</value>
+  <description>Whether to add the domain field to a NutchDocument.</description>
+</property>
+
+<property>
   <name>indexer.skip.notmodified</name>
   <value>false</value>
   <description>Whether the indexer will skip records with a db_notmodified status.

Modified: nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=1353585&r1=1353584&r2=1353585&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java Mon Jun 25 14:49:03 2012
@@ -26,6 +26,7 @@ import org.apache.nutch.parse.Parse;
 import org.apache.nutch.indexer.IndexingFilter;
 import org.apache.nutch.indexer.IndexingException;
 import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.util.URLUtil;
 import org.apache.hadoop.io.Text;
 
 import org.apache.nutch.crawl.CrawlDatum;
@@ -43,6 +44,7 @@ public class BasicIndexingFilter impleme
 
   private int MAX_TITLE_LENGTH;
   private int MAX_CONTENT_LENGTH;
+  private boolean addDomain = false;
   private Configuration conf;
 
   public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
@@ -60,6 +62,11 @@ public class BasicIndexingFilter impleme
       } else {
         u = new URL(urlString);
       }
+      
+      if (addDomain) {
+        doc.add("domain", URLUtil.getDomainName(u));
+      }
+      
       host = u.getHost();
     } catch (MalformedURLException e) {
       throw new IndexingException(e);
@@ -104,6 +111,7 @@ public class BasicIndexingFilter impleme
   public void setConf(Configuration conf) {
     this.conf = conf;
     this.MAX_TITLE_LENGTH = conf.getInt("indexer.max.title.length", 100);
+    this.addDomain = conf.getBoolean("indexer.add.domain", false);
     this.MAX_CONTENT_LENGTH = conf.getInt("indexer.max.content.length", -1);
   }