You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2012/06/25 16:49:05 UTC
svn commit: r1353585 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml
src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
Author: markus
Date: Mon Jun 25 14:49:03 2012
New Revision: 1353585
URL: http://svn.apache.org/viewvc?rev=1353585&view=rev
Log:
NUTCH-1407 BasicIndexingFilter to optionally add domain field
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1353585&r1=1353584&r2=1353585&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon Jun 25 14:49:03 2012
@@ -2,6 +2,8 @@ Nutch Change Log
(trunk) Current Development:
+* NUTCH-1407 BasicIndexingFilter to optionally add domain field (markus)
+
* NUTCH-1408 RobotRulesParser main doesn't take URL's (markus)
* NUTCH-1400 Remove developer -core option for bin/nutch (jnioche)
Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1353585&r1=1353584&r2=1353585&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Mon Jun 25 14:49:03 2012
@@ -900,6 +900,12 @@
</property>
<property>
+ <name>indexer.add.domain</name>
+ <value>false</value>
+ <description>Whether to add the domain field to a NutchDocument.</description>
+</property>
+
+<property>
<name>indexer.skip.notmodified</name>
<value>false</value>
<description>Whether the indexer will skip records with a db_notmodified status.
Modified: nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=1353585&r1=1353584&r2=1353585&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java Mon Jun 25 14:49:03 2012
@@ -26,6 +26,7 @@ import org.apache.nutch.parse.Parse;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.util.URLUtil;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
@@ -43,6 +44,7 @@ public class BasicIndexingFilter impleme
private int MAX_TITLE_LENGTH;
private int MAX_CONTENT_LENGTH;
+ private boolean addDomain = false;
private Configuration conf;
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
@@ -60,6 +62,11 @@ public class BasicIndexingFilter impleme
} else {
u = new URL(urlString);
}
+
+ if (addDomain) {
+ doc.add("domain", URLUtil.getDomainName(u));
+ }
+
host = u.getHost();
} catch (MalformedURLException e) {
throw new IndexingException(e);
@@ -104,6 +111,7 @@ public class BasicIndexingFilter impleme
public void setConf(Configuration conf) {
this.conf = conf;
this.MAX_TITLE_LENGTH = conf.getInt("indexer.max.title.length", 100);
+ this.addDomain = conf.getBoolean("indexer.add.domain", false);
this.MAX_CONTENT_LENGTH = conf.getInt("indexer.max.content.length", -1);
}