You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2012/06/12 13:26:28 UTC
svn commit: r1349262 - in /nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/indexer/IndexerMapReduce.java
src/java/org/apache/nutch/indexer/solr/SolrIndexer.java
src/java/org/apache/nutch/net/URLNormalizers.java
Author: markus
Date: Tue Jun 12 11:26:28 2012
New Revision: 1349262
URL: http://svn.apache.org/viewvc?rev=1349262&view=rev
Log:
NUTCH-1300 Indexer to filter normalize URL's
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java
nutch/trunk/src/java/org/apache/nutch/net/URLNormalizers.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1349262&r1=1349261&r2=1349262&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Jun 12 11:26:28 2012
@@ -2,6 +2,8 @@ Nutch Change Log
(trunk) Current Development:
+* NUTCH-1300 Indexer to filter normalize URL's (markus)
+
* NUTCH-1330 WebGraph OutlinkDB to preserve back up (markus)
* NUTCH-1319 HostNormalizer plugin (markus)
Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=1349262&r1=1349261&r2=1349262&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Tue Jun 12 11:26:28 2012
@@ -41,6 +41,8 @@ import org.apache.nutch.crawl.LinkDb;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.URLFilters;
+import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
@@ -56,11 +58,21 @@ implements Mapper<Text, Writable, Text,
public static final String INDEXER_DELETE = "indexer.delete";
public static final String INDEXER_SKIP_NOTMODIFIED = "indexer.skip.notmodified";
+ public static final String URL_FILTERING = "indexer.url.filters";
+ public static final String URL_NORMALIZING = "indexer.url.normalizers";
private boolean skip = false;
private boolean delete = false;
private IndexingFilters filters;
private ScoringFilters scfilters;
+
+ // using normalizers and/or filters
+ private boolean normalize = false;
+ private boolean filter = false;
+
+ // url normalizers, filters and job configuration
+ private URLNormalizers urlNormalizers;
+ private URLFilters urlFilters;
public void configure(JobConf job) {
setConf(job);
@@ -68,10 +80,80 @@ implements Mapper<Text, Writable, Text,
this.scfilters = new ScoringFilters(getConf());
this.delete = job.getBoolean(INDEXER_DELETE, false);
this.skip = job.getBoolean(INDEXER_SKIP_NOTMODIFIED, false);
+
+ normalize = job.getBoolean(URL_NORMALIZING, false);
+ filter = job.getBoolean(URL_FILTERING, false);
+
+ if (normalize) {
+ urlNormalizers = new URLNormalizers(getConf(), URLNormalizers.SCOPE_DEFAULT);
+ }
+
+ if (filter) {
+ urlFilters = new URLFilters(getConf());
+ }
+ }
+
+ /**
+ * Normalizes and trims extra whitespace from the given url.
+ *
+ * @param url The url to normalize.
+ *
+ * @return The normalized url.
+ */
+ private String normalizeUrl(String url) {
+ if (!normalize) {
+ return url;
+ }
+
+ String normalized = null;
+ if (urlNormalizers != null) {
+ try {
+
+ // normalize and trim the url
+ normalized = urlNormalizers.normalize(url,
+ URLNormalizers.SCOPE_INDEXER);
+ normalized = normalized.trim();
+ }
+ catch (Exception e) {
+ LOG.warn("Skipping " + url + ":" + e);
+ normalized = null;
+ }
+ }
+
+ return normalized;
+ }
+
+ /**
+ * Filters the given url.
+ *
+ * @param url The url to filter.
+ *
+ * @return The filtered url or null.
+ */
+ private String filterUrl(String url) {
+ if (!filter) {
+ return url;
+ }
+
+ try {
+ url = urlFilters.filter(url);
+ } catch (Exception e) {
+ url = null;
+ }
+
+ return url;
}
public void map(Text key, Writable value,
OutputCollector<Text, NutchWritable> output, Reporter reporter) throws IOException {
+
+ String urlString = filterUrl(normalizeUrl(key.toString()));
+ if (urlString == null) {
+ return;
+ } else {
+ key.set(urlString);
+ }
+
output.collect(key, new NutchWritable(value));
}
Modified: nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java?rev=1349262&r1=1349261&r2=1349262&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java Tue Jun 12 11:26:28 2012
@@ -72,6 +72,13 @@ public class SolrIndexer extends Configu
public void indexSolr(String solrUrl, Path crawlDb, Path linkDb,
List<Path> segments, boolean noCommit, boolean deleteGone, String solrParams) throws IOException {
+ indexSolr(solrUrl, crawlDb, linkDb, segments, noCommit, deleteGone, solrParams, false, false);
+ }
+
+ public void indexSolr(String solrUrl, Path crawlDb, Path linkDb,
+ List<Path> segments, boolean noCommit, boolean deleteGone, String solrParams,
+ boolean filter, boolean normalize) throws IOException {
+
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
long start = System.currentTimeMillis();
LOG.info("SolrIndexer: starting at " + sdf.format(start));
@@ -79,14 +86,16 @@ public class SolrIndexer extends Configu
final JobConf job = new NutchJob(getConf());
job.setJobName("index-solr " + solrUrl);
- if (deleteGone) {
- LOG.info("SolrIndexer: deleting gone documents");
- }
-
+ LOG.info("SolrIndexer: deleting gone documents: " + deleteGone);
+ LOG.info("SolrIndexer: URL filtering: " + filter);
+ LOG.info("SolrIndexer: URL normalizing: " + normalize);
+
IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job);
job.set(SolrConstants.SERVER_URL, solrUrl);
job.setBoolean(IndexerMapReduce.INDEXER_DELETE, deleteGone);
+ job.setBoolean(IndexerMapReduce.URL_FILTERING, filter);
+ job.setBoolean(IndexerMapReduce.URL_NORMALIZING, normalize);
if (solrParams != null) {
job.set(SolrConstants.PARAMS, solrParams);
}
@@ -118,7 +127,7 @@ public class SolrIndexer extends Configu
public int run(String[] args) throws Exception {
if (args.length < 3) {
- System.err.println("Usage: SolrIndexer <solr url> <crawldb> [-linkdb <linkdb>] [-params k1=v1&k2=v2...] (<segment> ... | -dir <segments>) [-noCommit] [-deleteGone]");
+ System.err.println("Usage: SolrIndexer <solr url> <crawldb> [-linkdb <linkdb>] [-params k1=v1&k2=v2...] (<segment> ... | -dir <segments>) [-noCommit] [-deleteGone] [-filter] [-normalize]");
return -1;
}
@@ -130,6 +139,8 @@ public class SolrIndexer extends Configu
boolean noCommit = false;
boolean deleteGone = false;
+ boolean filter = false;
+ boolean normalize = false;
for (int i = 2; i < args.length; i++) {
if (args[i].equals("-linkdb")) {
@@ -148,6 +159,10 @@ public class SolrIndexer extends Configu
noCommit = true;
} else if (args[i].equals("-deleteGone")) {
deleteGone = true;
+ } else if (args[i].equals("-filter")) {
+ filter = true;
+ } else if (args[i].equals("-normalize")) {
+ normalize = true;
} else if (args[i].equals("-params")) {
params = args[++i];
} else {
@@ -156,7 +171,7 @@ public class SolrIndexer extends Configu
}
try {
- indexSolr(args[0], crawlDb, linkDb, segments, noCommit, deleteGone, params);
+ indexSolr(args[0], crawlDb, linkDb, segments, noCommit, deleteGone, params, filter, normalize);
return 0;
} catch (final Exception e) {
LOG.error("SolrIndexer: " + StringUtils.stringifyException(e));
Modified: nutch/trunk/src/java/org/apache/nutch/net/URLNormalizers.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/net/URLNormalizers.java?rev=1349262&r1=1349261&r2=1349262&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/net/URLNormalizers.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/net/URLNormalizers.java Tue Jun 12 11:26:28 2012
@@ -95,7 +95,8 @@ public final class URLNormalizers {
public static final String SCOPE_INJECT = "inject";
/** Scope used when constructing new {@link org.apache.nutch.parse.Outlink} instances. */
public static final String SCOPE_OUTLINK = "outlink";
-
+ /** Scope used when indexing URLs. */
+ public static final String SCOPE_INDEXER = "indexer";
public static final Logger LOG = LoggerFactory.getLogger(URLNormalizers.class);