You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/16 19:49:00 UTC
[44/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build
for nutch-core and nutch-plugins
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingFiltersChecker.java b/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
new file mode 100644
index 0000000..2e1b9c2
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
@@ -0,0 +1,371 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.io.PrintWriter;
+import java.net.ServerSocket;
+import java.net.Socket;
+import java.net.InetSocketAddress;
+import java.nio.charset.Charset;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.crawl.SignatureFactory;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.ParseSegment;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.scoring.ScoringFilters;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.StringUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Reads and parses a URL and run the indexers on it. Displays the fields
+ * obtained and the first 100 characters of their value
+ *
+ * Tested with e.g. ./nutch org.apache.nutch.indexer.IndexingFiltersChecker
+ * http://www.lemonde.fr
+ *
+ * @author Julien Nioche
+ **/
+
+public class IndexingFiltersChecker extends Configured implements Tool {
+
+ protected URLNormalizers normalizers = null;
+ protected boolean dumpText = false;
+ protected boolean followRedirects = false;
+ protected boolean keepClientCnxOpen = false;
+ // used to simulate the metadata propagated from injection
+ protected HashMap<String, String> metadata = new HashMap<String, String>();
+ protected int tcpPort = -1;
+
+ public static final Logger LOG = LoggerFactory
+ .getLogger(IndexingFiltersChecker.class);
+
+ public IndexingFiltersChecker() {
+
+ }
+
+ public int run(String[] args) throws Exception {
+ String url = null;
+ String usage = "Usage: IndexingFiltersChecker [-normalize] [-followRedirects] [-dumpText] [-md key=value] [-listen <port>] [-keepClientCnxOpen]";
+
+ if (args.length == 0) {
+ System.err.println(usage);
+ return -1;
+ }
+
+ for (int i = 0; i < args.length; i++) {
+ if (args[i].equals("-normalize")) {
+ normalizers = new URLNormalizers(getConf(), URLNormalizers.SCOPE_DEFAULT);
+ } else if (args[i].equals("-listen")) {
+ tcpPort = Integer.parseInt(args[++i]);
+ } else if (args[i].equals("-followRedirects")) {
+ followRedirects = true;
+ } else if (args[i].equals("-keepClientCnxOpen")) {
+ keepClientCnxOpen = true;
+ } else if (args[i].equals("-dumpText")) {
+ dumpText = true;
+ } else if (args[i].equals("-md")) {
+ String k = null, v = null;
+ String nextOne = args[++i];
+ int firstEquals = nextOne.indexOf("=");
+ if (firstEquals != -1) {
+ k = nextOne.substring(0, firstEquals);
+ v = nextOne.substring(firstEquals + 1);
+ } else
+ k = nextOne;
+ metadata.put(k, v);
+ } else if (i != args.length - 1) {
+ System.err.println(usage);
+ System.exit(-1);
+ } else {
+ url =args[i];
+ }
+ }
+
+ // In listening mode?
+ if (tcpPort == -1) {
+ // No, just fetch and display
+ StringBuilder output = new StringBuilder();
+ int ret = fetch(url, output);
+ System.out.println(output);
+ return ret;
+ } else {
+ // Listen on socket and start workers on incoming requests
+ listen();
+ }
+
+ return 0;
+ }
+
+ protected void listen() throws Exception {
+ ServerSocket server = null;
+
+ try{
+ server = new ServerSocket();
+ server.bind(new InetSocketAddress(tcpPort));
+ LOG.info(server.toString());
+ } catch (Exception e) {
+ LOG.error("Could not listen on port " + tcpPort);
+ System.exit(-1);
+ }
+
+ while(true){
+ Worker worker;
+ try{
+ worker = new Worker(server.accept());
+ Thread thread = new Thread(worker);
+ thread.start();
+ } catch (Exception e) {
+ LOG.error("Accept failed: " + tcpPort);
+ System.exit(-1);
+ }
+ }
+ }
+
+ private class Worker implements Runnable {
+ private Socket client;
+
+ Worker(Socket client) {
+ this.client = client;
+ LOG.info(client.toString());
+ }
+
+ public void run() {
+ if (keepClientCnxOpen) {
+ while (true) { // keep connection open until closes
+ readWrite();
+ }
+ } else {
+ readWrite();
+
+ try { // close ourselves
+ client.close();
+ } catch (Exception e){
+ LOG.error(e.toString());
+ }
+ }
+ }
+
+ protected void readWrite() {
+ String line;
+ BufferedReader in = null;
+ PrintWriter out = null;
+
+ try{
+ in = new BufferedReader(new InputStreamReader(client.getInputStream()));
+ } catch (Exception e) {
+ LOG.error("in or out failed");
+ System.exit(-1);
+ }
+
+ try{
+ line = in.readLine();
+ StringBuilder output = new StringBuilder();
+ fetch(line, output);
+
+ client.getOutputStream().write(output.toString().getBytes(Charset.forName("UTF-8")));
+ }catch (Exception e) {
+ LOG.error("Read/Write failed: " + e);
+ }
+ }
+ }
+
+
+ protected int fetch(String url, StringBuilder output) throws Exception {
+ if (normalizers != null) {
+ url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
+ }
+
+ LOG.info("fetching: " + url);
+
+ CrawlDatum datum = new CrawlDatum();
+
+ Iterator<String> iter = metadata.keySet().iterator();
+ while (iter.hasNext()) {
+ String key = iter.next();
+ String value = metadata.get(key);
+ if (value == null)
+ value = "";
+ datum.getMetaData().put(new Text(key), new Text(value));
+ }
+
+ IndexingFilters indexers = new IndexingFilters(getConf());
+
+ int maxRedirects = 3;
+
+ ProtocolOutput protocolOutput = getProtocolOutput(url, datum);
+ Text turl = new Text(url);
+
+ // Following redirects and not reached maxRedirects?
+ while (!protocolOutput.getStatus().isSuccess() && followRedirects && protocolOutput.getStatus().isRedirect() && maxRedirects != 0) {
+ String[] stuff = protocolOutput.getStatus().getArgs();
+ url = stuff[0];
+
+ if (normalizers != null) {
+ url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
+ }
+
+ turl.set(url);
+
+ // try again
+ protocolOutput = getProtocolOutput(url, datum);
+ maxRedirects--;
+ }
+
+ if (!protocolOutput.getStatus().isSuccess()) {
+ output.append("Fetch failed with protocol status: "
+ + protocolOutput.getStatus() + "\n");
+ return 0;
+ }
+
+ Content content = protocolOutput.getContent();
+
+ if (content == null) {
+ output.append("No content for " + url + "\n");
+ return 0;
+ }
+
+ String contentType = content.getContentType();
+
+ if (contentType == null) {
+ return -1;
+ }
+
+ // store the guessed content type in the crawldatum
+ datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE),
+ new Text(contentType));
+
+ if (ParseSegment.isTruncated(content)) {
+ LOG.warn("Content is truncated, parse may fail!");
+ }
+
+ ScoringFilters scfilters = new ScoringFilters(getConf());
+ // call the scoring filters
+ try {
+ scfilters.passScoreBeforeParsing(turl, datum, content);
+ } catch (Exception e) {
+ LOG.warn("Couldn't pass score, url {} ({})", url, e);
+ }
+
+ LOG.info("parsing: {}", url);
+ LOG.info("contentType: {}", contentType);
+
+ ParseResult parseResult = new ParseUtil(getConf()).parse(content);
+
+ NutchDocument doc = new NutchDocument();
+ doc.add("id", url);
+ Text urlText = new Text(url);
+
+ Inlinks inlinks = null;
+ Parse parse = parseResult.get(urlText);
+ if (parse == null) {
+ LOG.error("Failed to get parse from parse result");
+ LOG.error("Available parses in parse result (by URL key):");
+ for (Map.Entry<Text, Parse> entry : parseResult) {
+ LOG.error(" " + entry.getKey());
+ }
+ LOG.error("Parse result does not contain a parse for URL to be checked:");
+ LOG.error(" " + urlText);
+ return -1;
+ }
+
+ byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content,
+ parse);
+ parse.getData().getContentMeta()
+ .set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
+ String digest = parse.getData().getContentMeta().get(Nutch.SIGNATURE_KEY);
+ doc.add("digest", digest);
+ datum.setSignature(signature);
+
+ // call the scoring filters
+ try {
+ scfilters.passScoreAfterParsing(turl, content, parseResult.get(turl));
+ } catch (Exception e) {
+ LOG.warn("Couldn't pass score, url {} ({})", turl, e);
+ }
+
+ try {
+ doc = indexers.filter(doc, parse, urlText, datum, inlinks);
+ } catch (IndexingException e) {
+ e.printStackTrace();
+ }
+
+ if (doc == null) {
+ output.append("Document discarded by indexing filter\n");
+ return 0;
+ }
+
+ for (String fname : doc.getFieldNames()) {
+ List<Object> values = doc.getField(fname).getValues();
+ if (values != null) {
+ for (Object value : values) {
+ String str = value.toString();
+ int minText = dumpText ? str.length() : Math.min(100, str.length());
+ output.append(fname + " :\t" + str.substring(0, minText) + "\n");
+ }
+ }
+ }
+
+ output.append("\n"); // For readability if keepClientCnxOpen
+
+ if (getConf().getBoolean("doIndex", false) && doc != null) {
+ IndexWriters writers = new IndexWriters(getConf());
+ writers.open(new JobConf(getConf()), "IndexingFilterChecker");
+ writers.write(doc);
+ writers.close();
+ }
+
+ return 0;
+ }
+
+ protected ProtocolOutput getProtocolOutput(String url, CrawlDatum datum) throws Exception {
+ ProtocolFactory factory = new ProtocolFactory(getConf());
+ Protocol protocol = factory.getProtocol(url);
+ Text turl = new Text(url);
+ ProtocolOutput protocolOutput = protocol.getProtocolOutput(turl, datum);
+ return protocolOutput;
+ }
+
+ public static void main(String[] args) throws Exception {
+ final int res = ToolRunner.run(NutchConfiguration.create(),
+ new IndexingFiltersChecker(), args);
+ System.exit(res);
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingJob.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingJob.java b/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingJob.java
new file mode 100644
index 0000000..342ea4a
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingJob.java
@@ -0,0 +1,358 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer;
+
+import java.io.File;
+import java.io.IOException;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Random;
+
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.segment.SegmentChecker;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Counters.Counter;
+import org.apache.hadoop.mapred.RunningJob;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.util.HadoopFSUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.NutchTool;
+import org.apache.nutch.util.TimingUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Generic indexer which relies on the plugins implementing IndexWriter
+ **/
+
+public class IndexingJob extends NutchTool implements Tool {
+
+ public static Logger LOG = LoggerFactory.getLogger(IndexingJob.class);
+
+ public IndexingJob() {
+ super(null);
+ }
+
+ public IndexingJob(Configuration conf) {
+ super(conf);
+ }
+
+ public void index(Path crawlDb, Path linkDb, List<Path> segments,
+ boolean noCommit) throws IOException {
+ index(crawlDb, linkDb, segments, noCommit, false, null);
+ }
+
+ public void index(Path crawlDb, Path linkDb, List<Path> segments,
+ boolean noCommit, boolean deleteGone) throws IOException {
+ index(crawlDb, linkDb, segments, noCommit, deleteGone, null);
+ }
+
+ public void index(Path crawlDb, Path linkDb, List<Path> segments,
+ boolean noCommit, boolean deleteGone, String params) throws IOException {
+ index(crawlDb, linkDb, segments, noCommit, deleteGone, params, false, false);
+ }
+
+ public void index(Path crawlDb, Path linkDb, List<Path> segments,
+ boolean noCommit, boolean deleteGone, String params, boolean filter,
+ boolean normalize) throws IOException {
+ index(crawlDb, linkDb, segments, noCommit, deleteGone, params, false,
+ false, false);
+ }
+
+ public void index(Path crawlDb, Path linkDb, List<Path> segments,
+ boolean noCommit, boolean deleteGone, String params,
+ boolean filter, boolean normalize, boolean addBinaryContent) throws IOException {
+ index(crawlDb, linkDb, segments, noCommit, deleteGone, params, false,
+ false, false, false);
+ }
+
+ public void index(Path crawlDb, Path linkDb, List<Path> segments,
+ boolean noCommit, boolean deleteGone, String params,
+ boolean filter, boolean normalize, boolean addBinaryContent,
+ boolean base64) throws IOException {
+
+
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+ long start = System.currentTimeMillis();
+ LOG.info("Indexer: starting at {}", sdf.format(start));
+
+ final JobConf job = new NutchJob(getConf());
+ job.setJobName("Indexer");
+
+ LOG.info("Indexer: deleting gone documents: {}", deleteGone);
+ LOG.info("Indexer: URL filtering: {}", filter);
+ LOG.info("Indexer: URL normalizing: {}", normalize);
+ if (addBinaryContent) {
+ if (base64) {
+ LOG.info("Indexer: adding binary content as Base64");
+ } else {
+ LOG.info("Indexer: adding binary content");
+ }
+ }
+ IndexWriters writers = new IndexWriters(getConf());
+ LOG.info(writers.describe());
+
+ IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job, addBinaryContent);
+
+ // NOW PASSED ON THE COMMAND LINE AS A HADOOP PARAM
+ // job.set(SolrConstants.SERVER_URL, solrUrl);
+
+ job.setBoolean(IndexerMapReduce.INDEXER_DELETE, deleteGone);
+ job.setBoolean(IndexerMapReduce.URL_FILTERING, filter);
+ job.setBoolean(IndexerMapReduce.URL_NORMALIZING, normalize);
+ job.setBoolean(IndexerMapReduce.INDEXER_BINARY_AS_BASE64, base64);
+
+ if (params != null) {
+ job.set(IndexerMapReduce.INDEXER_PARAMS, params);
+ }
+
+ job.setReduceSpeculativeExecution(false);
+
+ final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-"
+ + new Random().nextInt());
+
+ FileOutputFormat.setOutputPath(job, tmp);
+ try {
+ RunningJob indexJob = JobClient.runJob(job);
+ // do the commits once and for all the reducers in one go
+ if (!noCommit) {
+ writers.open(job, "commit");
+ writers.commit();
+ }
+ LOG.info("Indexer: number of documents indexed, deleted, or skipped:");
+ for (Counter counter : indexJob.getCounters().getGroup("IndexerStatus")) {
+ LOG.info("Indexer: {} {}",
+ String.format(Locale.ROOT, "%6d", counter.getValue()),
+ counter.getName());
+ }
+ long end = System.currentTimeMillis();
+ LOG.info("Indexer: finished at " + sdf.format(end) + ", elapsed: "
+ + TimingUtil.elapsedTime(start, end));
+ } finally {
+ FileSystem.get(job).delete(tmp, true);
+ }
+ }
+
+ public int run(String[] args) throws Exception {
+ if (args.length < 2) {
+ System.err
+ //.println("Usage: Indexer <crawldb> [-linkdb <linkdb>] [-params k1=v1&k2=v2...] (<segment> ... | -dir <segments>) [-noCommit] [-deleteGone] [-filter] [-normalize]");
+ .println("Usage: Indexer <crawldb> [-linkdb <linkdb>] [-params k1=v1&k2=v2...] (<segment> ... | -dir <segments>) [-noCommit] [-deleteGone] [-filter] [-normalize] [-addBinaryContent] [-base64]");
+ IndexWriters writers = new IndexWriters(getConf());
+ System.err.println(writers.describe());
+ return -1;
+ }
+
+ final Path crawlDb = new Path(args[0]);
+ Path linkDb = null;
+
+ final List<Path> segments = new ArrayList<Path>();
+ String params = null;
+
+ boolean noCommit = false;
+ boolean deleteGone = false;
+ boolean filter = false;
+ boolean normalize = false;
+ boolean addBinaryContent = false;
+ boolean base64 = false;
+
+ for (int i = 1; i < args.length; i++) {
+ FileSystem fs = null;
+ Path dir = null;
+ if (args[i].equals("-linkdb")) {
+ linkDb = new Path(args[++i]);
+ } else if (args[i].equals("-dir")) {
+ dir = new Path(args[++i]);
+ fs = dir.getFileSystem(getConf());
+ FileStatus[] fstats = fs.listStatus(dir,
+ HadoopFSUtil.getPassDirectoriesFilter(fs));
+ Path[] files = HadoopFSUtil.getPaths(fstats);
+ for (Path p : files) {
+ if (SegmentChecker.isIndexable(p,fs)) {
+ segments.add(p);
+ }
+ }
+ } else if (args[i].equals("-noCommit")) {
+ noCommit = true;
+ } else if (args[i].equals("-deleteGone")) {
+ deleteGone = true;
+ } else if (args[i].equals("-filter")) {
+ filter = true;
+ } else if (args[i].equals("-normalize")) {
+ normalize = true;
+ } else if (args[i].equals("-addBinaryContent")) {
+ addBinaryContent = true;
+ } else if (args[i].equals("-base64")) {
+ base64 = true;
+ } else if (args[i].equals("-params")) {
+ params = args[++i];
+ } else {
+ dir = new Path(args[i]);
+ fs = dir.getFileSystem(getConf());
+ if (SegmentChecker.isIndexable(dir,fs)) {
+ segments.add(dir);
+ }
+ }
+ }
+
+ try {
+ index(crawlDb, linkDb, segments, noCommit, deleteGone, params, filter, normalize, addBinaryContent, base64);
+ return 0;
+ } catch (final Exception e) {
+ LOG.error("Indexer: {}", StringUtils.stringifyException(e));
+ return -1;
+ }
+ }
+
+ public static void main(String[] args) throws Exception {
+ final int res = ToolRunner.run(NutchConfiguration.create(),
+ new IndexingJob(), args);
+ System.exit(res);
+ }
+
+
+ //Used for REST API
+ @Override
+ public Map<String, Object> run(Map<String, Object> args, String crawlId) throws Exception {
+ boolean noCommit = false;
+ boolean deleteGone = false;
+ boolean filter = false;
+ boolean normalize = false;
+ boolean isSegment = false;
+ String params= null;
+ Configuration conf = getConf();
+
+ Path crawlDb;
+ if(args.containsKey(Nutch.ARG_CRAWLDB)) {
+ Object crawldbPath = args.get(Nutch.ARG_CRAWLDB);
+ if(crawldbPath instanceof Path) {
+ crawlDb = (Path) crawldbPath;
+ }
+ else {
+ crawlDb = new Path(crawldbPath.toString());
+ }
+ }
+ else {
+ crawlDb = new Path(crawlId+"/crawldb");
+ }
+
+ Path linkdb = null;
+ List<Path> segments = new ArrayList<Path>();
+
+ if(args.containsKey(Nutch.ARG_LINKDB)){
+ if(args.containsKey(Nutch.ARG_LINKDB)) {
+ Object path = args.get(Nutch.ARG_LINKDB);
+ if(path instanceof Path) {
+ linkdb = (Path) path;
+ }
+ else {
+ linkdb = new Path(path.toString());
+ }
+ }
+ else {
+ linkdb = new Path(crawlId+"/linkdb");
+ }
+ }
+
+ if(args.containsKey(Nutch.ARG_SEGMENTDIR)){
+ isSegment = true;
+ Path segmentsDir;
+ Object segDir = args.get(Nutch.ARG_SEGMENTDIR);
+ if(segDir instanceof Path) {
+ segmentsDir = (Path) segDir;
+ }
+ else {
+ segmentsDir = new Path(segDir.toString());
+ }
+ FileSystem fs = segmentsDir.getFileSystem(getConf());
+ FileStatus[] fstats = fs.listStatus(segmentsDir,
+ HadoopFSUtil.getPassDirectoriesFilter(fs));
+ Path[] files = HadoopFSUtil.getPaths(fstats);
+ for (Path p : files) {
+ if (SegmentChecker.isIndexable(p,fs)) {
+ segments.add(p);
+ }
+ }
+ }
+
+ if(args.containsKey(Nutch.ARG_SEGMENT)){
+ isSegment = true;
+ Object seg = args.get(Nutch.ARG_SEGMENT);
+ ArrayList<String> segmentList = new ArrayList<String>();
+ if(seg instanceof ArrayList) {
+ segmentList = (ArrayList<String>)seg;
+ }
+ for(String segment: segmentList) {
+ segments.add(new Path(segment));
+ }
+ }
+
+ if(!isSegment){
+ String segment_dir = crawlId+"/segments";
+ File segmentsDir = new File(segment_dir);
+ File[] segmentsList = segmentsDir.listFiles();
+ Arrays.sort(segmentsList, new Comparator<File>(){
+ @Override
+ public int compare(File f1, File f2) {
+ if(f1.lastModified()>f2.lastModified())
+ return -1;
+ else
+ return 0;
+ }
+ });
+ Path segment = new Path(segmentsList[0].getPath());
+ segments.add(segment);
+ }
+
+ if(args.containsKey("noCommit")){
+ noCommit = true;
+ }
+ if(args.containsKey("deleteGone")){
+ deleteGone = true;
+ }
+ if(args.containsKey("normalize")){
+ normalize = true;
+ }
+ if(args.containsKey("filter")){
+ filter = true;
+ }
+ if(args.containsKey("params")){
+ params = (String)args.get("params");
+ }
+ setConf(conf);
+ index(crawlDb, linkdb, segments, noCommit, deleteGone, params, filter,
+ normalize);
+ Map<String, Object> results = new HashMap<String, Object>();
+ results.put(Nutch.VAL_RESULT, 0);
+ return results;
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/indexer/NutchDocument.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/indexer/NutchDocument.java b/nutch-core/src/main/java/org/apache/nutch/indexer/NutchDocument.java
new file mode 100644
index 0000000..efdde02
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/indexer/NutchDocument.java
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.VersionMismatchException;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableUtils;
+import org.apache.nutch.metadata.Metadata;
+
+/** A {@link NutchDocument} is the unit of indexing. */
+public class NutchDocument implements Writable,
+ Iterable<Entry<String, NutchField>> {
+
+ public static final byte VERSION = 2;
+
+ private Map<String, NutchField> fields;
+
+ private Metadata documentMeta;
+
+ private float weight;
+
+ public NutchDocument() {
+ fields = new HashMap<String, NutchField>();
+ documentMeta = new Metadata();
+ weight = 1.0f;
+ }
+
+ public void add(String name, Object value) {
+ NutchField field = fields.get(name);
+ if (field == null) {
+ field = new NutchField(value);
+ fields.put(name, field);
+ } else {
+ field.add(value);
+ }
+ }
+
+ public Object getFieldValue(String name) {
+ NutchField field = fields.get(name);
+ if (field == null) {
+ return null;
+ }
+ if (field.getValues().size() == 0) {
+ return null;
+ }
+ return field.getValues().get(0);
+ }
+
+ public NutchField getField(String name) {
+ return fields.get(name);
+ }
+
+ public NutchField removeField(String name) {
+ return fields.remove(name);
+ }
+
+ public Collection<String> getFieldNames() {
+ return fields.keySet();
+ }
+
+ /** Iterate over all fields. */
+ public Iterator<Entry<String, NutchField>> iterator() {
+ return fields.entrySet().iterator();
+ }
+
+ public float getWeight() {
+ return weight;
+ }
+
+ public void setWeight(float weight) {
+ this.weight = weight;
+ }
+
+ public Metadata getDocumentMeta() {
+ return documentMeta;
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ fields.clear();
+ byte version = in.readByte();
+ if (version != VERSION) {
+ throw new VersionMismatchException(VERSION, version);
+ }
+ int size = WritableUtils.readVInt(in);
+ for (int i = 0; i < size; i++) {
+ String name = Text.readString(in);
+ NutchField field = new NutchField();
+ field.readFields(in);
+ fields.put(name, field);
+ }
+ weight = in.readFloat();
+ documentMeta.readFields(in);
+ }
+
+ public void write(DataOutput out) throws IOException {
+ out.writeByte(VERSION);
+ WritableUtils.writeVInt(out, fields.size());
+ for (Map.Entry<String, NutchField> entry : fields.entrySet()) {
+ Text.writeString(out, entry.getKey());
+ NutchField field = entry.getValue();
+ field.write(out);
+ }
+ out.writeFloat(weight);
+ documentMeta.write(out);
+ }
+
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("doc {\n");
+ for (Map.Entry<String, NutchField> entry : fields.entrySet()) {
+ sb.append("\t");
+ sb.append(entry.getKey());
+ sb.append(":\t");
+ sb.append(entry.getValue());
+ sb.append("\n");
+ }
+ sb.append("}\n");
+ return sb.toString();
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/indexer/NutchField.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/indexer/NutchField.java b/nutch-core/src/main/java/org/apache/nutch/indexer/NutchField.java
new file mode 100644
index 0000000..33911e1
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/indexer/NutchField.java
@@ -0,0 +1,137 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Date;
+import java.util.List;
+
+import org.apache.hadoop.io.*;
+
+/**
+ * This class represents a multi-valued field with a weight. Values are
+ * arbitrary objects.
+ */
+public class NutchField implements Writable {
+ private float weight;
+ private List<Object> values = new ArrayList<Object>();
+
+ public NutchField() {
+ }
+
+ public NutchField(Object value) {
+ this(value, 1.0f);
+ }
+
+ public NutchField(Object value, float weight) {
+ this.weight = weight;
+ if (value instanceof Collection) {
+ values.addAll((Collection<?>) value);
+ } else {
+ values.add(value);
+ }
+ }
+
+ public void add(Object value) {
+ values.add(value);
+ }
+
+ public float getWeight() {
+ return weight;
+ }
+
+ public void setWeight(float weight) {
+ this.weight = weight;
+ }
+
+ public List<Object> getValues() {
+ return values;
+ }
+
+ public void reset() {
+ weight = 1.0f;
+ values.clear();
+ }
+
+ @Override
+ public Object clone() throws CloneNotSupportedException {
+ NutchField result = (NutchField) super.clone();
+ result.weight = weight;
+ result.values = values;
+
+ return result;
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ weight = in.readFloat();
+ int count = in.readInt();
+ values = new ArrayList<Object>();
+ for (int i = 0; i < count; i++) {
+ String type = Text.readString(in);
+
+ if (type.equals("java.lang.String")) {
+ values.add(Text.readString(in));
+ } else if (type.equals("java.lang.Boolean")) {
+ values.add(in.readBoolean());
+ } else if (type.equals("java.lang.Integer")) {
+ values.add(in.readInt());
+ } else if (type.equals("java.lang.Float")) {
+ values.add(in.readFloat());
+ } else if (type.equals("java.lang.Long")) {
+ values.add(in.readLong());
+ } else if (type.equals("java.util.Date")) {
+ values.add(new Date(in.readLong()));
+ }
+ }
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeFloat(weight);
+ out.writeInt(values.size());
+ for (Object value : values) {
+
+ Text.writeString(out, value.getClass().getName());
+
+ if (value instanceof Boolean) {
+ out.writeBoolean((Boolean) value);
+ } else if (value instanceof Integer) {
+ out.writeInt((Integer) value);
+ } else if (value instanceof Long) {
+ out.writeLong((Long) value);
+ } else if (value instanceof Float) {
+ out.writeFloat((Float) value);
+ } else if (value instanceof String) {
+ Text.writeString(out, (String) value);
+ } else if (value instanceof Date) {
+ Date date = (Date) value;
+ out.writeLong(date.getTime());
+ }
+ }
+ }
+
+ public String toString() {
+ return values.toString();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/indexer/NutchIndexAction.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/indexer/NutchIndexAction.java b/nutch-core/src/main/java/org/apache/nutch/indexer/NutchIndexAction.java
new file mode 100644
index 0000000..b2517c3
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/indexer/NutchIndexAction.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.Writable;
+
+import org.apache.nutch.indexer.NutchDocument;
+
+/**
+ * A {@link NutchIndexAction} is the new unit of indexing holding the document
+ * and action information.
+ */
+public class NutchIndexAction implements Writable {
+
+ public static final byte ADD = 0;
+ public static final byte DELETE = 1;
+ public static final byte UPDATE = 2;
+
+ public NutchDocument doc = null;
+ public byte action = ADD;
+
+ protected NutchIndexAction() {
+ }
+
+ public NutchIndexAction(NutchDocument doc, byte action) {
+ this.doc = doc;
+ this.action = action;
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ action = in.readByte();
+ doc = new NutchDocument();
+ doc.readFields(in);
+ }
+
+ public void write(DataOutput out) throws IOException {
+ out.write(action);
+ doc.write(out);
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/indexer/package.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/indexer/package.html b/nutch-core/src/main/java/org/apache/nutch/indexer/package.html
new file mode 100644
index 0000000..825eaae
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/indexer/package.html
@@ -0,0 +1,10 @@
+<html>
+<body>
+Index content, configure and run indexing and cleaning jobs to
+add, update, and delete documents from an index. Two tasks are
+delegated to plugins:
+<ul>
+<li>indexing filters fill index fields of each documents</li>
+<li>index writer plugins send documents to index back-ends (Solr, etc.).
+</body>
+</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/metadata/CreativeCommons.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/metadata/CreativeCommons.java b/nutch-core/src/main/java/org/apache/nutch/metadata/CreativeCommons.java
new file mode 100644
index 0000000..f9c425b
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/metadata/CreativeCommons.java
@@ -0,0 +1,35 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.metadata;
+
+/**
+ * A collection of Creative Commons properties names.
+ *
+ * @see <a href="http://www.creativecommons.org/">creativecommons.org</a>
+ *
+ * @author Chris Mattmann
+ * @author Jérôme Charron
+ */
+public interface CreativeCommons {
+
+ public final static String LICENSE_URL = "License-Url";
+
+ public final static String LICENSE_LOCATION = "License-Location";
+
+ public final static String WORK_TYPE = "Work-Type";
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/metadata/DublinCore.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/metadata/DublinCore.java b/nutch-core/src/main/java/org/apache/nutch/metadata/DublinCore.java
new file mode 100644
index 0000000..9724d80
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/metadata/DublinCore.java
@@ -0,0 +1,161 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.metadata;
+
+/**
+ * A collection of Dublin Core metadata names.
+ *
+ * @see <a href="http://dublincore.org">dublincore.org</a>
+ *
+ * @author Chris Mattmann
+ * @author Jérôme Charron
+ */
+public interface DublinCore {
+
+ /**
+ * Typically, Format may include the media-type or dimensions of the resource.
+ * Format may be used to determine the software, hardware or other equipment
+ * needed to display or operate the resource. Examples of dimensions include
+ * size and duration. Recommended best practice is to select a value from a
+ * controlled vocabulary (for example, the list of Internet Media Types [MIME]
+ * defining computer media formats).
+ */
+ public static final String FORMAT = "format";
+
+ /**
+ * Recommended best practice is to identify the resource by means of a string
+ * or number conforming to a formal identification system. Example formal
+ * identification systems include the Uniform Resource Identifier (URI)
+ * (including the Uniform Resource Locator (URL)), the Digital Object
+ * Identifier (DOI) and the International Standard Book Number (ISBN).
+ */
+ public static final String IDENTIFIER = "identifier";
+
+ /**
+ * Date on which the resource was changed.
+ */
+ public static final String MODIFIED = "modified";
+
+ /**
+ * An entity responsible for making contributions to the content of the
+ * resource. Examples of a Contributor include a person, an organisation, or a
+ * service. Typically, the name of a Contributor should be used to indicate
+ * the entity.
+ */
+ public static final String CONTRIBUTOR = "contributor";
+
+ /**
+ * The extent or scope of the content of the resource. Coverage will typically
+ * include spatial location (a place name or geographic coordinates), temporal
+ * period (a period label, date, or date range) or jurisdiction (such as a
+ * named administrative entity). Recommended best practice is to select a
+ * value from a controlled vocabulary (for example, the Thesaurus of
+ * Geographic Names [TGN]) and that, where appropriate, named places or time
+ * periods be used in preference to numeric identifiers such as sets of
+ * coordinates or date ranges.
+ */
+ public static final String COVERAGE = "coverage";
+
+ /**
+ * An entity primarily responsible for making the content of the resource.
+ * Examples of a Creator include a person, an organisation, or a service.
+ * Typically, the name of a Creator should be used to indicate the entity.
+ */
+ public static final String CREATOR = "creator";
+
+ /**
+ * A date associated with an event in the life cycle of the resource.
+ * Typically, Date will be associated with the creation or availability of the
+ * resource. Recommended best practice for encoding the date value is defined
+ * in a profile of ISO 8601 [W3CDTF] and follows the YYYY-MM-DD format.
+ */
+ public static final String DATE = "date";
+
+ /**
+ * An account of the content of the resource. Description may include but is
+ * not limited to: an abstract, table of contents, reference to a graphical
+ * representation of content or a free-text account of the content.
+ */
+ public static final String DESCRIPTION = "description";
+
+ /**
+ * A language of the intellectual content of the resource. Recommended best
+ * practice is to use RFC 3066 [RFC3066], which, in conjunction with ISO 639
+ * [ISO639], defines two- and three-letter primary language tags with optional
+ * subtags. Examples include "en" or "eng" for English, "akk" for Akkadian,
+ * and "en-GB" for English used in the United Kingdom.
+ */
+ public static final String LANGUAGE = "language";
+
+ /**
+ * An entity responsible for making the resource available. Examples of a
+ * Publisher include a person, an organisation, or a service. Typically, the
+ * name of a Publisher should be used to indicate the entity.
+ */
+ public static final String PUBLISHER = "publisher";
+
+ /**
+ * A reference to a related resource. Recommended best practice is to
+ * reference the resource by means of a string or number conforming to a
+ * formal identification system.
+ */
+ public static final String RELATION = "relation";
+
+ /**
+ * Information about rights held in and over the resource. Typically, a Rights
+ * element will contain a rights management statement for the resource, or
+ * reference a service providing such information. Rights information often
+ * encompasses Intellectual Property Rights (IPR), Copyright, and various
+ * Property Rights. If the Rights element is absent, no assumptions can be
+ * made about the status of these and other rights with respect to the
+ * resource.
+ */
+ public static final String RIGHTS = "rights";
+
+ /**
+ * A reference to a resource from which the present resource is derived. The
+ * present resource may be derived from the Source resource in whole or in
+ * part. Recommended best practice is to reference the resource by means of a
+ * string or number conforming to a formal identification system.
+ */
+ public static final String SOURCE = "source";
+
+ /**
+ * The topic of the content of the resource. Typically, a Subject will be
+ * expressed as keywords, key phrases or classification codes that describe a
+ * topic of the resource. Recommended best practice is to select a value from
+ * a controlled vocabulary or formal classification scheme.
+ */
+ public static final String SUBJECT = "subject";
+
+ /**
+ * A name given to the resource. Typically, a Title will be a name by which
+ * the resource is formally known.
+ */
+ public static final String TITLE = "title";
+
+ /**
+ * The nature or genre of the content of the resource. Type includes terms
+ * describing general categories, functions, genres, or aggregation levels for
+ * content. Recommended best practice is to select a value from a controlled
+ * vocabulary (for example, the DCMI Type Vocabulary [DCMITYPE]). To describe
+ * the physical or digital manifestation of the resource, use the Format
+ * element.
+ */
+ public static final String TYPE = "type";
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/metadata/Feed.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/metadata/Feed.java b/nutch-core/src/main/java/org/apache/nutch/metadata/Feed.java
new file mode 100644
index 0000000..2697da6
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/metadata/Feed.java
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.metadata;
+
+/**
+ * A collection of Feed property names extracted by the ROME library.
+ *
+ *
+ * @author mattmann
+ * @author dogacan
+ */
+public interface Feed {
+
+ public static final String FEED_AUTHOR = "author";
+
+ public static final String FEED_TAGS = "tag";
+
+ public static final String FEED_PUBLISHED = "published";
+
+ public static final String FEED_UPDATED = "updated";
+
+ public static final String FEED = "feed";
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/metadata/HttpHeaders.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/metadata/HttpHeaders.java b/nutch-core/src/main/java/org/apache/nutch/metadata/HttpHeaders.java
new file mode 100644
index 0000000..78b8797
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/metadata/HttpHeaders.java
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.metadata;
+
+import org.apache.hadoop.io.Text;
+
+/**
+ * A collection of HTTP header names.
+ *
+ * @see <a href="http://rfc-ref.org/RFC-TEXTS/2616/">Hypertext Transfer Protocol
+ * -- HTTP/1.1 (RFC 2616)</a>
+ */
+public interface HttpHeaders {
+
+ public final static String TRANSFER_ENCODING = "Transfer-Encoding";
+
+ public final static String CONTENT_ENCODING = "Content-Encoding";
+
+ public final static String CONTENT_LANGUAGE = "Content-Language";
+
+ public final static String CONTENT_LENGTH = "Content-Length";
+
+ public final static String CONTENT_LOCATION = "Content-Location";
+
+ public static final String CONTENT_DISPOSITION = "Content-Disposition";
+
+ public final static String CONTENT_MD5 = "Content-MD5";
+
+ public final static String CONTENT_TYPE = "Content-Type";
+
+ public static final Text WRITABLE_CONTENT_TYPE = new Text(CONTENT_TYPE);
+
+ public final static String LAST_MODIFIED = "Last-Modified";
+
+ public final static String LOCATION = "Location";
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/metadata/MetaWrapper.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/metadata/MetaWrapper.java b/nutch-core/src/main/java/org/apache/nutch/metadata/MetaWrapper.java
new file mode 100644
index 0000000..a43fa9d
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/metadata/MetaWrapper.java
@@ -0,0 +1,120 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.metadata;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Writable;
+import org.apache.nutch.crawl.NutchWritable;
+
+/**
+ * This is a simple decorator that adds metadata to any Writable-s that can be
+ * serialized by <tt>NutchWritable</tt>. This is useful when data needs to be
+ * temporarily enriched during processing, but this temporary metadata doesn't
+ * need to be permanently stored after the job is done.
+ *
+ * @author Andrzej Bialecki
+ */
+public class MetaWrapper extends NutchWritable {
+ private Metadata metadata;
+
+ public MetaWrapper() {
+ super();
+ metadata = new Metadata();
+ }
+
+ public MetaWrapper(Writable instance, Configuration conf) {
+ super(instance);
+ metadata = new Metadata();
+ setConf(conf);
+ }
+
+ public MetaWrapper(Metadata metadata, Writable instance, Configuration conf) {
+ super(instance);
+ if (metadata == null)
+ metadata = new Metadata();
+ this.metadata = metadata;
+ setConf(conf);
+ }
+
+ /**
+ * Get all metadata.
+ */
+ public Metadata getMetadata() {
+ return metadata;
+ }
+
+ /**
+ * Add metadata. See {@link Metadata#add(String, String)} for more
+ * information.
+ *
+ * @param name
+ * metadata name
+ * @param value
+ * metadata value
+ */
+ public void addMeta(String name, String value) {
+ metadata.add(name, value);
+ }
+
+ /**
+ * Set metadata. See {@link Metadata#set(String, String)} for more
+ * information.
+ *
+ * @param name
+ * @param value
+ */
+ public void setMeta(String name, String value) {
+ metadata.set(name, value);
+ }
+
+ /**
+ * Get metadata. See {@link Metadata#get(String)} for more information.
+ *
+ * @param name
+ * @return metadata value
+ */
+ public String getMeta(String name) {
+ return metadata.get(name);
+ }
+
+ /**
+ * Get multiple metadata. See {@link Metadata#getValues(String)} for more
+ * information.
+ *
+ * @param name
+ * @return multiple values
+ */
+ public String[] getMetaValues(String name) {
+ return metadata.getValues(name);
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ super.readFields(in);
+ metadata = new Metadata();
+ metadata.readFields(in);
+ }
+
+ public void write(DataOutput out) throws IOException {
+ super.write(out);
+ metadata.write(out);
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/metadata/Metadata.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/metadata/Metadata.java b/nutch-core/src/main/java/org/apache/nutch/metadata/Metadata.java
new file mode 100644
index 0000000..8a57ee3
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/metadata/Metadata.java
@@ -0,0 +1,280 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.metadata;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Enumeration;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Properties;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+
+/**
+ * A multi-valued metadata container.
+ */
+public class Metadata implements Writable, CreativeCommons, DublinCore,
+ HttpHeaders, Nutch, Feed {
+
+ /**
+ * A map of all metadata attributes.
+ */
+ private Map<String, String[]> metadata = null;
+
+ /**
+ * Constructs a new, empty metadata.
+ */
+ public Metadata() {
+ metadata = new HashMap<String, String[]>();
+ }
+
+ /**
+ * Returns true if named value is multivalued.
+ *
+ * @param name
+ * name of metadata
+ * @return true is named value is multivalued, false if single value or null
+ */
+ public boolean isMultiValued(final String name) {
+ return metadata.get(name) != null && metadata.get(name).length > 1;
+ }
+
+ /**
+ * Returns an array of the names contained in the metadata.
+ *
+ * @return Metadata names
+ */
+ public String[] names() {
+ return metadata.keySet().toArray(new String[metadata.keySet().size()]);
+ }
+
+ /**
+ * Get the value associated to a metadata name. If many values are assiociated
+ * to the specified name, then the first one is returned.
+ *
+ * @param name
+ * of the metadata.
+ * @return the value associated to the specified metadata name.
+ */
+ public String get(final String name) {
+ String[] values = metadata.get(name);
+ if (values == null) {
+ return null;
+ } else {
+ return values[0];
+ }
+ }
+
+ /**
+ * Get the values associated to a metadata name.
+ *
+ * @param name
+ * of the metadata.
+ * @return the values associated to a metadata name.
+ */
+ public String[] getValues(final String name) {
+ return _getValues(name);
+ }
+
+ private String[] _getValues(final String name) {
+ String[] values = metadata.get(name);
+ if (values == null) {
+ values = new String[0];
+ }
+ return values;
+ }
+
+ /**
+ * Add a metadata name/value mapping. Add the specified value to the list of
+ * values associated to the specified metadata name.
+ *
+ * @param name
+ * the metadata name.
+ * @param value
+ * the metadata value.
+ */
+ public void add(final String name, final String value) {
+ String[] values = metadata.get(name);
+ if (values == null) {
+ set(name, value);
+ } else {
+ String[] newValues = new String[values.length + 1];
+ System.arraycopy(values, 0, newValues, 0, values.length);
+ newValues[newValues.length - 1] = value;
+ metadata.put(name, newValues);
+ }
+ }
+
+ /**
+ * Add all name/value mappings (merge two metadata mappings). If a name
+ * already exists in current metadata the values are added to existing values.
+ *
+ * @param metadata
+ * other Metadata to be merged
+ */
+ public void addAll(Metadata metadata) {
+ for (String name : metadata.names()) {
+ String[] addValues = metadata.getValues(name);
+ if (addValues == null)
+ continue;
+ String[] oldValues = this.metadata.get(name);
+ if (oldValues == null) {
+ this.metadata.put(name, addValues);
+ } else {
+ String[] newValues = new String[oldValues.length + addValues.length];
+ System.arraycopy(oldValues, 0, newValues, 0, oldValues.length);
+ System.arraycopy(addValues, 0, newValues, oldValues.length,
+ addValues.length);
+ this.metadata.put(name, newValues);
+ }
+ }
+ }
+
+ /**
+ * Copy All key-value pairs from properties.
+ *
+ * @param properties
+ * properties to copy from
+ */
+ public void setAll(Properties properties) {
+ Enumeration<?> names = properties.propertyNames();
+ while (names.hasMoreElements()) {
+ String name = (String) names.nextElement();
+ metadata.put(name, new String[] { properties.getProperty(name) });
+ }
+ }
+
+ /**
+ * Set metadata name/value. Associate the specified value to the specified
+ * metadata name. If some previous values were associated to this name, they
+ * are removed.
+ *
+ * @param name
+ * the metadata name.
+ * @param value
+ * the metadata value.
+ */
+ public void set(String name, String value) {
+ metadata.put(name, new String[] { value });
+ }
+
+ /**
+ * Remove a metadata and all its associated values.
+ *
+ * @param name
+ * metadata name to remove
+ */
+ public void remove(String name) {
+ metadata.remove(name);
+ }
+
+ /**
+ * Returns the number of metadata names in this metadata.
+ *
+ * @return number of metadata names
+ */
+ public int size() {
+ return metadata.size();
+ }
+
+ /** Remove all mappings from metadata. */
+ public void clear() {
+ metadata.clear();
+ }
+
+ public boolean equals(Object o) {
+
+ if (o == null) {
+ return false;
+ }
+
+ Metadata other = null;
+ try {
+ other = (Metadata) o;
+ } catch (ClassCastException cce) {
+ return false;
+ }
+
+ if (other.size() != size()) {
+ return false;
+ }
+
+ String[] names = names();
+ for (int i = 0; i < names.length; i++) {
+ String[] otherValues = other._getValues(names[i]);
+ String[] thisValues = _getValues(names[i]);
+ if (otherValues.length != thisValues.length) {
+ return false;
+ }
+ for (int j = 0; j < otherValues.length; j++) {
+ if (!otherValues[j].equals(thisValues[j])) {
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ public String toString() {
+ StringBuffer buf = new StringBuffer();
+ String[] names = names();
+ for (int i = 0; i < names.length; i++) {
+ String[] values = _getValues(names[i]);
+ for (int j = 0; j < values.length; j++) {
+ buf.append(names[i]).append("=").append(values[j]).append(" ");
+ }
+ }
+ return buf.toString();
+ }
+
+ public final void write(DataOutput out) throws IOException {
+ out.writeInt(size());
+ String[] values = null;
+ String[] names = names();
+ for (int i = 0; i < names.length; i++) {
+ Text.writeString(out, names[i]);
+ values = _getValues(names[i]);
+ int cnt = 0;
+ for (int j = 0; j < values.length; j++) {
+ if (values[j] != null)
+ cnt++;
+ }
+ out.writeInt(cnt);
+ for (int j = 0; j < values.length; j++) {
+ if (values[j] != null) {
+ Text.writeString(out, values[j]);
+ }
+ }
+ }
+ }
+
+ public final void readFields(DataInput in) throws IOException {
+ int keySize = in.readInt();
+ String key;
+ for (int i = 0; i < keySize; i++) {
+ key = Text.readString(in);
+ int valueSize = in.readInt();
+ for (int j = 0; j < valueSize; j++) {
+ add(key, Text.readString(in));
+ }
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/metadata/Nutch.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/metadata/Nutch.java b/nutch-core/src/main/java/org/apache/nutch/metadata/Nutch.java
new file mode 100644
index 0000000..de80399
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/metadata/Nutch.java
@@ -0,0 +1,98 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.metadata;
+
+import org.apache.hadoop.io.Text;
+
+/**
+ * A collection of Nutch internal metadata constants.
+ *
+ * @author Chris Mattmann
+ * @author Jérôme Charron
+ */
+public interface Nutch {
+
+ public static final String ORIGINAL_CHAR_ENCODING = "OriginalCharEncoding";
+
+ public static final String CHAR_ENCODING_FOR_CONVERSION = "CharEncodingForConversion";
+
+ public static final String SIGNATURE_KEY = "nutch.content.digest";
+
+ public static final String SEGMENT_NAME_KEY = "nutch.segment.name";
+
+ public static final String SCORE_KEY = "nutch.crawl.score";
+
+ public static final String GENERATE_TIME_KEY = "_ngt_";
+
+ public static final Text WRITABLE_GENERATE_TIME_KEY = new Text(
+ GENERATE_TIME_KEY);
+
+ public static final Text PROTOCOL_STATUS_CODE_KEY = new Text("nutch.protocol.code");
+
+ public static final String PROTO_STATUS_KEY = "_pst_";
+
+ public static final Text WRITABLE_PROTO_STATUS_KEY = new Text(
+ PROTO_STATUS_KEY);
+
+ public static final String FETCH_TIME_KEY = "_ftk_";
+
+ public static final String FETCH_STATUS_KEY = "_fst_";
+
+ /**
+ * Sites may request that search engines don't provide access to cached
+ * documents.
+ */
+ public static final String CACHING_FORBIDDEN_KEY = "caching.forbidden";
+
+ /** Show both original forbidden content and summaries (default). */
+ public static final String CACHING_FORBIDDEN_NONE = "none";
+
+ /** Don't show either original forbidden content or summaries. */
+ public static final String CACHING_FORBIDDEN_ALL = "all";
+
+ /** Don't show original forbidden content, but show summaries. */
+ public static final String CACHING_FORBIDDEN_CONTENT = "content";
+
+ public static final String REPR_URL_KEY = "_repr_";
+
+ public static final Text WRITABLE_REPR_URL_KEY = new Text(REPR_URL_KEY);
+
+ /** Used by AdaptiveFetchSchedule to maintain custom fetch interval */
+ public static final String FIXED_INTERVAL_KEY = "fixedInterval";
+
+ public static final Text WRITABLE_FIXED_INTERVAL_KEY = new Text(
+ FIXED_INTERVAL_KEY);
+
+
+ /** For progress of job. Used by the Nutch REST service */
+ public static final String STAT_PROGRESS = "progress";
+ /**Used by Nutch REST service */
+ public static final String CRAWL_ID_KEY = "storage.crawl.id";
+ /** Argument key to specify location of the seed url dir for the REST endpoints **/
+ public static final String ARG_SEEDDIR = "url_dir";
+ /** Argument key to specify the location of crawldb for the REST endpoints **/
+ public static final String ARG_CRAWLDB = "crawldb";
+ /** Argument key to specify the location of linkdb for the REST endpoints **/
+ public static final String ARG_LINKDB = "linkdb";
+ /** Name of the key used in the Result Map sent back by the REST endpoint **/
+ public static final String VAL_RESULT = "result";
+ /** Argument key to specify the location of a directory of segments for the REST endpoints.
+ * Similar to the -dir command in the bin/nutch script **/
+ public static final String ARG_SEGMENTDIR = "segment_dir";
+ /** Argument key to specify the location of individual segment for the REST endpoints **/
+ public static final String ARG_SEGMENT = "segment";
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/metadata/SpellCheckedMetadata.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/metadata/SpellCheckedMetadata.java b/nutch-core/src/main/java/org/apache/nutch/metadata/SpellCheckedMetadata.java
new file mode 100644
index 0000000..164ca1d
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/metadata/SpellCheckedMetadata.java
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.metadata;
+
+import java.lang.reflect.Field;
+import java.lang.reflect.Modifier;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.commons.lang.StringUtils;
+
+/**
+ * A decorator to Metadata that adds spellchecking capabilities to property
+ * names. Currently used spelling vocabulary contains just the httpheaders from
+ * {@link HttpHeaders} class.
+ *
+ */
+public class SpellCheckedMetadata extends Metadata {
+
+ /**
+ * Treshold divider.
+ *
+ * <code>threshold = searched.length() / TRESHOLD_DIVIDER;</code>
+ */
+ private static final int TRESHOLD_DIVIDER = 3;
+
+ /**
+ * Normalized name to name mapping.
+ */
+ private final static Map<String, String> NAMES_IDX = new HashMap<String, String>();
+
+ /**
+ * Array holding map keys.
+ */
+ private static String[] normalized = null;
+
+ static {
+
+ // Uses following array to fill the metanames index and the
+ // metanames list.
+ Class<?>[] spellthese = { HttpHeaders.class };
+
+ for (Class<?> spellCheckedNames : spellthese) {
+ for (Field field : spellCheckedNames.getFields()) {
+ int mods = field.getModifiers();
+ if (Modifier.isFinal(mods) && Modifier.isPublic(mods)
+ && Modifier.isStatic(mods) && field.getType().equals(String.class)) {
+ try {
+ String val = (String) field.get(null);
+ NAMES_IDX.put(normalize(val), val);
+ } catch (Exception e) {
+ // Simply ignore...
+ }
+ }
+ }
+ }
+ normalized = NAMES_IDX.keySet().toArray(new String[NAMES_IDX.size()]);
+ }
+
+ /**
+ * Normalizes String.
+ *
+ * @param str
+ * the string to normalize
+ * @return normalized String
+ */
+ private static String normalize(final String str) {
+ char c;
+ StringBuffer buf = new StringBuffer();
+ for (int i = 0; i < str.length(); i++) {
+ c = str.charAt(i);
+ if (Character.isLetter(c)) {
+ buf.append(Character.toLowerCase(c));
+ }
+ }
+ return buf.toString();
+ }
+
+ /**
+ * Get the normalized name of metadata attribute name. This method tries to
+ * find a well-known metadata name (one of the metadata names defined in this
+ * class) that matches the specified name. The matching is error tolerent. For
+ * instance,
+ * <ul>
+ * <li>content-type gives Content-Type</li>
+ * <li>CoNtEntType gives Content-Type</li>
+ * <li>ConTnTtYpe gives Content-Type</li>
+ * </ul>
+ * If no matching with a well-known metadata name is found, then the original
+ * name is returned.
+ *
+ * @param name
+ * Name to normalize
+ * @return normalized name
+ */
+ public static String getNormalizedName(final String name) {
+ String searched = normalize(name);
+ String value = NAMES_IDX.get(searched);
+
+ if ((value == null) && (normalized != null)) {
+ int threshold = searched.length() / TRESHOLD_DIVIDER;
+ for (int i = 0; i < normalized.length && value == null; i++) {
+ if (StringUtils.getLevenshteinDistance(searched, normalized[i]) < threshold) {
+ value = NAMES_IDX.get(normalized[i]);
+ }
+ }
+ }
+ return (value != null) ? value : name;
+ }
+
+ @Override
+ public void remove(final String name) {
+ super.remove(getNormalizedName(name));
+ }
+
+ @Override
+ public void add(final String name, final String value) {
+ super.add(getNormalizedName(name), value);
+ }
+
+ @Override
+ public String[] getValues(final String name) {
+ return super.getValues(getNormalizedName(name));
+ }
+
+ @Override
+ public String get(final String name) {
+ return super.get(getNormalizedName(name));
+ }
+
+ @Override
+ public void set(final String name, final String value) {
+ super.set(getNormalizedName(name), value);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/metadata/package.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/metadata/package.html b/nutch-core/src/main/java/org/apache/nutch/metadata/package.html
new file mode 100644
index 0000000..53281bb
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/metadata/package.html
@@ -0,0 +1,6 @@
+<html>
+<body>
+A Multi-valued Metadata container, and set
+of constant fields for Nutch Metadata.
+</body>
+</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/URLExemptionFilter.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/net/URLExemptionFilter.java b/nutch-core/src/main/java/org/apache/nutch/net/URLExemptionFilter.java
new file mode 100644
index 0000000..8de5800
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/net/URLExemptionFilter.java
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net;
+
+//Hadoop
+import org.apache.hadoop.conf.Configurable;
+// Nutch
+import org.apache.nutch.plugin.Pluggable;
+
+/**
+ * Interface used to allow exemptions to external domain resources by overriding <code>db.ignore.external.links</code>.
+ * This is useful when the crawl is focused to a domain but resources like images are hosted on CDN.
+ */
+
+public interface URLExemptionFilter extends Pluggable, Configurable{
+
+ /** The name of the extension point. */
+ public final static String X_POINT_ID = URLExemptionFilter.class.getName();
+
+ /**
+ * Checks if toUrl is exempted when the ignore external is enabled
+ * @param fromUrl : the source url which generated the outlink
+ * @param toUrl : the destination url which needs to be checked for exemption
+ * @return true when toUrl is exempted from dbIgnore
+ */
+ public boolean filter(String fromUrl, String toUrl);
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/URLExemptionFilters.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/net/URLExemptionFilters.java b/nutch-core/src/main/java/org/apache/nutch/net/URLExemptionFilters.java
new file mode 100644
index 0000000..d362f2e
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/net/URLExemptionFilters.java
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.plugin.PluginRuntimeException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/** Creates and caches {@link URLExemptionFilter} implementing plugins. */
+public class URLExemptionFilters {
+
+ private static final Logger LOG = LoggerFactory.getLogger(URLExemptionFilters.class);
+
+ private URLExemptionFilter[] filters;
+
+ public URLExemptionFilters(Configuration conf) {
+ Extension[] extensions = PluginRepository.get(conf)
+ .getExtensionPoint(URLExemptionFilter.X_POINT_ID).getExtensions();
+ filters = new URLExemptionFilter[extensions.length];
+ for (int i = 0; i < extensions.length; i++) {
+ try {
+ filters[i] = (URLExemptionFilter) extensions[i].getExtensionInstance();
+ } catch (PluginRuntimeException e) {
+ throw new IllegalStateException(e);
+ }
+ }
+ LOG.info("Found {} extensions at point:'{}'", filters.length,
+ URLExemptionFilter.X_POINT_ID);
+ }
+
+
+ /** Run all defined filters. Assume logical AND. */
+ public boolean isExempted(String fromUrl, String toUrl) {
+ if (filters.length < 1) {
+ //at least one filter should be on
+ return false;
+ }
+ //validate from, to and filters
+ boolean exempted = fromUrl != null && toUrl != null;
+ //An URL is exempted when all the filters accept it to pass through
+ for (int i = 0; i < this.filters.length && exempted; i++) {
+ exempted = this.filters[i].filter(fromUrl, toUrl);
+ }
+ return exempted;
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/URLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/net/URLFilter.java b/nutch-core/src/main/java/org/apache/nutch/net/URLFilter.java
new file mode 100644
index 0000000..01efbcd
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/net/URLFilter.java
@@ -0,0 +1,40 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configurable;
+
+// Nutch imports
+import org.apache.nutch.plugin.Pluggable;
+
+/**
+ * Interface used to limit which URLs enter Nutch. Used by the injector and the
+ * db updater.
+ */
+
+public interface URLFilter extends Pluggable, Configurable {
+ /** The name of the extension point. */
+ public final static String X_POINT_ID = URLFilter.class.getName();
+
+ /*
+ * Interface for a filter that transforms a URL: it can pass the original URL
+ * through or "delete" the URL by returning null
+ */
+ public String filter(String urlString);
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/URLFilterChecker.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/net/URLFilterChecker.java b/nutch-core/src/main/java/org/apache/nutch/net/URLFilterChecker.java
new file mode 100644
index 0000000..89a3d00
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/net/URLFilterChecker.java
@@ -0,0 +1,134 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net;
+
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.ExtensionPoint;
+import org.apache.nutch.plugin.PluginRepository;
+
+import org.apache.hadoop.conf.Configuration;
+
+import org.apache.nutch.util.NutchConfiguration;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+
+/**
+ * Checks one given filter or all filters.
+ *
+ * @author John Xing
+ */
+public class URLFilterChecker {
+
+ private Configuration conf;
+
+ public URLFilterChecker(Configuration conf) {
+ this.conf = conf;
+ }
+
+ private void checkOne(String filterName) throws Exception {
+ URLFilter filter = null;
+
+ ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(
+ URLFilter.X_POINT_ID);
+
+ if (point == null)
+ throw new RuntimeException(URLFilter.X_POINT_ID + " not found.");
+
+ Extension[] extensions = point.getExtensions();
+
+ for (int i = 0; i < extensions.length; i++) {
+ Extension extension = extensions[i];
+ filter = (URLFilter) extension.getExtensionInstance();
+ if (filter.getClass().getName().equals(filterName)) {
+ break;
+ } else {
+ filter = null;
+ }
+ }
+
+ if (filter == null)
+ throw new RuntimeException("Filter " + filterName + " not found.");
+
+ // jerome : should we keep this behavior?
+ // if (LogFormatter.hasLoggedSevere())
+ // throw new RuntimeException("Severe error encountered.");
+
+ System.out.println("Checking URLFilter " + filterName);
+
+ BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+ String line;
+ while ((line = in.readLine()) != null) {
+ String out = filter.filter(line);
+ if (out != null) {
+ System.out.print("+");
+ System.out.println(out);
+ } else {
+ System.out.print("-");
+ System.out.println(line);
+ }
+ }
+ }
+
+ private void checkAll() throws Exception {
+ System.out.println("Checking combination of all URLFilters available");
+
+ BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+ String line;
+ while ((line = in.readLine()) != null) {
+ URLFilters filters = new URLFilters(this.conf);
+ String out = filters.filter(line);
+ if (out != null) {
+ System.out.print("+");
+ System.out.println(out);
+ } else {
+ System.out.print("-");
+ System.out.println(line);
+ }
+ }
+ }
+
+ public static void main(String[] args) throws Exception {
+
+ String usage = "Usage: URLFilterChecker (-filterName filterName | -allCombined) \n"
+ + "Tool takes a list of URLs, one per line, passed via STDIN.\n";
+
+ if (args.length == 0) {
+ System.err.println(usage);
+ System.exit(-1);
+ }
+
+ String filterName = null;
+ if (args[0].equals("-filterName")) {
+ if (args.length != 2) {
+ System.err.println(usage);
+ System.exit(-1);
+ }
+ filterName = args[1];
+ }
+
+ URLFilterChecker checker = new URLFilterChecker(NutchConfiguration.create());
+ if (filterName != null) {
+ checker.checkOne(filterName);
+ } else {
+ checker.checkAll();
+ }
+
+ System.exit(0);
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/URLFilterException.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/net/URLFilterException.java b/nutch-core/src/main/java/org/apache/nutch/net/URLFilterException.java
new file mode 100644
index 0000000..b367b56
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/net/URLFilterException.java
@@ -0,0 +1,39 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net;
+
+@SuppressWarnings("serial")
+public class URLFilterException extends Exception {
+
+ public URLFilterException() {
+ super();
+ }
+
+ public URLFilterException(String message) {
+ super(message);
+ }
+
+ public URLFilterException(String message, Throwable cause) {
+ super(message, cause);
+ }
+
+ public URLFilterException(Throwable cause) {
+ super(cause);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/URLFilters.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/net/URLFilters.java b/nutch-core/src/main/java/org/apache/nutch/net/URLFilters.java
new file mode 100644
index 0000000..3deccca
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/net/URLFilters.java
@@ -0,0 +1,44 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.plugin.PluginRepository;
+
+/** Creates and caches {@link URLFilter} implementing plugins. */
+public class URLFilters {
+
+ public static final String URLFILTER_ORDER = "urlfilter.order";
+ private URLFilter[] filters;
+
+ public URLFilters(Configuration conf) {
+ this.filters = (URLFilter[]) PluginRepository.get(conf).getOrderedPlugins(
+ URLFilter.class, URLFilter.X_POINT_ID, URLFILTER_ORDER);
+ }
+
+ /** Run all defined filters. Assume logical AND. */
+ public String filter(String urlString) throws URLFilterException {
+ for (int i = 0; i < this.filters.length; i++) {
+ if (urlString == null)
+ return null;
+ urlString = this.filters[i].filter(urlString);
+
+ }
+ return urlString;
+ }
+}