You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/16 19:49:00 UTC

[44/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingFiltersChecker.java b/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
new file mode 100644
index 0000000..2e1b9c2
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
@@ -0,0 +1,371 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.io.PrintWriter;
+import java.net.ServerSocket;
+import java.net.Socket;
+import java.net.InetSocketAddress;
+import java.nio.charset.Charset;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.crawl.SignatureFactory;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.ParseSegment;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.scoring.ScoringFilters;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.StringUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Reads and parses a URL and run the indexers on it. Displays the fields
+ * obtained and the first 100 characters of their value
+ * 
+ * Tested with e.g. ./nutch org.apache.nutch.indexer.IndexingFiltersChecker
+ * http://www.lemonde.fr
+ * 
+ * @author Julien Nioche
+ **/
+
+public class IndexingFiltersChecker extends Configured implements Tool {
+
+  protected URLNormalizers normalizers = null;
+  protected boolean dumpText = false;
+  protected boolean followRedirects = false;
+  protected boolean keepClientCnxOpen = false;
+  // used to simulate the metadata propagated from injection
+  protected HashMap<String, String> metadata = new HashMap<String, String>();
+  protected int tcpPort = -1;
+
+  public static final Logger LOG = LoggerFactory
+      .getLogger(IndexingFiltersChecker.class);
+
+  public IndexingFiltersChecker() {
+
+  }
+
+  public int run(String[] args) throws Exception {
+    String url = null;
+    String usage = "Usage: IndexingFiltersChecker [-normalize] [-followRedirects] [-dumpText] [-md key=value] [-listen <port>] [-keepClientCnxOpen]";
+
+    if (args.length == 0) {
+      System.err.println(usage);
+      return -1;
+    }
+
+    for (int i = 0; i < args.length; i++) {
+      if (args[i].equals("-normalize")) {
+        normalizers = new URLNormalizers(getConf(), URLNormalizers.SCOPE_DEFAULT);
+      } else if (args[i].equals("-listen")) {
+        tcpPort = Integer.parseInt(args[++i]);
+      } else if (args[i].equals("-followRedirects")) {
+        followRedirects = true;
+      } else if (args[i].equals("-keepClientCnxOpen")) {
+        keepClientCnxOpen = true;
+      } else if (args[i].equals("-dumpText")) {
+        dumpText = true;
+      } else if (args[i].equals("-md")) {
+        String k = null, v = null;
+        String nextOne = args[++i];
+        int firstEquals = nextOne.indexOf("=");
+        if (firstEquals != -1) {
+          k = nextOne.substring(0, firstEquals);
+          v = nextOne.substring(firstEquals + 1);
+        } else
+          k = nextOne;
+        metadata.put(k, v);
+      } else if (i != args.length - 1) {
+        System.err.println(usage);
+        System.exit(-1);
+      } else {
+        url =args[i];
+      }
+    }
+    
+    // In listening mode?
+    if (tcpPort == -1) {
+      // No, just fetch and display
+      StringBuilder output = new StringBuilder();
+      int ret = fetch(url, output);
+      System.out.println(output);
+      return ret;
+    } else {
+      // Listen on socket and start workers on incoming requests
+      listen();
+    }
+    
+    return 0;
+  }
+  
+  protected void listen() throws Exception {
+    ServerSocket server = null;
+
+    try{
+      server = new ServerSocket();
+      server.bind(new InetSocketAddress(tcpPort));
+      LOG.info(server.toString());
+    } catch (Exception e) {
+      LOG.error("Could not listen on port " + tcpPort);
+      System.exit(-1);
+    }
+    
+    while(true){
+      Worker worker;
+      try{
+        worker = new Worker(server.accept());
+        Thread thread = new Thread(worker);
+        thread.start();
+      } catch (Exception e) {
+        LOG.error("Accept failed: " + tcpPort);
+        System.exit(-1);
+      }
+    }
+  }
+  
+  private class Worker implements Runnable {
+    private Socket client;
+
+    Worker(Socket client) {
+      this.client = client;
+      LOG.info(client.toString());
+    }
+
+    public void run() {
+      if (keepClientCnxOpen) {
+        while (true) { // keep connection open until closes
+          readWrite();
+        }
+      } else {
+        readWrite();
+        
+        try { // close ourselves
+          client.close();
+        } catch (Exception e){
+          LOG.error(e.toString());
+        }
+      }
+    }
+    
+    protected void readWrite() {
+      String line;
+      BufferedReader in = null;
+      PrintWriter out = null;
+      
+      try{
+        in = new BufferedReader(new InputStreamReader(client.getInputStream()));
+      } catch (Exception e) {
+        LOG.error("in or out failed");
+        System.exit(-1);
+      }
+
+      try{
+        line = in.readLine();        
+        StringBuilder output = new StringBuilder();
+        fetch(line, output);
+        
+        client.getOutputStream().write(output.toString().getBytes(Charset.forName("UTF-8")));
+      }catch (Exception e) {
+        LOG.error("Read/Write failed: " + e);
+      }
+    }
+  }
+    
+  
+  protected int fetch(String url, StringBuilder output) throws Exception {
+    if (normalizers != null) {
+      url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
+    }
+
+    LOG.info("fetching: " + url);
+
+    CrawlDatum datum = new CrawlDatum();
+
+    Iterator<String> iter = metadata.keySet().iterator();
+    while (iter.hasNext()) {
+      String key = iter.next();
+      String value = metadata.get(key);
+      if (value == null)
+        value = "";
+      datum.getMetaData().put(new Text(key), new Text(value));
+    }
+
+    IndexingFilters indexers = new IndexingFilters(getConf());
+    
+    int maxRedirects = 3;
+
+    ProtocolOutput protocolOutput = getProtocolOutput(url, datum);
+    Text turl = new Text(url);
+    
+    // Following redirects and not reached maxRedirects?
+    while (!protocolOutput.getStatus().isSuccess() && followRedirects && protocolOutput.getStatus().isRedirect() && maxRedirects != 0) {
+      String[] stuff = protocolOutput.getStatus().getArgs();
+      url = stuff[0];
+      
+      if (normalizers != null) {
+        url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
+      }
+    
+      turl.set(url);
+      
+      // try again
+      protocolOutput = getProtocolOutput(url, datum);
+      maxRedirects--;
+    }
+
+    if (!protocolOutput.getStatus().isSuccess()) {
+      output.append("Fetch failed with protocol status: "
+          + protocolOutput.getStatus() + "\n");
+      return 0;
+    }
+
+    Content content = protocolOutput.getContent();
+
+    if (content == null) {
+      output.append("No content for " + url + "\n");
+      return 0;
+    }
+
+    String contentType = content.getContentType();
+
+    if (contentType == null) {
+      return -1;
+    }
+
+    // store the guessed content type in the crawldatum
+    datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE),
+        new Text(contentType));
+
+    if (ParseSegment.isTruncated(content)) {
+      LOG.warn("Content is truncated, parse may fail!");
+    }
+
+    ScoringFilters scfilters = new ScoringFilters(getConf());
+    // call the scoring filters
+    try {
+      scfilters.passScoreBeforeParsing(turl, datum, content);
+    } catch (Exception e) {
+      LOG.warn("Couldn't pass score, url {} ({})", url, e);
+    }
+
+    LOG.info("parsing: {}", url);
+    LOG.info("contentType: {}", contentType);
+
+    ParseResult parseResult = new ParseUtil(getConf()).parse(content);
+
+    NutchDocument doc = new NutchDocument();
+    doc.add("id", url);
+    Text urlText = new Text(url);
+
+    Inlinks inlinks = null;
+    Parse parse = parseResult.get(urlText);
+    if (parse == null) {
+      LOG.error("Failed to get parse from parse result");
+      LOG.error("Available parses in parse result (by URL key):");
+      for (Map.Entry<Text, Parse> entry : parseResult) {
+        LOG.error("  " + entry.getKey());
+      }
+      LOG.error("Parse result does not contain a parse for URL to be checked:");
+      LOG.error("  " + urlText);
+      return -1;
+    }
+
+    byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content,
+        parse);
+    parse.getData().getContentMeta()
+        .set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
+    String digest = parse.getData().getContentMeta().get(Nutch.SIGNATURE_KEY);
+    doc.add("digest", digest);
+    datum.setSignature(signature);
+
+    // call the scoring filters
+    try {
+      scfilters.passScoreAfterParsing(turl, content, parseResult.get(turl));
+    } catch (Exception e) {
+      LOG.warn("Couldn't pass score, url {} ({})", turl, e);
+    }
+
+    try {
+      doc = indexers.filter(doc, parse, urlText, datum, inlinks);
+    } catch (IndexingException e) {
+      e.printStackTrace();
+    }
+
+    if (doc == null) {
+      output.append("Document discarded by indexing filter\n");
+      return 0;
+    }
+
+    for (String fname : doc.getFieldNames()) {
+      List<Object> values = doc.getField(fname).getValues();
+      if (values != null) {
+        for (Object value : values) {
+          String str = value.toString();
+          int minText = dumpText ? str.length() : Math.min(100, str.length());
+          output.append(fname + " :\t" + str.substring(0, minText) + "\n");
+        }
+      }
+    }
+    
+    output.append("\n"); // For readability if keepClientCnxOpen
+
+    if (getConf().getBoolean("doIndex", false) && doc != null) {
+      IndexWriters writers = new IndexWriters(getConf());
+      writers.open(new JobConf(getConf()), "IndexingFilterChecker");
+      writers.write(doc);
+      writers.close();
+    }
+
+    return 0;
+  }
+  
+  protected ProtocolOutput getProtocolOutput(String url, CrawlDatum datum) throws Exception {
+    ProtocolFactory factory = new ProtocolFactory(getConf());
+    Protocol protocol = factory.getProtocol(url);
+    Text turl = new Text(url);
+    ProtocolOutput protocolOutput = protocol.getProtocolOutput(turl, datum);
+    return protocolOutput;
+  }
+
+  public static void main(String[] args) throws Exception {
+    final int res = ToolRunner.run(NutchConfiguration.create(),
+        new IndexingFiltersChecker(), args);
+    System.exit(res);
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingJob.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingJob.java b/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingJob.java
new file mode 100644
index 0000000..342ea4a
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingJob.java
@@ -0,0 +1,358 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer;
+
+import java.io.File;
+import java.io.IOException;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Random;
+
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.segment.SegmentChecker;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapred.FileOutputFormat;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Counters.Counter;
+import org.apache.hadoop.mapred.RunningJob;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.util.HadoopFSUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.NutchTool;
+import org.apache.nutch.util.TimingUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Generic indexer which relies on the plugins implementing IndexWriter
+ **/
+
+public class IndexingJob extends NutchTool implements Tool {
+
+  public static Logger LOG = LoggerFactory.getLogger(IndexingJob.class);
+
+  public IndexingJob() {
+    super(null);
+  }
+
+  public IndexingJob(Configuration conf) {
+    super(conf);
+  }
+
+  public void index(Path crawlDb, Path linkDb, List<Path> segments,
+      boolean noCommit) throws IOException {
+    index(crawlDb, linkDb, segments, noCommit, false, null);
+  }
+
+  public void index(Path crawlDb, Path linkDb, List<Path> segments,
+      boolean noCommit, boolean deleteGone) throws IOException {
+    index(crawlDb, linkDb, segments, noCommit, deleteGone, null);
+  }
+
+  public void index(Path crawlDb, Path linkDb, List<Path> segments,
+      boolean noCommit, boolean deleteGone, String params) throws IOException {
+    index(crawlDb, linkDb, segments, noCommit, deleteGone, params, false, false);
+  }
+
+  public void index(Path crawlDb, Path linkDb, List<Path> segments,
+      boolean noCommit, boolean deleteGone, String params, boolean filter,
+      boolean normalize) throws IOException {
+    index(crawlDb, linkDb, segments, noCommit, deleteGone, params, false,
+        false, false);
+  }
+
+  public void index(Path crawlDb, Path linkDb, List<Path> segments,
+      boolean noCommit, boolean deleteGone, String params,
+      boolean filter, boolean normalize, boolean addBinaryContent) throws IOException {
+    index(crawlDb, linkDb, segments, noCommit, deleteGone, params, false,
+        false, false, false);
+  }
+
+  public void index(Path crawlDb, Path linkDb, List<Path> segments,
+      boolean noCommit, boolean deleteGone, String params,
+      boolean filter, boolean normalize, boolean addBinaryContent,
+      boolean base64) throws IOException {
+
+
+    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+    long start = System.currentTimeMillis();
+    LOG.info("Indexer: starting at {}", sdf.format(start));
+
+    final JobConf job = new NutchJob(getConf());
+    job.setJobName("Indexer");
+
+    LOG.info("Indexer: deleting gone documents: {}", deleteGone);
+    LOG.info("Indexer: URL filtering: {}", filter);
+    LOG.info("Indexer: URL normalizing: {}", normalize);
+    if (addBinaryContent) {
+      if (base64) {
+        LOG.info("Indexer: adding binary content as Base64");
+      } else {
+        LOG.info("Indexer: adding binary content");
+      }
+    }        
+    IndexWriters writers = new IndexWriters(getConf());
+    LOG.info(writers.describe());
+
+    IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job, addBinaryContent);
+
+    // NOW PASSED ON THE COMMAND LINE AS A HADOOP PARAM
+    // job.set(SolrConstants.SERVER_URL, solrUrl);
+
+    job.setBoolean(IndexerMapReduce.INDEXER_DELETE, deleteGone);
+    job.setBoolean(IndexerMapReduce.URL_FILTERING, filter);
+    job.setBoolean(IndexerMapReduce.URL_NORMALIZING, normalize);
+    job.setBoolean(IndexerMapReduce.INDEXER_BINARY_AS_BASE64, base64);
+
+    if (params != null) {
+      job.set(IndexerMapReduce.INDEXER_PARAMS, params);
+    }
+
+    job.setReduceSpeculativeExecution(false);
+
+    final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-"
+        + new Random().nextInt());
+
+    FileOutputFormat.setOutputPath(job, tmp);
+    try {
+      RunningJob indexJob = JobClient.runJob(job);
+      // do the commits once and for all the reducers in one go
+      if (!noCommit) {
+        writers.open(job, "commit");
+        writers.commit();
+      }
+      LOG.info("Indexer: number of documents indexed, deleted, or skipped:");
+      for (Counter counter : indexJob.getCounters().getGroup("IndexerStatus")) {
+        LOG.info("Indexer: {}  {}",
+            String.format(Locale.ROOT, "%6d", counter.getValue()),
+            counter.getName());
+      }
+      long end = System.currentTimeMillis();
+      LOG.info("Indexer: finished at " + sdf.format(end) + ", elapsed: "
+          + TimingUtil.elapsedTime(start, end));
+    } finally {
+      FileSystem.get(job).delete(tmp, true);
+    }
+  }
+
+  public int run(String[] args) throws Exception {
+    if (args.length < 2) {
+      System.err
+      //.println("Usage: Indexer <crawldb> [-linkdb <linkdb>] [-params k1=v1&k2=v2...] (<segment> ... | -dir <segments>) [-noCommit] [-deleteGone] [-filter] [-normalize]");
+      .println("Usage: Indexer <crawldb> [-linkdb <linkdb>] [-params k1=v1&k2=v2...] (<segment> ... | -dir <segments>) [-noCommit] [-deleteGone] [-filter] [-normalize] [-addBinaryContent] [-base64]");
+      IndexWriters writers = new IndexWriters(getConf());
+      System.err.println(writers.describe());
+      return -1;
+    }
+
+    final Path crawlDb = new Path(args[0]);
+    Path linkDb = null;
+
+    final List<Path> segments = new ArrayList<Path>();
+    String params = null;
+
+    boolean noCommit = false;
+    boolean deleteGone = false;
+    boolean filter = false;
+    boolean normalize = false;
+    boolean addBinaryContent = false;
+    boolean base64 = false;
+
+    for (int i = 1; i < args.length; i++) {
+      FileSystem fs = null;
+      Path dir = null;
+      if (args[i].equals("-linkdb")) {
+        linkDb = new Path(args[++i]);
+      } else if (args[i].equals("-dir")) {
+        dir = new Path(args[++i]);
+        fs = dir.getFileSystem(getConf());
+        FileStatus[] fstats = fs.listStatus(dir,
+            HadoopFSUtil.getPassDirectoriesFilter(fs));
+        Path[] files = HadoopFSUtil.getPaths(fstats);
+        for (Path p : files) {
+          if (SegmentChecker.isIndexable(p,fs)) {
+            segments.add(p);
+          }
+        }
+      } else if (args[i].equals("-noCommit")) {
+        noCommit = true;
+      } else if (args[i].equals("-deleteGone")) {
+        deleteGone = true;
+      } else if (args[i].equals("-filter")) {
+        filter = true;
+      } else if (args[i].equals("-normalize")) {
+        normalize = true;
+      } else if (args[i].equals("-addBinaryContent")) {
+        addBinaryContent = true;
+      } else if (args[i].equals("-base64")) {
+        base64 = true;
+      } else if (args[i].equals("-params")) {
+        params = args[++i];
+      } else {
+        dir = new Path(args[i]);
+        fs = dir.getFileSystem(getConf());
+        if (SegmentChecker.isIndexable(dir,fs)) {
+          segments.add(dir);
+        }
+      }
+    }
+
+    try {
+      index(crawlDb, linkDb, segments, noCommit, deleteGone, params, filter, normalize, addBinaryContent, base64);
+      return 0;
+    } catch (final Exception e) {
+      LOG.error("Indexer: {}", StringUtils.stringifyException(e));
+      return -1;
+    }
+  }
+
+  public static void main(String[] args) throws Exception {
+    final int res = ToolRunner.run(NutchConfiguration.create(),
+        new IndexingJob(), args);
+    System.exit(res);
+  }
+
+
+  //Used for REST API
+  @Override
+  public Map<String, Object> run(Map<String, Object> args, String crawlId) throws Exception {
+    boolean noCommit = false;
+    boolean deleteGone = false; 
+    boolean filter = false;
+    boolean normalize = false;
+    boolean isSegment = false;
+    String params= null;
+    Configuration conf = getConf();
+
+    Path crawlDb;
+    if(args.containsKey(Nutch.ARG_CRAWLDB)) {
+      Object crawldbPath = args.get(Nutch.ARG_CRAWLDB);
+      if(crawldbPath instanceof Path) {
+        crawlDb = (Path) crawldbPath;
+      }
+      else {
+        crawlDb = new Path(crawldbPath.toString());
+      }
+    }
+    else {
+      crawlDb = new Path(crawlId+"/crawldb");
+    }
+
+    Path linkdb = null;
+    List<Path> segments = new ArrayList<Path>();
+
+    if(args.containsKey(Nutch.ARG_LINKDB)){
+      if(args.containsKey(Nutch.ARG_LINKDB)) {
+        Object path = args.get(Nutch.ARG_LINKDB);
+        if(path instanceof Path) {
+          linkdb = (Path) path;
+        }
+        else {
+          linkdb = new Path(path.toString());
+        }
+      }
+      else {
+        linkdb = new Path(crawlId+"/linkdb");
+      }
+    }
+
+    if(args.containsKey(Nutch.ARG_SEGMENTDIR)){
+      isSegment = true;
+      Path segmentsDir;
+      Object segDir = args.get(Nutch.ARG_SEGMENTDIR);
+      if(segDir instanceof Path) {
+        segmentsDir = (Path) segDir;
+      }
+      else {
+        segmentsDir = new Path(segDir.toString());
+      }
+      FileSystem fs = segmentsDir.getFileSystem(getConf());
+      FileStatus[] fstats = fs.listStatus(segmentsDir,
+          HadoopFSUtil.getPassDirectoriesFilter(fs));
+      Path[] files = HadoopFSUtil.getPaths(fstats);
+      for (Path p : files) {
+        if (SegmentChecker.isIndexable(p,fs)) {
+          segments.add(p);
+        }
+      }     
+    }
+
+    if(args.containsKey(Nutch.ARG_SEGMENT)){
+      isSegment = true;
+      Object seg = args.get(Nutch.ARG_SEGMENT);
+      ArrayList<String> segmentList = new ArrayList<String>();
+      if(seg instanceof ArrayList) {
+        segmentList = (ArrayList<String>)seg;
+      }
+      for(String segment: segmentList) {
+        segments.add(new Path(segment));
+      }
+    }
+
+    if(!isSegment){
+      String segment_dir = crawlId+"/segments";
+      File segmentsDir = new File(segment_dir);
+      File[] segmentsList = segmentsDir.listFiles();  
+      Arrays.sort(segmentsList, new Comparator<File>(){
+        @Override
+        public int compare(File f1, File f2) {
+          if(f1.lastModified()>f2.lastModified())
+            return -1;
+          else
+            return 0;
+        }      
+      });
+      Path segment = new Path(segmentsList[0].getPath());
+      segments.add(segment);
+    }
+
+    if(args.containsKey("noCommit")){
+      noCommit = true;
+    }
+    if(args.containsKey("deleteGone")){
+      deleteGone = true;
+    }
+    if(args.containsKey("normalize")){
+      normalize = true;
+    }
+    if(args.containsKey("filter")){
+      filter = true;
+    }
+    if(args.containsKey("params")){
+      params = (String)args.get("params");
+    }
+    setConf(conf);
+    index(crawlDb, linkdb, segments, noCommit, deleteGone, params, filter,
+        normalize);
+    Map<String, Object> results = new HashMap<String, Object>();
+    results.put(Nutch.VAL_RESULT, 0);
+    return results;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/indexer/NutchDocument.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/indexer/NutchDocument.java b/nutch-core/src/main/java/org/apache/nutch/indexer/NutchDocument.java
new file mode 100644
index 0000000..efdde02
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/indexer/NutchDocument.java
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.VersionMismatchException;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableUtils;
+import org.apache.nutch.metadata.Metadata;
+
+/** A {@link NutchDocument} is the unit of indexing. */
+public class NutchDocument implements Writable,
+    Iterable<Entry<String, NutchField>> {
+
+  public static final byte VERSION = 2;
+
+  private Map<String, NutchField> fields;
+
+  private Metadata documentMeta;
+
+  private float weight;
+
+  public NutchDocument() {
+    fields = new HashMap<String, NutchField>();
+    documentMeta = new Metadata();
+    weight = 1.0f;
+  }
+
+  public void add(String name, Object value) {
+    NutchField field = fields.get(name);
+    if (field == null) {
+      field = new NutchField(value);
+      fields.put(name, field);
+    } else {
+      field.add(value);
+    }
+  }
+
+  public Object getFieldValue(String name) {
+    NutchField field = fields.get(name);
+    if (field == null) {
+      return null;
+    }
+    if (field.getValues().size() == 0) {
+      return null;
+    }
+    return field.getValues().get(0);
+  }
+
+  public NutchField getField(String name) {
+    return fields.get(name);
+  }
+
+  public NutchField removeField(String name) {
+    return fields.remove(name);
+  }
+
+  public Collection<String> getFieldNames() {
+    return fields.keySet();
+  }
+
+  /** Iterate over all fields. */
+  public Iterator<Entry<String, NutchField>> iterator() {
+    return fields.entrySet().iterator();
+  }
+
+  public float getWeight() {
+    return weight;
+  }
+
+  public void setWeight(float weight) {
+    this.weight = weight;
+  }
+
+  public Metadata getDocumentMeta() {
+    return documentMeta;
+  }
+
+  public void readFields(DataInput in) throws IOException {
+    fields.clear();
+    byte version = in.readByte();
+    if (version != VERSION) {
+      throw new VersionMismatchException(VERSION, version);
+    }
+    int size = WritableUtils.readVInt(in);
+    for (int i = 0; i < size; i++) {
+      String name = Text.readString(in);
+      NutchField field = new NutchField();
+      field.readFields(in);
+      fields.put(name, field);
+    }
+    weight = in.readFloat();
+    documentMeta.readFields(in);
+  }
+
+  public void write(DataOutput out) throws IOException {
+    out.writeByte(VERSION);
+    WritableUtils.writeVInt(out, fields.size());
+    for (Map.Entry<String, NutchField> entry : fields.entrySet()) {
+      Text.writeString(out, entry.getKey());
+      NutchField field = entry.getValue();
+      field.write(out);
+    }
+    out.writeFloat(weight);
+    documentMeta.write(out);
+  }
+
+  public String toString() {
+    StringBuilder sb = new StringBuilder();
+    sb.append("doc {\n");
+    for (Map.Entry<String, NutchField> entry : fields.entrySet()) {
+      sb.append("\t");
+      sb.append(entry.getKey());
+      sb.append(":\t");
+      sb.append(entry.getValue());
+      sb.append("\n");
+    }
+    sb.append("}\n");
+    return sb.toString();
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/indexer/NutchField.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/indexer/NutchField.java b/nutch-core/src/main/java/org/apache/nutch/indexer/NutchField.java
new file mode 100644
index 0000000..33911e1
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/indexer/NutchField.java
@@ -0,0 +1,137 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Date;
+import java.util.List;
+
+import org.apache.hadoop.io.*;
+
+/**
+ * This class represents a multi-valued field with a weight. Values are
+ * arbitrary objects.
+ */
+public class NutchField implements Writable {
+  private float weight;
+  private List<Object> values = new ArrayList<Object>();
+
+  public NutchField() {
+  }
+
+  public NutchField(Object value) {
+    this(value, 1.0f);
+  }
+
+  public NutchField(Object value, float weight) {
+    this.weight = weight;
+    if (value instanceof Collection) {
+      values.addAll((Collection<?>) value);
+    } else {
+      values.add(value);
+    }
+  }
+
+  public void add(Object value) {
+    values.add(value);
+  }
+
+  public float getWeight() {
+    return weight;
+  }
+
+  public void setWeight(float weight) {
+    this.weight = weight;
+  }
+
+  public List<Object> getValues() {
+    return values;
+  }
+
+  public void reset() {
+    weight = 1.0f;
+    values.clear();
+  }
+
+  @Override
+  public Object clone() throws CloneNotSupportedException {
+    NutchField result = (NutchField) super.clone();
+    result.weight = weight;
+    result.values = values;
+
+    return result;
+  }
+
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    weight = in.readFloat();
+    int count = in.readInt();
+    values = new ArrayList<Object>();
+    for (int i = 0; i < count; i++) {
+      String type = Text.readString(in);
+
+      if (type.equals("java.lang.String")) {
+        values.add(Text.readString(in));
+      } else if (type.equals("java.lang.Boolean")) {
+        values.add(in.readBoolean());
+      } else if (type.equals("java.lang.Integer")) {
+        values.add(in.readInt());
+      } else if (type.equals("java.lang.Float")) {
+        values.add(in.readFloat());
+      } else if (type.equals("java.lang.Long")) {
+        values.add(in.readLong());
+      } else if (type.equals("java.util.Date")) {
+        values.add(new Date(in.readLong()));
+      }
+    }
+  }
+
+  @Override
+  public void write(DataOutput out) throws IOException {
+    out.writeFloat(weight);
+    out.writeInt(values.size());
+    for (Object value : values) {
+
+      Text.writeString(out, value.getClass().getName());
+
+      if (value instanceof Boolean) {
+        out.writeBoolean((Boolean) value);
+      } else if (value instanceof Integer) {
+        out.writeInt((Integer) value);
+      } else if (value instanceof Long) {
+        out.writeLong((Long) value);
+      } else if (value instanceof Float) {
+        out.writeFloat((Float) value);
+      } else if (value instanceof String) {
+        Text.writeString(out, (String) value);
+      } else if (value instanceof Date) {
+        Date date = (Date) value;
+        out.writeLong(date.getTime());
+      }
+    }
+  }
+
+  public String toString() {
+    return values.toString();
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/indexer/NutchIndexAction.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/indexer/NutchIndexAction.java b/nutch-core/src/main/java/org/apache/nutch/indexer/NutchIndexAction.java
new file mode 100644
index 0000000..b2517c3
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/indexer/NutchIndexAction.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.Writable;
+
+import org.apache.nutch.indexer.NutchDocument;
+
+/**
+ * A {@link NutchIndexAction} is the new unit of indexing holding the document
+ * and action information.
+ */
+public class NutchIndexAction implements Writable {
+
+  public static final byte ADD = 0;
+  public static final byte DELETE = 1;
+  public static final byte UPDATE = 2;
+
+  public NutchDocument doc = null;
+  public byte action = ADD;
+
+  protected NutchIndexAction() {
+  }
+
+  public NutchIndexAction(NutchDocument doc, byte action) {
+    this.doc = doc;
+    this.action = action;
+  }
+
+  public void readFields(DataInput in) throws IOException {
+    action = in.readByte();
+    doc = new NutchDocument();
+    doc.readFields(in);
+  }
+
+  public void write(DataOutput out) throws IOException {
+    out.write(action);
+    doc.write(out);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/indexer/package.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/indexer/package.html b/nutch-core/src/main/java/org/apache/nutch/indexer/package.html
new file mode 100644
index 0000000..825eaae
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/indexer/package.html
@@ -0,0 +1,10 @@
+<html>
+<body>
+Index content, configure and run indexing and cleaning jobs to 
+add, update, and delete documents from an index. Two tasks are
+delegated to plugins:
+<ul>
+<li>indexing filters fill index fields of each documents</li>
+<li>index writer plugins send documents to index back-ends (Solr, etc.).
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/metadata/CreativeCommons.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/metadata/CreativeCommons.java b/nutch-core/src/main/java/org/apache/nutch/metadata/CreativeCommons.java
new file mode 100644
index 0000000..f9c425b
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/metadata/CreativeCommons.java
@@ -0,0 +1,35 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.metadata;
+
+/**
+ * A collection of Creative Commons properties names.
+ * 
+ * @see <a href="http://www.creativecommons.org/">creativecommons.org</a>
+ * 
+ * @author Chris Mattmann
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public interface CreativeCommons {
+
+  public final static String LICENSE_URL = "License-Url";
+
+  public final static String LICENSE_LOCATION = "License-Location";
+
+  public final static String WORK_TYPE = "Work-Type";
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/metadata/DublinCore.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/metadata/DublinCore.java b/nutch-core/src/main/java/org/apache/nutch/metadata/DublinCore.java
new file mode 100644
index 0000000..9724d80
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/metadata/DublinCore.java
@@ -0,0 +1,161 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.metadata;
+
+/**
+ * A collection of Dublin Core metadata names.
+ * 
+ * @see <a href="http://dublincore.org">dublincore.org</a>
+ * 
+ * @author Chris Mattmann
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public interface DublinCore {
+
+  /**
+   * Typically, Format may include the media-type or dimensions of the resource.
+   * Format may be used to determine the software, hardware or other equipment
+   * needed to display or operate the resource. Examples of dimensions include
+   * size and duration. Recommended best practice is to select a value from a
+   * controlled vocabulary (for example, the list of Internet Media Types [MIME]
+   * defining computer media formats).
+   */
+  public static final String FORMAT = "format";
+
+  /**
+   * Recommended best practice is to identify the resource by means of a string
+   * or number conforming to a formal identification system. Example formal
+   * identification systems include the Uniform Resource Identifier (URI)
+   * (including the Uniform Resource Locator (URL)), the Digital Object
+   * Identifier (DOI) and the International Standard Book Number (ISBN).
+   */
+  public static final String IDENTIFIER = "identifier";
+
+  /**
+   * Date on which the resource was changed.
+   */
+  public static final String MODIFIED = "modified";
+
+  /**
+   * An entity responsible for making contributions to the content of the
+   * resource. Examples of a Contributor include a person, an organisation, or a
+   * service. Typically, the name of a Contributor should be used to indicate
+   * the entity.
+   */
+  public static final String CONTRIBUTOR = "contributor";
+
+  /**
+   * The extent or scope of the content of the resource. Coverage will typically
+   * include spatial location (a place name or geographic coordinates), temporal
+   * period (a period label, date, or date range) or jurisdiction (such as a
+   * named administrative entity). Recommended best practice is to select a
+   * value from a controlled vocabulary (for example, the Thesaurus of
+   * Geographic Names [TGN]) and that, where appropriate, named places or time
+   * periods be used in preference to numeric identifiers such as sets of
+   * coordinates or date ranges.
+   */
+  public static final String COVERAGE = "coverage";
+
+  /**
+   * An entity primarily responsible for making the content of the resource.
+   * Examples of a Creator include a person, an organisation, or a service.
+   * Typically, the name of a Creator should be used to indicate the entity.
+   */
+  public static final String CREATOR = "creator";
+
+  /**
+   * A date associated with an event in the life cycle of the resource.
+   * Typically, Date will be associated with the creation or availability of the
+   * resource. Recommended best practice for encoding the date value is defined
+   * in a profile of ISO 8601 [W3CDTF] and follows the YYYY-MM-DD format.
+   */
+  public static final String DATE = "date";
+
+  /**
+   * An account of the content of the resource. Description may include but is
+   * not limited to: an abstract, table of contents, reference to a graphical
+   * representation of content or a free-text account of the content.
+   */
+  public static final String DESCRIPTION = "description";
+
+  /**
+   * A language of the intellectual content of the resource. Recommended best
+   * practice is to use RFC 3066 [RFC3066], which, in conjunction with ISO 639
+   * [ISO639], defines two- and three-letter primary language tags with optional
+   * subtags. Examples include "en" or "eng" for English, "akk" for Akkadian,
+   * and "en-GB" for English used in the United Kingdom.
+   */
+  public static final String LANGUAGE = "language";
+
+  /**
+   * An entity responsible for making the resource available. Examples of a
+   * Publisher include a person, an organisation, or a service. Typically, the
+   * name of a Publisher should be used to indicate the entity.
+   */
+  public static final String PUBLISHER = "publisher";
+
+  /**
+   * A reference to a related resource. Recommended best practice is to
+   * reference the resource by means of a string or number conforming to a
+   * formal identification system.
+   */
+  public static final String RELATION = "relation";
+
+  /**
+   * Information about rights held in and over the resource. Typically, a Rights
+   * element will contain a rights management statement for the resource, or
+   * reference a service providing such information. Rights information often
+   * encompasses Intellectual Property Rights (IPR), Copyright, and various
+   * Property Rights. If the Rights element is absent, no assumptions can be
+   * made about the status of these and other rights with respect to the
+   * resource.
+   */
+  public static final String RIGHTS = "rights";
+
+  /**
+   * A reference to a resource from which the present resource is derived. The
+   * present resource may be derived from the Source resource in whole or in
+   * part. Recommended best practice is to reference the resource by means of a
+   * string or number conforming to a formal identification system.
+   */
+  public static final String SOURCE = "source";
+
+  /**
+   * The topic of the content of the resource. Typically, a Subject will be
+   * expressed as keywords, key phrases or classification codes that describe a
+   * topic of the resource. Recommended best practice is to select a value from
+   * a controlled vocabulary or formal classification scheme.
+   */
+  public static final String SUBJECT = "subject";
+
+  /**
+   * A name given to the resource. Typically, a Title will be a name by which
+   * the resource is formally known.
+   */
+  public static final String TITLE = "title";
+
+  /**
+   * The nature or genre of the content of the resource. Type includes terms
+   * describing general categories, functions, genres, or aggregation levels for
+   * content. Recommended best practice is to select a value from a controlled
+   * vocabulary (for example, the DCMI Type Vocabulary [DCMITYPE]). To describe
+   * the physical or digital manifestation of the resource, use the Format
+   * element.
+   */
+  public static final String TYPE = "type";
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/metadata/Feed.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/metadata/Feed.java b/nutch-core/src/main/java/org/apache/nutch/metadata/Feed.java
new file mode 100644
index 0000000..2697da6
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/metadata/Feed.java
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.metadata;
+
+/**
+ * A collection of Feed property names extracted by the ROME library.
+ * 
+ * 
+ * @author mattmann
+ * @author dogacan
+ */
+public interface Feed {
+
+  public static final String FEED_AUTHOR = "author";
+
+  public static final String FEED_TAGS = "tag";
+
+  public static final String FEED_PUBLISHED = "published";
+
+  public static final String FEED_UPDATED = "updated";
+
+  public static final String FEED = "feed";
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/metadata/HttpHeaders.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/metadata/HttpHeaders.java b/nutch-core/src/main/java/org/apache/nutch/metadata/HttpHeaders.java
new file mode 100644
index 0000000..78b8797
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/metadata/HttpHeaders.java
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.metadata;
+
+import org.apache.hadoop.io.Text;
+
+/**
+ * A collection of HTTP header names.
+ * 
+ * @see <a href="http://rfc-ref.org/RFC-TEXTS/2616/">Hypertext Transfer Protocol
+ *      -- HTTP/1.1 (RFC 2616)</a>
+ */
+public interface HttpHeaders {
+
+  public final static String TRANSFER_ENCODING = "Transfer-Encoding";
+
+  public final static String CONTENT_ENCODING = "Content-Encoding";
+
+  public final static String CONTENT_LANGUAGE = "Content-Language";
+
+  public final static String CONTENT_LENGTH = "Content-Length";
+
+  public final static String CONTENT_LOCATION = "Content-Location";
+
+  public static final String CONTENT_DISPOSITION = "Content-Disposition";
+
+  public final static String CONTENT_MD5 = "Content-MD5";
+
+  public final static String CONTENT_TYPE = "Content-Type";
+
+  public static final Text WRITABLE_CONTENT_TYPE = new Text(CONTENT_TYPE);
+
+  public final static String LAST_MODIFIED = "Last-Modified";
+
+  public final static String LOCATION = "Location";
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/metadata/MetaWrapper.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/metadata/MetaWrapper.java b/nutch-core/src/main/java/org/apache/nutch/metadata/MetaWrapper.java
new file mode 100644
index 0000000..a43fa9d
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/metadata/MetaWrapper.java
@@ -0,0 +1,120 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.metadata;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Writable;
+import org.apache.nutch.crawl.NutchWritable;
+
+/**
+ * This is a simple decorator that adds metadata to any Writable-s that can be
+ * serialized by <tt>NutchWritable</tt>. This is useful when data needs to be
+ * temporarily enriched during processing, but this temporary metadata doesn't
+ * need to be permanently stored after the job is done.
+ * 
+ * @author Andrzej Bialecki
+ */
+public class MetaWrapper extends NutchWritable {
+  private Metadata metadata;
+
+  public MetaWrapper() {
+    super();
+    metadata = new Metadata();
+  }
+
+  public MetaWrapper(Writable instance, Configuration conf) {
+    super(instance);
+    metadata = new Metadata();
+    setConf(conf);
+  }
+
+  public MetaWrapper(Metadata metadata, Writable instance, Configuration conf) {
+    super(instance);
+    if (metadata == null)
+      metadata = new Metadata();
+    this.metadata = metadata;
+    setConf(conf);
+  }
+
+  /**
+   * Get all metadata.
+   */
+  public Metadata getMetadata() {
+    return metadata;
+  }
+
+  /**
+   * Add metadata. See {@link Metadata#add(String, String)} for more
+   * information.
+   * 
+   * @param name
+   *          metadata name
+   * @param value
+   *          metadata value
+   */
+  public void addMeta(String name, String value) {
+    metadata.add(name, value);
+  }
+
+  /**
+   * Set metadata. See {@link Metadata#set(String, String)} for more
+   * information.
+   * 
+   * @param name
+   * @param value
+   */
+  public void setMeta(String name, String value) {
+    metadata.set(name, value);
+  }
+
+  /**
+   * Get metadata. See {@link Metadata#get(String)} for more information.
+   * 
+   * @param name
+   * @return metadata value
+   */
+  public String getMeta(String name) {
+    return metadata.get(name);
+  }
+
+  /**
+   * Get multiple metadata. See {@link Metadata#getValues(String)} for more
+   * information.
+   * 
+   * @param name
+   * @return multiple values
+   */
+  public String[] getMetaValues(String name) {
+    return metadata.getValues(name);
+  }
+
+  public void readFields(DataInput in) throws IOException {
+    super.readFields(in);
+    metadata = new Metadata();
+    metadata.readFields(in);
+  }
+
+  public void write(DataOutput out) throws IOException {
+    super.write(out);
+    metadata.write(out);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/metadata/Metadata.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/metadata/Metadata.java b/nutch-core/src/main/java/org/apache/nutch/metadata/Metadata.java
new file mode 100644
index 0000000..8a57ee3
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/metadata/Metadata.java
@@ -0,0 +1,280 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.metadata;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Enumeration;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Properties;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+
+/**
+ * A multi-valued metadata container.
+ */
+public class Metadata implements Writable, CreativeCommons, DublinCore,
+    HttpHeaders, Nutch, Feed {
+
+  /**
+   * A map of all metadata attributes.
+   */
+  private Map<String, String[]> metadata = null;
+
+  /**
+   * Constructs a new, empty metadata.
+   */
+  public Metadata() {
+    metadata = new HashMap<String, String[]>();
+  }
+
+  /**
+   * Returns true if named value is multivalued.
+   * 
+   * @param name
+   *          name of metadata
+   * @return true is named value is multivalued, false if single value or null
+   */
+  public boolean isMultiValued(final String name) {
+    return metadata.get(name) != null && metadata.get(name).length > 1;
+  }
+
+  /**
+   * Returns an array of the names contained in the metadata.
+   * 
+   * @return Metadata names
+   */
+  public String[] names() {
+    return metadata.keySet().toArray(new String[metadata.keySet().size()]);
+  }
+
+  /**
+   * Get the value associated to a metadata name. If many values are assiociated
+   * to the specified name, then the first one is returned.
+   * 
+   * @param name
+   *          of the metadata.
+   * @return the value associated to the specified metadata name.
+   */
+  public String get(final String name) {
+    String[] values = metadata.get(name);
+    if (values == null) {
+      return null;
+    } else {
+      return values[0];
+    }
+  }
+
+  /**
+   * Get the values associated to a metadata name.
+   * 
+   * @param name
+   *          of the metadata.
+   * @return the values associated to a metadata name.
+   */
+  public String[] getValues(final String name) {
+    return _getValues(name);
+  }
+
+  private String[] _getValues(final String name) {
+    String[] values = metadata.get(name);
+    if (values == null) {
+      values = new String[0];
+    }
+    return values;
+  }
+
+  /**
+   * Add a metadata name/value mapping. Add the specified value to the list of
+   * values associated to the specified metadata name.
+   * 
+   * @param name
+   *          the metadata name.
+   * @param value
+   *          the metadata value.
+   */
+  public void add(final String name, final String value) {
+    String[] values = metadata.get(name);
+    if (values == null) {
+      set(name, value);
+    } else {
+      String[] newValues = new String[values.length + 1];
+      System.arraycopy(values, 0, newValues, 0, values.length);
+      newValues[newValues.length - 1] = value;
+      metadata.put(name, newValues);
+    }
+  }
+
+  /**
+   * Add all name/value mappings (merge two metadata mappings). If a name
+   * already exists in current metadata the values are added to existing values.
+   *
+   * @param metadata
+   *          other Metadata to be merged
+   */
+  public void addAll(Metadata metadata) {
+    for (String name : metadata.names()) {
+      String[] addValues = metadata.getValues(name);
+      if (addValues == null)
+        continue;
+      String[] oldValues = this.metadata.get(name);
+      if (oldValues == null) {
+        this.metadata.put(name, addValues);
+      } else {
+        String[] newValues = new String[oldValues.length + addValues.length];
+        System.arraycopy(oldValues, 0, newValues, 0, oldValues.length);
+        System.arraycopy(addValues, 0, newValues, oldValues.length,
+            addValues.length);
+        this.metadata.put(name, newValues);
+      }
+    }
+  }
+
+  /**
+   * Copy All key-value pairs from properties.
+   * 
+   * @param properties
+   *          properties to copy from
+   */
+  public void setAll(Properties properties) {
+    Enumeration<?> names = properties.propertyNames();
+    while (names.hasMoreElements()) {
+      String name = (String) names.nextElement();
+      metadata.put(name, new String[] { properties.getProperty(name) });
+    }
+  }
+
+  /**
+   * Set metadata name/value. Associate the specified value to the specified
+   * metadata name. If some previous values were associated to this name, they
+   * are removed.
+   * 
+   * @param name
+   *          the metadata name.
+   * @param value
+   *          the metadata value.
+   */
+  public void set(String name, String value) {
+    metadata.put(name, new String[] { value });
+  }
+
+  /**
+   * Remove a metadata and all its associated values.
+   * 
+   * @param name
+   *          metadata name to remove
+   */
+  public void remove(String name) {
+    metadata.remove(name);
+  }
+
+  /**
+   * Returns the number of metadata names in this metadata.
+   * 
+   * @return number of metadata names
+   */
+  public int size() {
+    return metadata.size();
+  }
+
+  /** Remove all mappings from metadata. */
+  public void clear() {
+    metadata.clear();
+  }
+
+  public boolean equals(Object o) {
+
+    if (o == null) {
+      return false;
+    }
+
+    Metadata other = null;
+    try {
+      other = (Metadata) o;
+    } catch (ClassCastException cce) {
+      return false;
+    }
+
+    if (other.size() != size()) {
+      return false;
+    }
+
+    String[] names = names();
+    for (int i = 0; i < names.length; i++) {
+      String[] otherValues = other._getValues(names[i]);
+      String[] thisValues = _getValues(names[i]);
+      if (otherValues.length != thisValues.length) {
+        return false;
+      }
+      for (int j = 0; j < otherValues.length; j++) {
+        if (!otherValues[j].equals(thisValues[j])) {
+          return false;
+        }
+      }
+    }
+    return true;
+  }
+
+  public String toString() {
+    StringBuffer buf = new StringBuffer();
+    String[] names = names();
+    for (int i = 0; i < names.length; i++) {
+      String[] values = _getValues(names[i]);
+      for (int j = 0; j < values.length; j++) {
+        buf.append(names[i]).append("=").append(values[j]).append(" ");
+      }
+    }
+    return buf.toString();
+  }
+
+  public final void write(DataOutput out) throws IOException {
+    out.writeInt(size());
+    String[] values = null;
+    String[] names = names();
+    for (int i = 0; i < names.length; i++) {
+      Text.writeString(out, names[i]);
+      values = _getValues(names[i]);
+      int cnt = 0;
+      for (int j = 0; j < values.length; j++) {
+        if (values[j] != null)
+          cnt++;
+      }
+      out.writeInt(cnt);
+      for (int j = 0; j < values.length; j++) {
+        if (values[j] != null) {
+          Text.writeString(out, values[j]);
+        }
+      }
+    }
+  }
+
+  public final void readFields(DataInput in) throws IOException {
+    int keySize = in.readInt();
+    String key;
+    for (int i = 0; i < keySize; i++) {
+      key = Text.readString(in);
+      int valueSize = in.readInt();
+      for (int j = 0; j < valueSize; j++) {
+        add(key, Text.readString(in));
+      }
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/metadata/Nutch.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/metadata/Nutch.java b/nutch-core/src/main/java/org/apache/nutch/metadata/Nutch.java
new file mode 100644
index 0000000..de80399
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/metadata/Nutch.java
@@ -0,0 +1,98 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.metadata;
+
+import org.apache.hadoop.io.Text;
+
+/**
+ * A collection of Nutch internal metadata constants.
+ * 
+ * @author Chris Mattmann
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public interface Nutch {
+
+	public static final String ORIGINAL_CHAR_ENCODING = "OriginalCharEncoding";
+
+	public static final String CHAR_ENCODING_FOR_CONVERSION = "CharEncodingForConversion";
+
+	public static final String SIGNATURE_KEY = "nutch.content.digest";
+
+	public static final String SEGMENT_NAME_KEY = "nutch.segment.name";
+
+	public static final String SCORE_KEY = "nutch.crawl.score";
+
+	public static final String GENERATE_TIME_KEY = "_ngt_";
+
+	public static final Text WRITABLE_GENERATE_TIME_KEY = new Text(
+			GENERATE_TIME_KEY);
+
+	public static final Text PROTOCOL_STATUS_CODE_KEY = new Text("nutch.protocol.code");
+
+	public static final String PROTO_STATUS_KEY = "_pst_";
+
+	public static final Text WRITABLE_PROTO_STATUS_KEY = new Text(
+			PROTO_STATUS_KEY);
+
+	public static final String FETCH_TIME_KEY = "_ftk_";
+
+	public static final String FETCH_STATUS_KEY = "_fst_";
+
+	/**
+	 * Sites may request that search engines don't provide access to cached
+	 * documents.
+	 */
+	public static final String CACHING_FORBIDDEN_KEY = "caching.forbidden";
+
+	/** Show both original forbidden content and summaries (default). */
+	public static final String CACHING_FORBIDDEN_NONE = "none";
+
+	/** Don't show either original forbidden content or summaries. */
+	public static final String CACHING_FORBIDDEN_ALL = "all";
+
+	/** Don't show original forbidden content, but show summaries. */
+	public static final String CACHING_FORBIDDEN_CONTENT = "content";
+
+	public static final String REPR_URL_KEY = "_repr_";
+
+	public static final Text WRITABLE_REPR_URL_KEY = new Text(REPR_URL_KEY);
+
+	/** Used by AdaptiveFetchSchedule to maintain custom fetch interval */
+	public static final String FIXED_INTERVAL_KEY = "fixedInterval";
+
+	public static final Text WRITABLE_FIXED_INTERVAL_KEY = new Text(
+			FIXED_INTERVAL_KEY);
+
+	
+	 /** For progress of job. Used by the Nutch REST service */
+	public static final String STAT_PROGRESS = "progress";
+	/**Used by Nutch REST service */
+	public static final String CRAWL_ID_KEY = "storage.crawl.id";
+	/** Argument key to specify location of the seed url dir for the REST endpoints **/
+	public static final String ARG_SEEDDIR = "url_dir";
+	/** Argument key to specify the location of crawldb for the REST endpoints **/
+	public static final String ARG_CRAWLDB = "crawldb";
+	/** Argument key to specify the location of linkdb for the REST endpoints **/
+	public static final String ARG_LINKDB = "linkdb";
+	/** Name of the key used in the Result Map sent back by the REST endpoint **/
+	public static final String VAL_RESULT = "result";
+	/** Argument key to specify the location of a directory of segments for the REST endpoints.
+	 * Similar to the -dir command in the bin/nutch script **/
+	public static final String ARG_SEGMENTDIR = "segment_dir";
+	/** Argument key to specify the location of individual segment for the REST endpoints **/
+	public static final String ARG_SEGMENT = "segment";
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/metadata/SpellCheckedMetadata.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/metadata/SpellCheckedMetadata.java b/nutch-core/src/main/java/org/apache/nutch/metadata/SpellCheckedMetadata.java
new file mode 100644
index 0000000..164ca1d
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/metadata/SpellCheckedMetadata.java
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.metadata;
+
+import java.lang.reflect.Field;
+import java.lang.reflect.Modifier;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.commons.lang.StringUtils;
+
+/**
+ * A decorator to Metadata that adds spellchecking capabilities to property
+ * names. Currently used spelling vocabulary contains just the httpheaders from
+ * {@link HttpHeaders} class.
+ * 
+ */
+public class SpellCheckedMetadata extends Metadata {
+
+  /**
+   * Treshold divider.
+   * 
+   * <code>threshold = searched.length() / TRESHOLD_DIVIDER;</code>
+   */
+  private static final int TRESHOLD_DIVIDER = 3;
+
+  /**
+   * Normalized name to name mapping.
+   */
+  private final static Map<String, String> NAMES_IDX = new HashMap<String, String>();
+
+  /**
+   * Array holding map keys.
+   */
+  private static String[] normalized = null;
+
+  static {
+
+    // Uses following array to fill the metanames index and the
+    // metanames list.
+    Class<?>[] spellthese = { HttpHeaders.class };
+
+    for (Class<?> spellCheckedNames : spellthese) {
+      for (Field field : spellCheckedNames.getFields()) {
+        int mods = field.getModifiers();
+        if (Modifier.isFinal(mods) && Modifier.isPublic(mods)
+            && Modifier.isStatic(mods) && field.getType().equals(String.class)) {
+          try {
+            String val = (String) field.get(null);
+            NAMES_IDX.put(normalize(val), val);
+          } catch (Exception e) {
+            // Simply ignore...
+          }
+        }
+      }
+    }
+    normalized = NAMES_IDX.keySet().toArray(new String[NAMES_IDX.size()]);
+  }
+
+  /**
+   * Normalizes String.
+   * 
+   * @param str
+   *          the string to normalize
+   * @return normalized String
+   */
+  private static String normalize(final String str) {
+    char c;
+    StringBuffer buf = new StringBuffer();
+    for (int i = 0; i < str.length(); i++) {
+      c = str.charAt(i);
+      if (Character.isLetter(c)) {
+        buf.append(Character.toLowerCase(c));
+      }
+    }
+    return buf.toString();
+  }
+
+  /**
+   * Get the normalized name of metadata attribute name. This method tries to
+   * find a well-known metadata name (one of the metadata names defined in this
+   * class) that matches the specified name. The matching is error tolerent. For
+   * instance,
+   * <ul>
+   * <li>content-type gives Content-Type</li>
+   * <li>CoNtEntType gives Content-Type</li>
+   * <li>ConTnTtYpe gives Content-Type</li>
+   * </ul>
+   * If no matching with a well-known metadata name is found, then the original
+   * name is returned.
+   * 
+   * @param name
+   *          Name to normalize
+   * @return normalized name
+   */
+  public static String getNormalizedName(final String name) {
+    String searched = normalize(name);
+    String value = NAMES_IDX.get(searched);
+
+    if ((value == null) && (normalized != null)) {
+      int threshold = searched.length() / TRESHOLD_DIVIDER;
+      for (int i = 0; i < normalized.length && value == null; i++) {
+        if (StringUtils.getLevenshteinDistance(searched, normalized[i]) < threshold) {
+          value = NAMES_IDX.get(normalized[i]);
+        }
+      }
+    }
+    return (value != null) ? value : name;
+  }
+
+  @Override
+  public void remove(final String name) {
+    super.remove(getNormalizedName(name));
+  }
+
+  @Override
+  public void add(final String name, final String value) {
+    super.add(getNormalizedName(name), value);
+  }
+
+  @Override
+  public String[] getValues(final String name) {
+    return super.getValues(getNormalizedName(name));
+  }
+
+  @Override
+  public String get(final String name) {
+    return super.get(getNormalizedName(name));
+  }
+
+  @Override
+  public void set(final String name, final String value) {
+    super.set(getNormalizedName(name), value);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/metadata/package.html
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/metadata/package.html b/nutch-core/src/main/java/org/apache/nutch/metadata/package.html
new file mode 100644
index 0000000..53281bb
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/metadata/package.html
@@ -0,0 +1,6 @@
+<html>
+<body>
+A Multi-valued Metadata container, and set
+of constant fields for Nutch Metadata.
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/URLExemptionFilter.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/net/URLExemptionFilter.java b/nutch-core/src/main/java/org/apache/nutch/net/URLExemptionFilter.java
new file mode 100644
index 0000000..8de5800
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/net/URLExemptionFilter.java
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net;
+
+//Hadoop
+import org.apache.hadoop.conf.Configurable;
+// Nutch
+import org.apache.nutch.plugin.Pluggable;
+
+/**
+ * Interface used to allow exemptions to external domain resources by overriding <code>db.ignore.external.links</code>.
+ * This is useful when the crawl is focused to a domain but resources like images are hosted on CDN.
+ */
+
+public interface URLExemptionFilter extends Pluggable, Configurable{
+
+  /** The name of the extension point. */
+  public final static String X_POINT_ID = URLExemptionFilter.class.getName();
+
+  /**
+   * Checks if toUrl is exempted when the ignore external is enabled
+   * @param fromUrl : the source url which generated the outlink
+   * @param toUrl : the destination url which needs to be checked for exemption
+   * @return true when toUrl is exempted from dbIgnore
+   */
+  public boolean filter(String fromUrl, String toUrl);
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/URLExemptionFilters.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/net/URLExemptionFilters.java b/nutch-core/src/main/java/org/apache/nutch/net/URLExemptionFilters.java
new file mode 100644
index 0000000..d362f2e
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/net/URLExemptionFilters.java
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.plugin.PluginRuntimeException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/** Creates and caches {@link URLExemptionFilter} implementing plugins. */
+public class URLExemptionFilters {
+
+  private static final Logger LOG = LoggerFactory.getLogger(URLExemptionFilters.class);
+
+  private URLExemptionFilter[] filters;
+
+  public URLExemptionFilters(Configuration conf) {
+    Extension[] extensions = PluginRepository.get(conf)
+        .getExtensionPoint(URLExemptionFilter.X_POINT_ID).getExtensions();
+    filters = new URLExemptionFilter[extensions.length];
+    for (int i = 0; i < extensions.length; i++) {
+      try {
+        filters[i] = (URLExemptionFilter) extensions[i].getExtensionInstance();
+      } catch (PluginRuntimeException e) {
+        throw new IllegalStateException(e);
+      }
+    }
+    LOG.info("Found {} extensions at point:'{}'", filters.length,
+        URLExemptionFilter.X_POINT_ID);
+  }
+
+
+  /** Run all defined filters. Assume logical AND. */
+  public boolean isExempted(String fromUrl, String toUrl) {
+    if (filters.length < 1) {
+      //at least one filter should be on
+      return false;
+    }
+    //validate from, to and filters
+    boolean exempted = fromUrl != null && toUrl != null;
+    //An URL is exempted when all the filters accept it to pass through
+    for (int i = 0; i < this.filters.length && exempted; i++) {
+      exempted = this.filters[i].filter(fromUrl, toUrl);
+    }
+    return exempted;
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/URLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/net/URLFilter.java b/nutch-core/src/main/java/org/apache/nutch/net/URLFilter.java
new file mode 100644
index 0000000..01efbcd
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/net/URLFilter.java
@@ -0,0 +1,40 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configurable;
+
+// Nutch imports
+import org.apache.nutch.plugin.Pluggable;
+
+/**
+ * Interface used to limit which URLs enter Nutch. Used by the injector and the
+ * db updater.
+ */
+
+public interface URLFilter extends Pluggable, Configurable {
+  /** The name of the extension point. */
+  public final static String X_POINT_ID = URLFilter.class.getName();
+
+  /*
+   * Interface for a filter that transforms a URL: it can pass the original URL
+   * through or "delete" the URL by returning null
+   */
+  public String filter(String urlString);
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/URLFilterChecker.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/net/URLFilterChecker.java b/nutch-core/src/main/java/org/apache/nutch/net/URLFilterChecker.java
new file mode 100644
index 0000000..89a3d00
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/net/URLFilterChecker.java
@@ -0,0 +1,134 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net;
+
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.ExtensionPoint;
+import org.apache.nutch.plugin.PluginRepository;
+
+import org.apache.hadoop.conf.Configuration;
+
+import org.apache.nutch.util.NutchConfiguration;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+
+/**
+ * Checks one given filter or all filters.
+ * 
+ * @author John Xing
+ */
+public class URLFilterChecker {
+
+  private Configuration conf;
+
+  public URLFilterChecker(Configuration conf) {
+    this.conf = conf;
+  }
+
+  private void checkOne(String filterName) throws Exception {
+    URLFilter filter = null;
+
+    ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(
+        URLFilter.X_POINT_ID);
+
+    if (point == null)
+      throw new RuntimeException(URLFilter.X_POINT_ID + " not found.");
+
+    Extension[] extensions = point.getExtensions();
+
+    for (int i = 0; i < extensions.length; i++) {
+      Extension extension = extensions[i];
+      filter = (URLFilter) extension.getExtensionInstance();
+      if (filter.getClass().getName().equals(filterName)) {
+        break;
+      } else {
+        filter = null;
+      }
+    }
+
+    if (filter == null)
+      throw new RuntimeException("Filter " + filterName + " not found.");
+
+    // jerome : should we keep this behavior?
+    // if (LogFormatter.hasLoggedSevere())
+    // throw new RuntimeException("Severe error encountered.");
+
+    System.out.println("Checking URLFilter " + filterName);
+
+    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+    String line;
+    while ((line = in.readLine()) != null) {
+      String out = filter.filter(line);
+      if (out != null) {
+        System.out.print("+");
+        System.out.println(out);
+      } else {
+        System.out.print("-");
+        System.out.println(line);
+      }
+    }
+  }
+
+  private void checkAll() throws Exception {
+    System.out.println("Checking combination of all URLFilters available");
+
+    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+    String line;
+    while ((line = in.readLine()) != null) {
+      URLFilters filters = new URLFilters(this.conf);
+      String out = filters.filter(line);
+      if (out != null) {
+        System.out.print("+");
+        System.out.println(out);
+      } else {
+        System.out.print("-");
+        System.out.println(line);
+      }
+    }
+  }
+
+  public static void main(String[] args) throws Exception {
+
+    String usage = "Usage: URLFilterChecker (-filterName filterName | -allCombined) \n"
+        + "Tool takes a list of URLs, one per line, passed via STDIN.\n";
+
+    if (args.length == 0) {
+      System.err.println(usage);
+      System.exit(-1);
+    }
+
+    String filterName = null;
+    if (args[0].equals("-filterName")) {
+      if (args.length != 2) {
+        System.err.println(usage);
+        System.exit(-1);
+      }
+      filterName = args[1];
+    }
+
+    URLFilterChecker checker = new URLFilterChecker(NutchConfiguration.create());
+    if (filterName != null) {
+      checker.checkOne(filterName);
+    } else {
+      checker.checkAll();
+    }
+
+    System.exit(0);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/URLFilterException.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/net/URLFilterException.java b/nutch-core/src/main/java/org/apache/nutch/net/URLFilterException.java
new file mode 100644
index 0000000..b367b56
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/net/URLFilterException.java
@@ -0,0 +1,39 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net;
+
+@SuppressWarnings("serial")
+public class URLFilterException extends Exception {
+
+  public URLFilterException() {
+    super();
+  }
+
+  public URLFilterException(String message) {
+    super(message);
+  }
+
+  public URLFilterException(String message, Throwable cause) {
+    super(message, cause);
+  }
+
+  public URLFilterException(Throwable cause) {
+    super(cause);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/URLFilters.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/net/URLFilters.java b/nutch-core/src/main/java/org/apache/nutch/net/URLFilters.java
new file mode 100644
index 0000000..3deccca
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/net/URLFilters.java
@@ -0,0 +1,44 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.plugin.PluginRepository;
+
+/** Creates and caches {@link URLFilter} implementing plugins. */
+public class URLFilters {
+
+  public static final String URLFILTER_ORDER = "urlfilter.order";
+  private URLFilter[] filters;
+
+  public URLFilters(Configuration conf) {
+    this.filters = (URLFilter[]) PluginRepository.get(conf).getOrderedPlugins(
+        URLFilter.class, URLFilter.X_POINT_ID, URLFILTER_ORDER);
+  }
+
+  /** Run all defined filters. Assume logical AND. */
+  public String filter(String urlString) throws URLFilterException {
+    for (int i = 0; i < this.filters.length; i++) {
+      if (urlString == null)
+        return null;
+      urlString = this.filters[i].filter(urlString);
+
+    }
+    return urlString;
+  }
+}