You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2015/08/24 19:57:29 UTC

svn commit: r1697466 - in /nutch/trunk: ./ conf/ ivy/ src/bin/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/parse/ src/java/org/apache/nutch/plugin/ src/java/org/apache...

Author: jnioche
Date: Mon Aug 24 17:57:28 2015
New Revision: 1697466

URL: http://svn.apache.org/r1697466
Log:
NUTCH-2049 Upgrade Trunk to Hadoop > 2.4 stable

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/build.xml
    nutch/trunk/conf/nutch-default.xml
    nutch/trunk/ivy/ivy.xml
    nutch/trunk/src/bin/crawl
    nutch/trunk/src/bin/nutch
    nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
    nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
    nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java
    nutch/trunk/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java
    nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java
    nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
    nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java
    nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
    nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
    nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
    nutch/trunk/src/java/org/apache/nutch/plugin/Extension.java
    nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java
    nutch/trunk/src/java/org/apache/nutch/segment/ContentAsTextInputFormat.java
    nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java
    nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java
    nutch/trunk/src/java/org/apache/nutch/service/JobManager.java
    nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
    nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
    nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java
    nutch/trunk/src/java/org/apache/nutch/util/LockUtil.java
    nutch/trunk/src/java/org/apache/nutch/util/domain/DomainStatistics.java
    nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesClassifier.java
    nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java
    nutch/trunk/src/test/crawl-tests.xml
    nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
    nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java
    nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java
    nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java
    nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java
    nutch/trunk/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java
    nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java
    nutch/trunk/src/test/org/apache/nutch/net/TestURLNormalizers.java
    nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMerger.java
    nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMergerCrawlDatums.java
    nutch/trunk/src/test/org/apache/nutch/tools/proxy/SegmentHandler.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon Aug 24 17:57:28 2015
@@ -2,6 +2,8 @@ Nutch Change Log
   
 Nutch Current Development 1.11-SNAPSHOT
 
+* NUTCH-2049 Upgrade to Hadoop 2.4 (lewismc)
+
 * NUTCH-1486 Upgrade to Solr 4.10.2 (lewismc, markus)
 
 * NUTCH-2048 parse-tika: fix dependencies in plugin.xml (Michael Joyce via snagel)

Modified: nutch/trunk/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/build.xml (original)
+++ nutch/trunk/build.xml Mon Aug 24 17:57:28 2015
@@ -427,9 +427,6 @@
     <copy todir="${test.build.data}">
       <fileset dir="src/testresources" includes="**/*"/>
     </copy>
-    
-    <copy file="${test.src.dir}/nutch-site.xml"
-          todir="${test.build.classes}"/>
 
     <copy file="${test.src.dir}/log4j.properties"
           todir="${test.build.classes}"/>

Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Mon Aug 24 17:57:28 2015
@@ -1543,6 +1543,16 @@ CAUTION: Set the parser.timeout to -1 or
   </description>
 </property>
 
+<property>
+  <name>io.serializations</name>
+  <value>org.apache.hadoop.io.serializer.WritableSerialization,org.apache.hadoop.io.serializer.JavaSerialization</value>
+  <!-- org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization,
+  org.apache.hadoop.io.serializer.avro.AvroReflectSerialization,
+  org.apache.hadoop.io.serializer.avro.AvroGenericSerialization, -->
+  <description>A list of serialization classes that can be used for
+  obtaining serializers and deserializers.</description>
+</property>
+
 <!-- linkrank scoring properties -->
 
 <property>

Modified: nutch/trunk/ivy/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/ivy/ivy.xml (original)
+++ nutch/trunk/ivy/ivy.xml Mon Aug 24 17:57:28 2015
@@ -43,10 +43,11 @@
 		<dependency org="commons-collections" name="commons-collections" rev="3.1" conf="*->default" />
 		<dependency org="commons-httpclient" name="commons-httpclient" rev="3.1" conf="*->master" />
 		<dependency org="commons-codec" name="commons-codec" rev="1.3" conf="*->default" />
-        <dependency org="org.apache.commons" name="commons-compress" rev="1.9" conf="*->default" />	
+        <dependency org="org.apache.commons" name="commons-compress" rev="1.9" conf="*->default" />
         <dependency org="org.apache.commons" name="commons-jexl" rev="2.1.1" />
-
-		<dependency org="org.apache.hadoop" name="hadoop-core" rev="1.2.0" conf="*->default">
+            
+        <!-- Hadoop Dependencies -->
+		<dependency org="org.apache.hadoop" name="hadoop-common" rev="2.4.0" conf="*->default">
 			<exclude org="hsqldb" name="hsqldb" />
 			<exclude org="net.sf.kosmosfs" name="kfs" />
 			<exclude org="net.java.dev.jets3t" name="jets3t" />
@@ -54,6 +55,10 @@
 			<exclude org="org.mortbay.jetty" name="jsp-*" />
 			<exclude org="ant" name="ant" />
 		</dependency>
+        <dependency org="org.apache.hadoop" name="hadoop-hdfs" rev="2.4.0" conf="*->default"/>
+        <dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-core" rev="2.4.0" conf="*->default"/>
+        <dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-jobclient" rev="2.4.0" conf="*->default"/>
+        <!-- End of Hadoop Dependencies -->
 
 		<dependency org="org.apache.tika" name="tika-core" rev="1.8" />
 		<dependency org="com.ibm.icu" name="icu4j" rev="55.1" />
@@ -77,17 +82,25 @@
               
 		<!--artifacts needed for testing -->
 		<dependency org="junit" name="junit" rev="4.11" conf="test->default" />
-		<dependency org="org.apache.hadoop" name="hadoop-test" rev="1.2.0" conf="test->default" />
+		<!--dependency org="org.apache.hadoop" name="hadoop-test" rev="1.2.0" conf="test->default" /-->
 		<dependency org="org.mortbay.jetty" name="jetty-client" rev="6.1.22" conf="test->default" />
 		<dependency org="org.mortbay.jetty" name="jetty" rev="6.1.22" conf="test->default" />
 		<dependency org="org.mortbay.jetty" name="jetty-util" rev="6.1.22" conf="test->default" />
 		<!-- end of test artifacts -->
 
+
+		<dependency org="org.mortbay.jetty" name="jetty-client" rev="6.1.22" conf="test->default" />
+
+		<dependency org="org.mortbay.jetty" name="jetty" rev="6.1.22" conf="test->default" />
+		<dependency org="org.mortbay.jetty" name="jetty-util" rev="6.1.22" conf="test->default" />
+
 		<!--global exclusion -->
 		<exclude module="jmxtools" />
 		<exclude module="jms" />
 		<exclude module="jmxri" />
         <exclude org="com.thoughtworks.xstream"/>
+        <exclude org="org.apache.mrunit"/>
+        <exclude org="com.thoughtworks.xstream"/>
 
 	</dependencies>
 

Modified: nutch/trunk/src/bin/crawl
URL: http://svn.apache.org/viewvc/nutch/trunk/src/bin/crawl?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/bin/crawl (original)
+++ nutch/trunk/src/bin/crawl Mon Aug 24 17:57:28 2015
@@ -140,7 +140,7 @@ commonOptions="-D mapred.reduce.tasks=$n
  # check that hadoop can be found on the path
 if [ $mode = "distributed" ]; then
  if [ $(which hadoop | wc -l ) -eq 0 ]; then
-    echo "Can't find Hadoop executable. Add HADOOP_HOME/bin to the path or run in local mode."
+    echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode."
     exit -1;
  fi
 fi

Modified: nutch/trunk/src/bin/nutch
URL: http://svn.apache.org/viewvc/nutch/trunk/src/bin/nutch?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/bin/nutch (original)
+++ nutch/trunk/src/bin/nutch Mon Aug 24 17:57:28 2015
@@ -290,7 +290,7 @@ if $local; then
 else
  # check that hadoop can be found on the path
  if [ $(which hadoop | wc -l ) -eq 0 ]; then
-    echo "Can't find Hadoop executable. Add HADOOP_HOME/bin to the path or run in local mode."
+    echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode."
     exit -1;
  fi
 fi

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java Mon Aug 24 17:57:28 2015
@@ -70,9 +70,9 @@ public class AdaptiveFetchSchedule exten
 
   protected float DEC_RATE;
 
-  private int MAX_INTERVAL;
+  private float MAX_INTERVAL;
 
-  private int MIN_INTERVAL;
+  private float MIN_INTERVAL;
 
   private boolean SYNC_DELTA;
 
@@ -84,9 +84,9 @@ public class AdaptiveFetchSchedule exten
       return;
     INC_RATE = conf.getFloat("db.fetch.schedule.adaptive.inc_rate", 0.2f);
     DEC_RATE = conf.getFloat("db.fetch.schedule.adaptive.dec_rate", 0.2f);
-    MIN_INTERVAL = conf.getInt("db.fetch.schedule.adaptive.min_interval", 60);
-    MAX_INTERVAL = conf.getInt("db.fetch.schedule.adaptive.max_interval",
-        SECONDS_PER_DAY * 365); // 1 year
+    MIN_INTERVAL = conf.getFloat("db.fetch.schedule.adaptive.min_interval", (float) 60.0);
+    MAX_INTERVAL = conf.getFloat("db.fetch.schedule.adaptive.max_interval",
+        (float) SECONDS_PER_DAY * 365); // 1 year
     SYNC_DELTA = conf.getBoolean("db.fetch.schedule.adaptive.sync_delta", true);
     SYNC_DELTA_RATE = conf.getFloat(
         "db.fetch.schedule.adaptive.sync_delta_rate", 0.2f);

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Mon Aug 24 17:57:28 2015
@@ -33,14 +33,6 @@ import java.util.regex.Pattern;
 import java.util.TreeMap;
 
 
-
-
-
-
-
-
-
-
 // Commons Logging imports
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -75,9 +67,7 @@ import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.NutchTool;
 import org.apache.nutch.util.StringUtil;
-
 import org.apache.commons.jexl2.Expression;
 import org.apache.commons.jexl2.JexlEngine;
 
@@ -778,6 +768,7 @@ public class CrawlDbReader extends Confi
           String[] st = k.split(" ");
           int code = Integer.parseInt(st[1]);
           if (st.length > 2){
+            @SuppressWarnings("unchecked")
             Map<String, Object> individualStatusInfo = (Map<String, Object>) statusMap.get(String.valueOf(code));
             Map<String, String> hostValues;
             if(individualStatusInfo.containsKey("hostValues")){

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java Mon Aug 24 17:57:28 2015
@@ -23,7 +23,6 @@ import java.util.Iterator;
 import java.util.Map;
 import java.util.Random;
 
-import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.BytesWritable;

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java Mon Aug 24 17:57:28 2015
@@ -18,7 +18,6 @@
 package org.apache.nutch.crawl;
 
 import java.io.BufferedReader;
-import java.io.FileReader;
 import java.io.IOException;
 import java.io.Reader;
 import java.util.HashMap;
@@ -26,7 +25,6 @@ import java.util.HashMap;
 import org.apache.commons.lang.StringUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.*;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.metadata.HttpHeaders;
 import org.apache.nutch.util.MimeUtil;

Modified: nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java Mon Aug 24 17:57:28 2015
@@ -23,12 +23,12 @@ import org.apache.nutch.crawl.CrawlDatum
 import org.apache.nutch.crawl.NutchWritable;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
-
 import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.MapFile.Writer.Option;
+import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.SequenceFile.CompressionType;
-
 import org.apache.hadoop.mapred.FileOutputFormat;
 import org.apache.hadoop.mapred.InvalidJobConfException;
 import org.apache.hadoop.mapred.OutputFormat;
@@ -37,7 +37,6 @@ import org.apache.hadoop.mapred.JobConf;
 import org.apache.hadoop.mapred.Reporter;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.hadoop.util.Progressable;
-
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseOutputFormat;
 import org.apache.nutch.protocol.Content;
@@ -68,8 +67,13 @@ public class FetcherOutputFormat impleme
     final CompressionType compType = SequenceFileOutputFormat
         .getOutputCompressionType(job);
 
-    final MapFile.Writer fetchOut = new MapFile.Writer(job, fs,
-        fetch.toString(), Text.class, CrawlDatum.class, compType, progress);
+    Option fKeyClassOpt = MapFile.Writer.keyClass(Text.class);
+    org.apache.hadoop.io.SequenceFile.Writer.Option fValClassOpt = SequenceFile.Writer.valueClass(CrawlDatum.class);
+    org.apache.hadoop.io.SequenceFile.Writer.Option fProgressOpt = SequenceFile.Writer.progressable(progress);
+    org.apache.hadoop.io.SequenceFile.Writer.Option fCompOpt = SequenceFile.Writer.compression(compType);
+    
+    final MapFile.Writer fetchOut = new MapFile.Writer(job,
+        fetch, fKeyClassOpt, fValClassOpt, fCompOpt, fProgressOpt);
 
     return new RecordWriter<Text, NutchWritable>() {
       private MapFile.Writer contentOut;
@@ -77,8 +81,12 @@ public class FetcherOutputFormat impleme
 
       {
         if (Fetcher.isStoringContent(job)) {
-          contentOut = new MapFile.Writer(job, fs, content.toString(),
-              Text.class, Content.class, compType, progress);
+          Option cKeyClassOpt = MapFile.Writer.keyClass(Text.class);
+          org.apache.hadoop.io.SequenceFile.Writer.Option cValClassOpt = SequenceFile.Writer.valueClass(Content.class);
+          org.apache.hadoop.io.SequenceFile.Writer.Option cProgressOpt = SequenceFile.Writer.progressable(progress);
+          org.apache.hadoop.io.SequenceFile.Writer.Option cCompOpt = SequenceFile.Writer.compression(compType);
+          contentOut = new MapFile.Writer(job, content,
+              cKeyClassOpt, cValClassOpt, cCompOpt, cProgressOpt);
         }
 
         if (Fetcher.isParsing(job)) {

Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Mon Aug 24 17:57:28 2015
@@ -22,7 +22,6 @@ import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 
-import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapred.JobConf;

Modified: nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java Mon Aug 24 17:57:28 2015
@@ -142,113 +142,4 @@ public class OutlinkExtractor {
     return retval;
   }
 
-  /**
-   * Extracts outlinks from a plain text. <br />
-   * This Method takes the Jakarta Regexp API.
-   * 
-   * @param plainText
-   * 
-   * @return Array of <code>Outlink</code> s within found in plainText
-   * @deprecated only for tests
-   */
-  @Deprecated
-  private Outlink[] getOutlinksJakartaRegexpImpl(final String plainText) {
-
-    throw new UnsupportedOperationException(
-        "Implementation commented out. Please uncomment to use it.");
-
-    // final List outlinks = new ArrayList();
-    // String url;
-    // Outlink link;
-    //
-    // RE re = new RE(URL_PATTERN);
-    //
-    // int pos = 0;
-    //
-    // while (re.match(plainText, pos)) {
-    //
-    // url = re.getParen(0);
-    //
-    // if (LOG.isTraceEnabled()) {
-    // LOG.trace("Extracted url: " + url);
-    // }
-    //
-    // try {
-    //
-    // link = new Outlink(url, null);
-    // outlinks.add(link);
-    //
-    // } catch (MalformedURLException ex) {
-    // // if it is a malformed URL we just throw it away and continue with
-    // // extraction.
-    // if (LOG.isErrorEnabled()) { LOG.error("getOutlinks", ex); }
-    // }
-    //
-    // pos = re.getParenEnd(0);
-    // }
-    //
-    // final Outlink[] retval;
-    //
-    // if (pos > 0) {
-    // retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
-    // } else {
-    // retval = new Outlink[0];
-    // }
-    //
-    // return retval;
-
-  }
-
-  /**
-   * Extracts outlinks from a plain text. </p> This Method takes the JDK5 Regexp
-   * API.
-   * 
-   * @param plainText
-   * 
-   * @return Array of <code>Outlink</code> s within found in plainText
-   * @deprecated only for tests
-   */
-  @Deprecated
-  private Outlink[] getOutlinksJDK5Impl(final String plainText) {
-
-    throw new UnsupportedOperationException(
-        "Implementation commented out. Please uncomment to use it.");
-
-    // final List outlinks = new ArrayList();
-    // String url;
-    // Outlink link;
-    //
-    // final Pattern urlPattern = Pattern.compile(URL_PATTERN);
-    // final RE re = new RE(urlPattern);
-    //
-    // int pos = 0;
-    //
-    // while (re.match(plainText, pos)) {
-    //
-    // url = re.getParen(0);
-    //
-    // try {
-    //
-    // link = new Outlink(url, null);
-    // outlinks.add(link);
-    // } catch (MalformedURLException ex) {
-    // // if it is a malformed URL we just throw it away and continue with
-    // // extraction.
-    // if (LOG.isErrorEnabled()) { LOG.error("getOutlinks", ex); }
-    // }
-    //
-    // pos = re.getParenEnd(0);
-    // }
-    //
-    // final Outlink[] retval;
-    //
-    // if (pos > 0) {
-    // retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
-    // } else {
-    // retval = new Outlink[0];
-    // }
-    //
-    // return retval;
-  }
-
 }

Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Mon Aug 24 17:57:28 2015
@@ -20,13 +20,13 @@ package org.apache.nutch.parse;
 // Commons Logging imports
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-
-import org.apache.hadoop.conf.*;
 import org.apache.hadoop.io.*;
+import org.apache.hadoop.io.MapFile.Writer.Option;
 import org.apache.hadoop.io.SequenceFile.CompressionType;
+import org.apache.hadoop.io.SequenceFile.Metadata;
+import org.apache.hadoop.io.compress.DefaultCodec;
 import org.apache.hadoop.fs.*;
 import org.apache.hadoop.mapred.*;
-import org.apache.hadoop.util.StringUtils;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.fetcher.Fetcher;
 import org.apache.nutch.scoring.ScoringFilterException;
@@ -119,14 +119,33 @@ public class ParseOutputFormat implement
     final String[] parseMDtoCrawlDB = job.get("db.parsemeta.to.crawldb", "")
         .split(" *, *");
 
-    final MapFile.Writer textOut = new MapFile.Writer(job, fs, text.toString(),
-        Text.class, ParseText.class, CompressionType.RECORD, progress);
-
-    final MapFile.Writer dataOut = new MapFile.Writer(job, fs, data.toString(),
-        Text.class, ParseData.class, compType, progress);
-
-    final SequenceFile.Writer crawlOut = SequenceFile.createWriter(fs, job,
-        crawl, Text.class, CrawlDatum.class, compType, progress);
+    // textOut Options
+    Option tKeyClassOpt = (Option) MapFile.Writer.keyClass(Text.class);
+    org.apache.hadoop.io.SequenceFile.Writer.Option tValClassOpt = SequenceFile.Writer.valueClass(ParseText.class);
+    org.apache.hadoop.io.SequenceFile.Writer.Option tProgressOpt = SequenceFile.Writer.progressable(progress);
+    org.apache.hadoop.io.SequenceFile.Writer.Option tCompOpt = SequenceFile.Writer.compression(CompressionType.RECORD);
+    
+    final MapFile.Writer textOut = new MapFile.Writer(job, text,
+        tKeyClassOpt, tValClassOpt, tCompOpt, tProgressOpt);
+    
+    // dataOut Options
+    Option dKeyClassOpt = (Option) MapFile.Writer.keyClass(Text.class);
+    org.apache.hadoop.io.SequenceFile.Writer.Option dValClassOpt = SequenceFile.Writer.valueClass(ParseData.class);
+    org.apache.hadoop.io.SequenceFile.Writer.Option dProgressOpt = SequenceFile.Writer.progressable(progress);
+    org.apache.hadoop.io.SequenceFile.Writer.Option dCompOpt = SequenceFile.Writer.compression(compType);
+
+    final MapFile.Writer dataOut = new MapFile.Writer(job, data,
+        dKeyClassOpt, dValClassOpt, dCompOpt, dProgressOpt);
+    
+    final SequenceFile.Writer crawlOut = SequenceFile.createWriter(job, SequenceFile.Writer.file(crawl),
+        SequenceFile.Writer.keyClass(Text.class),
+        SequenceFile.Writer.valueClass(CrawlDatum.class),
+        SequenceFile.Writer.bufferSize(fs.getConf().getInt("io.file.buffer.size",4096)),
+        SequenceFile.Writer.replication(fs.getDefaultReplication(crawl)),
+        SequenceFile.Writer.blockSize(1073741824),
+        SequenceFile.Writer.compression(compType, new DefaultCodec()),
+        SequenceFile.Writer.progressable(progress),
+        SequenceFile.Writer.metadata(new Metadata())); 
 
     return new RecordWriter<Text, Parse>() {
 

Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Mon Aug 24 17:57:28 2015
@@ -33,7 +33,6 @@ import org.apache.nutch.net.protocols.Re
 import org.apache.nutch.protocol.*;
 import org.apache.nutch.scoring.ScoringFilterException;
 import org.apache.nutch.scoring.ScoringFilters;
-import org.apache.hadoop.fs.FileSystem;
 import org.apache.nutch.util.*;
 import org.apache.hadoop.fs.Path;
 

Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java Mon Aug 24 17:57:28 2015
@@ -34,7 +34,6 @@ import org.apache.nutch.protocol.Content
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolFactory;
 import org.apache.nutch.protocol.ProtocolOutput;
-import org.apache.nutch.protocol.ProtocolStatus;
 import org.apache.nutch.scoring.ScoringFilters;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.URLUtil;

Modified: nutch/trunk/src/java/org/apache/nutch/plugin/Extension.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/plugin/Extension.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/plugin/Extension.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/plugin/Extension.java Mon Aug 24 17:57:28 2015
@@ -153,7 +153,7 @@ public class Extension {
     synchronized (getId()) {
       try {
         PluginRepository pluginRepository = PluginRepository.get(conf);
-        Class extensionClazz = pluginRepository.getCachedClass(fDescriptor,
+        Class<?> extensionClazz = pluginRepository.getCachedClass(fDescriptor,
             getClazz());
         // lazy loading of Plugin in case there is no instance of the plugin
         // already.

Modified: nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java Mon Aug 24 17:57:28 2015
@@ -21,7 +21,6 @@ import java.io.IOException;
 import java.io.UnsupportedEncodingException;
 import java.net.MalformedURLException;
 import java.net.URL;
-import java.net.URI;
 import java.net.URLDecoder;
 import java.util.HashMap;
 import java.util.Map;

Modified: nutch/trunk/src/java/org/apache/nutch/segment/ContentAsTextInputFormat.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/segment/ContentAsTextInputFormat.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/segment/ContentAsTextInputFormat.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/segment/ContentAsTextInputFormat.java Mon Aug 24 17:57:28 2015
@@ -65,7 +65,6 @@ public class ContentAsTextInputFormat ex
 
       // convert the content object to text
       Text tKey = key;
-      Text tValue = value;
       if (!sequenceFileRecordReader.next(innerKey, innerValue)) {
         return false;
       }

Modified: nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java Mon Aug 24 17:57:28 2015
@@ -34,7 +34,10 @@ import org.apache.hadoop.io.MapFile;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.MapFile.Writer.Option;
 import org.apache.hadoop.io.SequenceFile.CompressionType;
+import org.apache.hadoop.io.SequenceFile.Metadata;
+import org.apache.hadoop.io.compress.DefaultCodec;
 import org.apache.hadoop.mapred.FileInputFormat;
 import org.apache.hadoop.mapred.FileOutputFormat;
 import org.apache.hadoop.mapred.FileSplit;
@@ -160,8 +163,7 @@ public class SegmentMerger extends Confi
         throw new RuntimeException("Cannot identify segment:", e);
       }
 
-      SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.get(job),
-          fSplit.getPath(), job);
+      SequenceFile.Reader reader = new SequenceFile.Reader(job, SequenceFile.Reader.file(fSplit.getPath()));
 
       final Writable w;
       try {
@@ -284,9 +286,26 @@ public class SegmentMerger extends Confi
             wname = new Path(new Path(new Path(out, segmentName + "-" + slice),
                 dirName), name);
           }
-          res = SequenceFile.createWriter(fs, job, wname, Text.class,
-              CrawlDatum.class,
-              SequenceFileOutputFormat.getOutputCompressionType(job), progress);
+          
+//          Option rKeyClassOpt = MapFile.Writer.keyClass(Text.class);
+//          org.apache.hadoop.io.SequenceFile.Writer.Option rValClassOpt = SequenceFile.Writer.valueClass(CrawlDatum.class);
+//          Option rProgressOpt = (Option) SequenceFile.Writer.progressable(progress);
+//          Option rCompOpt = (Option) SequenceFile.Writer.compression(SequenceFileOutputFormat.getOutputCompressionType(job));
+//          Option rFileOpt = (Option) SequenceFile.Writer.file(wname);
+          
+          //res = SequenceFile.createWriter(job, rFileOpt, rKeyClassOpt,
+           //   rValClassOpt, rCompOpt, rProgressOpt);
+          
+          res = SequenceFile.createWriter(job, SequenceFile.Writer.file(wname),
+              SequenceFile.Writer.keyClass(Text.class),
+              SequenceFile.Writer.valueClass(CrawlDatum.class),
+              SequenceFile.Writer.bufferSize(fs.getConf().getInt("io.file.buffer.size",4096)),
+              SequenceFile.Writer.replication(fs.getDefaultReplication(wname)),
+              SequenceFile.Writer.blockSize(1073741824),
+              SequenceFile.Writer.compression(SequenceFileOutputFormat.getOutputCompressionType(job), new DefaultCodec()),
+              SequenceFile.Writer.progressable(progress),
+              SequenceFile.Writer.metadata(new Metadata())); 
+          
           sliceWriters.put(slice + dirName, res);
           return res;
         }
@@ -314,8 +333,14 @@ public class SegmentMerger extends Confi
           if (clazz.isAssignableFrom(ParseText.class)) {
             compType = CompressionType.RECORD;
           }
-          res = new MapFile.Writer(job, fs, wname.toString(), Text.class,
-              clazz, compType, progress);
+          
+          Option rKeyClassOpt = (Option) MapFile.Writer.keyClass(Text.class);
+          org.apache.hadoop.io.SequenceFile.Writer.Option rValClassOpt = SequenceFile.Writer.valueClass(clazz);
+          org.apache.hadoop.io.SequenceFile.Writer.Option rProgressOpt = SequenceFile.Writer.progressable(progress);
+          org.apache.hadoop.io.SequenceFile.Writer.Option rCompOpt = SequenceFile.Writer.compression(compType);
+          
+          res = new MapFile.Writer(job, wname, rKeyClassOpt,
+              rValClassOpt, rCompOpt, rProgressOpt);
           sliceWriters.put(slice + dirName, res);
           return res;
         }

Modified: nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Mon Aug 24 17:57:28 2015
@@ -523,7 +523,7 @@ public class SegmentReader extends Confi
     
     if (fe) {
       Path fetchDir = new Path(segment, CrawlDatum.FETCH_DIR_NAME);
-      if (fs.exists(fetchDir) && fs.getFileStatus(fetchDir).isDir()) {
+      if (fs.exists(fetchDir) && fs.getFileStatus(fetchDir).isDirectory()) {
         cnt = 0L;
         long start = Long.MAX_VALUE;
         long end = Long.MIN_VALUE;
@@ -548,7 +548,7 @@ public class SegmentReader extends Confi
     
     if (pd) {
       Path parseDir = new Path(segment, ParseData.DIR_NAME);
-      if (fs.exists(parseDir) && fs.getFileStatus(parseDir).isDir()) {
+      if (fs.exists(parseDir) && fs.getFileStatus(parseDir).isDirectory()) {
         cnt = 0L;
         long errors = 0L;
         ParseData value = new ParseData();

Modified: nutch/trunk/src/java/org/apache/nutch/service/JobManager.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/service/JobManager.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/service/JobManager.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/service/JobManager.java Mon Aug 24 17:57:28 2015
@@ -18,8 +18,6 @@
 package org.apache.nutch.service;
 
 import java.util.Collection;
-import java.util.Map;
-
 import org.apache.nutch.service.model.request.JobConfig;
 import org.apache.nutch.service.model.response.JobInfo;
 import org.apache.nutch.service.model.response.JobInfo.State;

Modified: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java Mon Aug 24 17:57:28 2015
@@ -380,7 +380,7 @@ public class CommonCrawlDataDumper {
 					LOG.warn("Skipping segment: [" + segmentContentPath	+ "]: no data directory present");
 					continue;
 				}
-				SequenceFile.Reader reader = new SequenceFile.Reader(fs, file, nutchConfig);
+				SequenceFile.Reader reader = new SequenceFile.Reader(nutchConfig, SequenceFile.Reader.file(file));
 
 				if (!new File(file.toString()).exists()) {
 					LOG.warn("Skipping segment: [" + segmentContentPath	+ "]: no data directory present");

Modified: nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java Mon Aug 24 17:57:28 2015
@@ -167,7 +167,7 @@ public class FileDumper {
               + "]: no data directory present");
           continue;
         }
-        SequenceFile.Reader reader = new SequenceFile.Reader(fs, file, conf);
+        SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(file));
 
         Writable key = (Writable) reader.getKeyClass().newInstance();
         Content content = null;
@@ -209,7 +209,7 @@ public class FileDumper {
           }
 
           if (filter) {
-	    if (!mimeTypeStats) {
+            if (!mimeTypeStats) {
               String md5Ofurl = DumpFileUtil.getUrlMD5(url);
               String fullDir = DumpFileUtil.createTwoLevelsDirectory(outputDir.getAbsolutePath(), md5Ofurl);
   

Modified: nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java Mon Aug 24 17:57:28 2015
@@ -43,7 +43,7 @@ public class HadoopFSUtil {
     return new PathFilter() {
       public boolean accept(final Path path) {
         try {
-          return fs.getFileStatus(path).isDir();
+          return fs.getFileStatus(path).isDirectory();
         } catch (IOException ioe) {
           return false;
         }

Modified: nutch/trunk/src/java/org/apache/nutch/util/LockUtil.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/LockUtil.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/LockUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/LockUtil.java Mon Aug 24 17:57:28 2015
@@ -48,7 +48,7 @@ public class LockUtil {
     if (fs.exists(lockFile)) {
       if (!accept)
         throw new IOException("lock file " + lockFile + " already exists.");
-      if (fs.getFileStatus(lockFile).isDir())
+      if (fs.getFileStatus(lockFile).isDirectory())
         throw new IOException("lock file " + lockFile
             + " already exists and is a directory.");
       // do nothing - the file already exists.
@@ -76,7 +76,7 @@ public class LockUtil {
       throws IOException {
     if (!fs.exists(lockFile))
       return false;
-    if (fs.getFileStatus(lockFile).isDir())
+    if (fs.getFileStatus(lockFile).isDirectory())
       throw new IOException("lock file " + lockFile
           + " exists but is a directory!");
     return fs.delete(lockFile, false);

Modified: nutch/trunk/src/java/org/apache/nutch/util/domain/DomainStatistics.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/domain/DomainStatistics.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/domain/DomainStatistics.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/domain/DomainStatistics.java Mon Aug 24 17:57:28 2015
@@ -20,8 +20,6 @@ package org.apache.nutch.util.domain;
 import java.io.IOException;
 import java.net.URL;
 import java.text.SimpleDateFormat;
-import java.util.Iterator;
-
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
@@ -103,7 +101,7 @@ public class DomainStatistics extends Co
     conf.setInt("domain.statistics.mode", mode);
     conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
 
-    Job job = new Job(conf, jobName);
+    Job job = Job.getInstance(conf, jobName);
     job.setJarByClass(DomainStatistics.class);
 
     String[] inputDirsSpecs = inputDir.split(",");

Modified: nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesClassifier.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesClassifier.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesClassifier.java (original)
+++ nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesClassifier.java Mon Aug 24 17:57:28 2015
@@ -29,8 +29,10 @@ import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.SequenceFile.Metadata;
 import org.apache.hadoop.io.SequenceFile.Writer;
 import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.DefaultCodec;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
@@ -46,6 +48,7 @@ import org.apache.mahout.math.Vector;
 import org.apache.mahout.math.Vector.Element;
 import org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles;
 import org.apache.mahout.vectorizer.TFIDF;
+
 import com.google.common.collect.ConcurrentHashMultiset;
 import com.google.common.collect.Multiset;
 
@@ -190,8 +193,16 @@ public class NaiveBayesClassifier {
       throws IOException {
     Configuration configuration = new Configuration();
     FileSystem fs = FileSystem.get(configuration);
-    Writer writer = new SequenceFile.Writer(fs, configuration, new Path(
-        outputDirName + "/chunk-0"), Text.class, Text.class);
+    SequenceFile.Writer writer = SequenceFile.createWriter(configuration, 
+        Writer.file(new Path(outputDirName + "/chunk-0")),
+        Writer.keyClass(Text.class),
+        Writer.valueClass(Text.class),
+        Writer.bufferSize(fs.getConf().getInt("io.file.buffer.size",4096)),
+        Writer.replication(fs.getDefaultReplication(new Path(outputDirName + "/chunk-0"))),
+        Writer.blockSize(1073741824),
+        Writer.compression(SequenceFile.CompressionType.BLOCK, new DefaultCodec()),
+        Writer.progressable(null),
+        Writer.metadata(new Metadata())); 
     BufferedReader reader = null;
     reader = new BufferedReader(
         configuration.getConfResourceAsReader(inputFileName));

Modified: nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java (original)
+++ nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java Mon Aug 24 17:57:28 2015
@@ -27,10 +27,7 @@ import org.apache.nutch.parse.HTMLMetaTa
 import org.apache.nutch.parse.HtmlParseFilter;
 import org.apache.nutch.parse.Outlink;
 import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.parse.ParseResult;
-import org.apache.nutch.parse.ParseStatus;
-import org.apache.nutch.parse.ParseText;
 import org.apache.nutch.protocol.Content;
 
 import java.io.Reader;

Modified: nutch/trunk/src/test/crawl-tests.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/crawl-tests.xml?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/test/crawl-tests.xml (original)
+++ nutch/trunk/src/test/crawl-tests.xml Mon Aug 24 17:57:28 2015
@@ -48,5 +48,15 @@
   </description>                                                                                                                                             
 </property>
 
+<property>
+  <name>io.serializations</name>
+  <value>org.apache.hadoop.io.serializer.WritableSerialization,org.apache.hadoop.io.serializer.JavaSerialization</value>
+  <!-- org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization,
+  org.apache.hadoop.io.serializer.avro.AvroReflectSerialization,
+  org.apache.hadoop.io.serializer.avro.AvroGenericSerialization, -->
+  <description>A list of serialization classes that can be used for
+  obtaining serializers and deserializers.</description>
+</property>
+
 </configuration>
 

Modified: nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java Mon Aug 24 17:57:28 2015
@@ -29,8 +29,9 @@ import org.apache.hadoop.fs.FSDataOutput
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.MapFile.Writer.Option;
 import org.apache.hadoop.io.Text;
-
 import org.mortbay.jetty.Server;
 import org.mortbay.jetty.bio.SocketConnector;
 import org.mortbay.jetty.handler.ContextHandler;
@@ -56,8 +57,10 @@ public class CrawlDBTestUtil {
       Path crawldb, List<URLCrawlDatum> init) throws Exception {
     LOG.trace("* creating crawldb: " + crawldb);
     Path dir = new Path(crawldb, CrawlDb.CURRENT_NAME);
-    MapFile.Writer writer = new MapFile.Writer(conf, fs, new Path(dir,
-        "part-00000").toString(), Text.class, CrawlDatum.class);
+    Option wKeyOpt = MapFile.Writer.keyClass(Text.class);
+    org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(CrawlDatum.class);
+    MapFile.Writer writer = new MapFile.Writer(conf, new Path(dir,
+        "part-00000"), wKeyOpt, wValueOpt);
     Iterator<URLCrawlDatum> it = init.iterator();
     while (it.hasNext()) {
       URLCrawlDatum row = it.next();

Modified: nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java Mon Aug 24 17:57:28 2015
@@ -24,6 +24,7 @@ import org.apache.hadoop.conf.Configurat
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.SequenceFile.Reader.Option;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapred.*;
 import org.apache.nutch.crawl.CrawlDBTestUtil.URLCrawlDatum;
@@ -124,7 +125,8 @@ public class TestCrawlDbFilter {
   private ArrayList<URLCrawlDatum> readContents(Path fetchlist)
       throws IOException {
     // verify results
-    SequenceFile.Reader reader = new SequenceFile.Reader(fs, fetchlist, conf);
+    Option fFile = SequenceFile.Reader.file(fetchlist);
+    SequenceFile.Reader reader = new SequenceFile.Reader(conf, fFile);
 
     ArrayList<URLCrawlDatum> l = new ArrayList<URLCrawlDatum>();
 

Modified: nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java Mon Aug 24 17:57:28 2015
@@ -26,7 +26,9 @@ import org.apache.hadoop.conf.Configurat
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.MapFile.Writer.Option;
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.nutch.util.NutchConfiguration;
 import org.junit.After;
@@ -96,6 +98,17 @@ public class TestCrawlDbMerger {
     }
   }
 
+  /**
+   * Test creates two sample {@link org.apache.nutch.crawl.CrawlDb}'s
+   * populating entries for keys as {@link org.apache.hadoop.io.Text} e.g. URLs 
+   * and values as {@link org.apache.nutch.crawl.CrawlDatum} e.g. record data. 
+   * It then simulates a merge process for the two CrawlDb's via the {@link org.apache.nutch.crawl.CrawlDbMerger}
+   * tool. The merged CrawlDb is then written to an arbitrary output location and the results
+   * read using the {@link org.apache.nutch.crawl.CrawlDbReader} tool. 
+   * Test assertions include comparing expected CrawlDb key, value (URL, CrawlDatum) values
+   * with actual results based on the merge process. 
+   * @throws Exception
+   */
   @Test
   public void testMerge() throws Exception {
     Path crawldb1 = new Path(testDir, "crawldb1");
@@ -131,8 +144,12 @@ public class TestCrawlDbMerger {
       TreeSet<String> init, CrawlDatum cd) throws Exception {
     LOG.fine("* creating crawldb: " + crawldb);
     Path dir = new Path(crawldb, CrawlDb.CURRENT_NAME);
-    MapFile.Writer writer = new MapFile.Writer(config, fs, new Path(dir,
-        "part-00000").toString(), Text.class, CrawlDatum.class);
+    
+    Option wKeyOpt = MapFile.Writer.keyClass(Text.class);
+    org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(CrawlDatum.class);
+    
+    MapFile.Writer writer = new MapFile.Writer(config, new Path(dir,
+        "part-00000"), wKeyOpt, wValueOpt);
     Iterator<String> it = init.iterator();
     while (it.hasNext()) {
       String key = it.next();

Modified: nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java Mon Aug 24 17:57:28 2015
@@ -26,6 +26,7 @@ import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.SequenceFile.Reader.Option;
 import org.apache.nutch.crawl.CrawlDBTestUtil.URLCrawlDatum;
 import org.junit.After;
 import org.junit.Assert;
@@ -292,7 +293,8 @@ public class TestGenerator {
   private ArrayList<URLCrawlDatum> readContents(Path fetchlist)
       throws IOException {
     // verify results
-    SequenceFile.Reader reader = new SequenceFile.Reader(fs, fetchlist, conf);
+    Option rFile = SequenceFile.Reader.file(fetchlist);
+    SequenceFile.Reader reader = new SequenceFile.Reader(conf, rFile);
 
     ArrayList<URLCrawlDatum> l = new ArrayList<URLCrawlDatum>();
 

Modified: nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java Mon Aug 24 17:57:28 2015
@@ -28,6 +28,7 @@ import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.SequenceFile.Reader.Option;
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
@@ -141,8 +142,9 @@ public class TestInjector {
     Path dbfile = new Path(crawldbPath, CrawlDb.CURRENT_NAME
         + "/part-00000/data");
     System.out.println("reading:" + dbfile);
+    Option rFile = SequenceFile.Reader.file(dbfile);
     @SuppressWarnings("resource")
-    SequenceFile.Reader reader = new SequenceFile.Reader(fs, dbfile, conf);
+    SequenceFile.Reader reader = new SequenceFile.Reader(conf, rFile);
     ArrayList<String> read = new ArrayList<String>();
 
     READ: do {
@@ -160,8 +162,9 @@ public class TestInjector {
     Path dbfile = new Path(crawldbPath, CrawlDb.CURRENT_NAME
         + "/part-00000/data");
     System.out.println("reading:" + dbfile);
+    Option rFile = SequenceFile.Reader.file(dbfile);
     @SuppressWarnings("resource")
-    SequenceFile.Reader reader = new SequenceFile.Reader(fs, dbfile, conf);
+    SequenceFile.Reader reader = new SequenceFile.Reader(conf, rFile);
     HashMap<String, CrawlDatum> read = new HashMap<String, CrawlDatum>();
 
     READ: do {

Modified: nutch/trunk/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java Mon Aug 24 17:57:28 2015
@@ -27,7 +27,9 @@ import org.apache.hadoop.conf.Configurat
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.MapFile.Writer.Option;
 import org.apache.nutch.util.NutchConfiguration;
 import org.junit.After;
 import org.junit.Assert;
@@ -137,8 +139,11 @@ public class TestLinkDbMerger {
       TreeMap<String, String[]> init) throws Exception {
     LOG.fine("* creating linkdb: " + linkdb);
     Path dir = new Path(linkdb, LinkDb.CURRENT_NAME);
-    MapFile.Writer writer = new MapFile.Writer(config, fs, new Path(dir,
-        "part-00000").toString(), Text.class, Inlinks.class);
+    
+    Option wKeyOpt = MapFile.Writer.keyClass(Text.class);
+    org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(Inlinks.class);
+    MapFile.Writer writer = new MapFile.Writer(config, new Path(dir,
+        "part-00000"), wKeyOpt, wValueOpt);
     Iterator<String> it = init.keySet().iterator();
     while (it.hasNext()) {
       String key = it.next();

Modified: nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java Mon Aug 24 17:57:28 2015
@@ -122,7 +122,7 @@ public class TestFetcher {
     Path content = new Path(new Path(generatedSegment[0], Content.DIR_NAME),
         "part-00000/data");
     @SuppressWarnings("resource")
-    SequenceFile.Reader reader = new SequenceFile.Reader(fs, content, conf);
+    SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(content));
 
     ArrayList<String> handledurls = new ArrayList<String>();
 
@@ -154,7 +154,7 @@ public class TestFetcher {
     // verify parse data
     Path parseData = new Path(
         new Path(generatedSegment[0], ParseData.DIR_NAME), "part-00000/data");
-    reader = new SequenceFile.Reader(fs, parseData, conf);
+    reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(parseData));
 
     READ_PARSE_DATA: do {
       Text key = new Text();

Modified: nutch/trunk/src/test/org/apache/nutch/net/TestURLNormalizers.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/net/TestURLNormalizers.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/net/TestURLNormalizers.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/net/TestURLNormalizers.java Mon Aug 24 17:57:28 2015
@@ -60,7 +60,7 @@ public class TestURLNormalizers {
           "http://www.example.org//path/to//somewhere.html",
           URLNormalizers.SCOPE_DEFAULT);
       Assert.assertEquals(normalizedHost,
-          "http://example.org/path/to/somewhere.html");
+          "http://www.example.org/path/to/somewhere.html");
     } catch (MalformedURLException mue) {
       Assert.fail(mue.toString());
     }

Modified: nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMerger.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMerger.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMerger.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMerger.java Mon Aug 24 17:57:28 2015
@@ -23,6 +23,8 @@ import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.MapFile.Writer.Option;
+import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapred.MapFileOutputFormat;
 import org.apache.nutch.parse.ParseText;
@@ -56,8 +58,9 @@ public class TestSegmentMerger {
     DecimalFormat df = new DecimalFormat("0000000");
     Text k = new Text();
     Path ptPath = new Path(new Path(seg1, ParseText.DIR_NAME), "part-00000");
-    MapFile.Writer w = new MapFile.Writer(conf, fs, ptPath.toString(),
-        Text.class, ParseText.class);
+    Option kOpt = MapFile.Writer.keyClass(Text.class);
+    org.apache.hadoop.io.SequenceFile.Writer.Option vOpt = SequenceFile.Writer.valueClass(ParseText.class);
+    MapFile.Writer w = new MapFile.Writer(conf, ptPath, kOpt, vOpt);
     long curSize = 0;
     countSeg1 = 0;
     FileStatus fileStatus = fs.getFileStatus(ptPath);
@@ -73,8 +76,9 @@ public class TestSegmentMerger {
     System.err.println(" - done: " + countSeg1 + " records.");
     System.err.println("Creating large segment 2...");
     ptPath = new Path(new Path(seg2, ParseText.DIR_NAME), "part-00000");
-    w = new MapFile.Writer(conf, fs, ptPath.toString(), Text.class,
-        ParseText.class);
+    Option wKeyOpt = MapFile.Writer.keyClass(Text.class);
+    org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(ParseText.class);
+    w = new MapFile.Writer(conf, ptPath, wKeyOpt, wValueOpt);
     curSize = 0;
     countSeg2 = 0;
     while (curSize < blkSize * 2) {

Modified: nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMergerCrawlDatums.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMergerCrawlDatums.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMergerCrawlDatums.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMergerCrawlDatums.java Mon Aug 24 17:57:28 2015
@@ -24,6 +24,8 @@ import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.MapFile.Writer.Option;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapred.MapFileOutputFormat;
 import org.apache.nutch.crawl.CrawlDatum;
@@ -381,8 +383,9 @@ public class TestSegmentMergerCrawlDatum
         new Path(segment, CrawlDatum.FETCH_DIR_NAME), "part-00000");
 
     // Get a writer for map files containing <Text,CrawlDatum> pairs
-    MapFile.Writer writer = new MapFile.Writer(conf, fs,
-        crawlFetchPath.toString(), Text.class, CrawlDatum.class);
+    Option wKeyOpt = MapFile.Writer.keyClass(Text.class);
+    org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(CrawlDatum.class);
+    MapFile.Writer writer = new MapFile.Writer(conf, crawlFetchPath, wKeyOpt, wValueOpt);
 
     // Whether we're handling a redirect now
     // first add the linked datum

Modified: nutch/trunk/src/test/org/apache/nutch/tools/proxy/SegmentHandler.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/tools/proxy/SegmentHandler.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/tools/proxy/SegmentHandler.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/tools/proxy/SegmentHandler.java Mon Aug 24 17:57:28 2015
@@ -145,7 +145,7 @@ public class SegmentHandler extends Abst
 
       MapFile.Reader[] parts = new MapFile.Reader[names.length];
       for (int i = 0; i < names.length; i++) {
-        parts[i] = new MapFile.Reader(fs, names[i].toString(), conf);
+        parts[i] = new MapFile.Reader(names[i], conf);
       }
       return parts;
     }