You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2015/08/24 19:57:29 UTC
svn commit: r1697466 - in /nutch/trunk: ./ conf/ ivy/ src/bin/
src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/
src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/parse/
src/java/org/apache/nutch/plugin/ src/java/org/apache...
Author: jnioche
Date: Mon Aug 24 17:57:28 2015
New Revision: 1697466
URL: http://svn.apache.org/r1697466
Log:
NUTCH-2049 Upgrade Trunk to Hadoop > 2.4 stable
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/build.xml
nutch/trunk/conf/nutch-default.xml
nutch/trunk/ivy/ivy.xml
nutch/trunk/src/bin/crawl
nutch/trunk/src/bin/nutch
nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java
nutch/trunk/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java
nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java
nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
nutch/trunk/src/java/org/apache/nutch/plugin/Extension.java
nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java
nutch/trunk/src/java/org/apache/nutch/segment/ContentAsTextInputFormat.java
nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java
nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java
nutch/trunk/src/java/org/apache/nutch/service/JobManager.java
nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java
nutch/trunk/src/java/org/apache/nutch/util/LockUtil.java
nutch/trunk/src/java/org/apache/nutch/util/domain/DomainStatistics.java
nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesClassifier.java
nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java
nutch/trunk/src/test/crawl-tests.xml
nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java
nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java
nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java
nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java
nutch/trunk/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java
nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java
nutch/trunk/src/test/org/apache/nutch/net/TestURLNormalizers.java
nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMerger.java
nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMergerCrawlDatums.java
nutch/trunk/src/test/org/apache/nutch/tools/proxy/SegmentHandler.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon Aug 24 17:57:28 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.11-SNAPSHOT
+* NUTCH-2049 Upgrade to Hadoop 2.4 (lewismc)
+
* NUTCH-1486 Upgrade to Solr 4.10.2 (lewismc, markus)
* NUTCH-2048 parse-tika: fix dependencies in plugin.xml (Michael Joyce via snagel)
Modified: nutch/trunk/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/build.xml (original)
+++ nutch/trunk/build.xml Mon Aug 24 17:57:28 2015
@@ -427,9 +427,6 @@
<copy todir="${test.build.data}">
<fileset dir="src/testresources" includes="**/*"/>
</copy>
-
- <copy file="${test.src.dir}/nutch-site.xml"
- todir="${test.build.classes}"/>
<copy file="${test.src.dir}/log4j.properties"
todir="${test.build.classes}"/>
Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Mon Aug 24 17:57:28 2015
@@ -1543,6 +1543,16 @@ CAUTION: Set the parser.timeout to -1 or
</description>
</property>
+<property>
+ <name>io.serializations</name>
+ <value>org.apache.hadoop.io.serializer.WritableSerialization,org.apache.hadoop.io.serializer.JavaSerialization</value>
+ <!-- org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization,
+ org.apache.hadoop.io.serializer.avro.AvroReflectSerialization,
+ org.apache.hadoop.io.serializer.avro.AvroGenericSerialization, -->
+ <description>A list of serialization classes that can be used for
+ obtaining serializers and deserializers.</description>
+</property>
+
<!-- linkrank scoring properties -->
<property>
Modified: nutch/trunk/ivy/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/ivy/ivy.xml (original)
+++ nutch/trunk/ivy/ivy.xml Mon Aug 24 17:57:28 2015
@@ -43,10 +43,11 @@
<dependency org="commons-collections" name="commons-collections" rev="3.1" conf="*->default" />
<dependency org="commons-httpclient" name="commons-httpclient" rev="3.1" conf="*->master" />
<dependency org="commons-codec" name="commons-codec" rev="1.3" conf="*->default" />
- <dependency org="org.apache.commons" name="commons-compress" rev="1.9" conf="*->default" />
+ <dependency org="org.apache.commons" name="commons-compress" rev="1.9" conf="*->default" />
<dependency org="org.apache.commons" name="commons-jexl" rev="2.1.1" />
-
- <dependency org="org.apache.hadoop" name="hadoop-core" rev="1.2.0" conf="*->default">
+
+ <!-- Hadoop Dependencies -->
+ <dependency org="org.apache.hadoop" name="hadoop-common" rev="2.4.0" conf="*->default">
<exclude org="hsqldb" name="hsqldb" />
<exclude org="net.sf.kosmosfs" name="kfs" />
<exclude org="net.java.dev.jets3t" name="jets3t" />
@@ -54,6 +55,10 @@
<exclude org="org.mortbay.jetty" name="jsp-*" />
<exclude org="ant" name="ant" />
</dependency>
+ <dependency org="org.apache.hadoop" name="hadoop-hdfs" rev="2.4.0" conf="*->default"/>
+ <dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-core" rev="2.4.0" conf="*->default"/>
+ <dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-jobclient" rev="2.4.0" conf="*->default"/>
+ <!-- End of Hadoop Dependencies -->
<dependency org="org.apache.tika" name="tika-core" rev="1.8" />
<dependency org="com.ibm.icu" name="icu4j" rev="55.1" />
@@ -77,17 +82,25 @@
<!--artifacts needed for testing -->
<dependency org="junit" name="junit" rev="4.11" conf="test->default" />
- <dependency org="org.apache.hadoop" name="hadoop-test" rev="1.2.0" conf="test->default" />
+ <!--dependency org="org.apache.hadoop" name="hadoop-test" rev="1.2.0" conf="test->default" /-->
<dependency org="org.mortbay.jetty" name="jetty-client" rev="6.1.22" conf="test->default" />
<dependency org="org.mortbay.jetty" name="jetty" rev="6.1.22" conf="test->default" />
<dependency org="org.mortbay.jetty" name="jetty-util" rev="6.1.22" conf="test->default" />
<!-- end of test artifacts -->
+
+ <dependency org="org.mortbay.jetty" name="jetty-client" rev="6.1.22" conf="test->default" />
+
+ <dependency org="org.mortbay.jetty" name="jetty" rev="6.1.22" conf="test->default" />
+ <dependency org="org.mortbay.jetty" name="jetty-util" rev="6.1.22" conf="test->default" />
+
<!--global exclusion -->
<exclude module="jmxtools" />
<exclude module="jms" />
<exclude module="jmxri" />
<exclude org="com.thoughtworks.xstream"/>
+ <exclude org="org.apache.mrunit"/>
+ <exclude org="com.thoughtworks.xstream"/>
</dependencies>
Modified: nutch/trunk/src/bin/crawl
URL: http://svn.apache.org/viewvc/nutch/trunk/src/bin/crawl?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/bin/crawl (original)
+++ nutch/trunk/src/bin/crawl Mon Aug 24 17:57:28 2015
@@ -140,7 +140,7 @@ commonOptions="-D mapred.reduce.tasks=$n
# check that hadoop can be found on the path
if [ $mode = "distributed" ]; then
if [ $(which hadoop | wc -l ) -eq 0 ]; then
- echo "Can't find Hadoop executable. Add HADOOP_HOME/bin to the path or run in local mode."
+ echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode."
exit -1;
fi
fi
Modified: nutch/trunk/src/bin/nutch
URL: http://svn.apache.org/viewvc/nutch/trunk/src/bin/nutch?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/bin/nutch (original)
+++ nutch/trunk/src/bin/nutch Mon Aug 24 17:57:28 2015
@@ -290,7 +290,7 @@ if $local; then
else
# check that hadoop can be found on the path
if [ $(which hadoop | wc -l ) -eq 0 ]; then
- echo "Can't find Hadoop executable. Add HADOOP_HOME/bin to the path or run in local mode."
+ echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode."
exit -1;
fi
fi
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java Mon Aug 24 17:57:28 2015
@@ -70,9 +70,9 @@ public class AdaptiveFetchSchedule exten
protected float DEC_RATE;
- private int MAX_INTERVAL;
+ private float MAX_INTERVAL;
- private int MIN_INTERVAL;
+ private float MIN_INTERVAL;
private boolean SYNC_DELTA;
@@ -84,9 +84,9 @@ public class AdaptiveFetchSchedule exten
return;
INC_RATE = conf.getFloat("db.fetch.schedule.adaptive.inc_rate", 0.2f);
DEC_RATE = conf.getFloat("db.fetch.schedule.adaptive.dec_rate", 0.2f);
- MIN_INTERVAL = conf.getInt("db.fetch.schedule.adaptive.min_interval", 60);
- MAX_INTERVAL = conf.getInt("db.fetch.schedule.adaptive.max_interval",
- SECONDS_PER_DAY * 365); // 1 year
+ MIN_INTERVAL = conf.getFloat("db.fetch.schedule.adaptive.min_interval", (float) 60.0);
+ MAX_INTERVAL = conf.getFloat("db.fetch.schedule.adaptive.max_interval",
+ (float) SECONDS_PER_DAY * 365); // 1 year
SYNC_DELTA = conf.getBoolean("db.fetch.schedule.adaptive.sync_delta", true);
SYNC_DELTA_RATE = conf.getFloat(
"db.fetch.schedule.adaptive.sync_delta_rate", 0.2f);
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Mon Aug 24 17:57:28 2015
@@ -33,14 +33,6 @@ import java.util.regex.Pattern;
import java.util.TreeMap;
-
-
-
-
-
-
-
-
// Commons Logging imports
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -75,9 +67,7 @@ import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.NutchTool;
import org.apache.nutch.util.StringUtil;
-
import org.apache.commons.jexl2.Expression;
import org.apache.commons.jexl2.JexlEngine;
@@ -778,6 +768,7 @@ public class CrawlDbReader extends Confi
String[] st = k.split(" ");
int code = Integer.parseInt(st[1]);
if (st.length > 2){
+ @SuppressWarnings("unchecked")
Map<String, Object> individualStatusInfo = (Map<String, Object>) statusMap.get(String.valueOf(code));
Map<String, String> hostValues;
if(individualStatusInfo.containsKey("hostValues")){
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java Mon Aug 24 17:57:28 2015
@@ -23,7 +23,6 @@ import java.util.Iterator;
import java.util.Map;
import java.util.Random;
-import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java Mon Aug 24 17:57:28 2015
@@ -18,7 +18,6 @@
package org.apache.nutch.crawl;
import java.io.BufferedReader;
-import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
@@ -26,7 +25,6 @@ import java.util.HashMap;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.*;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.metadata.HttpHeaders;
import org.apache.nutch.util.MimeUtil;
Modified: nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java Mon Aug 24 17:57:28 2015
@@ -23,12 +23,12 @@ import org.apache.nutch.crawl.CrawlDatum
import org.apache.nutch.crawl.NutchWritable;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
-
import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.MapFile.Writer.Option;
+import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.SequenceFile.CompressionType;
-
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.InvalidJobConfException;
import org.apache.hadoop.mapred.OutputFormat;
@@ -37,7 +37,6 @@ import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.util.Progressable;
-
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseOutputFormat;
import org.apache.nutch.protocol.Content;
@@ -68,8 +67,13 @@ public class FetcherOutputFormat impleme
final CompressionType compType = SequenceFileOutputFormat
.getOutputCompressionType(job);
- final MapFile.Writer fetchOut = new MapFile.Writer(job, fs,
- fetch.toString(), Text.class, CrawlDatum.class, compType, progress);
+ Option fKeyClassOpt = MapFile.Writer.keyClass(Text.class);
+ org.apache.hadoop.io.SequenceFile.Writer.Option fValClassOpt = SequenceFile.Writer.valueClass(CrawlDatum.class);
+ org.apache.hadoop.io.SequenceFile.Writer.Option fProgressOpt = SequenceFile.Writer.progressable(progress);
+ org.apache.hadoop.io.SequenceFile.Writer.Option fCompOpt = SequenceFile.Writer.compression(compType);
+
+ final MapFile.Writer fetchOut = new MapFile.Writer(job,
+ fetch, fKeyClassOpt, fValClassOpt, fCompOpt, fProgressOpt);
return new RecordWriter<Text, NutchWritable>() {
private MapFile.Writer contentOut;
@@ -77,8 +81,12 @@ public class FetcherOutputFormat impleme
{
if (Fetcher.isStoringContent(job)) {
- contentOut = new MapFile.Writer(job, fs, content.toString(),
- Text.class, Content.class, compType, progress);
+ Option cKeyClassOpt = MapFile.Writer.keyClass(Text.class);
+ org.apache.hadoop.io.SequenceFile.Writer.Option cValClassOpt = SequenceFile.Writer.valueClass(Content.class);
+ org.apache.hadoop.io.SequenceFile.Writer.Option cProgressOpt = SequenceFile.Writer.progressable(progress);
+ org.apache.hadoop.io.SequenceFile.Writer.Option cCompOpt = SequenceFile.Writer.compression(compType);
+ contentOut = new MapFile.Writer(job, content,
+ cKeyClassOpt, cValClassOpt, cCompOpt, cProgressOpt);
}
if (Fetcher.isParsing(job)) {
Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Mon Aug 24 17:57:28 2015
@@ -22,7 +22,6 @@ import java.util.Iterator;
import java.util.List;
import java.util.Map;
-import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
Modified: nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java Mon Aug 24 17:57:28 2015
@@ -142,113 +142,4 @@ public class OutlinkExtractor {
return retval;
}
- /**
- * Extracts outlinks from a plain text. <br />
- * This Method takes the Jakarta Regexp API.
- *
- * @param plainText
- *
- * @return Array of <code>Outlink</code> s within found in plainText
- * @deprecated only for tests
- */
- @Deprecated
- private Outlink[] getOutlinksJakartaRegexpImpl(final String plainText) {
-
- throw new UnsupportedOperationException(
- "Implementation commented out. Please uncomment to use it.");
-
- // final List outlinks = new ArrayList();
- // String url;
- // Outlink link;
- //
- // RE re = new RE(URL_PATTERN);
- //
- // int pos = 0;
- //
- // while (re.match(plainText, pos)) {
- //
- // url = re.getParen(0);
- //
- // if (LOG.isTraceEnabled()) {
- // LOG.trace("Extracted url: " + url);
- // }
- //
- // try {
- //
- // link = new Outlink(url, null);
- // outlinks.add(link);
- //
- // } catch (MalformedURLException ex) {
- // // if it is a malformed URL we just throw it away and continue with
- // // extraction.
- // if (LOG.isErrorEnabled()) { LOG.error("getOutlinks", ex); }
- // }
- //
- // pos = re.getParenEnd(0);
- // }
- //
- // final Outlink[] retval;
- //
- // if (pos > 0) {
- // retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
- // } else {
- // retval = new Outlink[0];
- // }
- //
- // return retval;
-
- }
-
- /**
- * Extracts outlinks from a plain text. </p> This Method takes the JDK5 Regexp
- * API.
- *
- * @param plainText
- *
- * @return Array of <code>Outlink</code> s within found in plainText
- * @deprecated only for tests
- */
- @Deprecated
- private Outlink[] getOutlinksJDK5Impl(final String plainText) {
-
- throw new UnsupportedOperationException(
- "Implementation commented out. Please uncomment to use it.");
-
- // final List outlinks = new ArrayList();
- // String url;
- // Outlink link;
- //
- // final Pattern urlPattern = Pattern.compile(URL_PATTERN);
- // final RE re = new RE(urlPattern);
- //
- // int pos = 0;
- //
- // while (re.match(plainText, pos)) {
- //
- // url = re.getParen(0);
- //
- // try {
- //
- // link = new Outlink(url, null);
- // outlinks.add(link);
- // } catch (MalformedURLException ex) {
- // // if it is a malformed URL we just throw it away and continue with
- // // extraction.
- // if (LOG.isErrorEnabled()) { LOG.error("getOutlinks", ex); }
- // }
- //
- // pos = re.getParenEnd(0);
- // }
- //
- // final Outlink[] retval;
- //
- // if (pos > 0) {
- // retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
- // } else {
- // retval = new Outlink[0];
- // }
- //
- // return retval;
- }
-
}
Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Mon Aug 24 17:57:28 2015
@@ -20,13 +20,13 @@ package org.apache.nutch.parse;
// Commons Logging imports
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-
-import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
+import org.apache.hadoop.io.MapFile.Writer.Option;
import org.apache.hadoop.io.SequenceFile.CompressionType;
+import org.apache.hadoop.io.SequenceFile.Metadata;
+import org.apache.hadoop.io.compress.DefaultCodec;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.mapred.*;
-import org.apache.hadoop.util.StringUtils;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.fetcher.Fetcher;
import org.apache.nutch.scoring.ScoringFilterException;
@@ -119,14 +119,33 @@ public class ParseOutputFormat implement
final String[] parseMDtoCrawlDB = job.get("db.parsemeta.to.crawldb", "")
.split(" *, *");
- final MapFile.Writer textOut = new MapFile.Writer(job, fs, text.toString(),
- Text.class, ParseText.class, CompressionType.RECORD, progress);
-
- final MapFile.Writer dataOut = new MapFile.Writer(job, fs, data.toString(),
- Text.class, ParseData.class, compType, progress);
-
- final SequenceFile.Writer crawlOut = SequenceFile.createWriter(fs, job,
- crawl, Text.class, CrawlDatum.class, compType, progress);
+ // textOut Options
+ Option tKeyClassOpt = (Option) MapFile.Writer.keyClass(Text.class);
+ org.apache.hadoop.io.SequenceFile.Writer.Option tValClassOpt = SequenceFile.Writer.valueClass(ParseText.class);
+ org.apache.hadoop.io.SequenceFile.Writer.Option tProgressOpt = SequenceFile.Writer.progressable(progress);
+ org.apache.hadoop.io.SequenceFile.Writer.Option tCompOpt = SequenceFile.Writer.compression(CompressionType.RECORD);
+
+ final MapFile.Writer textOut = new MapFile.Writer(job, text,
+ tKeyClassOpt, tValClassOpt, tCompOpt, tProgressOpt);
+
+ // dataOut Options
+ Option dKeyClassOpt = (Option) MapFile.Writer.keyClass(Text.class);
+ org.apache.hadoop.io.SequenceFile.Writer.Option dValClassOpt = SequenceFile.Writer.valueClass(ParseData.class);
+ org.apache.hadoop.io.SequenceFile.Writer.Option dProgressOpt = SequenceFile.Writer.progressable(progress);
+ org.apache.hadoop.io.SequenceFile.Writer.Option dCompOpt = SequenceFile.Writer.compression(compType);
+
+ final MapFile.Writer dataOut = new MapFile.Writer(job, data,
+ dKeyClassOpt, dValClassOpt, dCompOpt, dProgressOpt);
+
+ final SequenceFile.Writer crawlOut = SequenceFile.createWriter(job, SequenceFile.Writer.file(crawl),
+ SequenceFile.Writer.keyClass(Text.class),
+ SequenceFile.Writer.valueClass(CrawlDatum.class),
+ SequenceFile.Writer.bufferSize(fs.getConf().getInt("io.file.buffer.size",4096)),
+ SequenceFile.Writer.replication(fs.getDefaultReplication(crawl)),
+ SequenceFile.Writer.blockSize(1073741824),
+ SequenceFile.Writer.compression(compType, new DefaultCodec()),
+ SequenceFile.Writer.progressable(progress),
+ SequenceFile.Writer.metadata(new Metadata()));
return new RecordWriter<Text, Parse>() {
Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Mon Aug 24 17:57:28 2015
@@ -33,7 +33,6 @@ import org.apache.nutch.net.protocols.Re
import org.apache.nutch.protocol.*;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
-import org.apache.hadoop.fs.FileSystem;
import org.apache.nutch.util.*;
import org.apache.hadoop.fs.Path;
Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java Mon Aug 24 17:57:28 2015
@@ -34,7 +34,6 @@ import org.apache.nutch.protocol.Content
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolFactory;
import org.apache.nutch.protocol.ProtocolOutput;
-import org.apache.nutch.protocol.ProtocolStatus;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.URLUtil;
Modified: nutch/trunk/src/java/org/apache/nutch/plugin/Extension.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/plugin/Extension.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/plugin/Extension.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/plugin/Extension.java Mon Aug 24 17:57:28 2015
@@ -153,7 +153,7 @@ public class Extension {
synchronized (getId()) {
try {
PluginRepository pluginRepository = PluginRepository.get(conf);
- Class extensionClazz = pluginRepository.getCachedClass(fDescriptor,
+ Class<?> extensionClazz = pluginRepository.getCachedClass(fDescriptor,
getClazz());
// lazy loading of Plugin in case there is no instance of the plugin
// already.
Modified: nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java Mon Aug 24 17:57:28 2015
@@ -21,7 +21,6 @@ import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
-import java.net.URI;
import java.net.URLDecoder;
import java.util.HashMap;
import java.util.Map;
Modified: nutch/trunk/src/java/org/apache/nutch/segment/ContentAsTextInputFormat.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/segment/ContentAsTextInputFormat.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/segment/ContentAsTextInputFormat.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/segment/ContentAsTextInputFormat.java Mon Aug 24 17:57:28 2015
@@ -65,7 +65,6 @@ public class ContentAsTextInputFormat ex
// convert the content object to text
Text tKey = key;
- Text tValue = value;
if (!sequenceFileRecordReader.next(innerKey, innerValue)) {
return false;
}
Modified: nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java Mon Aug 24 17:57:28 2015
@@ -34,7 +34,10 @@ import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.MapFile.Writer.Option;
import org.apache.hadoop.io.SequenceFile.CompressionType;
+import org.apache.hadoop.io.SequenceFile.Metadata;
+import org.apache.hadoop.io.compress.DefaultCodec;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.FileSplit;
@@ -160,8 +163,7 @@ public class SegmentMerger extends Confi
throw new RuntimeException("Cannot identify segment:", e);
}
- SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.get(job),
- fSplit.getPath(), job);
+ SequenceFile.Reader reader = new SequenceFile.Reader(job, SequenceFile.Reader.file(fSplit.getPath()));
final Writable w;
try {
@@ -284,9 +286,26 @@ public class SegmentMerger extends Confi
wname = new Path(new Path(new Path(out, segmentName + "-" + slice),
dirName), name);
}
- res = SequenceFile.createWriter(fs, job, wname, Text.class,
- CrawlDatum.class,
- SequenceFileOutputFormat.getOutputCompressionType(job), progress);
+
+// Option rKeyClassOpt = MapFile.Writer.keyClass(Text.class);
+// org.apache.hadoop.io.SequenceFile.Writer.Option rValClassOpt = SequenceFile.Writer.valueClass(CrawlDatum.class);
+// Option rProgressOpt = (Option) SequenceFile.Writer.progressable(progress);
+// Option rCompOpt = (Option) SequenceFile.Writer.compression(SequenceFileOutputFormat.getOutputCompressionType(job));
+// Option rFileOpt = (Option) SequenceFile.Writer.file(wname);
+
+ //res = SequenceFile.createWriter(job, rFileOpt, rKeyClassOpt,
+ // rValClassOpt, rCompOpt, rProgressOpt);
+
+ res = SequenceFile.createWriter(job, SequenceFile.Writer.file(wname),
+ SequenceFile.Writer.keyClass(Text.class),
+ SequenceFile.Writer.valueClass(CrawlDatum.class),
+ SequenceFile.Writer.bufferSize(fs.getConf().getInt("io.file.buffer.size",4096)),
+ SequenceFile.Writer.replication(fs.getDefaultReplication(wname)),
+ SequenceFile.Writer.blockSize(1073741824),
+ SequenceFile.Writer.compression(SequenceFileOutputFormat.getOutputCompressionType(job), new DefaultCodec()),
+ SequenceFile.Writer.progressable(progress),
+ SequenceFile.Writer.metadata(new Metadata()));
+
sliceWriters.put(slice + dirName, res);
return res;
}
@@ -314,8 +333,14 @@ public class SegmentMerger extends Confi
if (clazz.isAssignableFrom(ParseText.class)) {
compType = CompressionType.RECORD;
}
- res = new MapFile.Writer(job, fs, wname.toString(), Text.class,
- clazz, compType, progress);
+
+ Option rKeyClassOpt = (Option) MapFile.Writer.keyClass(Text.class);
+ org.apache.hadoop.io.SequenceFile.Writer.Option rValClassOpt = SequenceFile.Writer.valueClass(clazz);
+ org.apache.hadoop.io.SequenceFile.Writer.Option rProgressOpt = SequenceFile.Writer.progressable(progress);
+ org.apache.hadoop.io.SequenceFile.Writer.Option rCompOpt = SequenceFile.Writer.compression(compType);
+
+ res = new MapFile.Writer(job, wname, rKeyClassOpt,
+ rValClassOpt, rCompOpt, rProgressOpt);
sliceWriters.put(slice + dirName, res);
return res;
}
Modified: nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Mon Aug 24 17:57:28 2015
@@ -523,7 +523,7 @@ public class SegmentReader extends Confi
if (fe) {
Path fetchDir = new Path(segment, CrawlDatum.FETCH_DIR_NAME);
- if (fs.exists(fetchDir) && fs.getFileStatus(fetchDir).isDir()) {
+ if (fs.exists(fetchDir) && fs.getFileStatus(fetchDir).isDirectory()) {
cnt = 0L;
long start = Long.MAX_VALUE;
long end = Long.MIN_VALUE;
@@ -548,7 +548,7 @@ public class SegmentReader extends Confi
if (pd) {
Path parseDir = new Path(segment, ParseData.DIR_NAME);
- if (fs.exists(parseDir) && fs.getFileStatus(parseDir).isDir()) {
+ if (fs.exists(parseDir) && fs.getFileStatus(parseDir).isDirectory()) {
cnt = 0L;
long errors = 0L;
ParseData value = new ParseData();
Modified: nutch/trunk/src/java/org/apache/nutch/service/JobManager.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/service/JobManager.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/service/JobManager.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/service/JobManager.java Mon Aug 24 17:57:28 2015
@@ -18,8 +18,6 @@
package org.apache.nutch.service;
import java.util.Collection;
-import java.util.Map;
-
import org.apache.nutch.service.model.request.JobConfig;
import org.apache.nutch.service.model.response.JobInfo;
import org.apache.nutch.service.model.response.JobInfo.State;
Modified: nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java Mon Aug 24 17:57:28 2015
@@ -380,7 +380,7 @@ public class CommonCrawlDataDumper {
LOG.warn("Skipping segment: [" + segmentContentPath + "]: no data directory present");
continue;
}
- SequenceFile.Reader reader = new SequenceFile.Reader(fs, file, nutchConfig);
+ SequenceFile.Reader reader = new SequenceFile.Reader(nutchConfig, SequenceFile.Reader.file(file));
if (!new File(file.toString()).exists()) {
LOG.warn("Skipping segment: [" + segmentContentPath + "]: no data directory present");
Modified: nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java Mon Aug 24 17:57:28 2015
@@ -167,7 +167,7 @@ public class FileDumper {
+ "]: no data directory present");
continue;
}
- SequenceFile.Reader reader = new SequenceFile.Reader(fs, file, conf);
+ SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(file));
Writable key = (Writable) reader.getKeyClass().newInstance();
Content content = null;
@@ -209,7 +209,7 @@ public class FileDumper {
}
if (filter) {
- if (!mimeTypeStats) {
+ if (!mimeTypeStats) {
String md5Ofurl = DumpFileUtil.getUrlMD5(url);
String fullDir = DumpFileUtil.createTwoLevelsDirectory(outputDir.getAbsolutePath(), md5Ofurl);
Modified: nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java Mon Aug 24 17:57:28 2015
@@ -43,7 +43,7 @@ public class HadoopFSUtil {
return new PathFilter() {
public boolean accept(final Path path) {
try {
- return fs.getFileStatus(path).isDir();
+ return fs.getFileStatus(path).isDirectory();
} catch (IOException ioe) {
return false;
}
Modified: nutch/trunk/src/java/org/apache/nutch/util/LockUtil.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/LockUtil.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/LockUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/LockUtil.java Mon Aug 24 17:57:28 2015
@@ -48,7 +48,7 @@ public class LockUtil {
if (fs.exists(lockFile)) {
if (!accept)
throw new IOException("lock file " + lockFile + " already exists.");
- if (fs.getFileStatus(lockFile).isDir())
+ if (fs.getFileStatus(lockFile).isDirectory())
throw new IOException("lock file " + lockFile
+ " already exists and is a directory.");
// do nothing - the file already exists.
@@ -76,7 +76,7 @@ public class LockUtil {
throws IOException {
if (!fs.exists(lockFile))
return false;
- if (fs.getFileStatus(lockFile).isDir())
+ if (fs.getFileStatus(lockFile).isDirectory())
throw new IOException("lock file " + lockFile
+ " exists but is a directory!");
return fs.delete(lockFile, false);
Modified: nutch/trunk/src/java/org/apache/nutch/util/domain/DomainStatistics.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/domain/DomainStatistics.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/domain/DomainStatistics.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/domain/DomainStatistics.java Mon Aug 24 17:57:28 2015
@@ -20,8 +20,6 @@ package org.apache.nutch.util.domain;
import java.io.IOException;
import java.net.URL;
import java.text.SimpleDateFormat;
-import java.util.Iterator;
-
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
@@ -103,7 +101,7 @@ public class DomainStatistics extends Co
conf.setInt("domain.statistics.mode", mode);
conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
- Job job = new Job(conf, jobName);
+ Job job = Job.getInstance(conf, jobName);
job.setJarByClass(DomainStatistics.class);
String[] inputDirsSpecs = inputDir.split(",");
Modified: nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesClassifier.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesClassifier.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesClassifier.java (original)
+++ nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesClassifier.java Mon Aug 24 17:57:28 2015
@@ -29,8 +29,10 @@ import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.SequenceFile.Metadata;
import org.apache.hadoop.io.SequenceFile.Writer;
import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.DefaultCodec;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
@@ -46,6 +48,7 @@ import org.apache.mahout.math.Vector;
import org.apache.mahout.math.Vector.Element;
import org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles;
import org.apache.mahout.vectorizer.TFIDF;
+
import com.google.common.collect.ConcurrentHashMultiset;
import com.google.common.collect.Multiset;
@@ -190,8 +193,16 @@ public class NaiveBayesClassifier {
throws IOException {
Configuration configuration = new Configuration();
FileSystem fs = FileSystem.get(configuration);
- Writer writer = new SequenceFile.Writer(fs, configuration, new Path(
- outputDirName + "/chunk-0"), Text.class, Text.class);
+ SequenceFile.Writer writer = SequenceFile.createWriter(configuration,
+ Writer.file(new Path(outputDirName + "/chunk-0")),
+ Writer.keyClass(Text.class),
+ Writer.valueClass(Text.class),
+ Writer.bufferSize(fs.getConf().getInt("io.file.buffer.size",4096)),
+ Writer.replication(fs.getDefaultReplication(new Path(outputDirName + "/chunk-0"))),
+ Writer.blockSize(1073741824),
+ Writer.compression(SequenceFile.CompressionType.BLOCK, new DefaultCodec()),
+ Writer.progressable(null),
+ Writer.metadata(new Metadata()));
BufferedReader reader = null;
reader = new BufferedReader(
configuration.getConfResourceAsReader(inputFileName));
Modified: nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java (original)
+++ nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java Mon Aug 24 17:57:28 2015
@@ -27,10 +27,7 @@ import org.apache.nutch.parse.HTMLMetaTa
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseResult;
-import org.apache.nutch.parse.ParseStatus;
-import org.apache.nutch.parse.ParseText;
import org.apache.nutch.protocol.Content;
import java.io.Reader;
Modified: nutch/trunk/src/test/crawl-tests.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/crawl-tests.xml?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/test/crawl-tests.xml (original)
+++ nutch/trunk/src/test/crawl-tests.xml Mon Aug 24 17:57:28 2015
@@ -48,5 +48,15 @@
</description>
</property>
+<property>
+ <name>io.serializations</name>
+ <value>org.apache.hadoop.io.serializer.WritableSerialization,org.apache.hadoop.io.serializer.JavaSerialization</value>
+ <!-- org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization,
+ org.apache.hadoop.io.serializer.avro.AvroReflectSerialization,
+ org.apache.hadoop.io.serializer.avro.AvroGenericSerialization, -->
+ <description>A list of serialization classes that can be used for
+ obtaining serializers and deserializers.</description>
+</property>
+
</configuration>
Modified: nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java Mon Aug 24 17:57:28 2015
@@ -29,8 +29,9 @@ import org.apache.hadoop.fs.FSDataOutput
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.MapFile.Writer.Option;
import org.apache.hadoop.io.Text;
-
import org.mortbay.jetty.Server;
import org.mortbay.jetty.bio.SocketConnector;
import org.mortbay.jetty.handler.ContextHandler;
@@ -56,8 +57,10 @@ public class CrawlDBTestUtil {
Path crawldb, List<URLCrawlDatum> init) throws Exception {
LOG.trace("* creating crawldb: " + crawldb);
Path dir = new Path(crawldb, CrawlDb.CURRENT_NAME);
- MapFile.Writer writer = new MapFile.Writer(conf, fs, new Path(dir,
- "part-00000").toString(), Text.class, CrawlDatum.class);
+ Option wKeyOpt = MapFile.Writer.keyClass(Text.class);
+ org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(CrawlDatum.class);
+ MapFile.Writer writer = new MapFile.Writer(conf, new Path(dir,
+ "part-00000"), wKeyOpt, wValueOpt);
Iterator<URLCrawlDatum> it = init.iterator();
while (it.hasNext()) {
URLCrawlDatum row = it.next();
Modified: nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java Mon Aug 24 17:57:28 2015
@@ -24,6 +24,7 @@ import org.apache.hadoop.conf.Configurat
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.SequenceFile.Reader.Option;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.*;
import org.apache.nutch.crawl.CrawlDBTestUtil.URLCrawlDatum;
@@ -124,7 +125,8 @@ public class TestCrawlDbFilter {
private ArrayList<URLCrawlDatum> readContents(Path fetchlist)
throws IOException {
// verify results
- SequenceFile.Reader reader = new SequenceFile.Reader(fs, fetchlist, conf);
+ Option fFile = SequenceFile.Reader.file(fetchlist);
+ SequenceFile.Reader reader = new SequenceFile.Reader(conf, fFile);
ArrayList<URLCrawlDatum> l = new ArrayList<URLCrawlDatum>();
Modified: nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java Mon Aug 24 17:57:28 2015
@@ -26,7 +26,9 @@ import org.apache.hadoop.conf.Configurat
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.MapFile.Writer.Option;
import org.apache.hadoop.mapred.JobConf;
import org.apache.nutch.util.NutchConfiguration;
import org.junit.After;
@@ -96,6 +98,17 @@ public class TestCrawlDbMerger {
}
}
+ /**
+ * Test creates two sample {@link org.apache.nutch.crawl.CrawlDb}'s
+ * populating entries for keys as {@link org.apache.hadoop.io.Text} e.g. URLs
+ * and values as {@link org.apache.nutch.crawl.CrawlDatum} e.g. record data.
+ * It then simulates a merge process for the two CrawlDb's via the {@link org.apache.nutch.crawl.CrawlDbMerger}
+ * tool. The merged CrawlDb is then written to an arbitrary output location and the results
+ * read using the {@link org.apache.nutch.crawl.CrawlDbReader} tool.
+ * Test assertions include comparing expected CrawlDb key, value (URL, CrawlDatum) values
+ * with actual results based on the merge process.
+ * @throws Exception
+ */
@Test
public void testMerge() throws Exception {
Path crawldb1 = new Path(testDir, "crawldb1");
@@ -131,8 +144,12 @@ public class TestCrawlDbMerger {
TreeSet<String> init, CrawlDatum cd) throws Exception {
LOG.fine("* creating crawldb: " + crawldb);
Path dir = new Path(crawldb, CrawlDb.CURRENT_NAME);
- MapFile.Writer writer = new MapFile.Writer(config, fs, new Path(dir,
- "part-00000").toString(), Text.class, CrawlDatum.class);
+
+ Option wKeyOpt = MapFile.Writer.keyClass(Text.class);
+ org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(CrawlDatum.class);
+
+ MapFile.Writer writer = new MapFile.Writer(config, new Path(dir,
+ "part-00000"), wKeyOpt, wValueOpt);
Iterator<String> it = init.iterator();
while (it.hasNext()) {
String key = it.next();
Modified: nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java Mon Aug 24 17:57:28 2015
@@ -26,6 +26,7 @@ import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.SequenceFile.Reader.Option;
import org.apache.nutch.crawl.CrawlDBTestUtil.URLCrawlDatum;
import org.junit.After;
import org.junit.Assert;
@@ -292,7 +293,8 @@ public class TestGenerator {
private ArrayList<URLCrawlDatum> readContents(Path fetchlist)
throws IOException {
// verify results
- SequenceFile.Reader reader = new SequenceFile.Reader(fs, fetchlist, conf);
+ Option rFile = SequenceFile.Reader.file(fetchlist);
+ SequenceFile.Reader reader = new SequenceFile.Reader(conf, rFile);
ArrayList<URLCrawlDatum> l = new ArrayList<URLCrawlDatum>();
Modified: nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java Mon Aug 24 17:57:28 2015
@@ -28,6 +28,7 @@ import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.SequenceFile.Reader.Option;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
@@ -141,8 +142,9 @@ public class TestInjector {
Path dbfile = new Path(crawldbPath, CrawlDb.CURRENT_NAME
+ "/part-00000/data");
System.out.println("reading:" + dbfile);
+ Option rFile = SequenceFile.Reader.file(dbfile);
@SuppressWarnings("resource")
- SequenceFile.Reader reader = new SequenceFile.Reader(fs, dbfile, conf);
+ SequenceFile.Reader reader = new SequenceFile.Reader(conf, rFile);
ArrayList<String> read = new ArrayList<String>();
READ: do {
@@ -160,8 +162,9 @@ public class TestInjector {
Path dbfile = new Path(crawldbPath, CrawlDb.CURRENT_NAME
+ "/part-00000/data");
System.out.println("reading:" + dbfile);
+ Option rFile = SequenceFile.Reader.file(dbfile);
@SuppressWarnings("resource")
- SequenceFile.Reader reader = new SequenceFile.Reader(fs, dbfile, conf);
+ SequenceFile.Reader reader = new SequenceFile.Reader(conf, rFile);
HashMap<String, CrawlDatum> read = new HashMap<String, CrawlDatum>();
READ: do {
Modified: nutch/trunk/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java Mon Aug 24 17:57:28 2015
@@ -27,7 +27,9 @@ import org.apache.hadoop.conf.Configurat
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.MapFile.Writer.Option;
import org.apache.nutch.util.NutchConfiguration;
import org.junit.After;
import org.junit.Assert;
@@ -137,8 +139,11 @@ public class TestLinkDbMerger {
TreeMap<String, String[]> init) throws Exception {
LOG.fine("* creating linkdb: " + linkdb);
Path dir = new Path(linkdb, LinkDb.CURRENT_NAME);
- MapFile.Writer writer = new MapFile.Writer(config, fs, new Path(dir,
- "part-00000").toString(), Text.class, Inlinks.class);
+
+ Option wKeyOpt = MapFile.Writer.keyClass(Text.class);
+ org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(Inlinks.class);
+ MapFile.Writer writer = new MapFile.Writer(config, new Path(dir,
+ "part-00000"), wKeyOpt, wValueOpt);
Iterator<String> it = init.keySet().iterator();
while (it.hasNext()) {
String key = it.next();
Modified: nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java Mon Aug 24 17:57:28 2015
@@ -122,7 +122,7 @@ public class TestFetcher {
Path content = new Path(new Path(generatedSegment[0], Content.DIR_NAME),
"part-00000/data");
@SuppressWarnings("resource")
- SequenceFile.Reader reader = new SequenceFile.Reader(fs, content, conf);
+ SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(content));
ArrayList<String> handledurls = new ArrayList<String>();
@@ -154,7 +154,7 @@ public class TestFetcher {
// verify parse data
Path parseData = new Path(
new Path(generatedSegment[0], ParseData.DIR_NAME), "part-00000/data");
- reader = new SequenceFile.Reader(fs, parseData, conf);
+ reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(parseData));
READ_PARSE_DATA: do {
Text key = new Text();
Modified: nutch/trunk/src/test/org/apache/nutch/net/TestURLNormalizers.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/net/TestURLNormalizers.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/net/TestURLNormalizers.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/net/TestURLNormalizers.java Mon Aug 24 17:57:28 2015
@@ -60,7 +60,7 @@ public class TestURLNormalizers {
"http://www.example.org//path/to//somewhere.html",
URLNormalizers.SCOPE_DEFAULT);
Assert.assertEquals(normalizedHost,
- "http://example.org/path/to/somewhere.html");
+ "http://www.example.org/path/to/somewhere.html");
} catch (MalformedURLException mue) {
Assert.fail(mue.toString());
}
Modified: nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMerger.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMerger.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMerger.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMerger.java Mon Aug 24 17:57:28 2015
@@ -23,6 +23,8 @@ import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.MapFile.Writer.Option;
+import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapFileOutputFormat;
import org.apache.nutch.parse.ParseText;
@@ -56,8 +58,9 @@ public class TestSegmentMerger {
DecimalFormat df = new DecimalFormat("0000000");
Text k = new Text();
Path ptPath = new Path(new Path(seg1, ParseText.DIR_NAME), "part-00000");
- MapFile.Writer w = new MapFile.Writer(conf, fs, ptPath.toString(),
- Text.class, ParseText.class);
+ Option kOpt = MapFile.Writer.keyClass(Text.class);
+ org.apache.hadoop.io.SequenceFile.Writer.Option vOpt = SequenceFile.Writer.valueClass(ParseText.class);
+ MapFile.Writer w = new MapFile.Writer(conf, ptPath, kOpt, vOpt);
long curSize = 0;
countSeg1 = 0;
FileStatus fileStatus = fs.getFileStatus(ptPath);
@@ -73,8 +76,9 @@ public class TestSegmentMerger {
System.err.println(" - done: " + countSeg1 + " records.");
System.err.println("Creating large segment 2...");
ptPath = new Path(new Path(seg2, ParseText.DIR_NAME), "part-00000");
- w = new MapFile.Writer(conf, fs, ptPath.toString(), Text.class,
- ParseText.class);
+ Option wKeyOpt = MapFile.Writer.keyClass(Text.class);
+ org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(ParseText.class);
+ w = new MapFile.Writer(conf, ptPath, wKeyOpt, wValueOpt);
curSize = 0;
countSeg2 = 0;
while (curSize < blkSize * 2) {
Modified: nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMergerCrawlDatums.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMergerCrawlDatums.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMergerCrawlDatums.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMergerCrawlDatums.java Mon Aug 24 17:57:28 2015
@@ -24,6 +24,8 @@ import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.MapFile.Writer.Option;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapFileOutputFormat;
import org.apache.nutch.crawl.CrawlDatum;
@@ -381,8 +383,9 @@ public class TestSegmentMergerCrawlDatum
new Path(segment, CrawlDatum.FETCH_DIR_NAME), "part-00000");
// Get a writer for map files containing <Text,CrawlDatum> pairs
- MapFile.Writer writer = new MapFile.Writer(conf, fs,
- crawlFetchPath.toString(), Text.class, CrawlDatum.class);
+ Option wKeyOpt = MapFile.Writer.keyClass(Text.class);
+ org.apache.hadoop.io.SequenceFile.Writer.Option wValueOpt = SequenceFile.Writer.valueClass(CrawlDatum.class);
+ MapFile.Writer writer = new MapFile.Writer(conf, crawlFetchPath, wKeyOpt, wValueOpt);
// Whether we're handling a redirect now
// first add the linked datum
Modified: nutch/trunk/src/test/org/apache/nutch/tools/proxy/SegmentHandler.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/tools/proxy/SegmentHandler.java?rev=1697466&r1=1697465&r2=1697466&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/tools/proxy/SegmentHandler.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/tools/proxy/SegmentHandler.java Mon Aug 24 17:57:28 2015
@@ -145,7 +145,7 @@ public class SegmentHandler extends Abst
MapFile.Reader[] parts = new MapFile.Reader[names.length];
for (int i = 0; i < names.length; i++) {
- parts[i] = new MapFile.Reader(fs, names[i].toString(), conf);
+ parts[i] = new MapFile.Reader(names[i], conf);
}
return parts;
}