You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2006/02/04 01:39:32 UTC
svn commit: r374796 [1/5] - in /lucene/nutch/trunk: bin/ conf/ lib/
lib/jetty-ext/ src/java/org/apache/nutch/analysis/
src/java/org/apache/nutch/clustering/ src/java/org/apache/nutch/crawl/
src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/f...
Author: cutting
Date: Fri Feb 3 16:38:32 2006
New Revision: 374796
URL: http://svn.apache.org/viewcvs?rev=374796&view=rev
Log:
NUTCH-193: MapReduce and NDFS code moved to new project, Hadoop. See bug report for details.
Added:
lucene/nutch/trunk/conf/hadoop-default.xml
lucene/nutch/trunk/conf/mapred-default.xml.template
lucene/nutch/trunk/lib/hadoop-0.1-dev.jar (with props)
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/FsDirectory.java
- copied, changed from r374762, lucene/nutch/trunk/src/java/org/apache/nutch/indexer/NdfsDirectory.java
lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java
lucene/nutch/trunk/src/test/org/apache/nutch/util/WritableTestUtils.java
Removed:
lucene/nutch/trunk/bin/nutch-daemon.sh
lucene/nutch/trunk/bin/nutch-daemons.sh
lucene/nutch/trunk/bin/slaves.sh
lucene/nutch/trunk/bin/start-all.sh
lucene/nutch/trunk/bin/stop-all.sh
lucene/nutch/trunk/lib/jetty-5.1.4.LICENSE.txt
lucene/nutch/trunk/lib/jetty-5.1.4.jar
lucene/nutch/trunk/lib/jetty-ext/
lucene/nutch/trunk/src/java/org/apache/nutch/fs/
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/NdfsDirectory.java
lucene/nutch/trunk/src/java/org/apache/nutch/io/
lucene/nutch/trunk/src/java/org/apache/nutch/ipc/
lucene/nutch/trunk/src/java/org/apache/nutch/mapred/
lucene/nutch/trunk/src/java/org/apache/nutch/ndfs/
lucene/nutch/trunk/src/java/org/apache/nutch/util/Daemon.java
lucene/nutch/trunk/src/java/org/apache/nutch/util/LogFormatter.java
lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConf.java
lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfigurable.java
lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfigured.java
lucene/nutch/trunk/src/java/org/apache/nutch/util/Progress.java
lucene/nutch/trunk/src/test/org/apache/nutch/fs/
lucene/nutch/trunk/src/test/org/apache/nutch/io/
lucene/nutch/trunk/src/test/org/apache/nutch/ipc/
lucene/nutch/trunk/src/test/org/apache/nutch/mapred/
lucene/nutch/trunk/src/test/org/apache/nutch/ndfs/
lucene/nutch/trunk/src/webapps/
Modified:
lucene/nutch/trunk/bin/nutch
lucene/nutch/trunk/conf/crawl-tool.xml
lucene/nutch/trunk/conf/nutch-default.xml
lucene/nutch/trunk/conf/nutch-site.xml.template
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/CommonGrams.java
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.java
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysisTokenManager.java
lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java
lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClustererFactory.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Inlink.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Inlinks.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MD5Signature.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/PartitionUrlByHost.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Signature.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/SignatureFactory.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/TextProfileSignature.java
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilter.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java
lucene/nutch/trunk/src/java/org/apache/nutch/net/BasicUrlNormalizer.java
lucene/nutch/trunk/src/java/org/apache/nutch/net/RegexUrlNormalizer.java
lucene/nutch/trunk/src/java/org/apache/nutch/net/URLFilter.java
lucene/nutch/trunk/src/java/org/apache/nutch/net/URLFilterChecker.java
lucene/nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java
lucene/nutch/trunk/src/java/org/apache/nutch/net/UrlNormalizer.java
lucene/nutch/trunk/src/java/org/apache/nutch/net/UrlNormalizerFactory.java
lucene/nutch/trunk/src/java/org/apache/nutch/ontology/OntologyFactory.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilter.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseImpl.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParsePluginsReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseText.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseUtil.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/Parser.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java
lucene/nutch/trunk/src/java/org/apache/nutch/plugin/Extension.java
lucene/nutch/trunk/src/java/org/apache/nutch/plugin/Plugin.java
lucene/nutch/trunk/src/java/org/apache/nutch/plugin/PluginDescriptor.java
lucene/nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java
lucene/nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ContentProperties.java
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolFactory.java
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FieldQueryFilter.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Hit.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/HitDetails.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Hits.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LinkDbInlinks.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Query.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryFilter.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/QueryFilters.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summarizer.java
lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/servlet/Cached.java
lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java
lucene/nutch/trunk/src/java/org/apache/nutch/tools/PruneIndexTool.java
lucene/nutch/trunk/src/java/org/apache/nutch/util/ThreadPool.java
lucene/nutch/trunk/src/plugin/build.xml
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCDeleteUnlicensedTool.java
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCQueryFilter.java
lucene/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageQueryFilter.java
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/NGramProfile.java
lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
lucene/nutch/trunk/src/plugin/languageidentifier/src/test/org/apache/nutch/analysis/lang/TestLanguageIdentifier.java
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/OntologyImpl.java
lucene/nutch/trunk/src/plugin/ontology/src/java/org/apache/nutch/ontology/OwlParser.java
lucene/nutch/trunk/src/plugin/ontology/src/test/org/apache/nutch/ontology/TestOntology.java
lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
lucene/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java
lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MetadataCollector.java
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/ContentReaderListener.java
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/MSPowerPointParser.java
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PPTExtractor.java
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/PropertiesReaderListener.java
lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java
lucene/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java
lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java
lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java
lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java
lucene/nutch/trunk/src/plugin/parse-rtf/src/java/org/apache/nutch/parse/rtf/RTFParseFactory.java
lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java
lucene/nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java
lucene/nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java
lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java
lucene/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java
lucene/nutch/trunk/src/plugin/query-more/src/java/org/apache/nutch/searcher/more/DateQueryFilter.java
lucene/nutch/trunk/src/plugin/query-more/src/java/org/apache/nutch/searcher/more/TypeQueryFilter.java
lucene/nutch/trunk/src/plugin/query-site/src/java/org/apache/nutch/searcher/site/SiteQueryFilter.java
lucene/nutch/trunk/src/plugin/query-url/src/java/org/apache/nutch/searcher/url/URLQueryFilter.java
lucene/nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/net/PrefixURLFilter.java
lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java
lucene/nutch/trunk/src/test/nutch-site.xml
lucene/nutch/trunk/src/test/org/apache/nutch/analysis/TestQueryParser.java
lucene/nutch/trunk/src/test/org/apache/nutch/net/TestBasicUrlNormalizer.java
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseText.java
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParserFactory.java
lucene/nutch/trunk/src/test/org/apache/nutch/plugin/SimpleTestPlugin.java
lucene/nutch/trunk/src/test/org/apache/nutch/plugin/TestPluginSystem.java
lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java
lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContentProperties.java
lucene/nutch/trunk/src/test/org/apache/nutch/searcher/TestHitDetails.java
lucene/nutch/trunk/src/test/org/apache/nutch/searcher/TestQuery.java
Modified: lucene/nutch/trunk/bin/nutch
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/bin/nutch?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/bin/nutch (original)
+++ lucene/nutch/trunk/bin/nutch Fri Feb 3 16:38:32 2006
@@ -43,12 +43,6 @@
echo " dedup remove duplicates from a set of segment indexes"
echo " plugin load a plugin and run one of its classes main()"
echo " server run a search server"
- echo " namenode run the NDFS namenode"
- echo " datanode run an NDFS datanode"
- echo " ndfs run an NDFS admin client"
- echo " jobtracker run the MapReduce job Tracker node"
- echo " tasktracker run a MapReduce task Tracker node"
- echo " job manipulate MapReduce jobs"
echo " or"
echo " CLASSNAME run the class named CLASSNAME"
echo "Most commands print help when invoked w/o parameters."
@@ -155,18 +149,6 @@
CLASS=org.apache.nutch.plugin.PluginRepository
elif [ "$COMMAND" = "server" ] ; then
CLASS='org.apache.nutch.searcher.DistributedSearch$Server'
-elif [ "$COMMAND" = "namenode" ] ; then
- CLASS='org.apache.nutch.ndfs.NameNode'
-elif [ "$COMMAND" = "datanode" ] ; then
- CLASS='org.apache.nutch.ndfs.DataNode'
-elif [ "$COMMAND" = "ndfs" ] ; then
- CLASS=org.apache.nutch.fs.NDFSShell
-elif [ "$COMMAND" = "jobtracker" ] ; then
- CLASS=org.apache.nutch.mapred.JobTracker
-elif [ "$COMMAND" = "tasktracker" ] ; then
- CLASS=org.apache.nutch.mapred.TaskTracker
-elif [ "$COMMAND" = "job" ] ; then
- CLASS=org.apache.nutch.mapred.JobClient
else
CLASS=$COMMAND
fi
Modified: lucene/nutch/trunk/conf/crawl-tool.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/crawl-tool.xml?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/conf/crawl-tool.xml (original)
+++ lucene/nutch/trunk/conf/crawl-tool.xml Fri Feb 3 16:38:32 2006
@@ -1,5 +1,5 @@
<?xml version="1.0" ?>
-<?xml:stylesheet type="text/xsl" href="nutch-conf.xsl"?>
+<?xml:stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Overidden defaults for intranet use. -->
@@ -7,7 +7,7 @@
<!-- wish to modify from this file into nutch-site.xml and change them -->
<!-- there. If nutch-site.xml does not already exist, create it. -->
-<nutch-conf>
+<configuration>
<property>
<name>urlfilter.regex.file</name>
@@ -40,4 +40,4 @@
each.</description>
</property>
-</nutch-conf>
+</configuration>
Added: lucene/nutch/trunk/conf/hadoop-default.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/hadoop-default.xml?rev=374796&view=auto
==============================================================================
--- lucene/nutch/trunk/conf/hadoop-default.xml (added)
+++ lucene/nutch/trunk/conf/hadoop-default.xml Fri Feb 3 16:38:32 2006
@@ -0,0 +1,237 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+
+<!-- Do not modify this file directly. Instead, copy entries that you -->
+<!-- wish to modify from this file into hadoop-site.xml and change them -->
+<!-- there. If hadoop-site.xml does not already exist, create it. -->
+
+<configuration>
+
+<!-- file properties -->
+
+<property>
+ <name>file.content.limit</name>
+ <value>65536</value>
+ <description>The length limit for downloaded content, in bytes.
+ If this value is larger than zero, content longer than it will be
+ truncated; otherwise (zero or negative), no truncation at all.
+ </description>
+</property>
+
+<property>
+ <name>file.content.ignored</name>
+ <value>true</value>
+ <description>If true, no file content will be saved during fetch.
+ And it is probably what we want to set most of time, since file:// URLs
+ are meant to be local and we can always use them directly at parsing
+ and indexing stages. Otherwise file contents will be saved.
+ !! NO IMPLEMENTED YET !!
+ </description>
+</property>
+
+<!-- i/o properties -->
+
+<property>
+ <name>io.sort.factor</name>
+ <value>10</value>
+ <description>The number of streams to merge at once while sorting
+ files. This determines the number of open file handles.</description>
+</property>
+
+<property>
+ <name>io.sort.mb</name>
+ <value>100</value>
+ <description>The total amount of buffer memory to use while sorting
+ files, in megabytes. By default, gives each merge stream 1MB, which
+ should minimize seeks.</description>
+</property>
+
+<property>
+ <name>io.file.buffer.size</name>
+ <value>4096</value>
+ <description>The size of buffer for use in sequence files.
+ The size of this buffer should probably be a multiple of hardware
+ page size (4096 on Intel x86), and it determines how much data is
+ buffered during read and write operations.</description>
+</property>
+
+<property>
+ <name>io.bytes.per.checksum</name>
+ <value>512</value>
+ <description>The number of bytes per checksum. Must not be larger than
+ io.file.buffer.size.</description>
+</property>
+
+<property>
+ <name>io.skip.checksum.errors</name>
+ <value>false</value>
+ <description>If true, when a checksum error is encountered while
+ reading a sequence file, entries are skipped, instead of throwing an
+ exception.</description>
+</property>
+
+<property>
+ <name>io.map.index.skip</name>
+ <value>0</value>
+ <description>Number of index entries to skip between each entry.
+ Zero by default. Setting this to values larger than zero can
+ facilitate opening large map files using less memory.</description>
+</property>
+
+<!-- file system properties -->
+
+<property>
+ <name>fs.default.name</name>
+ <value>local</value>
+ <description>The name of the default file system. Either the
+ literal string "local" or a host:port for DFS.</description>
+</property>
+
+<property>
+ <name>dfs.datanode.port</name>
+ <value>50010</value>
+ <description>The port number that the dfs datanode server uses as a starting
+ point to look for a free port to listen on.
+</description>
+</property>
+
+<property>
+ <name>dfs.name.dir</name>
+ <value>/tmp/hadoop/dfs/name</value>
+ <description>Determines where on the local filesystem the DFS name node
+ should store the name table.</description>
+</property>
+
+<property>
+ <name>dfs.data.dir</name>
+ <value>/tmp/hadoop/dfs/data</value>
+ <description>Determines where on the local filesystem an DFS data node
+ should store its blocks. If this is a comma- or space-delimited
+ list of directories, then data will be stored in all named
+ directories, typically on different devices.</description>
+</property>
+
+<property>
+ <name>dfs.replication</name>
+ <value>3</value>
+ <description>How many copies we try to have at all times. The actual
+ number of replications is at max the number of datanodes in the
+ cluster.</description>
+</property>
+
+<!-- map/reduce properties -->
+
+<property>
+ <name>mapred.job.tracker</name>
+ <value>local</value>
+ <description>The host and port that the MapReduce job tracker runs
+ at. If "local", then jobs are run in-process as a single map
+ and reduce task.
+ </description>
+</property>
+
+<property>
+ <name>mapred.job.tracker.info.port</name>
+ <value>50030</value>
+ <description>The port that the MapReduce job tracker info webserver runs at.
+ </description>
+</property>
+
+<property>
+ <name>mapred.task.tracker.output.port</name>
+ <value>50040</value>
+ <description>The port number that the MapReduce task tracker output server uses as a starting
+ point to look for a free port to listen on.
+ </description>
+</property>
+
+<property>
+ <name>mapred.task.tracker.report.port</name>
+ <value>50050</value>
+ <description>The port number that the MapReduce task tracker report server uses as a starting
+ point to look for a free port to listen on.
+ </description>
+</property>
+
+<property>
+ <name>mapred.local.dir</name>
+ <value>/tmp/hadoop/mapred/local</value>
+ <description>The local directory where MapReduce stores intermediate
+ data files. May be a space- or comma- separated list of
+ directories on different devices in order to spread disk i/o.
+ </description>
+</property>
+
+<property>
+ <name>mapred.system.dir</name>
+ <value>/tmp/hadoop/mapred/system</value>
+ <description>The shared directory where MapReduce stores control files.
+ </description>
+</property>
+
+<property>
+ <name>mapred.temp.dir</name>
+ <value>/tmp/hadoop/mapred/temp</value>
+ <description>A shared directory for temporary files.
+ </description>
+</property>
+
+<property>
+ <name>mapred.map.tasks</name>
+ <value>2</value>
+ <description>The default number of map tasks per job. Typically set
+ to a prime several times greater than number of available hosts.
+ Ignored when mapred.job.tracker is "local".
+ </description>
+</property>
+
+<property>
+ <name>mapred.reduce.tasks</name>
+ <value>1</value>
+ <description>The default number of reduce tasks per job. Typically set
+ to a prime close to the number of available hosts. Ignored when
+ mapred.job.tracker is "local".
+ </description>
+</property>
+
+<property>
+ <name>mapred.task.timeout</name>
+ <value>600000</value>
+ <description>The number of milliseconds before a task will be
+ terminated if it neither reads an input, writes an output, nor
+ updates its status string.
+ </description>
+</property>
+
+<property>
+ <name>mapred.tasktracker.tasks.maximum</name>
+ <value>2</value>
+ <description>The maximum number of tasks that will be run
+ simultaneously by a task tracker.
+ </description>
+</property>
+
+<property>
+ <name>mapred.child.heap.size</name>
+ <value>200m</value>
+ <description>The heap size (-Xmx) that will be used for task tracker
+ child processes.</description>
+</property>
+
+<property>
+ <name>mapred.combine.buffer.size</name>
+ <value>100000</value>
+ <description>The number of entries the combining collector caches before
+ combining them and writing to disk.</description>
+</property>
+
+
+<!-- ipc properties -->
+
+<property>
+ <name>ipc.client.timeout</name>
+ <value>60000</value>
+ <description>Defines the timeout for IPC calls in milliseconds.</description>
+</property>
+
+</configuration>
Added: lucene/nutch/trunk/conf/mapred-default.xml.template
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/mapred-default.xml.template?rev=374796&view=auto
==============================================================================
--- lucene/nutch/trunk/conf/mapred-default.xml.template (added)
+++ lucene/nutch/trunk/conf/mapred-default.xml.template Fri Feb 3 16:38:32 2006
@@ -0,0 +1,8 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="nutch-conf.xsl"?>
+
+<!-- Put mapred-specific property overrides in this file. -->
+
+<configuration>
+
+</configuration>
Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Fri Feb 3 16:38:32 2006
@@ -1,11 +1,11 @@
<?xml version="1.0"?>
-<?xml-stylesheet type="text/xsl" href="nutch-conf.xsl"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Do not modify this file directly. Instead, copy entries that you -->
<!-- wish to modify from this file into nutch-site.xml and change them -->
<!-- there. If nutch-site.xml does not already exist, create it. -->
-<nutch-conf>
+<configuration>
<!-- HTTP properties -->
@@ -110,28 +110,6 @@
trying to fetch a page.</description>
</property>
-<!-- FILE properties -->
-
-<property>
- <name>file.content.limit</name>
- <value>65536</value>
- <description>The length limit for downloaded content, in bytes.
- If this value is larger than zero, content longer than it will be
- truncated; otherwise (zero or negative), no truncation at all.
- </description>
-</property>
-
-<property>
- <name>file.content.ignored</name>
- <value>true</value>
- <description>If true, no file content will be saved during fetch.
- And it is probably what we want to set most of time, since file:// URLs
- are meant to be local and we can always use them directly at parsing
- and indexing stages. Otherwise file contents will be saved.
- !! NO IMPLEMENTED YET !!
- </description>
-</property>
-
<!-- FTP properties -->
<property>
@@ -338,202 +316,6 @@
<description>If true, fetcher will store content.</description>
</property>
-<!-- i/o properties -->
-
-<property>
- <name>io.sort.factor</name>
- <value>10</value>
- <description>The number of streams to merge at once while sorting
- files. This determines the number of open file handles.</description>
-</property>
-
-<property>
- <name>io.sort.mb</name>
- <value>100</value>
- <description>The total amount of buffer memory to use while sorting
- files, in megabytes. By default, gives each merge stream 1MB, which
- should minimize seeks.</description>
-</property>
-
-<property>
- <name>io.file.buffer.size</name>
- <value>4096</value>
- <description>The size of buffer for use in sequence files.
- The size of this buffer should probably be a multiple of hardware
- page size (4096 on Intel x86), and it determines how much data is
- buffered during read and write operations.</description>
-</property>
-
-<property>
- <name>io.bytes.per.checksum</name>
- <value>512</value>
- <description>The number of bytes per checksum. Must not be larger than
- io.file.buffer.size.</description>
-</property>
-
-<property>
- <name>io.skip.checksum.errors</name>
- <value>false</value>
- <description>If true, when a checksum error is encountered while
- reading a sequence file, entries are skipped, instead of throwing an
- exception.</description>
-</property>
-
-<property>
- <name>io.map.index.skip</name>
- <value>0</value>
- <description>Number of index entries to skip between each entry.
- Zero by default. Setting this to values larger than zero can
- facilitate opening large map files using less memory.</description>
-</property>
-
-<!-- file system properties -->
-
-<property>
- <name>fs.default.name</name>
- <value>local</value>
- <description>The name of the default file system. Either the
- literal string "local" or a host:port for NDFS.</description>
-</property>
-
-<property>
- <name>ndfs.datanode.port</name>
- <value>50010</value>
- <description>The port number that the ndfs datanode server uses as a starting
- point to look for a free port to listen on.
-</description>
-</property>
-
-<property>
- <name>ndfs.name.dir</name>
- <value>/tmp/nutch/ndfs/name</value>
- <description>Determines where on the local filesystem the NDFS name node
- should store the name table.</description>
-</property>
-
-<property>
- <name>ndfs.data.dir</name>
- <value>/tmp/nutch/ndfs/data</value>
- <description>Determines where on the local filesystem an NDFS data node
- should store its blocks. If this is a comma- or space-delimited
- list of directories, then data will be stored in all named
- directories, typically on different devices.</description>
-</property>
-
-<property>
- <name>ndfs.replication</name>
- <value>3</value>
- <description>How many copies we try to have at all times. The actual
- number of replications is at max the number of datanodes in the
- cluster.</description>
-</property>
-
-<!-- map/reduce properties -->
-
-<property>
- <name>mapred.job.tracker</name>
- <value>local</value>
- <description>The host and port that the MapReduce job tracker runs
- at. If "local", then jobs are run in-process as a single map
- and reduce task.
- </description>
-</property>
-
-<property>
- <name>mapred.job.tracker.info.port</name>
- <value>50030</value>
- <description>The port that the MapReduce job tracker info webserver runs at.
- </description>
-</property>
-
-<property>
- <name>mapred.task.tracker.output.port</name>
- <value>50040</value>
- <description>The port number that the MapReduce task tracker output server uses as a starting
- point to look for a free port to listen on.
- </description>
-</property>
-
-<property>
- <name>mapred.task.tracker.report.port</name>
- <value>50050</value>
- <description>The port number that the MapReduce task tracker report server uses as a starting
- point to look for a free port to listen on.
- </description>
-</property>
-
-<property>
- <name>mapred.local.dir</name>
- <value>/tmp/nutch/mapred/local</value>
- <description>The local directory where MapReduce stores intermediate
- data files. May be a space- or comma- separated list of
- directories on different devices in order to spread disk i/o.
- </description>
-</property>
-
-<property>
- <name>mapred.system.dir</name>
- <value>/tmp/nutch/mapred/system</value>
- <description>The shared directory where MapReduce stores control files.
- </description>
-</property>
-
-<property>
- <name>mapred.temp.dir</name>
- <value>/tmp/nutch/mapred/temp</value>
- <description>A shared directory for temporary files.
- </description>
-</property>
-
-<property>
- <name>mapred.map.tasks</name>
- <value>2</value>
- <description>The default number of map tasks per job. Typically set
- to a prime several times greater than number of available hosts.
- Ignored when mapred.job.tracker is "local".
- </description>
-</property>
-
-<property>
- <name>mapred.reduce.tasks</name>
- <value>1</value>
- <description>The default number of reduce tasks per job. Typically set
- to a prime close to the number of available hosts. Ignored when
- mapred.job.tracker is "local".
- </description>
-</property>
-
-<property>
- <name>mapred.task.timeout</name>
- <value>600000</value>
- <description>The number of milliseconds before a task will be
- terminated if it neither reads an input, writes an output, nor
- updates its status string.
- </description>
-</property>
-
-<property>
- <name>mapred.tasktracker.tasks.maximum</name>
- <value>2</value>
- <description>The maximum number of tasks that will be run
- simultaneously by a task tracker.
- </description>
-</property>
-
-<property>
- <name>mapred.child.heap.size</name>
- <value>200m</value>
- <description>The heap size (-Xmx) that will be used for task tracker
- child processes.</description>
-</property>
-
-<property>
- <name>mapred.combine.buffer.size</name>
- <value>100000</value>
- <description>The number of entries the combining collector caches before
- combining them and writing to disk.</description>
-</property>
-
<!-- indexer properties -->
<property>
@@ -727,14 +509,6 @@
</description>
</property>
-<!-- ipc properties -->
-
-<property>
- <name>ipc.client.timeout</name>
- <value>60000</value>
- <description>Defines the timeout for IPC calls in milliseconds.</description>
-</property>
-
<!-- plugin properties -->
<property>
@@ -949,4 +723,4 @@
</description>
</property>
-</nutch-conf>
+</configuration>
Modified: lucene/nutch/trunk/conf/nutch-site.xml.template
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-site.xml.template?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/conf/nutch-site.xml.template (original)
+++ lucene/nutch/trunk/conf/nutch-site.xml.template Fri Feb 3 16:38:32 2006
@@ -1,8 +1,8 @@
<?xml version="1.0"?>
-<?xml-stylesheet type="text/xsl" href="nutch-conf.xsl"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Put site-specific property overrides in this file. -->
-<nutch-conf>
+<configuration>
-</nutch-conf>
+</configuration>
Added: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=374796&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/AnalyzerFactory.java Fri Feb 3 16:38:32 2006
@@ -22,8 +22,9 @@
import org.apache.nutch.plugin.Extension;
import org.apache.nutch.plugin.ExtensionPoint;
import org.apache.nutch.plugin.PluginRuntimeException;
-import org.apache.nutch.util.LogFormatter;
-import org.apache.nutch.util.NutchConf;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.hadoop.conf.Configuration;
/**
@@ -40,12 +41,12 @@
private NutchAnalyzer DEFAULT_ANALYZER;
private ExtensionPoint extensionPoint;
- private NutchConf nutchConf;
+ private Configuration conf;
- public AnalyzerFactory (NutchConf nutchConf) {
- DEFAULT_ANALYZER = new NutchDocumentAnalyzer(nutchConf);
- this.nutchConf = nutchConf;
- this.extensionPoint = nutchConf.getPluginRepository().getExtensionPoint(NutchAnalyzer.X_POINT_ID);
+ public AnalyzerFactory (Configuration conf) {
+ DEFAULT_ANALYZER = new NutchDocumentAnalyzer(conf);
+ this.conf = conf;
+ this.extensionPoint = PluginRepository.get(conf).getExtensionPoint(NutchAnalyzer.X_POINT_ID);
if(this.extensionPoint == null) {
throw new RuntimeException("x point " + NutchAnalyzer.X_POINT_ID +
" not found.");
@@ -77,10 +78,10 @@
private Extension getExtension(String lang) {
- Extension extension = (Extension) this.nutchConf.getObject(lang);
+ Extension extension = (Extension) this.conf.getObject(lang);
if (extension == null) {
extension = findExtension(lang);
- this.nutchConf.setObject(lang, extension);
+ this.conf.setObject(lang, extension);
}
return extension;
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/CommonGrams.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/CommonGrams.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/CommonGrams.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/CommonGrams.java Fri Feb 3 16:38:32 2006
@@ -24,8 +24,9 @@
import java.util.*;
import java.util.logging.Logger;
-import org.apache.nutch.util.*;
-
+import org.apache.hadoop.conf.*;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.searcher.Query.*;
/** Construct n-grams for frequently occuring terms and phrases while indexing.
@@ -40,10 +41,10 @@
/**
* The constructor.
- * @param nutchConf
+ * @param conf
*/
- public CommonGrams(NutchConf nutchConf) {
- init(nutchConf);
+ public CommonGrams(Configuration conf) {
+ init(conf);
}
private static class Filter extends TokenFilter {
@@ -133,10 +134,10 @@
}
/** Construct using the provided config file. */
- private void init(NutchConf nutchConf) {
+ private void init(Configuration conf) {
try {
- Reader reader = nutchConf.getConfResourceAsReader
- (nutchConf.get("analysis.common.terms.file"));
+ Reader reader = conf.getConfResourceAsReader
+ (conf.get("analysis.common.terms.file"));
BufferedReader in = new BufferedReader(reader);
String line;
while ((line = in.readLine()) != null) {
@@ -236,7 +237,7 @@
text.append(' ');
}
TokenStream ts = new NutchDocumentTokenizer(new StringReader(text.toString()));
- CommonGrams commonGrams = new CommonGrams(new NutchConf());
+ CommonGrams commonGrams = new CommonGrams(NutchConfiguration.create());
ts = commonGrams.getFilter(ts, "url");
Token token;
while ((token = ts.next()) != null) {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysis.java Fri Feb 3 16:38:32 2006
@@ -5,8 +5,9 @@
import org.apache.nutch.searcher.QueryFilters;
import org.apache.nutch.searcher.Query.Clause;
import org.apache.nutch.searcher.Query.Clause;
-import org.apache.nutch.util.NutchConf;
+import org.apache.hadoop.conf.Configuration;
import org.apache.lucene.analysis.StopFilter;
+import org.apache.nutch.util.NutchConfiguration;
import java.io.*;
import java.util.*;
@@ -35,12 +36,12 @@
}
/** Construct a query parser for the text in a reader. */
- public static Query parseQuery(String queryString, NutchConf nutchConf) throws IOException {
+ public static Query parseQuery(String queryString, Configuration conf) throws IOException {
NutchAnalysis parser =
new NutchAnalysis(new FastCharStream(new StringReader(queryString)));
parser.queryString = queryString;
- parser.queryFilters = new QueryFilters(nutchConf);
- return parser.parse(nutchConf);
+ parser.queryFilters = new QueryFilters(conf);
+ return parser.parse(conf);
}
/** For debugging. */
@@ -49,13 +50,13 @@
while (true) {
System.out.print("Query: ");
String line = in.readLine();
- System.out.println(parseQuery(line, new NutchConf()));
+ System.out.println(parseQuery(line, NutchConfiguration.create()));
}
}
/** Parse a query. */
- final public Query parse(NutchConf nutchConf) throws ParseException {
- Query query = new Query(nutchConf);
+ final public Query parse(Configuration conf) throws ParseException {
+ Query query = new Query(conf);
ArrayList terms;
Token token;
String field;
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysisTokenManager.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysisTokenManager.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysisTokenManager.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchAnalysisTokenManager.java Fri Feb 3 16:38:32 2006
@@ -4,7 +4,7 @@
import org.apache.nutch.searcher.QueryFilters;
import org.apache.nutch.searcher.Query.Clause;
import org.apache.nutch.searcher.Query.Clause;
-import org.apache.nutch.util.NutchConf;
+import org.apache.hadoop.conf.Configuration;
import org.apache.lucene.analysis.StopFilter;
import java.io.*;
import java.util.*;
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java Fri Feb 3 16:38:32 2006
@@ -24,7 +24,8 @@
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
-import org.apache.nutch.util.NutchConf;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.conf.Configuration;
/**
* The analyzer used for Nutch documents. Uses the JavaCC-defined lexical
@@ -44,13 +45,13 @@
public static final int INTER_ANCHOR_GAP = 4;
/** Analyzer used to analyze anchors. */
private static Analyzer ANCHOR_ANALYZER;
- private NutchConf nutchConf;
+ private Configuration conf;
/**
* @param conf
*/
- public NutchDocumentAnalyzer(NutchConf conf) {
- this.nutchConf = conf;
+ public NutchDocumentAnalyzer(Configuration conf) {
+ this.conf = conf;
CONTENT_ANALYZER = new ContentAnalyzer(conf);
ANCHOR_ANALYZER = new AnchorAnalyzer();
}
@@ -59,8 +60,8 @@
private static class ContentAnalyzer extends Analyzer {
private CommonGrams commonGrams;
- public ContentAnalyzer(NutchConf nutchConf) {
- this.commonGrams = new CommonGrams(nutchConf);
+ public ContentAnalyzer(Configuration conf) {
+ this.commonGrams = new CommonGrams(conf);
}
/** Constructs a {@link NutchDocumentTokenizer}. */
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClustererFactory.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClustererFactory.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClustererFactory.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClustererFactory.java Fri Feb 3 16:38:32 2006
@@ -17,9 +17,9 @@
package org.apache.nutch.clustering;
import org.apache.nutch.plugin.*;
-import org.apache.nutch.util.NutchConf;
+import org.apache.hadoop.conf.Configuration;
import java.util.logging.Logger;
-import org.apache.nutch.util.LogFormatter;
+import org.apache.hadoop.util.LogFormatter;
/**
* A factory for retrieving {@link OnlineClusterer} extensions.
@@ -33,9 +33,9 @@
private ExtensionPoint extensionPoint;
private String extensionName;
- public OnlineClustererFactory(NutchConf nutchConf) {
- this.extensionPoint = nutchConf.getPluginRepository().getExtensionPoint(OnlineClusterer.X_POINT_ID);
- this.extensionName = nutchConf.get("extension.clustering.extension-name");
+ public OnlineClustererFactory(Configuration conf) {
+ this.extensionPoint = PluginRepository.get(conf).getExtensionPoint(OnlineClusterer.X_POINT_ID);
+ this.extensionName = conf.get("extension.clustering.extension-name");
}
/**
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Fri Feb 3 16:38:32 2006
@@ -22,13 +22,15 @@
import java.util.logging.*;
import org.apache.nutch.fetcher.Fetcher;
-import org.apache.nutch.fs.*;
-import org.apache.nutch.util.*;
-import org.apache.nutch.mapred.*;
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.conf.*;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.hadoop.mapred.*;
import org.apache.nutch.parse.ParseSegment;
import org.apache.nutch.indexer.DeleteDuplicates;
import org.apache.nutch.indexer.IndexMerger;
import org.apache.nutch.indexer.Indexer;
+import org.apache.nutch.util.NutchConfiguration;
public class Crawl {
public static final Logger LOG =
@@ -48,13 +50,13 @@
return;
}
- NutchConf nutchConf = new NutchConf();
- nutchConf.addConfResource("crawl-tool.xml");
- JobConf conf = new JobConf(nutchConf);
+ Configuration conf = NutchConfiguration.create();
+ conf.addAppResource("crawl-tool.xml");
+ JobConf job = new JobConf(conf);
File rootUrlDir = null;
File dir = new File("crawl-" + getDate());
- int threads = conf.getInt("fetcher.threads.fetch", 10);
+ int threads = job.getInt("fetcher.threads.fetch", 10);
int depth = 5;
int topN = Integer.MAX_VALUE;
@@ -76,7 +78,7 @@
}
}
- NutchFileSystem fs = NutchFileSystem.get(conf);
+ FileSystem fs = FileSystem.get(job);
if (fs.exists(dir)) {
throw new RuntimeException(dir + " already exists.");
}
@@ -95,28 +97,28 @@
File indexes = new File(dir + "/indexes");
File index = new File(dir + "/index");
- File tmpDir = conf.getLocalFile("crawl", getDate());
+ File tmpDir = job.getLocalFile("crawl", getDate());
// initialize crawlDb
- new Injector(conf).inject(crawlDb, rootUrlDir);
+ new Injector(job).inject(crawlDb, rootUrlDir);
for (int i = 0; i < depth; i++) { // generate new segment
File segment =
- new Generator(conf).generate(crawlDb, segments, -1,
+ new Generator(job).generate(crawlDb, segments, -1,
topN, System.currentTimeMillis());
- new Fetcher(conf).fetch(segment, threads, Fetcher.isParsing(conf)); // fetch it
- if (!Fetcher.isParsing(conf)) {
- new ParseSegment(conf).parse(segment); // parse it, if needed
+ new Fetcher(job).fetch(segment, threads, Fetcher.isParsing(job)); // fetch it
+ if (!Fetcher.isParsing(job)) {
+ new ParseSegment(job).parse(segment); // parse it, if needed
}
- new CrawlDb(conf).update(crawlDb, segment); // update crawldb
+ new CrawlDb(job).update(crawlDb, segment); // update crawldb
}
- new LinkDb(conf).invert(linkDb, segments); // invert links
+ new LinkDb(job).invert(linkDb, segments); // invert links
// index, dedup & merge
- new Indexer(conf).index(indexes, crawlDb, linkDb, fs.listFiles(segments));
- new DeleteDuplicates(conf).dedup(new File[] { indexes });
- new IndexMerger(fs, fs.listFiles(indexes), index, tmpDir, nutchConf).merge();
+ new Indexer(job).index(indexes, crawlDb, linkDb, fs.listFiles(segments));
+ new DeleteDuplicates(job).dedup(new File[] { indexes });
+ new IndexMerger(fs, fs.listFiles(indexes), index, tmpDir, job).merge();
LOG.info("crawl finished: " + dir);
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java Fri Feb 3 16:38:32 2006
@@ -19,7 +19,8 @@
import java.io.*;
import java.util.*;
-import org.apache.nutch.io.*;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.conf.*;
import org.apache.nutch.util.*;
/* The crawl state of a url. */
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Fri Feb 3 16:38:32 2006
@@ -20,20 +20,23 @@
import java.util.*;
import java.util.logging.*;
-import org.apache.nutch.io.*;
-import org.apache.nutch.fs.*;
-import org.apache.nutch.util.*;
-import org.apache.nutch.mapred.*;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.conf.*;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.hadoop.mapred.*;
+
+import org.apache.nutch.util.NutchConfiguration;
/** This class takes a flat file of URLs and adds them to the of pages to be
* crawled. Useful for bootstrapping the system. */
-public class CrawlDb extends NutchConfigured {
+public class CrawlDb extends Configured {
public static final Logger LOG =
LogFormatter.getLogger("org.apache.nutch.crawl.CrawlDb");
/** Construct an CrawlDb. */
- public CrawlDb(NutchConf conf) {
+ public CrawlDb(Configuration conf) {
super(conf);
}
@@ -53,7 +56,7 @@
LOG.info("CrawlDb update: done");
}
- public static JobConf createJob(NutchConf config, File crawlDb) {
+ public static JobConf createJob(Configuration config, File crawlDb) {
File newCrawlDb =
new File(crawlDb,
Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
@@ -77,7 +80,7 @@
public static void install(JobConf job, File crawlDb) throws IOException {
File newCrawlDb = job.getOutputDir();
- NutchFileSystem fs = new JobClient(job).getFs();
+ FileSystem fs = new JobClient(job).getFs();
File old = new File(crawlDb, "old");
File current = new File(crawlDb, CrawlDatum.DB_DIR_NAME);
fs.delete(old);
@@ -87,7 +90,7 @@
}
public static void main(String[] args) throws Exception {
- CrawlDb crawlDb = new CrawlDb(new NutchConf());
+ CrawlDb crawlDb = new CrawlDb(NutchConfiguration.create());
if (args.length < 2) {
System.err.println("Usage: <crawldb> <segment>");
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Fri Feb 3 16:38:32 2006
@@ -22,27 +22,29 @@
import java.util.TreeMap;
import java.util.logging.Logger;
-import org.apache.nutch.fs.NutchFileSystem;
-import org.apache.nutch.io.LongWritable;
-import org.apache.nutch.io.MapFile;
-import org.apache.nutch.io.SequenceFile;
-import org.apache.nutch.io.UTF8;
-import org.apache.nutch.io.Writable;
-import org.apache.nutch.io.WritableComparable;
-import org.apache.nutch.mapred.JobClient;
-import org.apache.nutch.mapred.JobConf;
-import org.apache.nutch.mapred.MapFileOutputFormat;
-import org.apache.nutch.mapred.Mapper;
-import org.apache.nutch.mapred.OutputCollector;
-import org.apache.nutch.mapred.Reducer;
-import org.apache.nutch.mapred.Reporter;
-import org.apache.nutch.mapred.SequenceFileInputFormat;
-import org.apache.nutch.mapred.SequenceFileOutputFormat;
-import org.apache.nutch.mapred.TextOutputFormat;
-import org.apache.nutch.mapred.lib.HashPartitioner;
-import org.apache.nutch.util.LogFormatter;
-import org.apache.nutch.util.NutchConf;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.io.MapFile.Reader;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapFileOutputFormat;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
+import org.apache.hadoop.mapred.TextOutputFormat;
+import org.apache.hadoop.mapred.lib.HashPartitioner;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
/**
* Read utility for the CrawlDB.
*
@@ -130,7 +132,7 @@
}
}
- public void processStatJob(String crawlDb, NutchConf config) throws IOException {
+ public void processStatJob(String crawlDb, Configuration config) throws IOException {
LOG.info("CrawlDb statistics start: " + crawlDb);
File tmpFolder = new File(crawlDb, "stat_tmp" + System.currentTimeMillis());
@@ -152,7 +154,7 @@
JobClient.runJob(job);
// reading the result
- NutchFileSystem fileSystem = NutchFileSystem.get(config);
+ FileSystem fileSystem = FileSystem.get(config);
SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(config, tmpFolder);
UTF8 key = new UTF8();
@@ -201,8 +203,8 @@
}
- public void readUrl(String crawlDb, String url, NutchConf config) throws IOException {
- NutchFileSystem fs = NutchFileSystem.get(config);
+ public void readUrl(String crawlDb, String url, Configuration config) throws IOException {
+ FileSystem fs = FileSystem.get(config);
UTF8 key = new UTF8(url);
CrawlDatum val = new CrawlDatum();
MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new File(crawlDb, CrawlDatum.DB_DIR_NAME), config);
@@ -215,7 +217,7 @@
}
}
- public void processDumpJob(String crawlDb, String output, NutchConf config) throws IOException {
+ public void processDumpJob(String crawlDb, String output, Configuration config) throws IOException {
LOG.info("CrawlDb dump: starting");
LOG.info("CrawlDb db: " + crawlDb);
@@ -249,7 +251,7 @@
}
String param = null;
String crawlDb = args[0];
- NutchConf conf = new NutchConf();
+ Configuration conf = NutchConfiguration.create();
for (int i = 1; i < args.length; i++) {
if (args[i].equals("-stats")) {
dbr.processStatJob(crawlDb, conf);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Fri Feb 3 16:38:32 2006
@@ -19,8 +19,8 @@
import java.util.Iterator;
import java.io.IOException;
-import org.apache.nutch.io.*;
-import org.apache.nutch.mapred.*;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.mapred.*;
/** Merge new page entries with existing entries. */
public class CrawlDbReducer implements Reducer {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Fri Feb 3 16:38:32 2006
@@ -22,13 +22,16 @@
import java.text.*;
import java.util.logging.*;
-import org.apache.nutch.io.*;
-import org.apache.nutch.util.*;
-import org.apache.nutch.mapred.*;
-import org.apache.nutch.mapred.lib.*;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.conf.*;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.hadoop.mapred.*;
+import org.apache.hadoop.mapred.lib.*;
+
+import org.apache.nutch.util.NutchConfiguration;
/** Generates a subset of a crawl db to fetch. */
-public class Generator extends NutchConfigured {
+public class Generator extends Configured {
public static final Logger LOG =
LogFormatter.getLogger("org.apache.nutch.crawl.Generator");
@@ -141,7 +144,7 @@
}
/** Construct a generator. */
- public Generator(NutchConf conf) {
+ public Generator(Configuration conf) {
super(conf);
}
@@ -258,7 +261,7 @@
if (topN != Long.MAX_VALUE)
LOG.info("topN: " + topN);
- Generator gen = new Generator(new NutchConf());
+ Generator gen = new Generator(NutchConfiguration.create());
gen.generate(dbDir, segmentsDir, numFetchers, topN, curTime);
}
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Fri Feb 3 16:38:32 2006
@@ -20,15 +20,18 @@
import java.util.*;
import java.util.logging.*;
-import org.apache.nutch.io.*;
-import org.apache.nutch.fs.*;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.conf.*;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.hadoop.mapred.*;
+
import org.apache.nutch.net.*;
-import org.apache.nutch.util.*;
-import org.apache.nutch.mapred.*;
+import org.apache.nutch.util.NutchConfiguration;
/** This class takes a flat file of URLs and adds them to the of pages to be
* crawled. Useful for bootstrapping the system. */
-public class Injector extends NutchConfigured {
+public class Injector extends Configured {
public static final Logger LOG =
LogFormatter.getLogger("org.apache.nutch.crawl.Injector");
@@ -79,7 +82,7 @@
}
/** Construct an Injector. */
- public Injector(NutchConf conf) {
+ public Injector(Configuration conf) {
super(conf);
}
@@ -114,14 +117,14 @@
CrawlDb.install(mergeJob, crawlDb);
// clean up
- NutchFileSystem fs = new JobClient(getConf()).getFs();
+ FileSystem fs = new JobClient(getConf()).getFs();
fs.delete(tempDir);
LOG.info("Injector: done");
}
public static void main(String[] args) throws Exception {
- Injector injector = new Injector(new NutchConf());
+ Injector injector = new Injector(NutchConfiguration.create());
if (args.length < 2) {
System.err.println("Usage: Injector <crawldb> <url_dir>");
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Inlink.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Inlink.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Inlink.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Inlink.java Fri Feb 3 16:38:32 2006
@@ -17,7 +17,7 @@
package org.apache.nutch.crawl;
import java.io.*;
-import org.apache.nutch.io.*;
+import org.apache.hadoop.io.*;
/* An incoming link to a page. */
public class Inlink implements Writable {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Inlinks.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Inlinks.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Inlinks.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Inlinks.java Fri Feb 3 16:38:32 2006
@@ -20,7 +20,7 @@
import java.net.*;
import java.util.*;
-import org.apache.nutch.io.*;
+import org.apache.hadoop.io.*;
/** A list of {@link Inlink}s. */
public class Inlinks implements Writable {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Fri Feb 3 16:38:32 2006
@@ -21,14 +21,17 @@
import java.util.logging.*;
import java.net.*;
-import org.apache.nutch.io.*;
-import org.apache.nutch.fs.*;
-import org.apache.nutch.util.*;
-import org.apache.nutch.mapred.*;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.conf.*;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.hadoop.mapred.*;
+
import org.apache.nutch.parse.*;
+import org.apache.nutch.util.NutchConfiguration;
/** Maintains an inverted link map, listing incoming links for each url. */
-public class LinkDb extends NutchConfigured implements Mapper, Reducer {
+public class LinkDb extends Configured implements Mapper, Reducer {
public static final Logger LOG =
LogFormatter.getLogger("org.apache.nutch.crawl.LinkDb");
@@ -44,7 +47,7 @@
}
/** Construct an LinkDb. */
- public LinkDb(NutchConf conf) {
+ public LinkDb(Configuration conf) {
super(conf);
}
@@ -145,7 +148,7 @@
LOG.info("LinkDb: done");
}
- private static JobConf createJob(NutchConf config, File linkDb) {
+ private static JobConf createJob(Configuration config, File linkDb) {
File newLinkDb =
new File(linkDb,
Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
@@ -171,7 +174,7 @@
public static void install(JobConf job, File linkDb) throws IOException {
File newLinkDb = job.getOutputDir();
- NutchFileSystem fs = new JobClient(job).getFs();
+ FileSystem fs = new JobClient(job).getFs();
File old = new File(linkDb, "old");
File current = new File(linkDb, CURRENT_NAME);
fs.delete(old);
@@ -181,7 +184,7 @@
}
public static void main(String[] args) throws Exception {
- LinkDb linkDb = new LinkDb(new NutchConf());
+ LinkDb linkDb = new LinkDb(NutchConfiguration.create());
if (args.length < 2) {
System.err.println("Usage: <linkdb> <segments>");
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java Fri Feb 3 16:38:32 2006
@@ -19,12 +19,14 @@
import java.io.IOException;
import java.io.File;
-import org.apache.nutch.io.*;
-import org.apache.nutch.fs.*;
-import org.apache.nutch.mapred.*;
-import org.apache.nutch.mapred.lib.HashPartitioner;
-import org.apache.nutch.util.LogFormatter;
-import org.apache.nutch.util.NutchConf;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.mapred.*;
+import org.apache.hadoop.mapred.lib.HashPartitioner;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.hadoop.conf.Configuration;
+
+import org.apache.nutch.util.NutchConfiguration;
import java.util.logging.Logger;
@@ -34,15 +36,15 @@
private static final Partitioner PARTITIONER = new HashPartitioner();
- private NutchFileSystem fs;
+ private FileSystem fs;
private File directory;
private MapFile.Reader[] readers;
- private NutchConf nutchConf;
+ private Configuration conf;
- public LinkDbReader(NutchFileSystem fs, File directory, NutchConf nutchConf) {
+ public LinkDbReader(FileSystem fs, File directory, Configuration conf) {
this.fs = fs;
this.directory = directory;
- this.nutchConf = nutchConf;
+ this.conf = conf;
}
public String[] getAnchors(UTF8 url) throws IOException {
@@ -57,7 +59,7 @@
synchronized (this) {
if (readers == null) {
readers = MapFileOutputFormat.getReaders
- (fs, new File(directory, LinkDb.CURRENT_NAME), this.nutchConf);
+ (fs, new File(directory, LinkDb.CURRENT_NAME), this.conf);
}
}
@@ -65,7 +67,7 @@
(readers, PARTITIONER, url, new Inlinks());
}
- public static void processDumpJob(String linkdb, String output, NutchConf config) throws IOException {
+ public static void processDumpJob(String linkdb, String output, Configuration config) throws IOException {
LOG.info("LinkDb dump: starting");
LOG.info("LinkDb db: " + linkdb);
File outFolder = new File(output);
@@ -92,11 +94,11 @@
System.err.println("\t-url <url>\tprint information about <url> to System.out");
return;
}
- NutchConf nutchConf = new NutchConf();
+ Configuration conf = NutchConfiguration.create();
if (args[1].equals("-dump")) {
- LinkDbReader.processDumpJob(args[0], args[2], nutchConf);
+ LinkDbReader.processDumpJob(args[0], args[2], conf);
} else if (args[1].equals("-url")) {
- LinkDbReader dbr = new LinkDbReader(NutchFileSystem.get(new NutchConf()), new File(args[0]), nutchConf);
+ LinkDbReader dbr = new LinkDbReader(FileSystem.get(NutchConfiguration.create()), new File(args[0]), conf);
Inlinks links = dbr.getInlinks(new UTF8(args[2]));
if (links == null) {
System.out.println(" - no link information.");
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MD5Signature.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MD5Signature.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MD5Signature.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MD5Signature.java Fri Feb 3 16:38:32 2006
@@ -16,7 +16,7 @@
package org.apache.nutch.crawl;
-import org.apache.nutch.io.MD5Hash;
+import org.apache.hadoop.io.MD5Hash;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.protocol.Content;
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/PartitionUrlByHost.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/PartitionUrlByHost.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/PartitionUrlByHost.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/PartitionUrlByHost.java Fri Feb 3 16:38:32 2006
@@ -19,8 +19,8 @@
import java.net.URL;
import java.net.MalformedURLException;
-import org.apache.nutch.io.*;
-import org.apache.nutch.mapred.*;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.mapred.*;
/** Partition urls by hostname. */
public class PartitionUrlByHost implements Partitioner {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Signature.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Signature.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Signature.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Signature.java Fri Feb 3 16:38:32 2006
@@ -18,19 +18,19 @@
import org.apache.nutch.parse.Parse;
import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.NutchConf;
-import org.apache.nutch.util.NutchConfigurable;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configurable;
-public abstract class Signature implements NutchConfigurable {
- protected NutchConf conf;
+public abstract class Signature implements Configurable {
+ protected Configuration conf;
public abstract byte[] calculate(Content content, Parse parse);
- public NutchConf getConf() {
+ public Configuration getConf() {
return conf;
}
- public void setConf(NutchConf conf) {
+ public void setConf(Configuration conf) {
this.conf = conf;
}
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/SignatureFactory.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/SignatureFactory.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/SignatureFactory.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/SignatureFactory.java Fri Feb 3 16:38:32 2006
@@ -18,13 +18,13 @@
import java.util.logging.Logger;
-import org.apache.nutch.util.LogFormatter;
-import org.apache.nutch.util.NutchConf;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.hadoop.conf.Configuration;
/**
* Factory class, which instantiates a Signature implementation according to the
- * current NutchConf configuration. This newly created instance is cached in the
- * NutchConf instance, so that it could be later retrieved.
+ * current Configuration configuration. This newly created instance is cached in the
+ * Configuration instance, so that it could be later retrieved.
*
* @author Andrzej Bialecki <ab@getopt.org>
*/
@@ -35,7 +35,7 @@
private SignatureFactory() {} // no public ctor
/** Return the default Signature implementation. */
- public static Signature getSignature(NutchConf conf) {
+ public static Signature getSignature(Configuration conf) {
String clazz = conf.get("db.signature.class", MD5Signature.class.getName());
Signature impl = (Signature)conf.getObject(clazz);
if (impl == null) {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/TextProfileSignature.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/TextProfileSignature.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/TextProfileSignature.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/TextProfileSignature.java Fri Feb 3 16:38:32 2006
@@ -26,12 +26,13 @@
import java.util.HashMap;
import java.util.Iterator;
-import org.apache.nutch.io.MD5Hash;
+import org.apache.hadoop.io.MD5Hash;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.NutchConf;
+import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.util.StringUtil;
+import org.apache.nutch.util.NutchConfiguration;
/**
* <p>An implementation of a page signature. It calculates an MD5 hash
@@ -157,7 +158,7 @@
public static void main(String[] args) throws Exception {
TextProfileSignature sig = new TextProfileSignature();
- sig.setConf(new NutchConf());
+ sig.setConf(NutchConfiguration.create());
HashMap res = new HashMap();
File[] files = new File(args[0]).listFiles();
for (int i = 0; i < files.length; i++) {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Fri Feb 3 16:38:32 2006
@@ -19,20 +19,23 @@
import java.io.IOException;
import java.io.File;
-import org.apache.nutch.io.*;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.conf.*;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.hadoop.mapred.*;
+
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.SignatureFactory;
-import org.apache.nutch.fs.*;
import org.apache.nutch.net.*;
-import org.apache.nutch.util.*;
import org.apache.nutch.protocol.*;
import org.apache.nutch.parse.*;
-import org.apache.nutch.mapred.*;
+import org.apache.nutch.util.*;
import java.util.logging.*;
/** The fetcher. Most of the work is done by plugins. */
-public class Fetcher extends NutchConfigured implements MapRunnable {
+public class Fetcher extends Configured implements MapRunnable {
public static final Logger LOG =
LogFormatter.getLogger("org.apache.nutch.fetcher.Fetcher");
@@ -43,7 +46,7 @@
public static class InputFormat extends SequenceFileInputFormat {
/** Don't split inputs, to keep things polite. */
- public FileSplit[] getSplits(NutchFileSystem fs, JobConf job, int nSplits)
+ public FileSplit[] getSplits(FileSystem fs, JobConf job, int nSplits)
throws IOException {
File[] files = listFiles(fs, job);
FileSplit[] splits = new FileSplit[files.length];
@@ -73,18 +76,18 @@
private boolean parsing;
private class FetcherThread extends Thread {
- private NutchConf nutchConf;
+ private Configuration conf;
private URLFilters urlFilters;
private ParseUtil parseUtil;
private ProtocolFactory protocolFactory;
- public FetcherThread(NutchConf nutchConf) {
+ public FetcherThread(Configuration conf) {
this.setDaemon(true); // don't hang JVM on exit
this.setName("FetcherThread"); // use an informative name
- this.nutchConf = nutchConf;
- this.urlFilters = new URLFilters(nutchConf);
- this.parseUtil = new ParseUtil(nutchConf);
- this.protocolFactory = new ProtocolFactory(nutchConf);
+ this.conf = conf;
+ this.urlFilters = new URLFilters(conf);
+ this.parseUtil = new ParseUtil(conf);
+ this.protocolFactory = new ProtocolFactory(conf);
}
public void run() {
@@ -205,7 +208,7 @@
if (content == null) {
String url = key.toString();
- content = new Content(url, url, new byte[0], "", new ContentProperties(), this.nutchConf);
+ content = new Content(url, url, new byte[0], "", new ContentProperties(), this.conf);
}
content.getMetadata().setProperty // add segment to metadata
@@ -252,7 +255,7 @@
public Fetcher() { super(null); }
- public Fetcher(NutchConf conf) { super(conf); }
+ public Fetcher(Configuration conf) { super(conf); }
private synchronized void updateStatus(int bytesInPage) throws IOException {
pages++;
@@ -283,11 +286,11 @@
}
}
- public static boolean isParsing(NutchConf conf) {
+ public static boolean isParsing(Configuration conf) {
return conf.getBoolean("fetcher.parse", true);
}
- public static boolean isStoringContent(NutchConf conf) {
+ public static boolean isStoringContent(Configuration conf) {
return conf.getBoolean("fetcher.store.content", true);
}
@@ -370,7 +373,7 @@
File segment = new File(args[0]);
- NutchConf conf = new NutchConf();
+ Configuration conf = NutchConfiguration.create();
int threads = conf.getInt("fetcher.threads.fetch", 10);
boolean parsing = true;
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java Fri Feb 3 16:38:32 2006
@@ -18,19 +18,21 @@
import java.io.*;
-import org.apache.nutch.io.*;
+import org.apache.hadoop.io.*;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.parse.*;
-import org.apache.nutch.util.NutchConf;
-import org.apache.nutch.util.NutchConfigurable;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configurable;
/* An entry in the fetcher's output. */
-public final class FetcherOutput implements Writable, NutchConfigurable {
+public final class FetcherOutput implements Writable, Configurable {
private CrawlDatum crawlDatum;
private Content content;
private ParseImpl parse;
- private NutchConf nutchConf;
+ private Configuration conf;
+
+ static { WritableName.setName(FetcherOutput.class, "FetcherOutput"); }
public FetcherOutput() {}
@@ -44,7 +46,7 @@
public final void readFields(DataInput in) throws IOException {
this.crawlDatum = CrawlDatum.read(in);
this.content = in.readBoolean() ? Content.read(in) : null;
- this.parse = in.readBoolean() ? ParseImpl.read(in, this.nutchConf) : null;
+ this.parse = in.readBoolean() ? ParseImpl.read(in, this.conf) : null;
}
public final void write(DataOutput out) throws IOException {
@@ -80,12 +82,12 @@
return buffer.toString();
}
- public void setConf(NutchConf conf) {
- this.nutchConf = conf;
+ public void setConf(Configuration conf) {
+ this.conf = conf;
}
- public NutchConf getConf() {
- return this.nutchConf;
+ public Configuration getConf() {
+ return this.conf;
}
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java Fri Feb 3 16:38:32 2006
@@ -20,17 +20,17 @@
import java.io.File;
import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.fs.NutchFileSystem;
+import org.apache.hadoop.fs.FileSystem;
-import org.apache.nutch.io.MapFile;
-import org.apache.nutch.io.WritableComparable;
-import org.apache.nutch.io.Writable;
-import org.apache.nutch.io.UTF8;
+import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.UTF8;
-import org.apache.nutch.mapred.OutputFormat;
-import org.apache.nutch.mapred.RecordWriter;
-import org.apache.nutch.mapred.JobConf;
-import org.apache.nutch.mapred.Reporter;
+import org.apache.hadoop.mapred.OutputFormat;
+import org.apache.hadoop.mapred.RecordWriter;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Reporter;
import org.apache.nutch.parse.ParseOutputFormat;
import org.apache.nutch.protocol.Content;
@@ -38,7 +38,7 @@
/** Splits FetcherOutput entries into multiple map files. */
public class FetcherOutputFormat implements OutputFormat {
- public RecordWriter getRecordWriter(final NutchFileSystem fs,
+ public RecordWriter getRecordWriter(final FileSystem fs,
final JobConf job,
final String name) throws IOException {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java?rev=374796&r1=374795&r2=374796&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java Fri Feb 3 16:38:32 2006
@@ -20,10 +20,13 @@
import java.util.*;
import java.util.logging.*;
-import org.apache.nutch.io.*;
-import org.apache.nutch.fs.*;
-import org.apache.nutch.util.*;
-import org.apache.nutch.mapred.*;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.conf.*;
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.hadoop.mapred.*;
+
+import org.apache.nutch.util.NutchConfiguration;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.document.Document;
@@ -32,7 +35,7 @@
* Deletes duplicate documents in a set of Lucene indexes.
* Duplicates have either the same contents (via MD5 hash) or the same URL.
******************************************************************/
-public class DeleteDuplicates extends NutchConfigured
+public class DeleteDuplicates extends Configured
implements Mapper, Reducer, OutputFormat {
private static final Logger LOG =
LogFormatter.getLogger("org.apache.nutch.indexer.DeleteDuplicates");
@@ -127,7 +130,7 @@
private static final long INDEX_LENGTH = Integer.MAX_VALUE;
/** Return each index as a split. */
- public FileSplit[] getSplits(NutchFileSystem fs, JobConf job,
+ public FileSplit[] getSplits(FileSystem fs, JobConf job,
int numSplits)
throws IOException {
File[] files = listFiles(fs, job);
@@ -139,7 +142,7 @@
}
/** Return each index as a split. */
- public RecordReader getRecordReader(final NutchFileSystem fs,
+ public RecordReader getRecordReader(final FileSystem fs,
final FileSplit split,
final JobConf job,
Reporter reporter) throws IOException {
@@ -148,7 +151,7 @@
return new RecordReader() {
private IndexReader indexReader =
- IndexReader.open(new NdfsDirectory(fs, split.getFile(), false, job));
+ IndexReader.open(new FsDirectory(fs, split.getFile(), false, job));
{ indexReader.undeleteAll(); }
@@ -227,16 +230,17 @@
}
}
- private NutchFileSystem fs;
+ private FileSystem fs;
private int ioFileBufferSize;
public DeleteDuplicates() { super(null); }
- public DeleteDuplicates(NutchConf conf) { super(conf); }
+ public DeleteDuplicates(Configuration conf) { super(conf); }
public void configure(JobConf job) {
+ setConf(job);
try {
- fs = NutchFileSystem.get(job);
+ fs = FileSystem.get(job);
this.ioFileBufferSize = job.getInt("io.file.buffer.size", 4096);
} catch (IOException e) {
throw new RuntimeException(e);
@@ -256,7 +260,7 @@
OutputCollector output, Reporter reporter)
throws IOException {
File index = new File(key.toString());
- IndexReader reader = IndexReader.open(new NdfsDirectory(fs, index, false, getConf()));
+ IndexReader reader = IndexReader.open(new FsDirectory(fs, index, false, getConf()));
try {
while (values.hasNext()) {
reader.delete(((IntWritable)values.next()).get());
@@ -267,7 +271,7 @@
}
/** Write nothing. */
- public RecordWriter getRecordWriter(final NutchFileSystem fs,
+ public RecordWriter getRecordWriter(final FileSystem fs,
final JobConf job,
final String name) throws IOException {
return new RecordWriter() {
@@ -334,7 +338,7 @@
}
public static void main(String[] args) throws Exception {
- DeleteDuplicates dedup = new DeleteDuplicates(new NutchConf());
+ DeleteDuplicates dedup = new DeleteDuplicates(NutchConfiguration.create());
if (args.length < 1) {
System.err.println("Usage: <indexes> ...");