You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by do...@apache.org on 2009/08/17 00:25:17 UTC
svn commit: r804789 [1/6] - in /lucene/nutch/branches/nutchbase: ./ bin/
conf/ lib/ src/java/org/apache/nutch/analysis/
src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/
src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/ind...
Author: dogacan
Date: Sun Aug 16 22:25:12 2009
New Revision: 804789
URL: http://svn.apache.org/viewvc?rev=804789&view=rev
Log:
NUTCH-650 - Hbase integration
Added:
lucene/nutch/branches/nutchbase/conf/hbase-site.xml.template
lucene/nutch/branches/nutchbase/lib/hadoop-0.20.0-core.jar (with props)
lucene/nutch/branches/nutchbase/lib/hbase-0.20.0-r804408.jar (with props)
lucene/nutch/branches/nutchbase/lib/zookeeper-r785019-hbase-1329.jar (with props)
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/CrawlDatumHbase.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/GeneratorMapper.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/GeneratorReducer.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/PartitionSelectorByHost.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/TableUpdateMapper.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/TableUpdateReducer.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/TableUpdater.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/fetcher/FetchEntry.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/fetcher/FetcherReducer.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/fetcher/PartitionUrlByHost.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/indexer/IndexerReducer.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/TableParser.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/plugin/TablePluggable.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/scoring/ScoreDatum.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/util/NutchJobConf.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/util/hbase/
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/util/hbase/ColumnData.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/util/hbase/ColumnDescriptor.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/util/hbase/HbaseColumn.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/util/hbase/RowMutation.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/util/hbase/TableRow.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/util/hbase/TableUtil.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/util/hbase/WebTableColumns.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/util/hbase/WebTableCreator.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/util/hbase/WebTableRow.java
lucene/nutch/branches/nutchbase/src/test/org/apache/nutch/util/TestTableUtil.java
Removed:
lucene/nutch/branches/nutchbase/lib/hadoop-0.19.1-core.jar
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/Crawl.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/CrawlDbFilter.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/CrawlDbReader.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/LinkDb.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/LinkDbFilter.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/LinkDbMerger.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/LinkDbReader.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/PartitionUrlByHost.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/fetcher/OldFetcher.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/ParseImpl.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/ParseOutputFormat.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/ParseResult.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/ParseSegment.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/ParserChecker.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/searcher/DistributedSegmentBean.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/searcher/LinkDbInlinks.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/segment/SegmentMerger.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/tools/FreeGenerator.java
lucene/nutch/branches/nutchbase/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java
lucene/nutch/branches/nutchbase/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java
Modified:
lucene/nutch/branches/nutchbase/CHANGES.txt
lucene/nutch/branches/nutchbase/bin/nutch
lucene/nutch/branches/nutchbase/build.xml
lucene/nutch/branches/nutchbase/conf/log4j.properties
lucene/nutch/branches/nutchbase/conf/nutch-default.xml
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/CrawlDatum.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/CrawlDb.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/FetchSchedule.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/FetchScheduleFactory.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/Generator.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/Injector.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/MD5Signature.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/NutchWritable.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/Signature.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/SignatureComparator.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/SignatureFactory.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/TextProfileSignature.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/fetcher/Fetcher.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/fetcher/FetcherOutput.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/indexer/Indexer.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/indexer/IndexerOutputFormat.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/indexer/IndexingFilter.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/indexer/IndexingFilters.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/indexer/NutchIndexWriter.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/indexer/lucene/LuceneWriter.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/indexer/solr/SolrWriter.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/HTMLMetaTags.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/HtmlParseFilter.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/HtmlParseFilters.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/Outlink.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/OutlinkExtractor.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/Parse.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/ParseData.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/ParsePluginList.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/ParsePluginsReader.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/ParseStatus.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/ParseText.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/ParseUtil.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/Parser.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/ParserFactory.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/plugin/Pluggable.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/protocol/Content.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/protocol/Protocol.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/protocol/ProtocolFactory.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/scoring/ScoringFilter.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/scoring/ScoringFilters.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/searcher/Hit.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/searcher/NutchBean.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/searcher/SegmentBean.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/searcher/SolrSearchBean.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/segment/SegmentReader.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/util/EncodingDetector.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/util/NutchJob.java
lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/util/domain/DomainStatistics.java
lucene/nutch/branches/nutchbase/src/plugin/build.xml
lucene/nutch/branches/nutchbase/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
lucene/nutch/branches/nutchbase/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java
lucene/nutch/branches/nutchbase/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
lucene/nutch/branches/nutchbase/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
lucene/nutch/branches/nutchbase/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java
lucene/nutch/branches/nutchbase/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
lucene/nutch/branches/nutchbase/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
lucene/nutch/branches/nutchbase/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
lucene/nutch/branches/nutchbase/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
lucene/nutch/branches/nutchbase/src/plugin/parse-oo/src/java/org/apache/nutch/parse/oo/OOParser.java
lucene/nutch/branches/nutchbase/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java
lucene/nutch/branches/nutchbase/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
lucene/nutch/branches/nutchbase/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
lucene/nutch/branches/nutchbase/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
Modified: lucene/nutch/branches/nutchbase/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/nutchbase/CHANGES.txt?rev=804789&r1=804788&r2=804789&view=diff
==============================================================================
--- lucene/nutch/branches/nutchbase/CHANGES.txt (original)
+++ lucene/nutch/branches/nutchbase/CHANGES.txt Sun Aug 16 22:25:12 2009
@@ -5,6 +5,8 @@
1. NUTCH-735 - crawl-tool.xml must be read before nutch-site.xml when
invoked using crawl command (Susam Pal via dogacan)
+ 2. NUTCH-650 - Hbase Integration (dogacan)
+
Release 1.0 - 2009-03-23
1. NUTCH-474 - Fetcher2 crawlDelay and blocking fix (Dogacan Guney via ab)
Modified: lucene/nutch/branches/nutchbase/bin/nutch
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/nutchbase/bin/nutch?rev=804789&r1=804788&r2=804789&view=diff
==============================================================================
--- lucene/nutch/branches/nutchbase/bin/nutch (original)
+++ lucene/nutch/branches/nutchbase/bin/nutch Sun Aug 16 22:25:12 2009
@@ -32,21 +32,22 @@
if [ $# = 0 ]; then
echo "Usage: nutch [-core] COMMAND"
echo "where COMMAND is one of:"
- echo " crawl one-step crawler for intranets"
- echo " readdb read / dump crawl db"
- echo " convdb convert crawl db from pre-0.9 format"
- echo " mergedb merge crawldb-s, with optional filtering"
- echo " readlinkdb read / dump link db"
+# echo " crawl one-step crawler for intranets"
+# echo " readdb read / dump crawl db"
+# echo " convdb convert crawl db from pre-0.9 format"
+# echo " mergedb merge crawldb-s, with optional filtering"
+# echo " readlinkdb read / dump link db"
+ echo " createtable create a new webtable in hbase"
echo " inject inject new urls into the database"
echo " generate generate new segments to fetch from crawl db"
- echo " freegen generate new segments to fetch from text files"
- echo " fetch fetch a segment's pages"
- echo " parse parse a segment's pages"
- echo " readseg read / dump segment data"
- echo " mergesegs merge several segments, with optional filtering and slicing"
- echo " updatedb update crawl db from segments after fetching"
- echo " invertlinks create a linkdb from parsed segments"
- echo " mergelinkdb merge linkdb-s, with optional filtering"
+# echo " freegen generate new segments to fetch from text files"
+ echo " fetch fetch URLs marked during generate"
+ echo " parse parse URLs marked during fetch"
+# echo " readseg read / dump segment data"
+# echo " mergesegs merge several segments, with optional filtering and slicing"
+# echo " updatedb update crawl db from segments after fetching"
+# echo " invertlinks create a linkdb from parsed segments"
+# echo " mergelinkdb merge linkdb-s, with optional filtering"
echo " index run the indexer on parsed segments and linkdb"
echo " solrindex run the solr indexer on parsed segments and linkdb"
echo " merge merge several segment indexes"
@@ -186,7 +187,7 @@
NUTCH_LOG_DIR=`cygpath -p -w "$NUTCH_LOG_DIR"`
fi
-NUTCH_OPTS="$NUTCH_OPTS -Dhadoop.log.dir=$NUTCH_LOG_DIR"
+NUTCH_OPTS="$NUTCH_OPTS -Dhadoop.log.dir=$NUTCH_LOG_DIR -Djavax.xml.parsers.DocumentBuilderFactory=com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderFactoryImpl"
NUTCH_OPTS="$NUTCH_OPTS -Dhadoop.log.file=$NUTCH_LOGFILE"
if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
@@ -196,39 +197,39 @@
# figure out which class to run
if [ "$COMMAND" = "crawl" ] ; then
CLASS=org.apache.nutch.crawl.Crawl
+elif [ "$COMMAND" = "createtable" ] ; then
+ CLASS=org.apache.nutch.util.hbase.WebTableCreator
elif [ "$COMMAND" = "inject" ] ; then
CLASS=org.apache.nutch.crawl.Injector
elif [ "$COMMAND" = "generate" ] ; then
CLASS=org.apache.nutch.crawl.Generator
-elif [ "$COMMAND" = "freegen" ] ; then
- CLASS=org.apache.nutch.tools.FreeGenerator
+#elif [ "$COMMAND" = "freegen" ] ; then
+# CLASS=org.apache.nutch.tools.FreeGenerator
elif [ "$COMMAND" = "fetch" ] ; then
CLASS=org.apache.nutch.fetcher.Fetcher
-elif [ "$COMMAND" = "fetch2" ] ; then
- CLASS=org.apache.nutch.fetcher.Fetcher2
elif [ "$COMMAND" = "parse" ] ; then
- CLASS=org.apache.nutch.parse.ParseSegment
-elif [ "$COMMAND" = "readdb" ] ; then
- CLASS=org.apache.nutch.crawl.CrawlDbReader
-elif [ "$COMMAND" = "convdb" ] ; then
- CLASS=org.apache.nutch.tools.compat.CrawlDbConverter
-elif [ "$COMMAND" = "mergedb" ] ; then
- CLASS=org.apache.nutch.crawl.CrawlDbMerger
-elif [ "$COMMAND" = "readlinkdb" ] ; then
- CLASS=org.apache.nutch.crawl.LinkDbReader
-elif [ "$COMMAND" = "readseg" ] ; then
- CLASS=org.apache.nutch.segment.SegmentReader
-elif [ "$COMMAND" = "segread" ] ; then
- echo "[DEPRECATED] Command 'segread' is deprecated, use 'readseg' instead."
- CLASS=org.apache.nutch.segment.SegmentReader
-elif [ "$COMMAND" = "mergesegs" ] ; then
- CLASS=org.apache.nutch.segment.SegmentMerger
-elif [ "$COMMAND" = "updatedb" ] ; then
- CLASS=org.apache.nutch.crawl.CrawlDb
-elif [ "$COMMAND" = "invertlinks" ] ; then
- CLASS=org.apache.nutch.crawl.LinkDb
-elif [ "$COMMAND" = "mergelinkdb" ] ; then
- CLASS=org.apache.nutch.crawl.LinkDbMerger
+ CLASS=org.apache.nutch.parse.TableParser
+#elif [ "$COMMAND" = "readdb" ] ; then
+# CLASS=org.apache.nutch.crawl.CrawlDbReader
+#elif [ "$COMMAND" = "convdb" ] ; then
+# CLASS=org.apache.nutch.tools.compat.CrawlDbConverter
+#elif [ "$COMMAND" = "mergedb" ] ; then
+# CLASS=org.apache.nutch.crawl.CrawlDbMerger
+#elif [ "$COMMAND" = "readlinkdb" ] ; then
+# CLASS=org.apache.nutch.crawl.LinkDbReader
+#elif [ "$COMMAND" = "readseg" ] ; then
+# CLASS=org.apache.nutch.segment.SegmentReader
+#elif [ "$COMMAND" = "segread" ] ; then
+# echo "[DEPRECATED] Command 'segread' is deprecated, use 'readseg' instead."
+# CLASS=org.apache.nutch.segment.SegmentReader
+#elif [ "$COMMAND" = "mergesegs" ] ; then
+# CLASS=org.apache.nutch.segment.SegmentMerger
+elif [ "$COMMAND" = "updatetable" ] ; then
+ CLASS=org.apache.nutch.crawl.TableUpdater
+#elif [ "$COMMAND" = "invertlinks" ] ; then
+# CLASS=org.apache.nutch.crawl.LinkDb
+#elif [ "$COMMAND" = "mergelinkdb" ] ; then
+# CLASS=org.apache.nutch.crawl.LinkDbMerger
elif [ "$COMMAND" = "index" ] ; then
CLASS=org.apache.nutch.indexer.Indexer
elif [ "$COMMAND" = "solrindex" ] ; then
Modified: lucene/nutch/branches/nutchbase/build.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/nutchbase/build.xml?rev=804789&r1=804788&r2=804789&view=diff
==============================================================================
--- lucene/nutch/branches/nutchbase/build.xml (original)
+++ lucene/nutch/branches/nutchbase/build.xml Sun Aug 16 22:25:12 2009
@@ -99,6 +99,7 @@
encoding="${build.encoding}"
srcdir="${src.dir}"
includes="org/apache/nutch/**/*.java"
+ excludes="org/apache/nutch/scoring/webgraph/**/*.java,org/apache/nutch/tools/compat/**/*.java,org/apache/nutch/tools/arc/**/*.java,org/apache/nutch/indexer/field/**/*.java"
destdir="${build.classes}"
debug="${javac.debug}"
optimize="${javac.optimize}"
@@ -181,6 +182,8 @@
<include name="lucene*.jar"/>
<include name="taglibs-*.jar"/>
<include name="hadoop-*.jar"/>
+ <include name="hbase-*.jar"/>
+ <include name="zookeeper-*.jar"/>
<include name="dom4j-*.jar"/>
<include name="xerces-*.jar"/>
<include name="tika-*.jar"/>
@@ -215,7 +218,8 @@
<javac
encoding="${build.encoding}"
srcdir="${test.src.dir}"
- includes="org/apache/nutch/**/*.java"
+ includes="org/apache/nutch*/**/*.java"
+ excludes="org/apache/nutch/scoring/webgraph/**/*.java,org/apache/nutch/tools/compat/**/*.java,org/apache/nutch/tools/arc/**/*.java"
destdir="${test.build.classes}"
debug="${javac.debug}"
optimize="${javac.optimize}"
Added: lucene/nutch/branches/nutchbase/conf/hbase-site.xml.template
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/nutchbase/conf/hbase-site.xml.template?rev=804789&view=auto
==============================================================================
--- lucene/nutch/branches/nutchbase/conf/hbase-site.xml.template (added)
+++ lucene/nutch/branches/nutchbase/conf/hbase-site.xml.template Sun Aug 16 22:25:12 2009
@@ -0,0 +1,25 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+/**
+ * Copyright 2009 The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+-->
+<configuration>
+</configuration>
Modified: lucene/nutch/branches/nutchbase/conf/log4j.properties
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/nutchbase/conf/log4j.properties?rev=804789&r1=804788&r2=804789&view=diff
==============================================================================
--- lucene/nutch/branches/nutchbase/conf/log4j.properties (original)
+++ lucene/nutch/branches/nutchbase/conf/log4j.properties Sun Aug 16 22:25:12 2009
@@ -28,6 +28,7 @@
log4j.logger.org.apache.nutch=INFO
log4j.logger.org.apache.hadoop=WARN
+log4j.logger.org.apache.zookeeper=WARN
#
# Daily Rolling File Appender
Modified: lucene/nutch/branches/nutchbase/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/nutchbase/conf/nutch-default.xml?rev=804789&r1=804788&r2=804789&view=diff
==============================================================================
--- lucene/nutch/branches/nutchbase/conf/nutch-default.xml (original)
+++ lucene/nutch/branches/nutchbase/conf/nutch-default.xml Sun Aug 16 22:25:12 2009
@@ -886,8 +886,8 @@
<property>
<name>plugin.includes</name>
- <value>protocol-http|urlfilter-regex|parse-(text|html|js)|index-(basic|anchor)|query-(basic|site|url)|response-(json|xml)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
- <description>Regular expression naming plugin directory names to
+ <value>protocol-http|urlfilter-regex|parse-(html|js)|index-basic|urlnormalizer-(pass|regex|basic)|scoring-opic|query-(basic|site|url)|summary-basic</value>
+ <description>Regular expression naming plugin directory names to
include. Any plugin not matching this expression is excluded.
In any case you need at least include the nutch-extensionpoints plugin. By
default Nutch includes crawling just HTML and plain text via HTTP,
Added: lucene/nutch/branches/nutchbase/lib/hadoop-0.20.0-core.jar
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/nutchbase/lib/hadoop-0.20.0-core.jar?rev=804789&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/branches/nutchbase/lib/hadoop-0.20.0-core.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/nutch/branches/nutchbase/lib/hbase-0.20.0-r804408.jar
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/nutchbase/lib/hbase-0.20.0-r804408.jar?rev=804789&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/branches/nutchbase/lib/hbase-0.20.0-r804408.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/nutch/branches/nutchbase/lib/zookeeper-r785019-hbase-1329.jar
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/nutchbase/lib/zookeeper-r785019-hbase-1329.jar?rev=804789&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/branches/nutchbase/lib/zookeeper-r785019-hbase-1329.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Modified: lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java?rev=804789&r1=804788&r2=804789&view=diff
==============================================================================
--- lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java (original)
+++ lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java Sun Aug 16 22:25:12 2009
@@ -19,12 +19,14 @@
// JDK imports
import java.io.Reader;
import java.io.IOException;
+import java.util.Collection;
// Lucene imports
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
+import org.apache.nutch.util.hbase.HbaseColumn;
import org.apache.hadoop.conf.Configuration;
/**
Modified: lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java?rev=804789&r1=804788&r2=804789&view=diff
==============================================================================
--- lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java (original)
+++ lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java Sun Aug 16 22:25:12 2009
@@ -17,12 +17,17 @@
package org.apache.nutch.crawl;
+import java.util.HashSet;
+import java.util.Set;
+
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.FetchSchedule;
+import org.apache.nutch.util.hbase.HbaseColumn;
+import org.apache.nutch.util.hbase.WebTableColumns;
+import org.apache.nutch.util.hbase.WebTableRow;
/**
* This class provides common methods for implementations of
@@ -30,12 +35,22 @@
*
* @author Andrzej Bialecki
*/
-public abstract class AbstractFetchSchedule extends Configured implements FetchSchedule {
+public abstract class AbstractFetchSchedule
+extends Configured
+implements FetchSchedule {
private static final Log LOG = LogFactory.getLog(AbstractFetchSchedule.class);
protected int defaultInterval;
protected int maxInterval;
+ private static final Set<HbaseColumn> COLUMNS = new HashSet<HbaseColumn>();
+
+ static {
+ COLUMNS.add(new HbaseColumn(WebTableColumns.FETCH_TIME));
+ COLUMNS.add(new HbaseColumn(WebTableColumns.RETRIES));
+ COLUMNS.add(new HbaseColumn(WebTableColumns.FETCH_INTERVAL));
+ }
+
public AbstractFetchSchedule() {
super(null);
}
@@ -49,10 +64,14 @@
if (conf == null) return;
int oldDefaultInterval = conf.getInt("db.default.fetch.interval", 0);
defaultInterval = conf.getInt("db.fetch.interval.default", 0);
- if (oldDefaultInterval > 0 && defaultInterval == 0) defaultInterval = oldDefaultInterval * SECONDS_PER_DAY;
+ if (oldDefaultInterval > 0 && defaultInterval == 0) {
+ defaultInterval = oldDefaultInterval * FetchSchedule.SECONDS_PER_DAY;
+ }
int oldMaxInterval = conf.getInt("db.max.fetch.interval", 0);
maxInterval = conf.getInt("db.fetch.interval.max", 0 );
- if (oldMaxInterval > 0 && maxInterval == 0) maxInterval = oldMaxInterval * FetchSchedule.SECONDS_PER_DAY;
+ if (oldMaxInterval > 0 && maxInterval == 0) {
+ maxInterval = oldMaxInterval * FetchSchedule.SECONDS_PER_DAY;
+ }
LOG.info("defaultInterval=" + defaultInterval);
LOG.info("maxInterval=" + maxInterval);
}
@@ -64,13 +83,12 @@
* default <code>fetchInterval</code>.
*
* @param url URL of the page.
- * @param datum datum instance to be initialized (modified in place).
+ * @param row url's row
*/
- public CrawlDatum initializeSchedule(Text url, CrawlDatum datum) {
- datum.setFetchTime(System.currentTimeMillis());
- datum.setFetchInterval(defaultInterval);
- datum.setRetriesSinceFetch(0);
- return datum;
+ public void initializeSchedule(String url, WebTableRow row) {
+ row.setFetchTime(System.currentTimeMillis());
+ row.setFetchInterval(defaultInterval);
+ row.setRetriesSinceFetch(0);
}
/**
@@ -79,11 +97,10 @@
* retry counter - extending classes should call super.setFetchSchedule() to
* preserve this behavior.
*/
- public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
+ public void setFetchSchedule(String url, WebTableRow row,
long prevFetchTime, long prevModifiedTime,
long fetchTime, long modifiedTime, int state) {
- datum.setRetriesSinceFetch(0);
- return datum;
+ row.setRetriesSinceFetch(0);
}
/**
@@ -92,20 +109,20 @@
* and if it exceeds the <code>maxInterval</code> it calls
* {@link #forceRefetch(Text, CrawlDatum, boolean)}.
* @param url URL of the page
- * @param datum datum instance to be adjusted
+ * @param row url's row
* @return adjusted page information, including all original information.
* NOTE: this may be a different instance than {@param datum}, but
* implementations should make sure that it contains at least all
* information from {@param datum}.
*/
- public CrawlDatum setPageGoneSchedule(Text url, CrawlDatum datum,
+ public void setPageGoneSchedule(String url, WebTableRow row,
long prevFetchTime, long prevModifiedTime, long fetchTime) {
// no page is truly GONE ... just increase the interval by 50%
// and try much later.
- datum.setFetchInterval(datum.getFetchInterval() * 1.5f);
- datum.setFetchTime(fetchTime + (long)datum.getFetchInterval() * 1000);
- if (maxInterval < datum.getFetchInterval()) forceRefetch(url, datum, false);
- return datum;
+ int newFetchInterval = (int) (row.getFetchInterval() * 1.5f);
+ row.setFetchInterval(newFetchInterval);
+ row.setFetchTime(fetchTime + newFetchInterval * 1000L);
+ if (maxInterval < newFetchInterval) forceRefetch(url, row, false);
}
/**
@@ -114,28 +131,24 @@
* sets the next fetch time 1 day in the future and increases
* the retry counter.
* @param url URL of the page
- * @param datum page information
+ * @param row url's row
* @param prevFetchTime previous fetch time
* @param prevModifiedTime previous modified time
* @param fetchTime current fetch time
- * @return adjusted page information, including all original information.
- * NOTE: this may be a different instance than {@param datum}, but
- * implementations should make sure that it contains at least all
- * information from {@param datum}.
*/
- public CrawlDatum setPageRetrySchedule(Text url, CrawlDatum datum,
+ public void setPageRetrySchedule(String url, WebTableRow row,
long prevFetchTime, long prevModifiedTime, long fetchTime) {
- datum.setFetchTime(fetchTime + (long)SECONDS_PER_DAY);
- datum.setRetriesSinceFetch(datum.getRetriesSinceFetch() + 1);
- return datum;
+ row.setFetchTime(fetchTime + (long)FetchSchedule.SECONDS_PER_DAY);
+ int oldRetries = row.getRetriesSinceFetch();
+ row.setRetriesSinceFetch(oldRetries + 1);
}
/**
* This method return the last fetch time of the CrawlDatum
* @return the date as a long.
*/
- public long calculateLastFetchTime(CrawlDatum datum) {
- return datum.getFetchTime() - (long)datum.getFetchInterval() * 1000;
+ public long calculateLastFetchTime(WebTableRow row) {
+ return row.getFetchTime() - row.getFetchInterval() * 1000L;
}
/**
@@ -148,21 +161,22 @@
* check that fetchTime is not too remote (more than <code>maxInterval</code),
* in which case it lowers the interval and returns true.
* @param url URL of the page
- * @param datum datum instance
+ * @param row url's row
* @param curTime reference time (usually set to the time when the
* fetchlist generation process was started).
* @return true, if the page should be considered for inclusion in the current
* fetchlist, otherwise false.
*/
- public boolean shouldFetch(Text url, CrawlDatum datum, long curTime) {
+ public boolean shouldFetch(String url, WebTableRow row, long curTime) {
// pages are never truly GONE - we have to check them from time to time.
// pages with too long fetchInterval are adjusted so that they fit within
// maximum fetchInterval (segment retention period).
- if (datum.getFetchTime() - curTime > (long) maxInterval * 1000) {
- datum.setFetchInterval(maxInterval * 0.9f);
- datum.setFetchTime(curTime);
+ long fetchTime = row.getFetchTime();
+ if (fetchTime - curTime > maxInterval * 1000L) {
+ row.setFetchInterval(Math.round(maxInterval * 0.9f));
+ row.setFetchTime(curTime);
}
- if (datum.getFetchTime() > curTime) {
+ if (fetchTime > curTime) {
return false; // not time yet
}
return true;
@@ -172,21 +186,25 @@
* This method resets fetchTime, fetchInterval, modifiedTime,
* retriesSinceFetch and page signature, so that it forces refetching.
* @param url URL of the page
- * @param datum datum instance
+ * @param row url's row
* @param asap if true, force refetch as soon as possible - this sets
* the fetchTime to now. If false, force refetch whenever the next fetch
* time is set.
*/
- public CrawlDatum forceRefetch(Text url, CrawlDatum datum, boolean asap) {
+ public void forceRefetch(String url, WebTableRow row, boolean asap) {
// reduce fetchInterval so that it fits within the max value
- if (datum.getFetchInterval() > maxInterval)
- datum.setFetchInterval(maxInterval * 0.9f);
- datum.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
- datum.setRetriesSinceFetch(0);
- datum.setSignature(null);
- datum.setModifiedTime(0L);
- if (asap) datum.setFetchTime(System.currentTimeMillis());
- return datum;
+ if (row.getFetchInterval() > maxInterval)
+ row.setFetchInterval(Math.round(maxInterval * 0.9f));
+ row.setStatus(CrawlDatumHbase.STATUS_UNFETCHED);
+ row.setRetriesSinceFetch(0);
+ // TODO: row.setSignature(null) ??
+ row.setModifiedTime(0L);
+ if (asap) row.setFetchTime(System.currentTimeMillis());
+ }
+
+
+ public Set<HbaseColumn> getColumns() {
+ return COLUMNS;
}
}
Modified: lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java?rev=804789&r1=804788&r2=804789&view=diff
==============================================================================
--- lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java (original)
+++ lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java Sun Aug 16 22:25:12 2009
@@ -18,9 +18,8 @@
package org.apache.nutch.crawl;
import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.crawl.FetchSchedule;
+import org.apache.nutch.util.hbase.WebTableRow;
/**
* This class implements an adaptive re-fetch algorithm. This works as follows:
@@ -63,7 +62,7 @@
private boolean SYNC_DELTA;
- private double SYNC_DELTA_RATE;
+ private float SYNC_DELTA_RATE;
public void setConf(Configuration conf) {
super.setConf(conf);
@@ -71,20 +70,22 @@
INC_RATE = conf.getFloat("db.fetch.schedule.adaptive.inc_rate", 0.2f);
DEC_RATE = conf.getFloat("db.fetch.schedule.adaptive.dec_rate", 0.2f);
MIN_INTERVAL = conf.getInt("db.fetch.schedule.adaptive.min_interval", 60);
- MAX_INTERVAL = conf.getInt("db.fetch.schedule.adaptive.max_interval", SECONDS_PER_DAY * 365 ); // 1 year
+ MAX_INTERVAL =
+ conf.getInt("db.fetch.schedule.adaptive.max_interval",
+ FetchSchedule.SECONDS_PER_DAY * 365 ); // 1 year
SYNC_DELTA = conf.getBoolean("db.fetch.schedule.adaptive.sync_delta", true);
SYNC_DELTA_RATE = conf.getFloat("db.fetch.schedule.adaptive.sync_delta_rate", 0.2f);
}
@Override
- public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
+ public void setFetchSchedule(String url, WebTableRow row,
long prevFetchTime, long prevModifiedTime,
long fetchTime, long modifiedTime, int state) {
- super.setFetchSchedule(url, datum, prevFetchTime, prevModifiedTime,
+ super.setFetchSchedule(url, row, prevFetchTime, prevModifiedTime,
fetchTime, modifiedTime, state);
long refTime = fetchTime;
if (modifiedTime <= 0) modifiedTime = fetchTime;
- float interval = datum.getFetchInterval();
+ int interval = row.getFetchInterval();
switch (state) {
case FetchSchedule.STATUS_MODIFIED:
interval *= (1.0f - DEC_RATE);
@@ -95,69 +96,19 @@
case FetchSchedule.STATUS_UNKNOWN:
break;
}
+ row.setFetchInterval(interval);
if (SYNC_DELTA) {
// try to synchronize with the time of change
- long delta = (fetchTime - modifiedTime) / 1000L;
+ // TODO: different from normal class (is delta in seconds)?
+ int delta = (int) ((fetchTime - modifiedTime) / 1000L) ;
if (delta > interval) interval = delta;
- refTime = fetchTime - Math.round(delta * SYNC_DELTA_RATE * 1000);
+ refTime = fetchTime - Math.round(delta * SYNC_DELTA_RATE);
}
- if (interval < MIN_INTERVAL) {
- interval = MIN_INTERVAL;
- } else if (interval > MAX_INTERVAL) {
- interval = MAX_INTERVAL;
- }
- datum.setFetchInterval(interval);
- datum.setFetchTime(refTime + Math.round(interval * 1000.0));
- datum.setModifiedTime(modifiedTime);
- return datum;
+ if (interval < MIN_INTERVAL) interval = MIN_INTERVAL;
+ if (interval > MAX_INTERVAL) interval = MAX_INTERVAL;
+ row.setFetchTime(refTime + interval * 1000L);
+ row.setModifiedTime(modifiedTime);
}
- public static void main(String[] args) throws Exception {
- FetchSchedule fs = new AdaptiveFetchSchedule();
- fs.setConf(NutchConfiguration.create());
- // we start the time at 0, for simplicity
- long curTime = 0;
- long delta = 1000L * 3600L * 24L; // 2 hours
- // we trigger the update of the page every 30 days
- long update = 1000L * 3600L * 24L * 30L; // 30 days
- boolean changed = true;
- long lastModified = 0;
- int miss = 0;
- int totalMiss = 0;
- int maxMiss = 0;
- int fetchCnt = 0;
- int changeCnt = 0;
- // initial fetchInterval is 10 days
- CrawlDatum p = new CrawlDatum(1, 3600 * 24 * 30, 1.0f);
- p.setFetchTime(0);
- System.out.println(p);
- // let's move the timeline a couple of deltas
- for (int i = 0; i < 10000; i++) {
- if (lastModified + update < curTime) {
- //System.out.println("i=" + i + ", lastModified=" + lastModified + ", update=" + update + ", curTime=" + curTime);
- changed = true;
- changeCnt++;
- lastModified = curTime;
- }
- System.out.println(i + ". " + changed + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
- + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days" + "\t missed " + miss);
- if (p.getFetchTime() <= curTime) {
- fetchCnt++;
- fs.setFetchSchedule(new Text("http://www.example.com"), p,
- p.getFetchTime(), p.getModifiedTime(), curTime, lastModified,
- changed ? FetchSchedule.STATUS_MODIFIED : FetchSchedule.STATUS_NOTMODIFIED);
- System.out.println("\tfetched & adjusted: " + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
- + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days");
- if (!changed) miss++;
- if (miss > maxMiss) maxMiss = miss;
- changed = false;
- totalMiss += miss;
- miss = 0;
- }
- if (changed) miss++;
- curTime += delta;
- }
- System.out.println("Total missed: " + totalMiss + ", max miss: " + maxMiss);
- System.out.println("Page changed " + changeCnt + " times, fetched " + fetchCnt + " times.");
- }
+
}
Modified: lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/CrawlDatum.java
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/CrawlDatum.java?rev=804789&r1=804788&r2=804789&view=diff
==============================================================================
--- lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/CrawlDatum.java (original)
+++ lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/CrawlDatum.java Sun Aug 16 22:25:12 2009
@@ -340,7 +340,7 @@
return (that.fetchInterval - this.fetchInterval) > 0 ? 1 : -1;
if (that.modifiedTime != this.modifiedTime)
return (that.modifiedTime - this.modifiedTime) > 0 ? 1 : -1;
- return SignatureComparator._compare(this, that);
+ return SignatureComparator.compare(this.signature, that.signature);
}
/** A Comparator optimized for CrawlDatum. */
@@ -427,7 +427,7 @@
(this.modifiedTime == other.modifiedTime) &&
(this.retries == other.retries) &&
(this.fetchInterval == other.fetchInterval) &&
- (SignatureComparator._compare(this.signature, other.signature) == 0) &&
+ (SignatureComparator.compare(this.signature, other.signature) == 0) &&
(this.score == other.score);
if (!res) return res;
return metadataEquals(other.metaData);
Added: lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/CrawlDatumHbase.java
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/CrawlDatumHbase.java?rev=804789&view=auto
==============================================================================
--- lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/CrawlDatumHbase.java (added)
+++ lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/CrawlDatumHbase.java Sun Aug 16 22:25:12 2009
@@ -0,0 +1,38 @@
+package org.apache.nutch.crawl;
+
+import java.util.HashMap;
+import java.util.Map;
+
+public class CrawlDatumHbase {
+ /** Page was not fetched yet. */
+ public static final byte STATUS_UNFETCHED = 0x01;
+ /** Page was successfully fetched. */
+ public static final byte STATUS_FETCHED = 0x02;
+ /** Page no longer exists. */
+ public static final byte STATUS_GONE = 0x03;
+ /** Page temporarily redirects to other page. */
+ public static final byte STATUS_REDIR_TEMP = 0x04;
+ /** Page permanently redirects to other page. */
+ public static final byte STATUS_REDIR_PERM = 0x05;
+ /** Fetching unsuccessful, needs to be retried (transient errors). */
+ public static final byte STATUS_RETRY = 0x22;
+ /** Fetching successful - page is not modified. */
+ public static final byte STATUS_NOTMODIFIED = 0x26;
+
+ private static final Map<Byte, String> NAMES = new HashMap<Byte, String>();
+
+ static {
+ NAMES.put(STATUS_UNFETCHED, "status_unfetched");
+ NAMES.put(STATUS_FETCHED, "status_fetched");
+ NAMES.put(STATUS_GONE, "status_gone");
+ NAMES.put(STATUS_REDIR_TEMP, "status_redir_temp");
+ NAMES.put(STATUS_REDIR_PERM, "status_redir_perm");
+ NAMES.put(STATUS_RETRY, "status_retry");
+ NAMES.put(STATUS_NOTMODIFIED, "status_notmodified");
+ }
+
+ public static String getName(byte status) {
+ return NAMES.get(status);
+ }
+
+}
Modified: lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/CrawlDb.java
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/CrawlDb.java?rev=804789&r1=804788&r2=804789&view=diff
==============================================================================
--- lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/CrawlDb.java (original)
+++ lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/CrawlDb.java Sun Aug 16 22:25:12 2009
@@ -17,180 +17,10 @@
package org.apache.nutch.crawl;
-import java.io.*;
-import java.util.*;
-
-// Commons Logging imports
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-
-import org.apache.hadoop.io.*;
-import org.apache.hadoop.fs.*;
-import org.apache.hadoop.conf.*;
-import org.apache.hadoop.mapred.*;
-import org.apache.hadoop.util.*;
-
-import org.apache.nutch.util.HadoopFSUtil;
-import org.apache.nutch.util.LockUtil;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.nutch.util.NutchJob;
-
-/**
- * This class takes the output of the fetcher and updates the
- * crawldb accordingly.
- */
-public class CrawlDb extends Configured implements Tool {
- public static final Log LOG = LogFactory.getLog(CrawlDb.class);
-
+public interface CrawlDb {
public static final String CRAWLDB_ADDITIONS_ALLOWED = "db.update.additions.allowed";
public static final String CURRENT_NAME = "current";
public static final String LOCK_NAME = ".locked";
-
- public CrawlDb() {}
-
- public CrawlDb(Configuration conf) {
- setConf(conf);
- }
-
- public void update(Path crawlDb, Path[] segments, boolean normalize, boolean filter) throws IOException {
- boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true);
- update(crawlDb, segments, normalize, filter, additionsAllowed, false);
- }
-
- public void update(Path crawlDb, Path[] segments, boolean normalize, boolean filter, boolean additionsAllowed, boolean force) throws IOException {
- FileSystem fs = FileSystem.get(getConf());
- Path lock = new Path(crawlDb, LOCK_NAME);
- LockUtil.createLockFile(fs, lock, force);
- if (LOG.isInfoEnabled()) {
- LOG.info("CrawlDb update: starting");
- LOG.info("CrawlDb update: db: " + crawlDb);
- LOG.info("CrawlDb update: segments: " + Arrays.asList(segments));
- LOG.info("CrawlDb update: additions allowed: " + additionsAllowed);
- LOG.info("CrawlDb update: URL normalizing: " + normalize);
- LOG.info("CrawlDb update: URL filtering: " + filter);
- }
-
- JobConf job = CrawlDb.createJob(getConf(), crawlDb);
- job.setBoolean(CRAWLDB_ADDITIONS_ALLOWED, additionsAllowed);
- job.setBoolean(CrawlDbFilter.URL_FILTERING, filter);
- job.setBoolean(CrawlDbFilter.URL_NORMALIZING, normalize);
- for (int i = 0; i < segments.length; i++) {
- Path fetch = new Path(segments[i], CrawlDatum.FETCH_DIR_NAME);
- Path parse = new Path(segments[i], CrawlDatum.PARSE_DIR_NAME);
- if (fs.exists(fetch) && fs.exists(parse)) {
- FileInputFormat.addInputPath(job, fetch);
- FileInputFormat.addInputPath(job, parse);
- } else {
- LOG.info(" - skipping invalid segment " + segments[i]);
- }
- }
-
- if (LOG.isInfoEnabled()) {
- LOG.info("CrawlDb update: Merging segment data into db.");
- }
- try {
- JobClient.runJob(job);
- } catch (IOException e) {
- LockUtil.removeLockFile(fs, lock);
- Path outPath = FileOutputFormat.getOutputPath(job);
- if (fs.exists(outPath) ) fs.delete(outPath, true);
- throw e;
- }
-
- CrawlDb.install(job, crawlDb);
- if (LOG.isInfoEnabled()) { LOG.info("CrawlDb update: done"); }
- }
-
- public static JobConf createJob(Configuration config, Path crawlDb)
- throws IOException {
- Path newCrawlDb =
- new Path(crawlDb,
- Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
-
- JobConf job = new NutchJob(config);
- job.setJobName("crawldb " + crawlDb);
-
-
- Path current = new Path(crawlDb, CURRENT_NAME);
- if (FileSystem.get(job).exists(current)) {
- FileInputFormat.addInputPath(job, current);
- }
- job.setInputFormat(SequenceFileInputFormat.class);
-
- job.setMapperClass(CrawlDbFilter.class);
- job.setReducerClass(CrawlDbReducer.class);
-
- FileOutputFormat.setOutputPath(job, newCrawlDb);
- job.setOutputFormat(MapFileOutputFormat.class);
- job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(CrawlDatum.class);
-
- return job;
- }
-
- public static void install(JobConf job, Path crawlDb) throws IOException {
- Path newCrawlDb = FileOutputFormat.getOutputPath(job);
- FileSystem fs = new JobClient(job).getFs();
- Path old = new Path(crawlDb, "old");
- Path current = new Path(crawlDb, CURRENT_NAME);
- if (fs.exists(current)) {
- if (fs.exists(old)) fs.delete(old, true);
- fs.rename(current, old);
- }
- fs.mkdirs(crawlDb);
- fs.rename(newCrawlDb, current);
- if (fs.exists(old)) fs.delete(old, true);
- Path lock = new Path(crawlDb, LOCK_NAME);
- LockUtil.removeLockFile(fs, lock);
- }
-
- public static void main(String[] args) throws Exception {
- int res = ToolRunner.run(NutchConfiguration.create(), new CrawlDb(), args);
- System.exit(res);
- }
-
- public int run(String[] args) throws Exception {
- if (args.length < 2) {
- System.err.println("Usage: CrawlDb <crawldb> (-dir <segments> | <seg1> <seg2> ...) [-force] [-normalize] [-filter] [-noAdditions]");
- System.err.println("\tcrawldb\tCrawlDb to update");
- System.err.println("\t-dir segments\tparent directory containing all segments to update from");
- System.err.println("\tseg1 seg2 ...\tlist of segment names to update from");
- System.err.println("\t-force\tforce update even if CrawlDb appears to be locked (CAUTION advised)");
- System.err.println("\t-normalize\tuse URLNormalizer on urls in CrawlDb and segment (usually not needed)");
- System.err.println("\t-filter\tuse URLFilters on urls in CrawlDb and segment");
- System.err.println("\t-noAdditions\tonly update already existing URLs, don't add any newly discovered URLs");
- return -1;
- }
- boolean normalize = false;
- boolean filter = false;
- boolean force = false;
- final FileSystem fs = FileSystem.get(getConf());
- boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true);
- HashSet<Path> dirs = new HashSet<Path>();
- for (int i = 1; i < args.length; i++) {
- if (args[i].equals("-normalize")) {
- normalize = true;
- } else if (args[i].equals("-filter")) {
- filter = true;
- } else if (args[i].equals("-force")) {
- force = true;
- } else if (args[i].equals("-noAdditions")) {
- additionsAllowed = false;
- } else if (args[i].equals("-dir")) {
- FileStatus[] paths = fs.listStatus(new Path(args[++i]), HadoopFSUtil.getPassDirectoriesFilter(fs));
- dirs.addAll(Arrays.asList(HadoopFSUtil.getPaths(paths)));
- } else {
- dirs.add(new Path(args[i]));
- }
- }
- try {
- update(new Path(args[0]), dirs.toArray(new Path[dirs.size()]), normalize, filter, additionsAllowed, force);
- return 0;
- } catch (Exception e) {
- LOG.fatal("CrawlDb update: " + StringUtils.stringifyException(e));
- return -1;
- }
- }
}
Modified: lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java?rev=804789&r1=804788&r2=804789&view=diff
==============================================================================
--- lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java (original)
+++ lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java Sun Aug 16 22:25:12 2009
@@ -17,7 +17,7 @@
package org.apache.nutch.crawl;
-import org.apache.hadoop.io.Text;
+import org.apache.nutch.util.hbase.WebTableRow;
/**
* This class implements the default re-fetch schedule. That is, no matter
@@ -30,13 +30,13 @@
public class DefaultFetchSchedule extends AbstractFetchSchedule {
@Override
- public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
+ public void setFetchSchedule(String url, WebTableRow row,
long prevFetchTime, long prevModifiedTime,
long fetchTime, long modifiedTime, int state) {
- datum = super.setFetchSchedule(url, datum, prevFetchTime, prevModifiedTime,
+ super.setFetchSchedule(url, row, prevFetchTime, prevModifiedTime,
fetchTime, modifiedTime, state);
- datum.setFetchTime(fetchTime + (long)datum.getFetchInterval() * 1000);
- datum.setModifiedTime(modifiedTime);
- return datum;
+ row.setFetchTime(fetchTime + row.getFetchInterval() * 1000L);
+ row.setModifiedTime(modifiedTime);
}
+
}
Modified: lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/FetchSchedule.java
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/FetchSchedule.java?rev=804789&r1=804788&r2=804789&view=diff
==============================================================================
--- lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/FetchSchedule.java (original)
+++ lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/FetchSchedule.java Sun Aug 16 22:25:12 2009
@@ -17,8 +17,12 @@
package org.apache.nutch.crawl;
+import java.util.Set;
+
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.io.Text;
+import org.apache.nutch.util.hbase.HbaseColumn;
+import org.apache.nutch.util.hbase.WebTableRow;
/**
* This interface defines the contract for implementations that manipulate
@@ -27,15 +31,16 @@
* @author Andrzej Bialecki
*/
public interface FetchSchedule extends Configurable {
-
+
/** It is unknown whether page was changed since our last visit. */
public static final int STATUS_UNKNOWN = 0;
/** Page is known to have been modified since our last visit. */
public static final int STATUS_MODIFIED = 1;
/** Page is known to remain unmodified since our last visit. */
public static final int STATUS_NOTMODIFIED = 2;
-
+
public static final int SECONDS_PER_DAY = 3600 * 24;
+
/**
* Initialize fetch schedule related data. Implementations should at least
* set the <code>fetchTime</code> and <code>fetchInterval</code>. The default
@@ -43,14 +48,10 @@
* default <code>fetchInterval</code>.
*
* @param url URL of the page.
- * @param datum datum instance to be initialized.
- * @return adjusted page information, including all original information.
- * NOTE: this may be a different instance than {@param datum}, but
- * implementations should make sure that it contains at least all
- * information from {@param datum}.
+ * @param row url's row
*/
- public CrawlDatum initializeSchedule(Text url, CrawlDatum datum);
-
+ public void initializeSchedule(String url, WebTableRow row);
+
/**
* Sets the <code>fetchInterval</code> and <code>fetchTime</code> on a
* successfully fetched page.
@@ -58,10 +59,9 @@
* schedules.
*
* @param url url of the page
- * @param datum page description to be adjusted. NOTE: this instance, passed by reference,
- * may be modified inside the method.
- * @param prevFetchTime previous value of fetch time, or 0 if not available
- * @param prevModifiedTime previous value of modifiedTime, or 0 if not available
+ * @param row url's row
+ * @param prevFetchTime previous value of fetch time, or -1 if not available
+ * @param prevModifiedTime previous value of modifiedTime, or -1 if not available
* @param fetchTime the latest time, when the page was recently re-fetched. Most FetchSchedule
* implementations should update the value in {@param datum} to something greater than this value.
* @param modifiedTime last time the content was modified. This information comes from
@@ -72,52 +72,41 @@
* This information may be obtained by comparing page signatures before and after fetching. If this
* is set to {@link #STATUS_UNKNOWN}, then it is unknown whether the page was changed; implementations
* are free to follow a sensible default behavior.
- * @return adjusted page information, including all original information. NOTE: this may
- * be a different instance than {@param datum}, but implementations should make sure that
- * it contains at least all information from {@param datum}.
- */
- public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
- long prevFetchTime, long prevModifiedTime,
- long fetchTime, long modifiedTime, int state);
-
+ */
+ public void setFetchSchedule(String url, WebTableRow row,
+ long prevFetchTime, long prevModifiedTime,
+ long fetchTime, long modifiedTime, int state);
+
/**
* This method specifies how to schedule refetching of pages
* marked as GONE. Default implementation increases fetchInterval by 50%,
* and if it exceeds the <code>maxInterval</code> it calls
* {@link #forceRefetch(Text, CrawlDatum, boolean)}.
* @param url URL of the page
- * @param datum datum instance to be adjusted
- * @return adjusted page information, including all original information.
- * NOTE: this may be a different instance than {@param datum}, but
- * implementations should make sure that it contains at least all
- * information from {@param datum}.
- */
- public CrawlDatum setPageGoneSchedule(Text url, CrawlDatum datum,
- long prevFetchTime, long prevModifiedTime, long fetchTime);
-
+ * @param row url's row
+ */
+ public void setPageGoneSchedule(String url, WebTableRow row,
+ long prevFetchTime, long prevModifiedTime, long fetchTime);
+
/**
* This method adjusts the fetch schedule if fetching needs to be
* re-tried due to transient errors. The default implementation
* sets the next fetch time 1 day in the future and increases the
* retry counter.
* @param url URL of the page
- * @param datum page information
+ * @param row url's row
* @param prevFetchTime previous fetch time
* @param prevModifiedTime previous modified time
* @param fetchTime current fetch time
- * @return adjusted page information, including all original information.
- * NOTE: this may be a different instance than {@param datum}, but
- * implementations should make sure that it contains at least all
- * information from {@param datum}.
- */
- public CrawlDatum setPageRetrySchedule(Text url, CrawlDatum datum,
- long prevFetchTime, long prevModifiedTime, long fetchTime);
-
+ */
+ public void setPageRetrySchedule(String url, WebTableRow row,
+ long prevFetchTime, long prevModifiedTime, long fetchTime);
+
/**
* Calculates last fetch time of the given CrawlDatum.
* @return the date as a long.
*/
- public long calculateLastFetchTime(CrawlDatum datum);
+ public long calculateLastFetchTime(WebTableRow row);
/**
* This method provides information whether the page is suitable for
@@ -129,26 +118,24 @@
* check that fetchTime is not too remote (more than <code>maxInterval</code),
* in which case it lowers the interval and returns true.
* @param url URL of the page
- * @param datum datum instance
+ * @param row url's row
* @param curTime reference time (usually set to the time when the
* fetchlist generation process was started).
* @return true, if the page should be considered for inclusion in the current
* fetchlist, otherwise false.
*/
- public boolean shouldFetch(Text url, CrawlDatum datum, long curTime);
-
+ public boolean shouldFetch(String url, WebTableRow row, long curTime);
+
/**
* This method resets fetchTime, fetchInterval, modifiedTime and
* page signature, so that it forces refetching.
* @param url URL of the page
- * @param datum datum instance
+ * @param row url's row
* @param asap if true, force refetch as soon as possible - this sets
* the fetchTime to now. If false, force refetch whenever the next fetch
* time is set.
- * @return adjusted page information, including all original information.
- * NOTE: this may be a different instance than {@param datum}, but
- * implementations should make sure that it contains at least all
- * information from {@param datum}.
*/
- public CrawlDatum forceRefetch(Text url, CrawlDatum datum, boolean asap);
+ public void forceRefetch(String url, WebTableRow row, boolean asap);
+
+ public Set<HbaseColumn> getColumns();
}
Modified: lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/FetchScheduleFactory.java
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/FetchScheduleFactory.java?rev=804789&r1=804788&r2=804789&view=diff
==============================================================================
--- lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/FetchScheduleFactory.java (original)
+++ lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/FetchScheduleFactory.java Sun Aug 16 22:25:12 2009
@@ -37,7 +37,7 @@
if (impl == null) {
try {
LOG.info("Using FetchSchedule impl: " + clazz);
- Class implClass = Class.forName(clazz);
+ Class<?> implClass = Class.forName(clazz);
impl = (FetchSchedule)implClass.newInstance();
impl.setConf(conf);
objectCache.setObject(clazz, impl);