You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by do...@apache.org on 2009/08/17 00:25:17 UTC
svn commit: r804789 [1/6] - in /lucene/nutch/branches/nutchbase: ./ bin/ conf/ lib/ src/java/org/apache/nutch/analysis/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/ind...

Author: dogacan
Date: Sun Aug 16 22:25:12 2009
New Revision: 804789

URL: http://svn.apache.org/viewvc?rev=804789&view=rev
Log:
NUTCH-650 - Hbase integration

Added:
    lucene/nutch/branches/nutchbase/conf/hbase-site.xml.template
    lucene/nutch/branches/nutchbase/lib/hadoop-0.20.0-core.jar   (with props)
    lucene/nutch/branches/nutchbase/lib/hbase-0.20.0-r804408.jar   (with props)
    lucene/nutch/branches/nutchbase/lib/zookeeper-r785019-hbase-1329.jar   (with props)
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/CrawlDatumHbase.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/GeneratorMapper.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/GeneratorReducer.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/PartitionSelectorByHost.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/TableUpdateMapper.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/TableUpdateReducer.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/TableUpdater.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/fetcher/FetchEntry.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/fetcher/FetcherReducer.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/fetcher/PartitionUrlByHost.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/indexer/IndexerReducer.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/TableParser.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/plugin/TablePluggable.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/scoring/ScoreDatum.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/util/NutchJobConf.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/util/hbase/
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/util/hbase/ColumnData.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/util/hbase/ColumnDescriptor.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/util/hbase/HbaseColumn.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/util/hbase/RowMutation.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/util/hbase/TableRow.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/util/hbase/TableUtil.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/util/hbase/WebTableColumns.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/util/hbase/WebTableCreator.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/util/hbase/WebTableRow.java
    lucene/nutch/branches/nutchbase/src/test/org/apache/nutch/util/TestTableUtil.java
Removed:
    lucene/nutch/branches/nutchbase/lib/hadoop-0.19.1-core.jar
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/Crawl.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/CrawlDbFilter.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/CrawlDbReader.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/LinkDb.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/LinkDbFilter.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/LinkDbMerger.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/LinkDbReader.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/PartitionUrlByHost.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/fetcher/OldFetcher.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/ParseImpl.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/ParseOutputFormat.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/ParseResult.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/ParseSegment.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/ParserChecker.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/searcher/DistributedSegmentBean.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/searcher/LinkDbInlinks.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/segment/SegmentMerger.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/tools/FreeGenerator.java
    lucene/nutch/branches/nutchbase/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java
    lucene/nutch/branches/nutchbase/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java
Modified:
    lucene/nutch/branches/nutchbase/CHANGES.txt
    lucene/nutch/branches/nutchbase/bin/nutch
    lucene/nutch/branches/nutchbase/build.xml
    lucene/nutch/branches/nutchbase/conf/log4j.properties
    lucene/nutch/branches/nutchbase/conf/nutch-default.xml
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/CrawlDatum.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/CrawlDb.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/FetchSchedule.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/FetchScheduleFactory.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/Generator.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/Injector.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/MD5Signature.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/NutchWritable.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/Signature.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/SignatureComparator.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/SignatureFactory.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/TextProfileSignature.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/fetcher/Fetcher.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/fetcher/FetcherOutput.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/indexer/Indexer.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/indexer/IndexerOutputFormat.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/indexer/IndexingFilter.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/indexer/IndexingFilters.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/indexer/NutchIndexWriter.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/indexer/lucene/LuceneWriter.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/indexer/solr/SolrIndexer.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/indexer/solr/SolrWriter.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/HTMLMetaTags.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/HtmlParseFilter.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/HtmlParseFilters.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/Outlink.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/OutlinkExtractor.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/Parse.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/ParseData.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/ParsePluginList.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/ParsePluginsReader.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/ParseStatus.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/ParseText.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/ParseUtil.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/Parser.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/parse/ParserFactory.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/plugin/Pluggable.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/protocol/Content.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/protocol/Protocol.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/protocol/ProtocolFactory.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/scoring/ScoringFilter.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/scoring/ScoringFilters.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/searcher/Hit.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/searcher/NutchBean.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/searcher/SegmentBean.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/searcher/SolrSearchBean.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/segment/SegmentReader.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/util/EncodingDetector.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/util/NutchJob.java
    lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/util/domain/DomainStatistics.java
    lucene/nutch/branches/nutchbase/src/plugin/build.xml
    lucene/nutch/branches/nutchbase/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
    lucene/nutch/branches/nutchbase/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java
    lucene/nutch/branches/nutchbase/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
    lucene/nutch/branches/nutchbase/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
    lucene/nutch/branches/nutchbase/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java
    lucene/nutch/branches/nutchbase/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
    lucene/nutch/branches/nutchbase/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
    lucene/nutch/branches/nutchbase/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
    lucene/nutch/branches/nutchbase/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
    lucene/nutch/branches/nutchbase/src/plugin/parse-oo/src/java/org/apache/nutch/parse/oo/OOParser.java
    lucene/nutch/branches/nutchbase/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java
    lucene/nutch/branches/nutchbase/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
    lucene/nutch/branches/nutchbase/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
    lucene/nutch/branches/nutchbase/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java

Modified: lucene/nutch/branches/nutchbase/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/nutchbase/CHANGES.txt?rev=804789&r1=804788&r2=804789&view=diff
==============================================================================
--- lucene/nutch/branches/nutchbase/CHANGES.txt (original)
+++ lucene/nutch/branches/nutchbase/CHANGES.txt Sun Aug 16 22:25:12 2009
@@ -5,6 +5,8 @@
  1. NUTCH-735 - crawl-tool.xml must be read before nutch-site.xml when
     invoked using crawl command (Susam Pal via dogacan)
 
+ 2. NUTCH-650 - Hbase Integration (dogacan)
+
 Release 1.0 - 2009-03-23
 
  1. NUTCH-474 - Fetcher2 crawlDelay and blocking fix (Dogacan Guney via ab)

Modified: lucene/nutch/branches/nutchbase/bin/nutch
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/nutchbase/bin/nutch?rev=804789&r1=804788&r2=804789&view=diff
==============================================================================
--- lucene/nutch/branches/nutchbase/bin/nutch (original)
+++ lucene/nutch/branches/nutchbase/bin/nutch Sun Aug 16 22:25:12 2009
@@ -32,21 +32,22 @@
 if [ $# = 0 ]; then
   echo "Usage: nutch [-core] COMMAND"
   echo "where COMMAND is one of:"
-  echo "  crawl             one-step crawler for intranets"
-  echo "  readdb            read / dump crawl db"
-  echo "  convdb            convert crawl db from pre-0.9 format"
-  echo "  mergedb           merge crawldb-s, with optional filtering"
-  echo "  readlinkdb        read / dump link db"
+#  echo "  crawl             one-step crawler for intranets"
+#  echo "  readdb            read / dump crawl db"
+#  echo "  convdb            convert crawl db from pre-0.9 format"
+#  echo "  mergedb           merge crawldb-s, with optional filtering"
+#  echo "  readlinkdb        read / dump link db"
+  echo "  createtable       create a new webtable in hbase"
   echo "  inject            inject new urls into the database"
   echo "  generate          generate new segments to fetch from crawl db"
-  echo "  freegen           generate new segments to fetch from text files"
-  echo "  fetch             fetch a segment's pages"
-  echo "  parse             parse a segment's pages"
-  echo "  readseg           read / dump segment data"
-  echo "  mergesegs         merge several segments, with optional filtering and slicing"
-  echo "  updatedb          update crawl db from segments after fetching"
-  echo "  invertlinks       create a linkdb from parsed segments"
-  echo "  mergelinkdb       merge linkdb-s, with optional filtering"
+#  echo "  freegen           generate new segments to fetch from text files"
+  echo "  fetch             fetch URLs marked during generate"
+  echo "  parse             parse URLs marked during fetch"
+#  echo "  readseg           read / dump segment data"
+#  echo "  mergesegs         merge several segments, with optional filtering and slicing"
+#  echo "  updatedb          update crawl db from segments after fetching"
+#  echo "  invertlinks       create a linkdb from parsed segments"
+#  echo "  mergelinkdb       merge linkdb-s, with optional filtering"
   echo "  index             run the indexer on parsed segments and linkdb"
   echo "  solrindex         run the solr indexer on parsed segments and linkdb"
   echo "  merge             merge several segment indexes"
@@ -186,7 +187,7 @@
   NUTCH_LOG_DIR=`cygpath -p -w "$NUTCH_LOG_DIR"`
 fi
 
-NUTCH_OPTS="$NUTCH_OPTS -Dhadoop.log.dir=$NUTCH_LOG_DIR"
+NUTCH_OPTS="$NUTCH_OPTS -Dhadoop.log.dir=$NUTCH_LOG_DIR -Djavax.xml.parsers.DocumentBuilderFactory=com.sun.org.apache.xerces.internal.jaxp.DocumentBuilderFactoryImpl"
 NUTCH_OPTS="$NUTCH_OPTS -Dhadoop.log.file=$NUTCH_LOGFILE"
 
 if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
@@ -196,39 +197,39 @@
 # figure out which class to run
 if [ "$COMMAND" = "crawl" ] ; then
   CLASS=org.apache.nutch.crawl.Crawl
+elif [ "$COMMAND" = "createtable" ] ; then
+  CLASS=org.apache.nutch.util.hbase.WebTableCreator
 elif [ "$COMMAND" = "inject" ] ; then
   CLASS=org.apache.nutch.crawl.Injector
 elif [ "$COMMAND" = "generate" ] ; then
   CLASS=org.apache.nutch.crawl.Generator
-elif [ "$COMMAND" = "freegen" ] ; then
-  CLASS=org.apache.nutch.tools.FreeGenerator
+#elif [ "$COMMAND" = "freegen" ] ; then
+#  CLASS=org.apache.nutch.tools.FreeGenerator
 elif [ "$COMMAND" = "fetch" ] ; then
   CLASS=org.apache.nutch.fetcher.Fetcher
-elif [ "$COMMAND" = "fetch2" ] ; then
-  CLASS=org.apache.nutch.fetcher.Fetcher2
 elif [ "$COMMAND" = "parse" ] ; then
-  CLASS=org.apache.nutch.parse.ParseSegment
-elif [ "$COMMAND" = "readdb" ] ; then
-  CLASS=org.apache.nutch.crawl.CrawlDbReader
-elif [ "$COMMAND" = "convdb" ] ; then
-  CLASS=org.apache.nutch.tools.compat.CrawlDbConverter
-elif [ "$COMMAND" = "mergedb" ] ; then
-  CLASS=org.apache.nutch.crawl.CrawlDbMerger
-elif [ "$COMMAND" = "readlinkdb" ] ; then
-  CLASS=org.apache.nutch.crawl.LinkDbReader
-elif [ "$COMMAND" = "readseg" ] ; then
-  CLASS=org.apache.nutch.segment.SegmentReader
-elif [ "$COMMAND" = "segread" ] ; then
-  echo "[DEPRECATED] Command 'segread' is deprecated, use 'readseg' instead."
-  CLASS=org.apache.nutch.segment.SegmentReader
-elif [ "$COMMAND" = "mergesegs" ] ; then
-  CLASS=org.apache.nutch.segment.SegmentMerger
-elif [ "$COMMAND" = "updatedb" ] ; then
-  CLASS=org.apache.nutch.crawl.CrawlDb
-elif [ "$COMMAND" = "invertlinks" ] ; then
-  CLASS=org.apache.nutch.crawl.LinkDb
-elif [ "$COMMAND" = "mergelinkdb" ] ; then
-  CLASS=org.apache.nutch.crawl.LinkDbMerger
+  CLASS=org.apache.nutch.parse.TableParser
+#elif [ "$COMMAND" = "readdb" ] ; then
+#  CLASS=org.apache.nutch.crawl.CrawlDbReader
+#elif [ "$COMMAND" = "convdb" ] ; then
+#  CLASS=org.apache.nutch.tools.compat.CrawlDbConverter
+#elif [ "$COMMAND" = "mergedb" ] ; then
+#  CLASS=org.apache.nutch.crawl.CrawlDbMerger
+#elif [ "$COMMAND" = "readlinkdb" ] ; then
+#  CLASS=org.apache.nutch.crawl.LinkDbReader
+#elif [ "$COMMAND" = "readseg" ] ; then
+#  CLASS=org.apache.nutch.segment.SegmentReader
+#elif [ "$COMMAND" = "segread" ] ; then
+#  echo "[DEPRECATED] Command 'segread' is deprecated, use 'readseg' instead."
+#  CLASS=org.apache.nutch.segment.SegmentReader
+#elif [ "$COMMAND" = "mergesegs" ] ; then
+#  CLASS=org.apache.nutch.segment.SegmentMerger
+elif [ "$COMMAND" = "updatetable" ] ; then
+  CLASS=org.apache.nutch.crawl.TableUpdater
+#elif [ "$COMMAND" = "invertlinks" ] ; then
+#  CLASS=org.apache.nutch.crawl.LinkDb
+#elif [ "$COMMAND" = "mergelinkdb" ] ; then
+#  CLASS=org.apache.nutch.crawl.LinkDbMerger
 elif [ "$COMMAND" = "index" ] ; then
   CLASS=org.apache.nutch.indexer.Indexer
 elif [ "$COMMAND" = "solrindex" ] ; then

Modified: lucene/nutch/branches/nutchbase/build.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/nutchbase/build.xml?rev=804789&r1=804788&r2=804789&view=diff
==============================================================================
--- lucene/nutch/branches/nutchbase/build.xml (original)
+++ lucene/nutch/branches/nutchbase/build.xml Sun Aug 16 22:25:12 2009
@@ -99,6 +99,7 @@
      encoding="${build.encoding}" 
      srcdir="${src.dir}"
      includes="org/apache/nutch/**/*.java"
+     excludes="org/apache/nutch/scoring/webgraph/**/*.java,org/apache/nutch/tools/compat/**/*.java,org/apache/nutch/tools/arc/**/*.java,org/apache/nutch/indexer/field/**/*.java"
      destdir="${build.classes}"
      debug="${javac.debug}"
      optimize="${javac.optimize}"
@@ -181,6 +182,8 @@
         <include name="lucene*.jar"/>
         <include name="taglibs-*.jar"/>
         <include name="hadoop-*.jar"/>
+        <include name="hbase-*.jar"/>
+        <include name="zookeeper-*.jar"/>
         <include name="dom4j-*.jar"/>
         <include name="xerces-*.jar"/>
         <include name="tika-*.jar"/>
@@ -215,7 +218,8 @@
     <javac 
      encoding="${build.encoding}" 
      srcdir="${test.src.dir}"
-     includes="org/apache/nutch/**/*.java"
+     includes="org/apache/nutch*/**/*.java"
+     excludes="org/apache/nutch/scoring/webgraph/**/*.java,org/apache/nutch/tools/compat/**/*.java,org/apache/nutch/tools/arc/**/*.java"
      destdir="${test.build.classes}"
      debug="${javac.debug}"
      optimize="${javac.optimize}"

Added: lucene/nutch/branches/nutchbase/conf/hbase-site.xml.template
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/nutchbase/conf/hbase-site.xml.template?rev=804789&view=auto
==============================================================================
--- lucene/nutch/branches/nutchbase/conf/hbase-site.xml.template (added)
+++ lucene/nutch/branches/nutchbase/conf/hbase-site.xml.template Sun Aug 16 22:25:12 2009
@@ -0,0 +1,25 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+/**
+ * Copyright 2009 The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+-->
+<configuration>
+</configuration>

Modified: lucene/nutch/branches/nutchbase/conf/log4j.properties
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/nutchbase/conf/log4j.properties?rev=804789&r1=804788&r2=804789&view=diff
==============================================================================
--- lucene/nutch/branches/nutchbase/conf/log4j.properties (original)
+++ lucene/nutch/branches/nutchbase/conf/log4j.properties Sun Aug 16 22:25:12 2009
@@ -28,6 +28,7 @@
 
 log4j.logger.org.apache.nutch=INFO
 log4j.logger.org.apache.hadoop=WARN
+log4j.logger.org.apache.zookeeper=WARN
 
 #
 # Daily Rolling File Appender

Modified: lucene/nutch/branches/nutchbase/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/nutchbase/conf/nutch-default.xml?rev=804789&r1=804788&r2=804789&view=diff
==============================================================================
--- lucene/nutch/branches/nutchbase/conf/nutch-default.xml (original)
+++ lucene/nutch/branches/nutchbase/conf/nutch-default.xml Sun Aug 16 22:25:12 2009
@@ -886,8 +886,8 @@
 
 <property>
   <name>plugin.includes</name>
-  <value>protocol-http|urlfilter-regex|parse-(text|html|js)|index-(basic|anchor)|query-(basic|site|url)|response-(json|xml)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
-  <description>Regular expression naming plugin directory names to
+ <value>protocol-http|urlfilter-regex|parse-(html|js)|index-basic|urlnormalizer-(pass|regex|basic)|scoring-opic|query-(basic|site|url)|summary-basic</value>
+ <description>Regular expression naming plugin directory names to
   include.  Any plugin not matching this expression is excluded.
   In any case you need at least include the nutch-extensionpoints plugin. By
   default Nutch includes crawling just HTML and plain text via HTTP,

Added: lucene/nutch/branches/nutchbase/lib/hadoop-0.20.0-core.jar
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/nutchbase/lib/hadoop-0.20.0-core.jar?rev=804789&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/branches/nutchbase/lib/hadoop-0.20.0-core.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/nutch/branches/nutchbase/lib/hbase-0.20.0-r804408.jar
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/nutchbase/lib/hbase-0.20.0-r804408.jar?rev=804789&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/branches/nutchbase/lib/hbase-0.20.0-r804408.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/nutch/branches/nutchbase/lib/zookeeper-r785019-hbase-1329.jar
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/nutchbase/lib/zookeeper-r785019-hbase-1329.jar?rev=804789&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/branches/nutchbase/lib/zookeeper-r785019-hbase-1329.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Modified: lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java?rev=804789&r1=804788&r2=804789&view=diff
==============================================================================
--- lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java (original)
+++ lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java Sun Aug 16 22:25:12 2009
@@ -19,12 +19,14 @@
 // JDK imports
 import java.io.Reader;
 import java.io.IOException;
+import java.util.Collection;
 
 // Lucene imports
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Token;
+import org.apache.nutch.util.hbase.HbaseColumn;
 import org.apache.hadoop.conf.Configuration;
 
 /**

Modified: lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java?rev=804789&r1=804788&r2=804789&view=diff
==============================================================================
--- lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java (original)
+++ lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java Sun Aug 16 22:25:12 2009
@@ -17,12 +17,17 @@
 
 package org.apache.nutch.crawl;
 
+import java.util.HashSet;
+import java.util.Set;
+
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.FetchSchedule;
+import org.apache.nutch.util.hbase.HbaseColumn;
+import org.apache.nutch.util.hbase.WebTableColumns;
+import org.apache.nutch.util.hbase.WebTableRow;
 
 /**
  * This class provides common methods for implementations of
@@ -30,12 +35,22 @@
  * 
  * @author Andrzej Bialecki
  */
-public abstract class AbstractFetchSchedule extends Configured implements FetchSchedule {
+public abstract class AbstractFetchSchedule
+extends Configured
+implements FetchSchedule {
   private static final Log LOG = LogFactory.getLog(AbstractFetchSchedule.class);
   
   protected int defaultInterval;
   protected int maxInterval;
   
+  private static final Set<HbaseColumn> COLUMNS = new HashSet<HbaseColumn>();
+  
+  static {
+    COLUMNS.add(new HbaseColumn(WebTableColumns.FETCH_TIME));
+    COLUMNS.add(new HbaseColumn(WebTableColumns.RETRIES));
+    COLUMNS.add(new HbaseColumn(WebTableColumns.FETCH_INTERVAL));
+  }
+  
   public AbstractFetchSchedule() {
     super(null);
   }
@@ -49,10 +64,14 @@
     if (conf == null) return;
     int oldDefaultInterval = conf.getInt("db.default.fetch.interval", 0);
     defaultInterval = conf.getInt("db.fetch.interval.default", 0);
-    if (oldDefaultInterval > 0 && defaultInterval == 0) defaultInterval = oldDefaultInterval * SECONDS_PER_DAY;
+    if (oldDefaultInterval > 0 && defaultInterval == 0) {
+      defaultInterval = oldDefaultInterval * FetchSchedule.SECONDS_PER_DAY;
+    }
     int oldMaxInterval = conf.getInt("db.max.fetch.interval", 0);
     maxInterval = conf.getInt("db.fetch.interval.max", 0 );
-    if (oldMaxInterval > 0 && maxInterval == 0) maxInterval = oldMaxInterval * FetchSchedule.SECONDS_PER_DAY;
+    if (oldMaxInterval > 0 && maxInterval == 0) { 
+      maxInterval = oldMaxInterval * FetchSchedule.SECONDS_PER_DAY;
+    }
     LOG.info("defaultInterval=" + defaultInterval);
     LOG.info("maxInterval=" + maxInterval);
   }
@@ -64,13 +83,12 @@
    * default <code>fetchInterval</code>.
    * 
    * @param url URL of the page.
-   * @param datum datum instance to be initialized (modified in place).
+   * @param row url's row
    */
-  public CrawlDatum initializeSchedule(Text url, CrawlDatum datum) {
-    datum.setFetchTime(System.currentTimeMillis());
-    datum.setFetchInterval(defaultInterval);
-    datum.setRetriesSinceFetch(0);
-    return datum;
+  public void initializeSchedule(String url, WebTableRow row) {
+    row.setFetchTime(System.currentTimeMillis());
+    row.setFetchInterval(defaultInterval);
+    row.setRetriesSinceFetch(0);
   }
   
   /**
@@ -79,11 +97,10 @@
    * retry counter - extending classes should call super.setFetchSchedule() to
    * preserve this behavior.
    */
-  public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
+  public void setFetchSchedule(String url, WebTableRow row,
           long prevFetchTime, long prevModifiedTime,
           long fetchTime, long modifiedTime, int state) {
-    datum.setRetriesSinceFetch(0);
-    return datum;
+    row.setRetriesSinceFetch(0);
   }
   
   /**
@@ -92,20 +109,20 @@
    * and if it exceeds the <code>maxInterval</code> it calls
    * {@link #forceRefetch(Text, CrawlDatum, boolean)}.
    * @param url URL of the page
-   * @param datum datum instance to be adjusted
+   * @param row url's row
    * @return adjusted page information, including all original information.
    * NOTE: this may be a different instance than {@param datum}, but
    * implementations should make sure that it contains at least all
    * information from {@param datum}.
    */
-  public CrawlDatum setPageGoneSchedule(Text url, CrawlDatum datum,
+  public void setPageGoneSchedule(String url, WebTableRow row,
           long prevFetchTime, long prevModifiedTime, long fetchTime) {
     // no page is truly GONE ... just increase the interval by 50%
     // and try much later.
-    datum.setFetchInterval(datum.getFetchInterval() * 1.5f);
-    datum.setFetchTime(fetchTime + (long)datum.getFetchInterval() * 1000);
-    if (maxInterval < datum.getFetchInterval()) forceRefetch(url, datum, false);
-    return datum;
+    int newFetchInterval = (int) (row.getFetchInterval() * 1.5f);
+    row.setFetchInterval(newFetchInterval);
+    row.setFetchTime(fetchTime + newFetchInterval * 1000L);
+    if (maxInterval < newFetchInterval) forceRefetch(url, row, false);
   }
   
   /**
@@ -114,28 +131,24 @@
    * sets the next fetch time 1 day in the future and increases
    * the retry counter.
    * @param url URL of the page
-   * @param datum page information
+   * @param row url's row
    * @param prevFetchTime previous fetch time
    * @param prevModifiedTime previous modified time
    * @param fetchTime current fetch time
-   * @return adjusted page information, including all original information.
-   * NOTE: this may be a different instance than {@param datum}, but
-   * implementations should make sure that it contains at least all
-   * information from {@param datum}.
    */
-  public CrawlDatum setPageRetrySchedule(Text url, CrawlDatum datum,
+  public void setPageRetrySchedule(String url, WebTableRow row,
           long prevFetchTime, long prevModifiedTime, long fetchTime) {
-    datum.setFetchTime(fetchTime + (long)SECONDS_PER_DAY);
-    datum.setRetriesSinceFetch(datum.getRetriesSinceFetch() + 1);
-    return datum;
+    row.setFetchTime(fetchTime + (long)FetchSchedule.SECONDS_PER_DAY);
+    int oldRetries = row.getRetriesSinceFetch();
+    row.setRetriesSinceFetch(oldRetries + 1);
   }
   
   /**
    * This method return the last fetch time of the CrawlDatum
    * @return the date as a long.
    */
-  public long calculateLastFetchTime(CrawlDatum datum) {
-    return  datum.getFetchTime() - (long)datum.getFetchInterval() * 1000;
+  public long calculateLastFetchTime(WebTableRow row) {
+    return row.getFetchTime() - row.getFetchInterval() * 1000L;
   }
 
   /**
@@ -148,21 +161,22 @@
    * check that fetchTime is not too remote (more than <code>maxInterval</code),
    * in which case it lowers the interval and returns true.
    * @param url URL of the page
-   * @param datum datum instance
+   * @param row url's row
    * @param curTime reference time (usually set to the time when the
    * fetchlist generation process was started).
    * @return true, if the page should be considered for inclusion in the current
    * fetchlist, otherwise false.
    */
-  public boolean shouldFetch(Text url, CrawlDatum datum, long curTime) {
+  public boolean shouldFetch(String url, WebTableRow row, long curTime) {
     // pages are never truly GONE - we have to check them from time to time.
     // pages with too long fetchInterval are adjusted so that they fit within
     // maximum fetchInterval (segment retention period).
-    if (datum.getFetchTime() - curTime > (long) maxInterval * 1000) {
-      datum.setFetchInterval(maxInterval * 0.9f);
-      datum.setFetchTime(curTime);
+    long fetchTime = row.getFetchTime(); 
+    if (fetchTime - curTime > maxInterval * 1000L) {
+      row.setFetchInterval(Math.round(maxInterval * 0.9f));
+      row.setFetchTime(curTime);
     }
-    if (datum.getFetchTime() > curTime) {
+    if (fetchTime > curTime) {
       return false;                                   // not time yet
     }
     return true;
@@ -172,21 +186,25 @@
    * This method resets fetchTime, fetchInterval, modifiedTime,
    * retriesSinceFetch and page signature, so that it forces refetching.
    * @param url URL of the page
-   * @param datum datum instance
+   * @param row url's row
    * @param asap if true, force refetch as soon as possible - this sets
    * the fetchTime to now. If false, force refetch whenever the next fetch
    * time is set.
    */
-  public CrawlDatum  forceRefetch(Text url, CrawlDatum datum, boolean asap) {
+  public void forceRefetch(String url, WebTableRow row, boolean asap) {
     // reduce fetchInterval so that it fits within the max value
-    if (datum.getFetchInterval() > maxInterval)
-      datum.setFetchInterval(maxInterval * 0.9f);
-    datum.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
-    datum.setRetriesSinceFetch(0);
-    datum.setSignature(null);
-    datum.setModifiedTime(0L);
-    if (asap) datum.setFetchTime(System.currentTimeMillis());
-    return datum;
+    if (row.getFetchInterval() > maxInterval)
+      row.setFetchInterval(Math.round(maxInterval * 0.9f));
+    row.setStatus(CrawlDatumHbase.STATUS_UNFETCHED);
+    row.setRetriesSinceFetch(0);
+    // TODO: row.setSignature(null) ??
+    row.setModifiedTime(0L);
+    if (asap) row.setFetchTime(System.currentTimeMillis());
+  }
+  
+
+  public Set<HbaseColumn> getColumns() {
+    return COLUMNS;
   }
 
 }

Modified: lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java?rev=804789&r1=804788&r2=804789&view=diff
==============================================================================
--- lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java (original)
+++ lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java Sun Aug 16 22:25:12 2009
@@ -18,9 +18,8 @@
 package org.apache.nutch.crawl;
 
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.crawl.FetchSchedule;
+import org.apache.nutch.util.hbase.WebTableRow;
 
 /**
  * This class implements an adaptive re-fetch algorithm. This works as follows:
@@ -63,7 +62,7 @@
   
   private boolean SYNC_DELTA;
 
-  private double SYNC_DELTA_RATE;
+  private float SYNC_DELTA_RATE;
   
   public void setConf(Configuration conf) {
     super.setConf(conf);
@@ -71,20 +70,22 @@
     INC_RATE = conf.getFloat("db.fetch.schedule.adaptive.inc_rate", 0.2f);
     DEC_RATE = conf.getFloat("db.fetch.schedule.adaptive.dec_rate", 0.2f);
     MIN_INTERVAL = conf.getInt("db.fetch.schedule.adaptive.min_interval", 60);
-    MAX_INTERVAL = conf.getInt("db.fetch.schedule.adaptive.max_interval", SECONDS_PER_DAY * 365 ); // 1 year
+    MAX_INTERVAL =
+      conf.getInt("db.fetch.schedule.adaptive.max_interval",
+                  FetchSchedule.SECONDS_PER_DAY * 365 ); // 1 year
     SYNC_DELTA = conf.getBoolean("db.fetch.schedule.adaptive.sync_delta", true);
     SYNC_DELTA_RATE = conf.getFloat("db.fetch.schedule.adaptive.sync_delta_rate", 0.2f);
   }
 
   @Override
-  public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
+  public void setFetchSchedule(String url, WebTableRow row,
           long prevFetchTime, long prevModifiedTime,
           long fetchTime, long modifiedTime, int state) {
-    super.setFetchSchedule(url, datum, prevFetchTime, prevModifiedTime,
+    super.setFetchSchedule(url, row, prevFetchTime, prevModifiedTime,
         fetchTime, modifiedTime, state);
     long refTime = fetchTime;
     if (modifiedTime <= 0) modifiedTime = fetchTime;
-    float interval = datum.getFetchInterval();
+    int interval = row.getFetchInterval();
     switch (state) {
       case FetchSchedule.STATUS_MODIFIED:
         interval *= (1.0f - DEC_RATE);
@@ -95,69 +96,19 @@
       case FetchSchedule.STATUS_UNKNOWN:
         break;
     }
+    row.setFetchInterval(interval);
     if (SYNC_DELTA) {
       // try to synchronize with the time of change
-      long delta = (fetchTime - modifiedTime) / 1000L;
+      // TODO: different from normal class (is delta in seconds)? 
+      int delta = (int) ((fetchTime - modifiedTime) / 1000L) ;
       if (delta > interval) interval = delta;
-      refTime = fetchTime - Math.round(delta * SYNC_DELTA_RATE * 1000);
+      refTime = fetchTime - Math.round(delta * SYNC_DELTA_RATE);
     }
-    if (interval < MIN_INTERVAL) {
-      interval = MIN_INTERVAL;
-    } else if (interval > MAX_INTERVAL) {
-      interval = MAX_INTERVAL;
-    }
-    datum.setFetchInterval(interval);
-    datum.setFetchTime(refTime + Math.round(interval * 1000.0));
-    datum.setModifiedTime(modifiedTime);
-    return datum;
+    if (interval < MIN_INTERVAL) interval = MIN_INTERVAL;
+    if (interval > MAX_INTERVAL) interval = MAX_INTERVAL;
+    row.setFetchTime(refTime + interval * 1000L);
+    row.setModifiedTime(modifiedTime);
   }
 
-  public static void main(String[] args) throws Exception {
-    FetchSchedule fs = new AdaptiveFetchSchedule();
-    fs.setConf(NutchConfiguration.create());
-    // we start the time at 0, for simplicity
-    long curTime = 0;
-    long delta = 1000L * 3600L * 24L; // 2 hours
-    // we trigger the update of the page every 30 days
-    long update = 1000L * 3600L * 24L * 30L; // 30 days
-    boolean changed = true;
-    long lastModified = 0;
-    int miss = 0;
-    int totalMiss = 0;
-    int maxMiss = 0;
-    int fetchCnt = 0;
-    int changeCnt = 0;
-    // initial fetchInterval is 10 days
-    CrawlDatum p = new CrawlDatum(1, 3600 * 24 * 30, 1.0f);
-    p.setFetchTime(0);
-    System.out.println(p);
-    // let's move the timeline a couple of deltas
-    for (int i = 0; i < 10000; i++) {
-      if (lastModified + update < curTime) {
-        //System.out.println("i=" + i + ", lastModified=" + lastModified + ", update=" + update + ", curTime=" + curTime);
-        changed = true;
-        changeCnt++;
-        lastModified = curTime;
-      }
-      System.out.println(i + ". " + changed + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
-              + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days" + "\t missed " + miss);
-      if (p.getFetchTime() <= curTime) {
-        fetchCnt++;
-        fs.setFetchSchedule(new Text("http://www.example.com"), p,
-                p.getFetchTime(), p.getModifiedTime(), curTime, lastModified,
-                changed ? FetchSchedule.STATUS_MODIFIED : FetchSchedule.STATUS_NOTMODIFIED);
-        System.out.println("\tfetched & adjusted: " + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval "
-                + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days");
-        if (!changed) miss++;
-        if (miss > maxMiss) maxMiss = miss;
-        changed = false;
-        totalMiss += miss;
-        miss = 0;
-      }
-      if (changed) miss++;
-      curTime += delta;
-    }
-    System.out.println("Total missed: " + totalMiss + ", max miss: " + maxMiss);
-    System.out.println("Page changed " + changeCnt + " times, fetched " + fetchCnt + " times.");
-  }
+
 }

Modified: lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/CrawlDatum.java
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/CrawlDatum.java?rev=804789&r1=804788&r2=804789&view=diff
==============================================================================
--- lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/CrawlDatum.java (original)
+++ lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/CrawlDatum.java Sun Aug 16 22:25:12 2009
@@ -340,7 +340,7 @@
       return (that.fetchInterval - this.fetchInterval) > 0 ? 1 : -1;
     if (that.modifiedTime != this.modifiedTime)
       return (that.modifiedTime - this.modifiedTime) > 0 ? 1 : -1;
-    return SignatureComparator._compare(this, that);
+    return SignatureComparator.compare(this.signature, that.signature);
   }
 
   /** A Comparator optimized for CrawlDatum. */ 
@@ -427,7 +427,7 @@
       (this.modifiedTime == other.modifiedTime) &&
       (this.retries == other.retries) &&
       (this.fetchInterval == other.fetchInterval) &&
-      (SignatureComparator._compare(this.signature, other.signature) == 0) &&
+      (SignatureComparator.compare(this.signature, other.signature) == 0) &&
       (this.score == other.score);
     if (!res) return res;
     return metadataEquals(other.metaData);

Added: lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/CrawlDatumHbase.java
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/CrawlDatumHbase.java?rev=804789&view=auto
==============================================================================
--- lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/CrawlDatumHbase.java (added)
+++ lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/CrawlDatumHbase.java Sun Aug 16 22:25:12 2009
@@ -0,0 +1,38 @@
+package org.apache.nutch.crawl;
+
+import java.util.HashMap;
+import java.util.Map;
+
+public class CrawlDatumHbase {
+  /** Page was not fetched yet. */
+  public static final byte STATUS_UNFETCHED      = 0x01;
+  /** Page was successfully fetched. */
+  public static final byte STATUS_FETCHED        = 0x02;
+  /** Page no longer exists. */
+  public static final byte STATUS_GONE           = 0x03;
+  /** Page temporarily redirects to other page. */
+  public static final byte STATUS_REDIR_TEMP     = 0x04;
+  /** Page permanently redirects to other page. */
+  public static final byte STATUS_REDIR_PERM     = 0x05;
+  /** Fetching unsuccessful, needs to be retried (transient errors). */
+  public static final byte STATUS_RETRY          = 0x22;
+  /** Fetching successful - page is not modified. */
+  public static final byte STATUS_NOTMODIFIED    = 0x26;
+  
+  private static final Map<Byte, String> NAMES = new HashMap<Byte, String>();
+  
+  static {
+    NAMES.put(STATUS_UNFETCHED, "status_unfetched");
+    NAMES.put(STATUS_FETCHED, "status_fetched");
+    NAMES.put(STATUS_GONE, "status_gone");
+    NAMES.put(STATUS_REDIR_TEMP, "status_redir_temp");
+    NAMES.put(STATUS_REDIR_PERM, "status_redir_perm");
+    NAMES.put(STATUS_RETRY, "status_retry");
+    NAMES.put(STATUS_NOTMODIFIED, "status_notmodified");
+  }
+  
+  public static String getName(byte status) {
+    return NAMES.get(status);
+  }
+ 
+}

Modified: lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/CrawlDb.java
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/CrawlDb.java?rev=804789&r1=804788&r2=804789&view=diff
==============================================================================
--- lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/CrawlDb.java (original)
+++ lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/CrawlDb.java Sun Aug 16 22:25:12 2009
@@ -17,180 +17,10 @@
 
 package org.apache.nutch.crawl;
 
-import java.io.*;
-import java.util.*;
-
-// Commons Logging imports
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-
-import org.apache.hadoop.io.*;
-import org.apache.hadoop.fs.*;
-import org.apache.hadoop.conf.*;
-import org.apache.hadoop.mapred.*;
-import org.apache.hadoop.util.*;
-
-import org.apache.nutch.util.HadoopFSUtil;
-import org.apache.nutch.util.LockUtil;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.nutch.util.NutchJob;
-
-/**
- * This class takes the output of the fetcher and updates the
- * crawldb accordingly.
- */
-public class CrawlDb extends Configured implements Tool {
-  public static final Log LOG = LogFactory.getLog(CrawlDb.class);
-
+public interface CrawlDb {
   public static final String CRAWLDB_ADDITIONS_ALLOWED = "db.update.additions.allowed";
 
   public static final String CURRENT_NAME = "current";
   
   public static final String LOCK_NAME = ".locked";
-  
-  public CrawlDb() {}
-  
-  public CrawlDb(Configuration conf) {
-    setConf(conf);
-  }
-
-  public void update(Path crawlDb, Path[] segments, boolean normalize, boolean filter) throws IOException {
-    boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true);    
-    update(crawlDb, segments, normalize, filter, additionsAllowed, false);
-  }
-  
-  public void update(Path crawlDb, Path[] segments, boolean normalize, boolean filter, boolean additionsAllowed, boolean force) throws IOException {
-    FileSystem fs = FileSystem.get(getConf());
-    Path lock = new Path(crawlDb, LOCK_NAME);
-    LockUtil.createLockFile(fs, lock, force);
-    if (LOG.isInfoEnabled()) {
-      LOG.info("CrawlDb update: starting");
-      LOG.info("CrawlDb update: db: " + crawlDb);
-      LOG.info("CrawlDb update: segments: " + Arrays.asList(segments));
-      LOG.info("CrawlDb update: additions allowed: " + additionsAllowed);
-      LOG.info("CrawlDb update: URL normalizing: " + normalize);
-      LOG.info("CrawlDb update: URL filtering: " + filter);
-    }
-
-    JobConf job = CrawlDb.createJob(getConf(), crawlDb);
-    job.setBoolean(CRAWLDB_ADDITIONS_ALLOWED, additionsAllowed);
-    job.setBoolean(CrawlDbFilter.URL_FILTERING, filter);
-    job.setBoolean(CrawlDbFilter.URL_NORMALIZING, normalize);
-    for (int i = 0; i < segments.length; i++) {
-      Path fetch = new Path(segments[i], CrawlDatum.FETCH_DIR_NAME);
-      Path parse = new Path(segments[i], CrawlDatum.PARSE_DIR_NAME);
-      if (fs.exists(fetch) && fs.exists(parse)) {
-        FileInputFormat.addInputPath(job, fetch);
-        FileInputFormat.addInputPath(job, parse);
-      } else {
-        LOG.info(" - skipping invalid segment " + segments[i]);
-      }
-    }
-
-    if (LOG.isInfoEnabled()) {
-      LOG.info("CrawlDb update: Merging segment data into db.");
-    }
-    try {
-      JobClient.runJob(job);
-    } catch (IOException e) {
-      LockUtil.removeLockFile(fs, lock);
-      Path outPath = FileOutputFormat.getOutputPath(job);
-      if (fs.exists(outPath) ) fs.delete(outPath, true);
-      throw e;
-    }
-
-    CrawlDb.install(job, crawlDb);
-    if (LOG.isInfoEnabled()) { LOG.info("CrawlDb update: done"); }
-  }
-
-  public static JobConf createJob(Configuration config, Path crawlDb)
-    throws IOException {
-    Path newCrawlDb =
-      new Path(crawlDb,
-               Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
-
-    JobConf job = new NutchJob(config);
-    job.setJobName("crawldb " + crawlDb);
-
-
-    Path current = new Path(crawlDb, CURRENT_NAME);
-    if (FileSystem.get(job).exists(current)) {
-      FileInputFormat.addInputPath(job, current);
-    }
-    job.setInputFormat(SequenceFileInputFormat.class);
-
-    job.setMapperClass(CrawlDbFilter.class);
-    job.setReducerClass(CrawlDbReducer.class);
-
-    FileOutputFormat.setOutputPath(job, newCrawlDb);
-    job.setOutputFormat(MapFileOutputFormat.class);
-    job.setOutputKeyClass(Text.class);
-    job.setOutputValueClass(CrawlDatum.class);
-
-    return job;
-  }
-
-  public static void install(JobConf job, Path crawlDb) throws IOException {
-    Path newCrawlDb = FileOutputFormat.getOutputPath(job);
-    FileSystem fs = new JobClient(job).getFs();
-    Path old = new Path(crawlDb, "old");
-    Path current = new Path(crawlDb, CURRENT_NAME);
-    if (fs.exists(current)) {
-      if (fs.exists(old)) fs.delete(old, true);
-      fs.rename(current, old);
-    }
-    fs.mkdirs(crawlDb);
-    fs.rename(newCrawlDb, current);
-    if (fs.exists(old)) fs.delete(old, true);
-    Path lock = new Path(crawlDb, LOCK_NAME);
-    LockUtil.removeLockFile(fs, lock);
-  }
-
-  public static void main(String[] args) throws Exception {
-    int res = ToolRunner.run(NutchConfiguration.create(), new CrawlDb(), args);
-    System.exit(res);
-  }
-
-  public int run(String[] args) throws Exception {
-    if (args.length < 2) {
-      System.err.println("Usage: CrawlDb <crawldb> (-dir <segments> | <seg1> <seg2> ...) [-force] [-normalize] [-filter] [-noAdditions]");
-      System.err.println("\tcrawldb\tCrawlDb to update");
-      System.err.println("\t-dir segments\tparent directory containing all segments to update from");
-      System.err.println("\tseg1 seg2 ...\tlist of segment names to update from");
-      System.err.println("\t-force\tforce update even if CrawlDb appears to be locked (CAUTION advised)");
-      System.err.println("\t-normalize\tuse URLNormalizer on urls in CrawlDb and segment (usually not needed)");
-      System.err.println("\t-filter\tuse URLFilters on urls in CrawlDb and segment");
-      System.err.println("\t-noAdditions\tonly update already existing URLs, don't add any newly discovered URLs");
-      return -1;
-    }
-    boolean normalize = false;
-    boolean filter = false;
-    boolean force = false;
-    final FileSystem fs = FileSystem.get(getConf());
-    boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true);
-    HashSet<Path> dirs = new HashSet<Path>();
-    for (int i = 1; i < args.length; i++) {
-      if (args[i].equals("-normalize")) {
-        normalize = true;
-      } else if (args[i].equals("-filter")) {
-        filter = true;
-      } else if (args[i].equals("-force")) {
-        force = true;
-      } else if (args[i].equals("-noAdditions")) {
-        additionsAllowed = false;
-      } else if (args[i].equals("-dir")) {
-        FileStatus[] paths = fs.listStatus(new Path(args[++i]), HadoopFSUtil.getPassDirectoriesFilter(fs));
-        dirs.addAll(Arrays.asList(HadoopFSUtil.getPaths(paths)));
-      } else {
-        dirs.add(new Path(args[i]));
-      }
-    }
-    try {
-      update(new Path(args[0]), dirs.toArray(new Path[dirs.size()]), normalize, filter, additionsAllowed, force);
-      return 0;
-    } catch (Exception e) {
-      LOG.fatal("CrawlDb update: " + StringUtils.stringifyException(e));
-      return -1;
-    }
-  }
 }

Modified: lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java?rev=804789&r1=804788&r2=804789&view=diff
==============================================================================
--- lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java (original)
+++ lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java Sun Aug 16 22:25:12 2009
@@ -17,7 +17,7 @@
 
 package org.apache.nutch.crawl;
 
-import org.apache.hadoop.io.Text;
+import org.apache.nutch.util.hbase.WebTableRow;
 
 /**
  * This class implements the default re-fetch schedule. That is, no matter
@@ -30,13 +30,13 @@
 public class DefaultFetchSchedule extends AbstractFetchSchedule {
 
   @Override
-  public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
+  public void setFetchSchedule(String url, WebTableRow row,
           long prevFetchTime, long prevModifiedTime,
           long fetchTime, long modifiedTime, int state) {
-    datum = super.setFetchSchedule(url, datum, prevFetchTime, prevModifiedTime,
+    super.setFetchSchedule(url, row, prevFetchTime, prevModifiedTime,
         fetchTime, modifiedTime, state);
-    datum.setFetchTime(fetchTime + (long)datum.getFetchInterval() * 1000);
-    datum.setModifiedTime(modifiedTime);
-    return datum;
+    row.setFetchTime(fetchTime + row.getFetchInterval() * 1000L);
+    row.setModifiedTime(modifiedTime);
   }
+
 }

Modified: lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/FetchSchedule.java
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/FetchSchedule.java?rev=804789&r1=804788&r2=804789&view=diff
==============================================================================
--- lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/FetchSchedule.java (original)
+++ lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/FetchSchedule.java Sun Aug 16 22:25:12 2009
@@ -17,8 +17,12 @@
 
 package org.apache.nutch.crawl;
 
+import java.util.Set;
+
 import org.apache.hadoop.conf.Configurable;
 import org.apache.hadoop.io.Text;
+import org.apache.nutch.util.hbase.HbaseColumn;
+import org.apache.nutch.util.hbase.WebTableRow;
 
 /**
  * This interface defines the contract for implementations that manipulate
@@ -27,15 +31,16 @@
  * @author Andrzej Bialecki
  */
 public interface FetchSchedule extends Configurable {
-  
+
   /** It is unknown whether page was changed since our last visit. */
   public static final int STATUS_UNKNOWN       = 0;
   /** Page is known to have been modified since our last visit. */
   public static final int STATUS_MODIFIED      = 1;
   /** Page is known to remain unmodified since our last visit. */
   public static final int STATUS_NOTMODIFIED    = 2;
-  
+
   public static final int SECONDS_PER_DAY = 3600 * 24;
+
   /**
    * Initialize fetch schedule related data. Implementations should at least
    * set the <code>fetchTime</code> and <code>fetchInterval</code>. The default
@@ -43,14 +48,10 @@
    * default <code>fetchInterval</code>.
    * 
    * @param url URL of the page.
-   * @param datum datum instance to be initialized.
-   * @return adjusted page information, including all original information.
-   * NOTE: this may be a different instance than {@param datum}, but
-   * implementations should make sure that it contains at least all
-   * information from {@param datum}.
+   * @param row url's row
    */
-  public CrawlDatum initializeSchedule(Text url, CrawlDatum datum);
-  
+  public void initializeSchedule(String url, WebTableRow row);
+
   /**
    * Sets the <code>fetchInterval</code> and <code>fetchTime</code> on a
    * successfully fetched page.
@@ -58,10 +59,9 @@
    * schedules.
    * 
    * @param url url of the page
-   * @param datum page description to be adjusted. NOTE: this instance, passed by reference,
-   * may be modified inside the method.
-   * @param prevFetchTime previous value of fetch time, or 0 if not available
-   * @param prevModifiedTime previous value of modifiedTime, or 0 if not available
+   * @param row url's row
+   * @param prevFetchTime previous value of fetch time, or -1 if not available
+   * @param prevModifiedTime previous value of modifiedTime, or -1 if not available
    * @param fetchTime the latest time, when the page was recently re-fetched. Most FetchSchedule
    * implementations should update the value in {@param datum} to something greater than this value.
    * @param modifiedTime last time the content was modified. This information comes from
@@ -72,52 +72,41 @@
    * This information may be obtained by comparing page signatures before and after fetching. If this
    * is set to {@link #STATUS_UNKNOWN}, then it is unknown whether the page was changed; implementations
    * are free to follow a sensible default behavior.
-   * @return adjusted page information, including all original information. NOTE: this may
-   * be a different instance than {@param datum}, but implementations should make sure that
-   * it contains at least all information from {@param datum}.
-   */
-  public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
-          long prevFetchTime, long prevModifiedTime,
-          long fetchTime, long modifiedTime, int state);
-  
+   */
+  public void setFetchSchedule(String url, WebTableRow row,
+      long prevFetchTime, long prevModifiedTime,
+      long fetchTime, long modifiedTime, int state);
+
   /**
    * This method specifies how to schedule refetching of pages
    * marked as GONE. Default implementation increases fetchInterval by 50%,
    * and if it exceeds the <code>maxInterval</code> it calls
    * {@link #forceRefetch(Text, CrawlDatum, boolean)}.
    * @param url URL of the page
-   * @param datum datum instance to be adjusted
-   * @return adjusted page information, including all original information.
-   * NOTE: this may be a different instance than {@param datum}, but
-   * implementations should make sure that it contains at least all
-   * information from {@param datum}.
-   */
-  public CrawlDatum setPageGoneSchedule(Text url, CrawlDatum datum,
-          long prevFetchTime, long prevModifiedTime, long fetchTime);
-  
+   * @param row url's row
+   */
+  public void setPageGoneSchedule(String url, WebTableRow row,
+      long prevFetchTime, long prevModifiedTime, long fetchTime);
+
   /**
    * This method adjusts the fetch schedule if fetching needs to be
    * re-tried due to transient errors. The default implementation
    * sets the next fetch time 1 day in the future and increases the
    * retry counter.
    * @param url URL of the page
-   * @param datum page information
+   * @param row url's row
    * @param prevFetchTime previous fetch time
    * @param prevModifiedTime previous modified time
    * @param fetchTime current fetch time
-   * @return adjusted page information, including all original information.
-   * NOTE: this may be a different instance than {@param datum}, but
-   * implementations should make sure that it contains at least all
-   * information from {@param datum}.
-   */
-  public CrawlDatum setPageRetrySchedule(Text url, CrawlDatum datum,
-          long prevFetchTime, long prevModifiedTime, long fetchTime);
-  
+   */
+  public void setPageRetrySchedule(String url, WebTableRow row,
+      long prevFetchTime, long prevModifiedTime, long fetchTime);
+
   /**
    * Calculates last fetch time of the given CrawlDatum.
    * @return the date as a long.
    */
-  public long calculateLastFetchTime(CrawlDatum datum);
+  public long calculateLastFetchTime(WebTableRow row);
 
   /**
    * This method provides information whether the page is suitable for
@@ -129,26 +118,24 @@
    * check that fetchTime is not too remote (more than <code>maxInterval</code),
    * in which case it lowers the interval and returns true.
    * @param url URL of the page
-   * @param datum datum instance
+   * @param row url's row
    * @param curTime reference time (usually set to the time when the
    * fetchlist generation process was started).
    * @return true, if the page should be considered for inclusion in the current
    * fetchlist, otherwise false.
    */
-  public boolean shouldFetch(Text url, CrawlDatum datum, long curTime);
-  
+  public boolean shouldFetch(String url, WebTableRow row, long curTime);
+
   /**
    * This method resets fetchTime, fetchInterval, modifiedTime and
    * page signature, so that it forces refetching.
    * @param url URL of the page
-   * @param datum datum instance
+   * @param row url's row
    * @param asap if true, force refetch as soon as possible - this sets
    * the fetchTime to now. If false, force refetch whenever the next fetch
    * time is set.
-   * @return adjusted page information, including all original information.
-   * NOTE: this may be a different instance than {@param datum}, but
-   * implementations should make sure that it contains at least all
-   * information from {@param datum}.
    */
-  public CrawlDatum forceRefetch(Text url, CrawlDatum datum, boolean asap);
+  public void forceRefetch(String url, WebTableRow row, boolean asap);
+
+  public Set<HbaseColumn> getColumns();
 }

Modified: lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/FetchScheduleFactory.java
URL: http://svn.apache.org/viewvc/lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/FetchScheduleFactory.java?rev=804789&r1=804788&r2=804789&view=diff
==============================================================================
--- lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/FetchScheduleFactory.java (original)
+++ lucene/nutch/branches/nutchbase/src/java/org/apache/nutch/crawl/FetchScheduleFactory.java Sun Aug 16 22:25:12 2009
@@ -37,7 +37,7 @@
     if (impl == null) {
       try {
         LOG.info("Using FetchSchedule impl: " + clazz);
-        Class implClass = Class.forName(clazz);
+        Class<?> implClass = Class.forName(clazz);
         impl = (FetchSchedule)implClass.newInstance();
         impl.setConf(conf);
         objectCache.setObject(clazz, impl);