You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by cu...@apache.org on 2005/06/03 21:11:56 UTC
svn commit: r179858 [1/3] - in /lucene/nutch/branches/mapred: ./ conf/ site/
src/java/org/apache/nutch/analysis/ src/java/org/apache/nutch/crawl/
src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/
src/java/org/apache/nutch/parse/ src/java/org/apache/nutch/plugin/
src/java/org/apache/nutch/protocol/ src/java/org/apache/nutch/searcher/
src/java/org/apache/nutch/tools/ src/java/org/apache/nutch/util/
src/plugin/ src/plugin/creativecommons/src/java/org/creativecommons/nutch/
src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/
src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/
src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/
src/plugin/parse-html/ src/plugin/parse-html/lib/
src/plugin/parse-html/src/java/org/apache/nutch/parse/html/
src/plugin/parse-html/src/test/org/apache/nutch/parse/html/
src/plugin/parse-js/ src/plugin/parse-js/src/ src/plugin/parse-js/src/java/
src/plugin/parse-js/src/java/org/ src/plugin/parse-js/src/java/org/apache/
src/plugin/parse-js/src/java/org/apache/nutch/
src/plugin/parse-js/src/java/org/apache/nutch/parse/
src/plugin/parse-js/src/java/org/apache/nutch/parse/js/
src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/
src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/
src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/
src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/
src/plugin/parse-text/src/java/org/apache/nutch/parse/text/
src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/
src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/
src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/
src/plugin/protocol-httpclient/ src/plugin/protocol-httpclient/lib/
src/plugin/protocol-httpclient/src/ src/plugin/protocol-httpclient/src/java/
src/plugin/protocol-httpclient/src/java/org/
src/plugin/protocol-httpclient/src/java/org/apache/
src/plugin/protocol-httpclient/src/java/org/apache/nutch/
src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/
src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/
src/plugin/urlfilter-prefix/src/java/org/apache/nutch/net/
src/site/src/documentation/ src/site/src/documentation/content/xdocs/
src/test/org/apache/nutch/analysis/ src/test/org/apache/nutch/fetcher/
src/test/org/apache/nutch/parse/ src/test/org/apache/nutch/tools/
src/test/org/apache/nutch/util/
Author: cutting
Date: Fri Jun 3 12:11:51 2005
New Revision: 179858
URL: http://svn.apache.org/viewcvs?rev=179858&view=rev
Log:
merge r171187:179837 from trunk
Added:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/HTMLMetaTags.java (props changed)
- copied unchanged from r179837, lucene/nutch/trunk/src/java/org/apache/nutch/parse/HTMLMetaTags.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/ParseStatus.java (props changed)
- copied unchanged from r179837, lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ProtocolOutput.java (props changed)
- copied unchanged from r179837, lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolOutput.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/ProtocolStatus.java (props changed)
- copied unchanged from r179837, lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java
lucene/nutch/branches/mapred/src/plugin/parse-html/lib/tagsoup-1.0rc3.jar (props changed)
- copied unchanged from r179837, lucene/nutch/trunk/src/plugin/parse-html/lib/tagsoup-1.0rc3.jar
lucene/nutch/branches/mapred/src/plugin/parse-html/lib/tagsoup.LICENSE.txt (props changed)
- copied unchanged from r179837, lucene/nutch/trunk/src/plugin/parse-html/lib/tagsoup.LICENSE.txt
lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java (props changed)
- copied unchanged from r179837, lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java
lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java (props changed)
- copied unchanged from r179837, lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java (props changed)
- copied unchanged from r179837, lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java
lucene/nutch/branches/mapred/src/plugin/parse-js/
- copied from r179837, lucene/nutch/trunk/src/plugin/parse-js/
lucene/nutch/branches/mapred/src/plugin/parse-js/build.xml (props changed)
- copied unchanged from r179837, lucene/nutch/trunk/src/plugin/parse-js/build.xml
lucene/nutch/branches/mapred/src/plugin/parse-js/plugin.xml (props changed)
- copied unchanged from r179837, lucene/nutch/trunk/src/plugin/parse-js/plugin.xml
lucene/nutch/branches/mapred/src/plugin/parse-js/src/
- copied from r179837, lucene/nutch/trunk/src/plugin/parse-js/src/
lucene/nutch/branches/mapred/src/plugin/parse-js/src/java/
- copied from r179837, lucene/nutch/trunk/src/plugin/parse-js/src/java/
lucene/nutch/branches/mapred/src/plugin/parse-js/src/java/org/
- copied from r179837, lucene/nutch/trunk/src/plugin/parse-js/src/java/org/
lucene/nutch/branches/mapred/src/plugin/parse-js/src/java/org/apache/
- copied from r179837, lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/
lucene/nutch/branches/mapred/src/plugin/parse-js/src/java/org/apache/nutch/
- copied from r179837, lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/
lucene/nutch/branches/mapred/src/plugin/parse-js/src/java/org/apache/nutch/package.html (props changed)
- copied unchanged from r179837, lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/package.html
lucene/nutch/branches/mapred/src/plugin/parse-js/src/java/org/apache/nutch/parse/
- copied from r179837, lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/
lucene/nutch/branches/mapred/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/
- copied from r179837, lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/
lucene/nutch/branches/mapred/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java (props changed)
- copied unchanged from r179837, lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/
- copied from r179837, lucene/nutch/trunk/src/plugin/protocol-httpclient/
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/build.xml (props changed)
- copied unchanged from r179837, lucene/nutch/trunk/src/plugin/protocol-httpclient/build.xml
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/lib/
- copied from r179837, lucene/nutch/trunk/src/plugin/protocol-httpclient/lib/
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/lib/commons-codec.jar (props changed)
- copied unchanged from r179837, lucene/nutch/trunk/src/plugin/protocol-httpclient/lib/commons-codec.jar
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/lib/commons-httpclient-3.0-rc2.jar (props changed)
- copied unchanged from r179837, lucene/nutch/trunk/src/plugin/protocol-httpclient/lib/commons-httpclient-3.0-rc2.jar
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/plugin.xml (props changed)
- copied unchanged from r179837, lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/
- copied from r179837, lucene/nutch/trunk/src/plugin/protocol-httpclient/src/
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/
- copied from r179837, lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/
- copied from r179837, lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/
- copied from r179837, lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/
- copied from r179837, lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/
- copied from r179837, lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/
- copied from r179837, lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java (props changed)
- copied unchanged from r179837, lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java (props changed)
- copied unchanged from r179837, lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java (props changed)
- copied unchanged from r179837, lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java (props changed)
- copied unchanged from r179837, lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java (props changed)
- copied unchanged from r179837, lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java (props changed)
- copied unchanged from r179837, lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java (props changed)
- copied unchanged from r179837, lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpError.java (props changed)
- copied unchanged from r179837, lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpError.java
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpException.java (props changed)
- copied unchanged from r179837, lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpException.java
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java (props changed)
- copied unchanged from r179837, lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/MultiProperties.java (props changed)
- copied unchanged from r179837, lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/MultiProperties.java
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java (props changed)
- copied unchanged from r179837, lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/package.html (props changed)
- copied unchanged from r179837, lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/package.html
Removed:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/SoftHashMap.java
lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/RobotsMetaProcessor.java
lucene/nutch/branches/mapred/src/test/org/apache/nutch/util/TestSoftHashMap.java
Modified:
lucene/nutch/branches/mapred/build.xml
lucene/nutch/branches/mapred/conf/nutch-default.xml
lucene/nutch/branches/mapred/site/about.html
lucene/nutch/branches/mapred/site/about.pdf
lucene/nutch/branches/mapred/site/bot.html
lucene/nutch/branches/mapred/site/credits.html
lucene/nutch/branches/mapred/site/faq.html
lucene/nutch/branches/mapred/site/i18n.html
lucene/nutch/branches/mapred/site/index.html
lucene/nutch/branches/mapred/site/issue_tracking.html
lucene/nutch/branches/mapred/site/linkmap.html
lucene/nutch/branches/mapred/site/mailing_lists.html
lucene/nutch/branches/mapred/site/tutorial.html
lucene/nutch/branches/mapred/site/version_control.html
lucene/nutch/branches/mapred/site/version_control.pdf
lucene/nutch/branches/mapred/src/java/org/apache/nutch/analysis/NutchAnalysis.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/analysis/NutchAnalysis.jj
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/CrawlDatum.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseSegment.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fetcher/Fetcher.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fetcher/FetcherOutput.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/IndexSegment.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/HtmlParseFilter.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/HtmlParseFilters.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/Parse.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/ParseData.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/Parser.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/ParserChecker.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/plugin/PluginManifestParser.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/plugin/PluginRepository.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Protocol.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/NutchBean.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/OpenSearchServlet.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/tools/ParseSegment.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/tools/UpdateDatabaseTool.java
lucene/nutch/branches/mapred/src/plugin/build.xml
lucene/nutch/branches/mapred/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
lucene/nutch/branches/mapred/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
lucene/nutch/branches/mapred/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
lucene/nutch/branches/mapred/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
lucene/nutch/branches/mapred/src/plugin/parse-html/plugin.xml
lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
lucene/nutch/branches/mapred/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
lucene/nutch/branches/mapred/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java
lucene/nutch/branches/mapred/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
lucene/nutch/branches/mapred/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java
lucene/nutch/branches/mapred/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
lucene/nutch/branches/mapred/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
lucene/nutch/branches/mapred/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
lucene/nutch/branches/mapred/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
lucene/nutch/branches/mapred/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/net/PrefixURLFilter.java
lucene/nutch/branches/mapred/src/site/src/documentation/content/xdocs/i18n.xml
lucene/nutch/branches/mapred/src/site/src/documentation/content/xdocs/site.xml
lucene/nutch/branches/mapred/src/site/src/documentation/content/xdocs/version_control.xml
lucene/nutch/branches/mapred/src/site/src/documentation/skinconf.xml
lucene/nutch/branches/mapred/src/test/org/apache/nutch/analysis/TestQueryParser.java
lucene/nutch/branches/mapred/src/test/org/apache/nutch/fetcher/TestFetcherOutput.java
lucene/nutch/branches/mapred/src/test/org/apache/nutch/parse/TestParseData.java
lucene/nutch/branches/mapred/src/test/org/apache/nutch/tools/TestSegmentMergeTool.java
Modified: lucene/nutch/branches/mapred/build.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/build.xml?rev=179858&r1=179857&r2=179858&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/build.xml (original)
+++ lucene/nutch/branches/mapred/build.xml Fri Jun 3 12:11:51 2005
@@ -208,7 +208,9 @@
<packageset dir="${plugins.dir}/protocol-file/src/java"/>
<packageset dir="${plugins.dir}/protocol-ftp/src/java"/>
<packageset dir="${plugins.dir}/protocol-http/src/java"/>
+ <packageset dir="${plugins.dir}/protocol-httpclient/src/java"/>
<packageset dir="${plugins.dir}/parse-html/src/java"/>
+ <packageset dir="${plugins.dir}/parse-js/src/java"/>
<packageset dir="${plugins.dir}/parse-text/src/java"/>
<packageset dir="${plugins.dir}/parse-pdf/src/java"/>
<packageset dir="${plugins.dir}/parse-rtf/src/java"/>
Modified: lucene/nutch/branches/mapred/conf/nutch-default.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/conf/nutch-default.xml?rev=179858&r1=179857&r2=179858&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/conf/nutch-default.xml (original)
+++ lucene/nutch/branches/mapred/conf/nutch-default.xml Fri Jun 3 12:11:51 2005
@@ -578,7 +578,7 @@
<property>
<name>plugin.includes</name>
- <value>protocol-http|urlfilter-regex|parse-(text|html)|index-basic|query-(basic|site|url)</value>
+ <value>protocol-httpclient|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url)</value>
<description>Regular expression naming plugin directory names to
include. Any plugin not matching this expression is excluded. By
default Nutch includes crawling just HTML and plain text via HTTP,
@@ -598,6 +598,14 @@
<value>windows-1252</value>
<description>The character encoding to fall back to when no other information
is available</description>
+</property>
+
+<property>
+ <name>parser.html.impl</name>
+ <value>neko</value>
+ <description>HTML Parser implementation. Currently the following keywords
+ are recognized: "neko" uses NekoHTML, "tagsoup" uses TagSoup.
+ </description>
</property>
<!-- urlfilter plugin properties -->
Modified: lucene/nutch/branches/mapred/site/about.html
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/site/about.html?rev=179858&r1=179857&r2=179858&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/site/about.html (original)
+++ lucene/nutch/branches/mapred/site/about.html Fri Jun 3 12:11:51 2005
@@ -20,7 +20,7 @@
|breadtrail
+-->
<div class="breadtrail">
-<a href="http://incubator.apache.org/">Incubator</a> > <a href="http://incubator.apache.org/nutch/">Nutch</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
+<a href="http://lucene.apache.org/">Lucene</a> > <a href="http://lucene.apache.org/nutch/">Nutch</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
</div>
<!--+
|header
@@ -30,7 +30,7 @@
|start group logo
+-->
<div class="grouplogo">
-<a href="http://incubator.apache.org/"><img class="logoImage" alt="Incubator" src="http://incubator.apache.org/images/apache-incubator-logo.png" title="Apache Incubator"></a>
+<a href="http://lucene.apache.org/"><img class="logoImage" alt="Lucene" src="http://lucene.apache.org/java/docs/images/lucene_green_150.gif" title="Apache Lucene"></a>
</div>
<!--+
|end group logo
@@ -39,7 +39,7 @@
|start Project Logo
+-->
<div class="projectlogoA1">
-<a href="http://incubator.apache.org/nutch/"><img class="logoImage" alt="Nutch" src="images/nutch-logo.gif" title="Open Source Web Search Software"></a>
+<a href="http://lucene.apache.org/nutch/"><img class="logoImage" alt="Nutch" src="images/nutch-logo.gif" title="Open Source Web Search Software"></a>
</div>
<!--+
|end Project Logo
@@ -138,7 +138,7 @@
<div onclick="SwitchMenu('menu_1.4', 'skin/')" id="menu_1.4Title" class="menutitle">Related Projects</div>
<div id="menu_1.4" class="menuitemgroup">
<div class="menuitem">
-<a title="" href="http://jakarta.apache.org/lucene/">Lucene</a>
+<a title="" href="http://lucene.apache.org/">Lucene</a>
</div>
</div>
<div id="credit"></div>
@@ -176,7 +176,7 @@
<a name="N1000C"></a><a name="Overview"></a>
<h2 class="h3">Overview</h2>
<div class="section">
-<p>Nutch is open source web-search software. It builds on <a href="http://jakarta.apache.org/lucene/">Lucene</a>, adding web-specifics, such as a
+<p>Nutch is open source web-search software. It builds on <a href="http://lucene.apache.org/">Lucene</a>, adding web-specifics, such as a
crawler, a link-graph database, parsers for HTML and other
document formats, etc.</p>
<p>For more information about Nutch, please see the <a href="http://wiki.apache.org/nutch/">Nutch wiki.</a>
Modified: lucene/nutch/branches/mapred/site/about.pdf
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/site/about.pdf?rev=179858&r1=179857&r2=179858&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/site/about.pdf (original)
+++ lucene/nutch/branches/mapred/site/about.pdf Fri Jun 3 12:11:51 2005
@@ -47,10 +47,10 @@
>>
endobj
12 0 obj
-<< /Length 1428 /Filter [ /ASCII85Decode /FlateDecode ]
+<< /Length 1425 /Filter [ /ASCII85Decode /FlateDecode ]
>>
stream
-Gasapa`?E"&A@B[E3u[c&GKaRSg;HVP@P<,C(>4n&.+42PCs,CX\uksbtVkBe;VQeJgS)+4ZpApIFt^RS!^\dFi_!n3FomFM$&5[8KE$l"[7)-Lhpktc^d_tab(E'n*g7'jm#k8j$!4)Ck%'[+`Y1ML`O9PBb6@&s6+Q*RLl2NcgnjjZYF'L12bZ5^3aNR;g<L:$[lQCURHY^k'>R8.`VjaWR@EIX08S/5k=ZqAEARYNLTtUFf6+?ReWjT;f>uD`1]a>P9Y?VrO)L+S[YU^3<M,^%+^JF`Z%,']Va4NqV]8XcZc5:L\L--(\Y_1As=<<<cT\?IU*C[f$F'pa?0"7nm"$K$^qPf'8=M2oNg3[iYZ-YRE&"+A&[!1`ICp_H$S$mFsn_+(7HodWAC6iR:\P3EYR]P<6H$!>"E3?^MJE"LiPC63\t&^oOTY]Y3(_P!nBtSW*EtWZ,G:ma].7(Vlup`65.T\;FmTQ[Z__(3iEn5X#T$+Cicr`"LiL94#Dqi0TS\Ullse\P0>T9Uq8X?_)>T(jc"A;5,J;(Ph,s6MZ?=B,;6PEi*>JkUT*l6?3FQD+$Q4H(t3kZ^*:HAF?c38mVk<9(G]$'+$\DFkBLJn%pMMeFh<S'r_^%*`C/Ntn(Zt?_M&M5GMJM^66o)H8q,_:&eXh$L2uke"GjeXR[gFl['$rAW0ulXc-;0eJLn-VRV^HcfgnXhKj2&Hi)K[anbMVCC%H`P*W@`N`P6R_EI=_n<-;3/7Dk2:$2lO#pd*r2'8\;Q7gg"3)-5-6gH6/l]9ONo(E(]7>ll$sTq/fe/oQm\G+i$=<L;rF@EZi)?$B#8!1:V%AGKF><o$s/ea179C4MiE(*P<`/%-Ac>#C.V0MA.F#[%.o5_#9>eZ<I=9c90G>&Ch:;.q6:$WDf:dT9OmkSui5f;F?ko=5R#`T":[!1E9/APG$[0B.j%)UGo3hH.[\YU"qa:fL8VPnP[s3bC?G0R4e[3XXY2VM!6n2atop<!6YkQQ0M,,`:Ve8TB<:0JoYKANV8Vs!NiE8MiT(Ui<].7,Hu;qDhI/mW8O[k[:Y5#C^&CAQsUR$39$->DDc]bmt1Lj^RuUU6MNp"Ne]@[O0kXBp/(W\$TBR"[RYHYr(]jWB\?4Y_CqIAJ90becVXB'5BZ+/X,Fn$/HfsndaJTj<]N'LL;(K9qHcH(f:9EN@k<#JsT@G#3u$/RjOUb#f:b;jadP!9oZ'A.6^[fH(OtJS/l:BlU?ic)t0fi#;D=::fcpDe4kNLgZ(C:`X1W;;mTg?27cOVit/A+^OSm,,`7dpUgUlUYHiiln.g;Gs#K(7aZcsm2n;%UX#'/JDS8"8&OA&E5oPG&ER$rJ&+X[;@UGk@T=rT?OW>f+X%?=O_@..=)@mPB)`(;eZu'XPj'p*cU,@]J#d6@gL:I6akeMgu~>
+Gasapa`?E"&A@B[E3uZ^M#/qd:D.6<8MZ-&22=-rL]g/b,bY6)X-Be`M]3QIQ#ZUD^r#F*LPOK`O5&32',*<t%bVUJL[8(pET#.hW<7KEi)A6gaL3FFh7<0L4F5t3s6fV?Y20:GhNP^rp(m]8M<bTn2Lsu=+<Ji.f'07m&>D%Sjaf@&e]LWugoQQapXjM`nEEshqT'@@3W6&(k?TI8*!^/s9Rk]t^931nae(e\PoiIPEWSe]TjCE-ihnD']kLaaLF-G'NoYKYa6M].3i_ma0%<.R>!I-IDtk._e*<jI!,oUKE`["(pon(+r?9;n/1W`GC;55.2^LSFBK!V#E_`s]@<jJor<7$CSr#dEm/s)c_aH8u&i1Uc`Jt[7%u^Ih7.^8b=6tHVEi.NM[2b_s0\(2tn-c5>ihu1IT+H=4UYF3C=%hY0Bh4f"gg"Gsbo:N9f1b4&\^'C_:@-G?`7a(EC8m<a6o.(8M:BXjZ9pp)<TBf(%P/a23Ee[R[rggq1lWV4O%-ek,LBe\9H3HKi<3S()kYo+6p&7GV#S1QY6"P/J0_"S(Yf.\':fZ&9TS15d)Gb=N0rb>o7Q9<U$'jobO$CrkXrkKm-=m\o,M[6o<<p*=gu#Qn*`]7<D:oK,&`oU#`h$:gi?Wi$]uQk./=D"OGL+W1)-2o+@,S4Y0"6.9C&"(Kq+RFd+l!QlH(gIkKucDc:1-^&T6RdJD^-#M(a'Kc^U9uDt&Nt;uCDY'ZX"_5Zq0R=Z$%YS^@DBN#sl?3Ki$(G-jL?aKUOs%@1H9)7l.X0c(_V]Nm!J!8!XNTIt/G$[/4GX&YMN:15BkA6jE(3<1acrQ0)n*3?`ZW2nUPFk'</``lD^akj8PJ-1^%r/4\jNCCoTE=)9LWOSBS%AkS,F@E5^QTY\8Ub$@hTspOJ/37k`+Hl%p6_aBSMTTDu>!l,Smb25H2,Z*Y'dY].l>$)S/#ItG.4g7KJul&@=II_t_ek"I""Df$i%?ckKKd'W/]Ua$M!8/b>k9@@N,p;a"WkP-R*q8iPdp[a3/3W1RR0=!C,(RXaX)%SUq#7F7\uc1`F^+bJantAq$ec<#m]-=Ub?M`=;We0]:Tn91lW^c5"rIDnbQ"EbWG9-\HW5&!"WOQo(,b/`"0)+hIpJ1ku+Y\ME[)$j+20;9G^Fi(7CM-kb]OW.9sRigPBb<lpJ[H!_@*G)&O"!P^5"a:HS$6CaK]mgC9Hq')XRDSi0M0;i5jlr+MH<BW:;dNn56=WojU@P,+1?loNWjJg&js1`aRBiY#1r?hHFj:/'[ZT'(A"P?:?i-cJ?eHK/=#$Q\.Up;m'mM*D4AbkBi1no1bWE#Q7ZiZLVl\_m^,'U$D&-gC*\iM=E+7!ZOlTo5!Q-T%l$<PuZ^'Z1h"'Z5qK-n%,7[/+4Ya)9:n_p=&rSl8#3Ij2jcHN~>
endstream
endobj
13 0 obj
@@ -76,7 +76,7 @@
/Rect [ 356.604 629.666 391.92 617.666 ]
/C [ 0 0 0 ]
/Border [ 0 0 0 ]
-/A << /URI (http://jakarta.apache.org/lucene/)
+/A << /URI (http://lucene.apache.org/)
/S /URI >>
/H /I
>>
@@ -200,32 +200,32 @@
xref
0 27
0000000000 65535 f
-0000004364 00000 n
-0000004429 00000 n
-0000004521 00000 n
+0000004353 00000 n
+0000004418 00000 n
+0000004510 00000 n
0000000015 00000 n
0000000071 00000 n
0000000569 00000 n
0000000689 00000 n
0000000721 00000 n
-0000004644 00000 n
+0000004633 00000 n
0000000856 00000 n
-0000004707 00000 n
+0000004696 00000 n
0000000992 00000 n
-0000002513 00000 n
-0000002636 00000 n
-0000002684 00000 n
-0000002870 00000 n
-0000003052 00000 n
-0000003252 00000 n
-0000004773 00000 n
-0000003452 00000 n
-0000003591 00000 n
-0000003808 00000 n
-0000003921 00000 n
-0000004031 00000 n
-0000004139 00000 n
-0000004255 00000 n
+0000002510 00000 n
+0000002633 00000 n
+0000002681 00000 n
+0000002859 00000 n
+0000003041 00000 n
+0000003241 00000 n
+0000004762 00000 n
+0000003441 00000 n
+0000003580 00000 n
+0000003797 00000 n
+0000003910 00000 n
+0000004020 00000 n
+0000004128 00000 n
+0000004244 00000 n
trailer
<<
/Size 27
@@ -233,5 +233,5 @@
/Info 4 0 R
>>
startxref
-4824
+4813
%%EOF
Modified: lucene/nutch/branches/mapred/site/bot.html
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/site/bot.html?rev=179858&r1=179857&r2=179858&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/site/bot.html (original)
+++ lucene/nutch/branches/mapred/site/bot.html Fri Jun 3 12:11:51 2005
@@ -20,7 +20,7 @@
|breadtrail
+-->
<div class="breadtrail">
-<a href="http://incubator.apache.org/">Incubator</a> > <a href="http://incubator.apache.org/nutch/">Nutch</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
+<a href="http://lucene.apache.org/">Lucene</a> > <a href="http://lucene.apache.org/nutch/">Nutch</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
</div>
<!--+
|header
@@ -30,7 +30,7 @@
|start group logo
+-->
<div class="grouplogo">
-<a href="http://incubator.apache.org/"><img class="logoImage" alt="Incubator" src="http://incubator.apache.org/images/apache-incubator-logo.png" title="Apache Incubator"></a>
+<a href="http://lucene.apache.org/"><img class="logoImage" alt="Lucene" src="http://lucene.apache.org/java/docs/images/lucene_green_150.gif" title="Apache Lucene"></a>
</div>
<!--+
|end group logo
@@ -39,7 +39,7 @@
|start Project Logo
+-->
<div class="projectlogoA1">
-<a href="http://incubator.apache.org/nutch/"><img class="logoImage" alt="Nutch" src="images/nutch-logo.gif" title="Open Source Web Search Software"></a>
+<a href="http://lucene.apache.org/nutch/"><img class="logoImage" alt="Nutch" src="images/nutch-logo.gif" title="Open Source Web Search Software"></a>
</div>
<!--+
|end Project Logo
@@ -138,7 +138,7 @@
<div onclick="SwitchMenu('menu_1.4', 'skin/')" id="menu_1.4Title" class="menutitle">Related Projects</div>
<div id="menu_1.4" class="menuitemgroup">
<div class="menuitem">
-<a title="" href="http://jakarta.apache.org/lucene/">Lucene</a>
+<a title="" href="http://lucene.apache.org/">Lucene</a>
</div>
</div>
<div id="credit"></div>
Modified: lucene/nutch/branches/mapred/site/credits.html
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/site/credits.html?rev=179858&r1=179857&r2=179858&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/site/credits.html (original)
+++ lucene/nutch/branches/mapred/site/credits.html Fri Jun 3 12:11:51 2005
@@ -20,7 +20,7 @@
|breadtrail
+-->
<div class="breadtrail">
-<a href="http://incubator.apache.org/">Incubator</a> > <a href="http://incubator.apache.org/nutch/">Nutch</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
+<a href="http://lucene.apache.org/">Lucene</a> > <a href="http://lucene.apache.org/nutch/">Nutch</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
</div>
<!--+
|header
@@ -30,7 +30,7 @@
|start group logo
+-->
<div class="grouplogo">
-<a href="http://incubator.apache.org/"><img class="logoImage" alt="Incubator" src="http://incubator.apache.org/images/apache-incubator-logo.png" title="Apache Incubator"></a>
+<a href="http://lucene.apache.org/"><img class="logoImage" alt="Lucene" src="http://lucene.apache.org/java/docs/images/lucene_green_150.gif" title="Apache Lucene"></a>
</div>
<!--+
|end group logo
@@ -39,7 +39,7 @@
|start Project Logo
+-->
<div class="projectlogoA1">
-<a href="http://incubator.apache.org/nutch/"><img class="logoImage" alt="Nutch" src="images/nutch-logo.gif" title="Open Source Web Search Software"></a>
+<a href="http://lucene.apache.org/nutch/"><img class="logoImage" alt="Nutch" src="images/nutch-logo.gif" title="Open Source Web Search Software"></a>
</div>
<!--+
|end Project Logo
@@ -138,7 +138,7 @@
<div onclick="SwitchMenu('menu_1.4', 'skin/')" id="menu_1.4Title" class="menutitle">Related Projects</div>
<div id="menu_1.4" class="menuitemgroup">
<div class="menuitem">
-<a title="" href="http://jakarta.apache.org/lucene/">Lucene</a>
+<a title="" href="http://lucene.apache.org/">Lucene</a>
</div>
</div>
<div id="credit"></div>
Modified: lucene/nutch/branches/mapred/site/faq.html
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/site/faq.html?rev=179858&r1=179857&r2=179858&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/site/faq.html (original)
+++ lucene/nutch/branches/mapred/site/faq.html Fri Jun 3 12:11:51 2005
@@ -20,7 +20,7 @@
|breadtrail
+-->
<div class="breadtrail">
-<a href="http://incubator.apache.org/">Incubator</a> > <a href="http://incubator.apache.org/nutch/">Nutch</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
+<a href="http://lucene.apache.org/">Lucene</a> > <a href="http://lucene.apache.org/nutch/">Nutch</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
</div>
<!--+
|header
@@ -30,7 +30,7 @@
|start group logo
+-->
<div class="grouplogo">
-<a href="http://incubator.apache.org/"><img class="logoImage" alt="Incubator" src="http://incubator.apache.org/images/apache-incubator-logo.png" title="Apache Incubator"></a>
+<a href="http://lucene.apache.org/"><img class="logoImage" alt="Lucene" src="http://lucene.apache.org/java/docs/images/lucene_green_150.gif" title="Apache Lucene"></a>
</div>
<!--+
|end group logo
@@ -39,7 +39,7 @@
|start Project Logo
+-->
<div class="projectlogoA1">
-<a href="http://incubator.apache.org/nutch/"><img class="logoImage" alt="Nutch" src="images/nutch-logo.gif" title="Open Source Web Search Software"></a>
+<a href="http://lucene.apache.org/nutch/"><img class="logoImage" alt="Nutch" src="images/nutch-logo.gif" title="Open Source Web Search Software"></a>
</div>
<!--+
|end Project Logo
@@ -138,7 +138,7 @@
<div onclick="SwitchMenu('menu_1.4', 'skin/')" id="menu_1.4Title" class="menutitle">Related Projects</div>
<div id="menu_1.4" class="menuitemgroup">
<div class="menuitem">
-<a title="" href="http://jakarta.apache.org/lucene/">Lucene</a>
+<a title="" href="http://lucene.apache.org/">Lucene</a>
</div>
</div>
<div id="credit"></div>
Modified: lucene/nutch/branches/mapred/site/i18n.html
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/site/i18n.html?rev=179858&r1=179857&r2=179858&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/site/i18n.html (original)
+++ lucene/nutch/branches/mapred/site/i18n.html Fri Jun 3 12:11:51 2005
@@ -20,7 +20,7 @@
|breadtrail
+-->
<div class="breadtrail">
-<a href="http://incubator.apache.org/">Incubator</a> > <a href="http://incubator.apache.org/nutch/">Nutch</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
+<a href="http://lucene.apache.org/">Lucene</a> > <a href="http://lucene.apache.org/nutch/">Nutch</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
</div>
<!--+
|header
@@ -30,7 +30,7 @@
|start group logo
+-->
<div class="grouplogo">
-<a href="http://incubator.apache.org/"><img class="logoImage" alt="Incubator" src="http://incubator.apache.org/images/apache-incubator-logo.png" title="Apache Incubator"></a>
+<a href="http://lucene.apache.org/"><img class="logoImage" alt="Lucene" src="http://lucene.apache.org/java/docs/images/lucene_green_150.gif" title="Apache Lucene"></a>
</div>
<!--+
|end group logo
@@ -39,7 +39,7 @@
|start Project Logo
+-->
<div class="projectlogoA1">
-<a href="http://incubator.apache.org/nutch/"><img class="logoImage" alt="Nutch" src="images/nutch-logo.gif" title="Open Source Web Search Software"></a>
+<a href="http://lucene.apache.org/nutch/"><img class="logoImage" alt="Nutch" src="images/nutch-logo.gif" title="Open Source Web Search Software"></a>
</div>
<!--+
|end Project Logo
@@ -138,7 +138,7 @@
<div onclick="SwitchMenu('menu_1.4', 'skin/')" id="menu_1.4Title" class="menutitle">Related Projects</div>
<div id="menu_1.4" class="menuitemgroup">
<div class="menuitem">
-<a title="" href="http://jakarta.apache.org/lucene/">Lucene</a>
+<a title="" href="http://lucene.apache.org/">Lucene</a>
</div>
</div>
<div id="credit"></div>
@@ -257,7 +257,7 @@
<p>Each item typically includes an HTML anchor, one for each of the
top-level pages in the translation.</p>
<p>For example, the header file for an English translation is filed
-as <a href="http://svn.apache.org/repos/asf/incubator/nutch/trunk/src/web/include/en/header.xml"><tt>src/web/include/en/header.xml</tt></a>.</p>
+as <a href="http://svn.apache.org/repos/asf/lucene/nutch/trunk/src/web/include/en/header.xml"><tt>src/web/include/en/header.xml</tt></a>.</p>
</div>
@@ -290,7 +290,7 @@
entities in your data, you'll need to declare these too. Look at
existing translations for examples of this.</p>
<p>For example, the English language "about" page is filed
-as <a href="http://svn.apache.org/repos/asf/incubator/nutch/trunk/src/web/pages/en/about.xml"><tt>src/web/pages/en/about.xml</tt></a>.</p>
+as <a href="http://svn.apache.org/repos/asf/lucene/nutch/trunk/src/web/pages/en/about.xml"><tt>src/web/pages/en/about.xml</tt></a>.</p>
</div>
@@ -305,10 +305,10 @@
page.</p>
<p>These property files are filed as
<tt>src/web/locale/org/nutch/jsp/<i>page</i>_<i>language</i>.xml</tt>
-where <i>page</i> is the name of the JSP page in <a href="http://svn.apache.org/repos/asf/incubator/nutch/trunk/src/web/jsp/"><tt>src/web/jsp/</tt></a>
+where <i>page</i> is the name of the JSP page in <a href="http://svn.apache.org/repos/asf/lucene/nutch/trunk/src/web/jsp/"><tt>src/web/jsp/</tt></a>
and <i>language</i> is the IS0639 language code, as above.</p>
<p>For example, text for the English language search results page is filed
-as <a href="http://svn.apache.org/repos/asf/incubator/nutch/trunk/src/web/locale/org/nutch/jsp/search_en.properties"><tt>src/web/locale/org/nutch/jsp/search_en.properties</tt></a>.
+as <a href="http://svn.apache.org/repos/asf/lucene/nutch/trunk/src/web/locale/org/nutch/jsp/search_en.properties"><tt>src/web/locale/org/nutch/jsp/search_en.properties</tt></a>.
This contains something like:</p>
<pre>
title = search results
Modified: lucene/nutch/branches/mapred/site/index.html
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/site/index.html?rev=179858&r1=179857&r2=179858&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/site/index.html (original)
+++ lucene/nutch/branches/mapred/site/index.html Fri Jun 3 12:11:51 2005
@@ -20,7 +20,7 @@
|breadtrail
+-->
<div class="breadtrail">
-<a href="http://incubator.apache.org/">Incubator</a> > <a href="http://incubator.apache.org/nutch/">Nutch</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
+<a href="http://lucene.apache.org/">Lucene</a> > <a href="http://lucene.apache.org/nutch/">Nutch</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
</div>
<!--+
|header
@@ -30,7 +30,7 @@
|start group logo
+-->
<div class="grouplogo">
-<a href="http://incubator.apache.org/"><img class="logoImage" alt="Incubator" src="http://incubator.apache.org/images/apache-incubator-logo.png" title="Apache Incubator"></a>
+<a href="http://lucene.apache.org/"><img class="logoImage" alt="Lucene" src="http://lucene.apache.org/java/docs/images/lucene_green_150.gif" title="Apache Lucene"></a>
</div>
<!--+
|end group logo
@@ -39,7 +39,7 @@
|start Project Logo
+-->
<div class="projectlogoA1">
-<a href="http://incubator.apache.org/nutch/"><img class="logoImage" alt="Nutch" src="images/nutch-logo.gif" title="Open Source Web Search Software"></a>
+<a href="http://lucene.apache.org/nutch/"><img class="logoImage" alt="Nutch" src="images/nutch-logo.gif" title="Open Source Web Search Software"></a>
</div>
<!--+
|end Project Logo
@@ -138,7 +138,7 @@
<div onclick="SwitchMenu('menu_1.4', 'skin/')" id="menu_1.4Title" class="menutitle">Related Projects</div>
<div id="menu_1.4" class="menuitemgroup">
<div class="menuitem">
-<a title="" href="http://jakarta.apache.org/lucene/">Lucene</a>
+<a title="" href="http://lucene.apache.org/">Lucene</a>
</div>
</div>
<div id="credit">
Modified: lucene/nutch/branches/mapred/site/issue_tracking.html
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/site/issue_tracking.html?rev=179858&r1=179857&r2=179858&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/site/issue_tracking.html (original)
+++ lucene/nutch/branches/mapred/site/issue_tracking.html Fri Jun 3 12:11:51 2005
@@ -20,7 +20,7 @@
|breadtrail
+-->
<div class="breadtrail">
-<a href="http://incubator.apache.org/">Incubator</a> > <a href="http://incubator.apache.org/nutch/">Nutch</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
+<a href="http://lucene.apache.org/">Lucene</a> > <a href="http://lucene.apache.org/nutch/">Nutch</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
</div>
<!--+
|header
@@ -30,7 +30,7 @@
|start group logo
+-->
<div class="grouplogo">
-<a href="http://incubator.apache.org/"><img class="logoImage" alt="Incubator" src="http://incubator.apache.org/images/apache-incubator-logo.png" title="Apache Incubator"></a>
+<a href="http://lucene.apache.org/"><img class="logoImage" alt="Lucene" src="http://lucene.apache.org/java/docs/images/lucene_green_150.gif" title="Apache Lucene"></a>
</div>
<!--+
|end group logo
@@ -39,7 +39,7 @@
|start Project Logo
+-->
<div class="projectlogoA1">
-<a href="http://incubator.apache.org/nutch/"><img class="logoImage" alt="Nutch" src="images/nutch-logo.gif" title="Open Source Web Search Software"></a>
+<a href="http://lucene.apache.org/nutch/"><img class="logoImage" alt="Nutch" src="images/nutch-logo.gif" title="Open Source Web Search Software"></a>
</div>
<!--+
|end Project Logo
@@ -138,7 +138,7 @@
<div onclick="SwitchMenu('menu_1.4', 'skin/')" id="menu_1.4Title" class="menutitle">Related Projects</div>
<div id="menu_1.4" class="menuitemgroup">
<div class="menuitem">
-<a title="" href="http://jakarta.apache.org/lucene/">Lucene</a>
+<a title="" href="http://lucene.apache.org/">Lucene</a>
</div>
</div>
<div id="credit"></div>
Modified: lucene/nutch/branches/mapred/site/linkmap.html
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/site/linkmap.html?rev=179858&r1=179857&r2=179858&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/site/linkmap.html (original)
+++ lucene/nutch/branches/mapred/site/linkmap.html Fri Jun 3 12:11:51 2005
@@ -20,7 +20,7 @@
|breadtrail
+-->
<div class="breadtrail">
-<a href="http://incubator.apache.org/">Incubator</a> > <a href="http://incubator.apache.org/nutch/">Nutch</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
+<a href="http://lucene.apache.org/">Lucene</a> > <a href="http://lucene.apache.org/nutch/">Nutch</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
</div>
<!--+
|header
@@ -30,7 +30,7 @@
|start group logo
+-->
<div class="grouplogo">
-<a href="http://incubator.apache.org/"><img class="logoImage" alt="Incubator" src="http://incubator.apache.org/images/apache-incubator-logo.png" title="Apache Incubator"></a>
+<a href="http://lucene.apache.org/"><img class="logoImage" alt="Lucene" src="http://lucene.apache.org/java/docs/images/lucene_green_150.gif" title="Apache Lucene"></a>
</div>
<!--+
|end group logo
@@ -39,7 +39,7 @@
|start Project Logo
+-->
<div class="projectlogoA1">
-<a href="http://incubator.apache.org/nutch/"><img class="logoImage" alt="Nutch" src="images/nutch-logo.gif" title="Open Source Web Search Software"></a>
+<a href="http://lucene.apache.org/nutch/"><img class="logoImage" alt="Nutch" src="images/nutch-logo.gif" title="Open Source Web Search Software"></a>
</div>
<!--+
|end Project Logo
@@ -138,7 +138,7 @@
<div onclick="SwitchMenu('menu_1.4', 'skin/')" id="menu_1.4Title" class="menutitle">Related Projects</div>
<div id="menu_1.4" class="menuitemgroup">
<div class="menuitem">
-<a title="" href="http://jakarta.apache.org/lucene/">Lucene</a>
+<a title="" href="http://lucene.apache.org/">Lucene</a>
</div>
</div>
<div id="credit"></div>
@@ -259,7 +259,7 @@
<ul>
<li>
-<a href="http://jakarta.apache.org/lucene/">Lucene</a> _________________________ <em>lucene</em>
+<a href="http://lucene.apache.org/">Lucene</a> _________________________ <em>lucene</em>
</li>
</ul>
Modified: lucene/nutch/branches/mapred/site/mailing_lists.html
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/site/mailing_lists.html?rev=179858&r1=179857&r2=179858&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/site/mailing_lists.html (original)
+++ lucene/nutch/branches/mapred/site/mailing_lists.html Fri Jun 3 12:11:51 2005
@@ -20,7 +20,7 @@
|breadtrail
+-->
<div class="breadtrail">
-<a href="http://incubator.apache.org/">Incubator</a> > <a href="http://incubator.apache.org/nutch/">Nutch</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
+<a href="http://lucene.apache.org/">Lucene</a> > <a href="http://lucene.apache.org/nutch/">Nutch</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
</div>
<!--+
|header
@@ -30,7 +30,7 @@
|start group logo
+-->
<div class="grouplogo">
-<a href="http://incubator.apache.org/"><img class="logoImage" alt="Incubator" src="http://incubator.apache.org/images/apache-incubator-logo.png" title="Apache Incubator"></a>
+<a href="http://lucene.apache.org/"><img class="logoImage" alt="Lucene" src="http://lucene.apache.org/java/docs/images/lucene_green_150.gif" title="Apache Lucene"></a>
</div>
<!--+
|end group logo
@@ -39,7 +39,7 @@
|start Project Logo
+-->
<div class="projectlogoA1">
-<a href="http://incubator.apache.org/nutch/"><img class="logoImage" alt="Nutch" src="images/nutch-logo.gif" title="Open Source Web Search Software"></a>
+<a href="http://lucene.apache.org/nutch/"><img class="logoImage" alt="Nutch" src="images/nutch-logo.gif" title="Open Source Web Search Software"></a>
</div>
<!--+
|end Project Logo
@@ -138,7 +138,7 @@
<div onclick="SwitchMenu('menu_1.4', 'skin/')" id="menu_1.4Title" class="menutitle">Related Projects</div>
<div id="menu_1.4" class="menuitemgroup">
<div class="menuitem">
-<a title="" href="http://jakarta.apache.org/lucene/">Lucene</a>
+<a title="" href="http://lucene.apache.org/">Lucene</a>
</div>
</div>
<div id="credit"></div>
Modified: lucene/nutch/branches/mapred/site/tutorial.html
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/site/tutorial.html?rev=179858&r1=179857&r2=179858&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/site/tutorial.html (original)
+++ lucene/nutch/branches/mapred/site/tutorial.html Fri Jun 3 12:11:51 2005
@@ -20,7 +20,7 @@
|breadtrail
+-->
<div class="breadtrail">
-<a href="http://incubator.apache.org/">Incubator</a> > <a href="http://incubator.apache.org/nutch/">Nutch</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
+<a href="http://lucene.apache.org/">Lucene</a> > <a href="http://lucene.apache.org/nutch/">Nutch</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
</div>
<!--+
|header
@@ -30,7 +30,7 @@
|start group logo
+-->
<div class="grouplogo">
-<a href="http://incubator.apache.org/"><img class="logoImage" alt="Incubator" src="http://incubator.apache.org/images/apache-incubator-logo.png" title="Apache Incubator"></a>
+<a href="http://lucene.apache.org/"><img class="logoImage" alt="Lucene" src="http://lucene.apache.org/java/docs/images/lucene_green_150.gif" title="Apache Lucene"></a>
</div>
<!--+
|end group logo
@@ -39,7 +39,7 @@
|start Project Logo
+-->
<div class="projectlogoA1">
-<a href="http://incubator.apache.org/nutch/"><img class="logoImage" alt="Nutch" src="images/nutch-logo.gif" title="Open Source Web Search Software"></a>
+<a href="http://lucene.apache.org/nutch/"><img class="logoImage" alt="Nutch" src="images/nutch-logo.gif" title="Open Source Web Search Software"></a>
</div>
<!--+
|end Project Logo
@@ -138,7 +138,7 @@
<div onclick="SwitchMenu('menu_1.4', 'skin/')" id="menu_1.4Title" class="menutitle">Related Projects</div>
<div id="menu_1.4" class="menuitemgroup">
<div class="menuitem">
-<a title="" href="http://jakarta.apache.org/lucene/">Lucene</a>
+<a title="" href="http://lucene.apache.org/">Lucene</a>
</div>
</div>
<div id="credit"></div>
Modified: lucene/nutch/branches/mapred/site/version_control.html
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/site/version_control.html?rev=179858&r1=179857&r2=179858&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/site/version_control.html (original)
+++ lucene/nutch/branches/mapred/site/version_control.html Fri Jun 3 12:11:51 2005
@@ -20,7 +20,7 @@
|breadtrail
+-->
<div class="breadtrail">
-<a href="http://incubator.apache.org/">Incubator</a> > <a href="http://incubator.apache.org/nutch/">Nutch</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
+<a href="http://lucene.apache.org/">Lucene</a> > <a href="http://lucene.apache.org/nutch/">Nutch</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
</div>
<!--+
|header
@@ -30,7 +30,7 @@
|start group logo
+-->
<div class="grouplogo">
-<a href="http://incubator.apache.org/"><img class="logoImage" alt="Incubator" src="http://incubator.apache.org/images/apache-incubator-logo.png" title="Apache Incubator"></a>
+<a href="http://lucene.apache.org/"><img class="logoImage" alt="Lucene" src="http://lucene.apache.org/java/docs/images/lucene_green_150.gif" title="Apache Lucene"></a>
</div>
<!--+
|end group logo
@@ -39,7 +39,7 @@
|start Project Logo
+-->
<div class="projectlogoA1">
-<a href="http://incubator.apache.org/nutch/"><img class="logoImage" alt="Nutch" src="images/nutch-logo.gif" title="Open Source Web Search Software"></a>
+<a href="http://lucene.apache.org/nutch/"><img class="logoImage" alt="Nutch" src="images/nutch-logo.gif" title="Open Source Web Search Software"></a>
</div>
<!--+
|end Project Logo
@@ -138,7 +138,7 @@
<div onclick="SwitchMenu('menu_1.4', 'skin/')" id="menu_1.4Title" class="menutitle">Related Projects</div>
<div id="menu_1.4" class="menuitemgroup">
<div class="menuitem">
-<a title="" href="http://jakarta.apache.org/lucene/">Lucene</a>
+<a title="" href="http://lucene.apache.org/">Lucene</a>
</div>
</div>
<div id="credit"></div>
@@ -197,7 +197,7 @@
<div class="section">
<p>
The source code can be browsed via the Web at
- <a href="http://svn.apache.org/viewcvs.cgi/incubator/nutch/">http://svn.apache.org/viewcvs.cgi/incubator/nutch/</a>.
+ <a href="http://svn.apache.org/viewcvs.cgi/lucene/nutch/">http://svn.apache.org/viewcvs.cgi/lucene/nutch/</a>.
No SVN client software is required.
</p>
</div>
@@ -208,7 +208,7 @@
<div class="section">
<p>
The SVN URL for anonymous users is
- <a href="http://svn.apache.org/repos/asf/incubator/nutch/">http://svn.apache.org/repos/asf/incubator/nutch/</a>.
+ <a href="http://svn.apache.org/repos/asf/lucene/nutch/">http://svn.apache.org/repos/asf/lucene/nutch/</a>.
Instructions for anonymous SVN access are
<a href="http://www.apache.org/dev/version-control.html#anon-svn">here</a>.
</p>
@@ -220,7 +220,7 @@
<div class="section">
<p>
The SVN URL for committers is
- <a href="https://svn.apache.org/repos/asf/incubator/nutch/">https://svn.apache.org/repos/asf/incubator/nutch/</a>.
+ <a href="https://svn.apache.org/repos/asf/lucene/nutch/">https://svn.apache.org/repos/asf/lucene/nutch/</a>.
Instructions for committer SVN access are
<a href="http://www.apache.org/dev/version-control.html#https-svn">here</a>.
</p>
Modified: lucene/nutch/branches/mapred/site/version_control.pdf
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/site/version_control.pdf?rev=179858&r1=179857&r2=179858&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/site/version_control.pdf (original)
+++ lucene/nutch/branches/mapred/site/version_control.pdf Fri Jun 3 12:11:51 2005
@@ -72,7 +72,7 @@
<< /Length 1528 /Filter [ /ASCII85Decode /FlateDecode ]
>>
stream
-Gatm<96...@Egh>,+O>bGm`d?_V"#oSPZKn2bspjl]*@#GNS4Y@VL+UNXgaUNb$r\eN,A68UA:ap:)h,h5>O\/dqY&-j.:PD]FH,;1q:DCkPYk]g*u&fd$oO_%]?JL,%p.Z=Kk.OpXe<.POZQakU3EE=17`l@9JB!7]oa$2'r;`c#k85T%u<9?iA=s5;6b<`1FWcRD#k*fdEk'+"b"qn8iD4j22ZDQfUjK/'DcpK[11C3M"K]<Q-`pDB&i*.+L#,Z^ppj\'AkAf4ZNh*44>:KLVl/=qXei10/XZW;]UP<(qOS]t^pTaB8_V&/Lie1p&T?DQ9erPhMLa#F,V@:TRZmIZPj3S-R(,JS(b6[3S(ll&T;l;]7e/"Ea5H!Y+515-2p0C'WkINn-u'FWTOdRRB^H.h#=Gg_@E^WN['9)bs8-""?S/sIZLKQ1ct?l<ujMXl$C.eQ"P%>B?Ep3AYjRZX)35Q0k\QKbY?Q%B#u-K+N<MNXhKdNHIBCk/>^-@<VH5r*.>%4IMiVj!k.Lut[_RZ\<'((1qNROo4gS+]Y5QV,FNe$*5Yl'.1K:1",\Q1"T2^a'-A$%+W>#4Ptd#JmbN\d,Lq34Dsdhf$<NrUFhdfNbWTIblY[=.(JE'Lf?-rD\,n"bCqtq[?k@@0(4F;QCjV@"lk5k47&K365@rf*`d10ol.RqjrD\HAa,8!.5kTOp)8W%(*lBM^aTA1.WaW^jonI?ACuJ?k!r_/Du^L&5^rF41,9uj1'MM%?MiMdPmh\^(#bsXK55#]jMJ*",Q'Y\JhCO#43L[Et't2E>JX99$I@)*/Q,%Hk6*o<sQV^HPG*+^[uHW^#5bfB`nsm9(VG;a)sJ/I<Z3Z(sJQI!,o(M8WKBG^?0Y=O[74_@:0_4^dhau^VcP);+>AG]^Fqk*0Z16MbG3nXX%E_lFVOG]+MYfRBJR'?p,.b>KEDVbc.Yjp<`in0R.Dnj&U1JWcFBUWJR`mKuHrb>?PSKo5&?"2A#U)gJ9.^~>
+Gatm<96...@A>K.c8c9"'AQ*5[0N'.Y\mE?8J]Pq._W(b"R`FX!(j_Nr$'YO_RF6M^IOMI*WZ8XcR(d(%T*\Vsjo<9.&d#J7@2<LIJnU!!Z(I</^TmVB;-n5[!,5F71qSD_?QdPN@IL0$)k'5Jl^;!^9.%nMd+C(8%XX+<V-gO&\FY)0ZCKUVmS6^QF>Hd.aadF]C-.WJ=?!s%EBhgC%n&Pt(94XL-W?M-RO=fSA(^OMT#;&+J`(:asaibLc[;5;J%?IPWKapundLGBdUQb)#GFfDUYZ,8*mdOm0c@'m#,&]),R9)nC;JZh6d,X@L-T$eb-=3HZ(j*r94]#*5lXge.!F<NM?EA[-A0[odBe/*e`oPjQ`.#3,UaD.Oo?~>
endstream
endobj
17 0 obj
@@ -156,10 +156,10 @@
24 0 obj
<< /Type /Annot
/Subtype /Link
-/Rect [ 90.0 498.132 332.976 486.132 ]
+/Rect [ 90.0 498.132 318.972 486.132 ]
/C [ 0 0 0 ]
/Border [ 0 0 0 ]
-/A << /URI (http://svn.apache.org/viewcvs.cgi/incubator/nutch/)
+/A << /URI (http://svn.apache.org/viewcvs.cgi/lucene/nutch/)
/S /URI >>
/H /I
>>
@@ -167,10 +167,10 @@
25 0 obj
<< /Type /Annot
/Subtype /Link
-/Rect [ 279.648 445.798 508.956 433.798 ]
+/Rect [ 279.648 445.798 494.952 433.798 ]
/C [ 0 0 0 ]
/Border [ 0 0 0 ]
-/A << /URI (http://svn.apache.org/repos/asf/incubator/nutch/)
+/A << /URI (http://svn.apache.org/repos/asf/lucene/nutch/)
/S /URI >>
/H /I
>>
@@ -189,10 +189,10 @@
27 0 obj
<< /Type /Annot
/Subtype /Link
-/Rect [ 250.656 367.064 484.632 355.064 ]
+/Rect [ 250.656 367.064 470.628 355.064 ]
/C [ 0 0 0 ]
/Border [ 0 0 0 ]
-/A << /URI (https://svn.apache.org/repos/asf/incubator/nutch/)
+/A << /URI (https://svn.apache.org/repos/asf/lucene/nutch/)
/S /URI >>
/H /I
>>
@@ -200,7 +200,7 @@
28 0 obj
<< /Type /Annot
/Subtype /Link
-/Rect [ 297.288 353.864 317.94 341.864 ]
+/Rect [ 237.624 353.864 258.276 341.864 ]
/C [ 0 0 0 ]
/Border [ 0 0 0 ]
/A << /URI (http://www.apache.org/dev/version-control.html#https-svn)
@@ -322,21 +322,21 @@
xref
0 39
0000000000 65535 f
-0000006626 00000 n
-0000006691 00000 n
-0000006783 00000 n
+0000006618 00000 n
+0000006683 00000 n
+0000006775 00000 n
0000000015 00000 n
0000000071 00000 n
0000000669 00000 n
0000000789 00000 n
0000000835 00000 n
-0000006906 00000 n
+0000006898 00000 n
0000000970 00000 n
-0000006969 00000 n
+0000006961 00000 n
0000001107 00000 n
-0000007035 00000 n
+0000007027 00000 n
0000001244 00000 n
-0000007101 00000 n
+0000007093 00000 n
0000001381 00000 n
0000003002 00000 n
0000003125 00000 n
@@ -346,20 +346,20 @@
0000003785 00000 n
0000003967 00000 n
0000004145 00000 n
-0000004346 00000 n
-0000004548 00000 n
-0000004756 00000 n
-0000004959 00000 n
-0000007167 00000 n
-0000005168 00000 n
-0000005307 00000 n
-0000005540 00000 n
-0000005809 00000 n
-0000006070 00000 n
-0000006183 00000 n
-0000006293 00000 n
-0000006401 00000 n
-0000006517 00000 n
+0000004343 00000 n
+0000004542 00000 n
+0000004750 00000 n
+0000004950 00000 n
+0000007159 00000 n
+0000005160 00000 n
+0000005299 00000 n
+0000005532 00000 n
+0000005801 00000 n
+0000006062 00000 n
+0000006175 00000 n
+0000006285 00000 n
+0000006393 00000 n
+0000006509 00000 n
trailer
<<
/Size 39
@@ -367,5 +367,5 @@
/Info 4 0 R
>>
startxref
-7218
+7210
%%EOF
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/analysis/NutchAnalysis.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/analysis/NutchAnalysis.java?rev=179858&r1=179857&r2=179858&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/analysis/NutchAnalysis.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/analysis/NutchAnalysis.java Fri Jun 3 12:11:51 2005
@@ -122,8 +122,11 @@
nonOpOrTerm();
String[] array = (String[])terms.toArray(new String[terms.size()]);
- if (stop && terms.size()==1 && isStopWord(array[0])) {
- // ignore stop words only when single, unadorned terms
+ if (stop
+ && field == Clause.DEFAULT_FIELD
+ && terms.size()==1
+ && isStopWord(array[0])) {
+ // ignore stop words only when single, unadorned terms in default field
} else {
if (prohibited)
query.addProhibitedPhrase(array, field);
@@ -443,35 +446,11 @@
finally { jj_save(2, xla); }
}
- final private boolean jj_3R_17() {
- Token xsp;
- xsp = jj_scanpos;
- if (jj_scan_token(10)) {
- jj_scanpos = xsp;
- if (jj_scan_token(11)) {
- jj_scanpos = xsp;
- if (jj_scan_token(12)) {
- jj_scanpos = xsp;
- if (jj_scan_token(13)) {
- jj_scanpos = xsp;
- if (jj_scan_token(14)) return true;
- }
- }
- }
- }
- return false;
- }
-
final private boolean jj_3R_26() {
if (jj_3R_16()) return true;
return false;
}
- final private boolean jj_3R_8() {
- if (jj_3R_14()) return true;
- return false;
- }
-
final private boolean jj_3R_16() {
Token xsp;
xsp = jj_scanpos;
@@ -496,6 +475,11 @@
return false;
}
+ final private boolean jj_3R_8() {
+ if (jj_3R_14()) return true;
+ return false;
+ }
+
final private boolean jj_3R_15() {
if (jj_3R_11()) return true;
Token xsp;
@@ -524,18 +508,6 @@
return false;
}
- final private boolean jj_3_1() {
- if (jj_scan_token(WORD)) return true;
- if (jj_scan_token(COLON)) return true;
- Token xsp;
- xsp = jj_scanpos;
- if (jj_3R_8()) {
- jj_scanpos = xsp;
- if (jj_3R_9()) return true;
- }
- return false;
- }
-
final private boolean jj_3R_23() {
if (jj_3R_24()) return true;
return false;
@@ -562,6 +534,18 @@
return false;
}
+ final private boolean jj_3_1() {
+ if (jj_scan_token(WORD)) return true;
+ if (jj_scan_token(COLON)) return true;
+ Token xsp;
+ xsp = jj_scanpos;
+ if (jj_3R_8()) {
+ jj_scanpos = xsp;
+ if (jj_3R_9()) return true;
+ }
+ return false;
+ }
+
final private boolean jj_3R_24() {
Token xsp;
xsp = jj_scanpos;
@@ -610,13 +594,13 @@
return false;
}
- final private boolean jj_3R_9() {
- if (jj_3R_15()) return true;
+ final private boolean jj_3R_19() {
+ if (jj_3R_24()) return true;
return false;
}
- final private boolean jj_3R_19() {
- if (jj_3R_24()) return true;
+ final private boolean jj_3R_9() {
+ if (jj_3R_15()) return true;
return false;
}
@@ -651,6 +635,25 @@
if (jj_scan_token(9)) {
jj_scanpos = xsp;
if (jj_scan_token(0)) return true;
+ }
+ return false;
+ }
+
+ final private boolean jj_3R_17() {
+ Token xsp;
+ xsp = jj_scanpos;
+ if (jj_scan_token(10)) {
+ jj_scanpos = xsp;
+ if (jj_scan_token(11)) {
+ jj_scanpos = xsp;
+ if (jj_scan_token(12)) {
+ jj_scanpos = xsp;
+ if (jj_scan_token(13)) {
+ jj_scanpos = xsp;
+ if (jj_scan_token(14)) return true;
+ }
+ }
+ }
}
return false;
}
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/analysis/NutchAnalysis.jj
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/analysis/NutchAnalysis.jj?rev=179858&r1=179857&r2=179858&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/analysis/NutchAnalysis.jj (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/analysis/NutchAnalysis.jj Fri Jun 3 12:11:51 2005
@@ -204,8 +204,11 @@
{
String[] array = (String[])terms.toArray(new String[terms.size()]);
- if (stop && terms.size()==1 && isStopWord(array[0])) {
- // ignore stop words only when single, unadorned terms
+ if (stop
+ && field == Clause.DEFAULT_FIELD
+ && terms.size()==1
+ && isStopWord(array[0])) {
+ // ignore stop words only when single, unadorned terms in default field
} else {
if (prohibited)
query.addProhibitedPhrase(array, field);
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/CrawlDatum.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/CrawlDatum.java?rev=179858&r1=179857&r2=179858&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/CrawlDatum.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/CrawlDatum.java Fri Jun 3 12:11:51 2005
@@ -38,8 +38,8 @@
public static final byte STATUS_DB_GONE = 3;
public static final byte STATUS_LINKED = 4;
public static final byte STATUS_FETCH_SUCCESS = 5;
- public static final byte STATUS_FETCH_FAIL_TEMP = 6;
- public static final byte STATUS_FETCH_FAIL_PERM = 7;
+ public static final byte STATUS_FETCH_RETRY = 6;
+ public static final byte STATUS_FETCH_GONE = 7;
private byte status;
private long nextFetch = System.currentTimeMillis();
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=179858&r1=179857&r2=179858&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Fri Jun 3 12:11:51 2005
@@ -77,7 +77,7 @@
result.setStatus(CrawlDatum.STATUS_DB_FETCHED);
break;
- case CrawlDatum.STATUS_FETCH_FAIL_TEMP: // temporary failure
+ case CrawlDatum.STATUS_FETCH_RETRY: // temporary failure
result = highest; // use new entry
if (highest.getRetriesSinceFetch() < retryMax) {
result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
@@ -86,7 +86,7 @@
}
break;
- case CrawlDatum.STATUS_FETCH_FAIL_PERM: // permanent failure
+ case CrawlDatum.STATUS_FETCH_GONE: // permanent failure
result = highest; // use new entry
result.setStatus(CrawlDatum.STATUS_DB_GONE);
break;
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java?rev=179858&r1=179857&r2=179858&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java Fri Jun 3 12:11:51 2005
@@ -39,6 +39,7 @@
private OutputCollector output;
private int activeThreads;
+ private int maxRedirect;
private long start = System.currentTimeMillis(); // start time of fetcher run
@@ -70,21 +71,63 @@
try {
LOG.info("fetching " + url); // fetch the page
- Protocol protocol = ProtocolFactory.getProtocol(url);
- Content content = protocol.getContent(url);
+ boolean redirecting;
+ int redirectCount = 0;
+ do {
+ redirecting = false;
+ LOG.fine("redirectCount=" + redirectCount);
+ Protocol protocol = ProtocolFactory.getProtocol(url);
+ ProtocolOutput output = protocol.getProtocolOutput(url);
+ ProtocolStatus status = output.getStatus();
+ Content content = output.getContent();
+
+ switch(status.getCode()) {
+
+ case ProtocolStatus.SUCCESS: // got a page
+ output(key, datum, content, CrawlDatum.STATUS_FETCH_SUCCESS);
+ updateStatus(content.getContent().length);
+ break;
+
+ case ProtocolStatus.MOVED: // redirect
+ case ProtocolStatus.TEMP_MOVED:
+ url = status.getMessage();
+ if (url != null) {
+ redirecting = true;
+ redirectCount++;
+ LOG.info(" - protocol redirect to " + url);
+ }
+ break;
+
+ case ProtocolStatus.RETRY: // retry
+ case ProtocolStatus.EXCEPTION:
+ output(key, datum, null, CrawlDatum.STATUS_FETCH_RETRY);
+ break;
+
+ case ProtocolStatus.GONE: // gone
+ case ProtocolStatus.NOT_FOUND:
+ case ProtocolStatus.ACCESS_DENIED:
+ case ProtocolStatus.ROBOTS_DENIED:
+ case ProtocolStatus.NOTMODIFIED:
+ output(key, datum, null, CrawlDatum.STATUS_FETCH_GONE);
+ break;
+
+ default:
+ LOG.warning("Unknown ProtocolStatus: " + status.getCode());
+ output(key, datum, null, CrawlDatum.STATUS_FETCH_GONE);
+ }
+
+ if (redirecting && redirectCount >= maxRedirect) {
+ LOG.info(" - redirect count exceeded " + url);
+ output(key, datum, null, CrawlDatum.STATUS_FETCH_GONE);
+ }
- output(url, datum, content, CrawlDatum.STATUS_FETCH_SUCCESS);
-
- updateStatus(content.getContent().length);
+ } while (redirecting && (redirectCount < maxRedirect));
- } catch (ResourceGone e) { // don't retry
- logError(url, e);
- output(url, datum, null, CrawlDatum.STATUS_FETCH_FAIL_PERM);
- } catch (Throwable t) { // retry all others
+ } catch (Throwable t) { // unexpected exception
logError(url, t);
- output(url, datum, null, CrawlDatum.STATUS_FETCH_FAIL_TEMP);
-
+ output(key, datum, null, CrawlDatum.STATUS_FETCH_GONE);
+
}
}
@@ -103,13 +146,18 @@
}
}
- private void output(String url, CrawlDatum datum,
+ private void output(UTF8 key, CrawlDatum datum,
Content content, int status) {
+
datum.setStatus(status);
- if (content == null)
+
+ if (content == null) {
+ String url = key.toString();
content = new Content(url, url, new byte[0], "", new Properties());
+ }
+
try {
- output.collect(new UTF8(url), new FetcherOutput(datum, content));
+ output.collect(key, new FetcherOutput(datum, content));
} catch (IOException e) {
LOG.severe("fetcher caught:"+e.toString());
}
@@ -152,6 +200,8 @@
this.input = input;
this.output = output;
+ this.maxRedirect = getConf().getInt("http.redirect.max", 3);
+
int threadCount = getConf().getInt("fetcher.threads.fetch", 10);
for (int i = 0; i < threadCount; i++) { // spawn threads
new FetcherThread().start();
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseSegment.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseSegment.java?rev=179858&r1=179857&r2=179858&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseSegment.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseSegment.java Fri Jun 3 12:11:51 2005
@@ -49,15 +49,22 @@
public void map(WritableComparable key, Writable value,
OutputCollector output) throws IOException {
Content content = (Content)value;
+
+ Parse parse = null;
+ ParseStatus status;
try {
Parser parser = ParserFactory.getParser(content.getContentType(),
content.getBaseUrl());
- Parse parse = parser.getParse(content);
-
+ parse = parser.getParse(content);
+ status = parse.getData().getStatus();
+ } catch (Exception e) {
+ status = new ParseStatus(e);
+ }
+
+ if (status.isSuccess()) {
output.collect(key, new ParseImpl(parse.getText(), parse.getData()));
-
- } catch (ParseException t) {
- LOG.warning("Error parsing: "+key+": "+t.toString());
+ } else {
+ LOG.warning("Error parsing: "+key+": "+status.toString());
}
}
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=179858&r1=179857&r2=179858&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/fetcher/Fetcher.java Fri Jun 3 12:11:51 2005
@@ -20,6 +20,7 @@
import java.io.File;
import java.util.Properties;
+import org.apache.nutch.net.URLFilters;
import org.apache.nutch.pagedb.FetchListEntry;
import org.apache.nutch.io.*;
import org.apache.nutch.db.*;
@@ -68,6 +69,10 @@
private int threadCount = // max number of threads
NutchConf.get().getInt("fetcher.threads.fetch", 10);
+ private static final float NEW_INJECTED_PAGE_SCORE =
+ NutchConf.get().getFloat("db.score.injected", 2.0f);
+ private static final int MAX_REDIRECT =
+ NutchConf.get().getInt("http.redirect.max", 3);
// All threads (FetcherThread or thread started by it) belong to
// group "fetcher". Each FetcherThread is named as "fetcherXX",
@@ -110,45 +115,84 @@
if (!fle.getFetch()) { // should we fetch this page?
if (LOG.isLoggable(Level.FINE))
LOG.fine("not fetching " + url);
- handleNoFetch(fle, FetcherOutput.SUCCESS);
+ handleNoFetch(fle, ProtocolStatus.STATUS_NOTFETCHING);
continue;
}
- LOG.info("fetching " + url); // fetch the page
-
- Protocol protocol = ProtocolFactory.getProtocol(url);
- Content content = protocol.getContent(url);
-
- handleFetch(url, fle, content);
-
- synchronized (Fetcher.this) { // update status
- pages++;
- bytes += content.getContent().length;
- if ((pages % 100) == 0) { // show status every 100pp
- status();
+ // support multiple redirects, if requested by protocol
+ // or content meta-tags (the latter requires running Fetcher
+ // in parsing mode). Protocol-level redirects take precedence over
+ // content-level redirects. Some plugins can handle redirects
+ // automatically, so that only the final success or failure will be
+ // shown here.
+ boolean refetch = false;
+ int redirCnt = 0;
+ do {
+ LOG.fine("redirCnt=" + redirCnt);
+ refetch = false;
+ LOG.info("fetching " + url); // fetch the page
+ Protocol protocol = ProtocolFactory.getProtocol(url);
+ ProtocolOutput output = protocol.getProtocolOutput(fle);
+ ProtocolStatus pstat = output.getStatus();
+ Content content = output.getContent();
+ switch(pstat.getCode()) {
+ case ProtocolStatus.SUCCESS:
+ if (content != null) {
+ synchronized (Fetcher.this) { // update status
+ pages++;
+ bytes += content.getContent().length;
+ if ((pages % 100) == 0) { // show status every 100pp
+ status();
+ }
+ }
+ ParseStatus ps = handleFetch(url, fle, output);
+ if (ps != null && ps.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
+ url = ps.getMessage();
+ url = URLFilters.filter(url);
+ if (url != null) {
+ refetch = true;
+ redirCnt++;
+ fle = new FetchListEntry(true, new Page(url, NEW_INJECTED_PAGE_SCORE), new String[0]);
+ LOG.info(" - content redirect to " + url);
+ }
+ }
+ }
+ break;
+ case ProtocolStatus.MOVED: // try to redirect immediately
+ case ProtocolStatus.TEMP_MOVED: // try to redirect immediately
+ // record the redirect. perhaps the DB will want to know this.
+ handleNoFetch(fle, pstat);
+ url = pstat.getMessage();
+ if (url != null) {
+ refetch = true;
+ redirCnt++;
+ // create new entry.
+ fle = new FetchListEntry(true, new Page(url, NEW_INJECTED_PAGE_SCORE), new String[0]);
+ LOG.info(" - protocol redirect to " + url);
+ }
+ break;
+ case ProtocolStatus.GONE:
+ case ProtocolStatus.NOT_FOUND:
+ case ProtocolStatus.ACCESS_DENIED:
+ case ProtocolStatus.ROBOTS_DENIED:
+ case ProtocolStatus.RETRY:
+ case ProtocolStatus.NOTMODIFIED:
+ handleNoFetch(fle, pstat);
+ break;
+ case ProtocolStatus.EXCEPTION:
+ logError(url, fle, new Exception(pstat.getMessage())); // retry?
+ handleNoFetch(fle, pstat);
+ break;
+ default:
+ LOG.warning("Unknown ProtocolStatus: " + pstat.getCode());
+ handleNoFetch(fle, pstat);
}
- }
- } catch (ResourceGone e) { // don't retry
- logError(url, fle, e);
- handleNoFetch(fle, FetcherOutput.NOT_FOUND);
-
- // dealt with in handleFetch() below
- //} catch (ParseException e) { // don't retry
- // logError(url, fle, e);
- // handleNoFetch(fle, FetcherOutput.CANT_PARSE);
-
- } catch (RetryLater e) { // explicit retry
- logError(url, fle, e);
- handleNoFetch(fle, FetcherOutput.RETRY);
-
- } catch (ProtocolException e) { // implicit retry
- logError(url, fle, e);
- handleNoFetch(fle, FetcherOutput.RETRY);
+ } while (refetch && (redirCnt < MAX_REDIRECT));
} catch (Throwable t) { // an unchecked exception
if (fle != null) {
logError(url, fle, t); // retry?
- handleNoFetch(fle, FetcherOutput.RETRY);
+ handleNoFetch(fle, new ProtocolStatus(t));
}
}
}
@@ -176,36 +220,44 @@
}
}
- private void handleFetch(String url, FetchListEntry fle, Content content) {
+ private ParseStatus handleFetch(String url, FetchListEntry fle, ProtocolOutput output) {
+ Content content = output.getContent();
+ ProtocolStatus protocolStatus = output.getStatus();
if (!Fetcher.this.parsing) {
outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
- FetcherOutput.SUCCESS),
+ protocolStatus),
content, null, null);
- return;
+ return null;
}
- try {
String contentType = content.getContentType();
- Parser parser = ParserFactory.getParser(contentType, url);
- Parse parse = parser.getParse(content);
- outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
- FetcherOutput.SUCCESS),
- content, new ParseText(parse.getText()), parse.getData());
- } catch (ParseException e) {
- // 20041026, xing
- // If fetching succeeds, but parsing fails, content should be saved
- // so that we can try to parse again in separate pass, possibly
- // using better/alternative parser.
- LOG.info("fetch okay, but can't parse " + url + ", reason: "
- + e.getMessage());
- outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
- FetcherOutput.CANT_PARSE),
- content, new ParseText(""),
- new ParseData("", new Outlink[0], new Properties()));
- }
+ Parser parser = null;
+ Parse parse = null;
+ ParseStatus status = null;
+ try {
+ parser = ParserFactory.getParser(contentType, url);
+ parse = parser.getParse(content);
+ status = parse.getData().getStatus();
+ } catch (Exception e) {
+ e.printStackTrace();
+ status = new ParseStatus(e);
+ }
+ if (status.isSuccess()) {
+ outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
+ protocolStatus),
+ content, new ParseText(parse.getText()), parse.getData());
+ } else {
+ LOG.info("fetch okay, but can't parse " + url + ", reason: "
+ + status.toString());
+ outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
+ protocolStatus),
+ content, new ParseText(""),
+ new ParseData(status, "", new Outlink[0], new Properties()));
+ }
+ return status;
}
- private void handleNoFetch(FetchListEntry fle, int status) {
+ private void handleNoFetch(FetchListEntry fle, ProtocolStatus status) {
String url = fle.getPage().getURL().toString();
MD5Hash hash = MD5Hash.digest(url);
@@ -213,7 +265,7 @@
outputPage(new FetcherOutput(fle, hash, status),
new Content(url, url, new byte[0], "", new Properties()),
new ParseText(""),
- new ParseData("", new Outlink[0], new Properties()));
+ new ParseData(ParseStatus.STATUS_NOTPARSED, "", new Outlink[0], new Properties()));
} else {
outputPage(new FetcherOutput(fle, hash, status),
new Content(url, url, new byte[0], "", new Properties()),
@@ -234,6 +286,7 @@
}
} catch (Throwable t) {
LOG.severe("error writing output:" + t.toString());
+ t.printStackTrace();
}
}
@@ -429,7 +482,7 @@
}
// set log level
- fetcher.setLogLevel(Level.parse(logLevel.toUpperCase()));
+ setLogLevel(Level.parse(logLevel.toUpperCase()));
if (showThreadID) {
LogFormatter.setShowThreadIDs(showThreadID);
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/fetcher/FetcherOutput.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fetcher/FetcherOutput.java?rev=179858&r1=179857&r2=179858&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/fetcher/FetcherOutput.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/fetcher/FetcherOutput.java Fri Jun 3 12:11:51 2005
@@ -26,6 +26,8 @@
import org.apache.nutch.pagedb.FetchListEntry;
import org.apache.nutch.tools.UpdateDatabaseTool;
import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.protocol.ProtocolStatus;
/*********************************************
* An entry in the fetcher's output. This includes all of the fetcher output
@@ -50,25 +52,34 @@
public static final String DONE_NAME = "fetcher.done";
public static final String ERROR_NAME = "fetcher.error";
- private final static byte VERSION = 4;
+ private final static byte VERSION = 5;
- public final static byte RETRY = 0;
- public final static byte SUCCESS = 1;
- public final static byte NOT_FOUND = 2;
- public final static byte CANT_PARSE = 4; // fetched, but can't be parsed
+ // backwards compatibility codes
+ private final static byte RETRY = 0;
+ private final static byte SUCCESS = 1;
+ private final static byte NOT_FOUND = 2;
+ private final static byte CANT_PARSE = 4; // fetched, but can't be parsed
+
+ private static final byte[] oldToNewMap = {
+ ProtocolStatus.RETRY,
+ ProtocolStatus.SUCCESS,
+ ProtocolStatus.NOT_FOUND,
+ ProtocolStatus.FAILED,
+ ProtocolStatus.RETRY
+ };
private FetchListEntry fetchListEntry;
private MD5Hash md5Hash;
- private int status;
+ private ProtocolStatus protocolStatus;
private long fetchDate;
public FetcherOutput() {}
public FetcherOutput(FetchListEntry fetchListEntry,
- MD5Hash md5Hash, int status) {
+ MD5Hash md5Hash, ProtocolStatus protocolStatus) {
this.fetchListEntry = fetchListEntry;
this.md5Hash = md5Hash;
- this.status = status;
+ this.protocolStatus = protocolStatus;
this.fetchDate = System.currentTimeMillis();
}
@@ -78,7 +89,12 @@
byte version = in.readByte(); // read version
fetchListEntry = FetchListEntry.read(in);
md5Hash = MD5Hash.read(in);
- status = in.readByte();
+ if (version < 5) {
+ int status = in.readByte();
+ protocolStatus = new ProtocolStatus(oldToNewMap[status]);
+ } else {
+ protocolStatus = ProtocolStatus.read(in);
+ }
if (version < 4) {
UTF8.readString(in); // read & ignore title
@@ -95,7 +111,7 @@
out.writeByte(VERSION); // store current version
fetchListEntry.write(out);
md5Hash.write(out);
- out.writeByte(status);
+ protocolStatus.write(out);
out.writeLong(fetchDate);
}
@@ -110,8 +126,8 @@
//
public FetchListEntry getFetchListEntry() { return fetchListEntry; }
public MD5Hash getMD5Hash() { return md5Hash; }
- public int getStatus() { return status; }
- public void setStatus(int status) { this.status = status; }
+ public ProtocolStatus getProtocolStatus() { return protocolStatus; }
+ public void setProtocolStatus(ProtocolStatus protocolStatus) { this.protocolStatus = protocolStatus; }
public long getFetchDate() { return fetchDate; }
public void setFetchDate(long fetchDate) { this.fetchDate = fetchDate; }
@@ -126,7 +142,7 @@
return
this.fetchListEntry.equals(other.fetchListEntry) &&
this.md5Hash.equals(other.md5Hash) &&
- (this.status == other.status);
+ this.protocolStatus.equals(other.protocolStatus);
}
@@ -134,7 +150,7 @@
StringBuffer buffer = new StringBuffer();
buffer.append("FetchListEntry: " + fetchListEntry + "Fetch Result:\n" );
buffer.append("MD5Hash: " + md5Hash + "\n" );
- buffer.append("Status: " + status + "\n" );
+ buffer.append("ProtocolStatus: " + protocolStatus + "\n" );
buffer.append("FetchDate: " + new Date(fetchDate) + "\n" );
return buffer.toString();
}
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/IndexSegment.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/IndexSegment.java?rev=179858&r1=179857&r2=179858&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/IndexSegment.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/IndexSegment.java Fri Jun 3 12:11:51 2005
@@ -134,7 +134,7 @@
if (!sr.next(fetcherOutput, null, parseText, parseData)) continue;
// only index the page if it was fetched correctly
- if (fetcherOutput.getStatus() != FetcherOutput.SUCCESS) {
+ if (!fetcherOutput.getProtocolStatus().isSuccess()) {
continue;
}
Copied: lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/HTMLMetaTags.java (from r179837, lucene/nutch/trunk/src/java/org/apache/nutch/parse/HTMLMetaTags.java)
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/HTMLMetaTags.java?p2=lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/HTMLMetaTags.java&p1=lucene/nutch/trunk/src/java/org/apache/nutch/parse/HTMLMetaTags.java&r1=179837&r2=179858&rev=179858&view=diff
==============================================================================
(empty)
Propchange: lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/HTMLMetaTags.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/HtmlParseFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/HtmlParseFilter.java?rev=179858&r1=179857&r2=179858&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/HtmlParseFilter.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/HtmlParseFilter.java Fri Jun 3 12:11:51 2005
@@ -30,6 +30,5 @@
/** Adds metadata or otherwise modifies a parse of HTML content, given
* the DOM tree of a page. */
- Parse filter(Content content, Parse parse, DocumentFragment doc)
- throws ParseException;
+ Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc);
}
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/HtmlParseFilters.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/HtmlParseFilters.java?rev=179858&r1=179857&r2=179858&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/HtmlParseFilters.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/HtmlParseFilters.java Fri Jun 3 12:11:51 2005
@@ -45,11 +45,11 @@
private HtmlParseFilters() {} // no public ctor
/** Run all defined filters. */
- public static Parse filter(Content content,Parse parse,DocumentFragment doc)
- throws ParseException {
+ public static Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc) {
for (int i = 0 ; i < CACHE.length; i++) {
- parse = CACHE[i].filter(content, parse, doc);
+ parse = CACHE[i].filter(content, parse, metaTags, doc);
+ if (!parse.getData().getStatus().isSuccess()) break;
}
return parse;
Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/Parse.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/Parse.java?rev=179858&r1=179857&r2=179858&view=diff
==============================================================================
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/Parse.java (original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/parse/Parse.java Fri Jun 3 12:11:51 2005
@@ -20,6 +20,7 @@
* @see Parser#getParse(FetcherOutput,Content)
*/
public interface Parse {
+
/** The textual content of the page. This is indexed, searched, and used when
* generating snippets.*/
String getText();