You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2007/03/08 00:37:24 UTC
svn commit: r515844 - in /lucene/nutch/trunk: ./ conf/
src/java/org/apache/nutch/metadata/
src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/
src/plugin/parse-html/src/java/org/apache/nutch/parse/html/ src/web/jsp/
Author: ab
Date: Wed Mar 7 15:37:21 2007
New Revision: 515844
URL: http://svn.apache.org/viewvc?view=rev&rev=515844
Log:
NUTCH-167 - Observation of robots "noarchive" directive.
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/conf/nutch-default.xml
lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java
lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
lucene/nutch/trunk/src/web/jsp/cached.jsp
lucene/nutch/trunk/src/web/jsp/search.jsp
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=515844&r1=515843&r2=515844
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Wed Mar 7 15:37:21 2007
@@ -151,7 +151,9 @@
50. NUTCH-432 - Fix a bug where platform name with spaces would break the
bin/nutch script. (Brian Whitman via ab)
-51. Upgrade to Hadoop 0.11.2 and Lucene 2.1.0 release.
+51. Upgrade to Hadoop 0.11.2 and Lucene 2.1.0 release. (ab)
+
+52. NUTCH-167 - Observation of robots "noarchive" directive. (ab)
Release 0.8 - 2006-07-25
Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?view=diff&rev=515844&r1=515843&r2=515844
==============================================================================
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Wed Mar 7 15:37:21 2007
@@ -773,6 +773,17 @@
</property>
<property>
+ <name>parser.caching.forbidden.policy</name>
+ <value>content</value>
+ <description>If a site (or a page) requests through its robot metatags
+ that it should not be shown as cached content, apply this policy. Currently
+ three keywords are recognized: "none" ignores any "noarchive" directives.
+ "content" doesn't show the content, but shows summaries (snippets).
+ "all" doesn't show either content or summaries.</description>
+</property>
+
+
+<property>
<name>parser.html.impl</name>
<value>neko</value>
<description>HTML Parser implementation. Currently the following keywords
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java?view=diff&rev=515844&r1=515843&r2=515844
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java Wed Mar 7 15:37:21 2007
@@ -47,4 +47,16 @@
public static final Text WRITABLE_PROTO_STATUS_KEY = new Text(PROTO_STATUS_KEY);
+ /** Sites may request that search engines don't provide access to cached documents. */
+ public static final String CACHING_FORBIDDEN_KEY = "caching.forbidden";
+
+ /** Show both original forbidden content and summaries (default). */
+ public static final String CACHING_FORBIDDEN_NONE = "none";
+
+ /** Don't show either original forbidden content or summaries. */
+ public static final String CACHING_FORBIDDEN_ALL = "all";
+
+ /** Don't show original forbidden content, but show summaries. */
+ public static final String CACHING_FORBIDDEN_CONTENT = "content";
+
}
Modified: lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?view=diff&rev=515844&r1=515843&r2=515844
==============================================================================
--- lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java Wed Mar 7 15:37:21 2007
@@ -24,6 +24,7 @@
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
+import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.indexer.IndexingFilter;
@@ -89,6 +90,11 @@
}
// add title indexed and stored so that it can be displayed
doc.add(new Field("title", title, Field.Store.YES, Field.Index.TOKENIZED));
+ // add cached content/summary display policy, if available
+ String caching = parse.getData().getMeta(Nutch.CACHING_FORBIDDEN_KEY);
+ if (caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE)) {
+ doc.add(new Field("cache", caching, Field.Store.YES, Field.Index.NO));
+ }
// add timestamp when fetched, for deduplication
doc.add(new Field("tstamp",
Modified: lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java?view=diff&rev=515844&r1=515843&r2=515844
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java Wed Mar 7 15:37:21 2007
@@ -105,6 +105,11 @@
if (index >= 0) {
metaTags.setNoFollow();
}
+
+ index = directives.indexOf("noarchive");
+ if (index >= 0) {
+ metaTags.setNoCache();
+ }
}
} // end if (name == robots)
Modified: lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?view=diff&rev=515844&r1=515843&r2=515844
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java Wed Mar 7 15:37:21 2007
@@ -33,6 +33,7 @@
import org.apache.commons.logging.LogFactory;
import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.Content;
import org.apache.hadoop.conf.*;
@@ -100,7 +101,9 @@
private DOMContentUtils utils;
private HtmlParseFilters htmlParseFilters;
-
+
+ private String cachingPolicy;
+
public Parse getParse(Content content) {
HTMLMetaTags metaTags = new HTMLMetaTags();
@@ -202,10 +205,6 @@
}
}
- if (!metaTags.getNoCache()) { // okay to cache
- // ??? FIXME ???
- }
-
ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
if (metaTags.getRefresh()) {
status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
@@ -217,7 +216,11 @@
Parse parse = new ParseImpl(text, parseData);
// run filters on parse
- return this.htmlParseFilters.filter(content, parse, metaTags, root);
+ parse = this.htmlParseFilters.filter(content, parse, metaTags, root);
+ if (metaTags.getNoCache()) { // not okay to cache
+ parse.getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
+ }
+ return parse;
}
private DocumentFragment parse(InputSource input) throws Exception {
@@ -302,6 +305,8 @@
this.defaultCharEncoding = getConf().get(
"parser.character.encoding.default", "windows-1252");
this.utils = new DOMContentUtils(conf);
+ this.cachingPolicy = getConf().get("parser.caching.forbidden.policy",
+ Nutch.CACHING_FORBIDDEN_CONTENT);
}
public Configuration getConf() {
Modified: lucene/nutch/trunk/src/web/jsp/cached.jsp
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/web/jsp/cached.jsp?view=diff&rev=515844&r1=515843&r2=515844
==============================================================================
--- lucene/nutch/trunk/src/web/jsp/cached.jsp (original)
+++ lucene/nutch/trunk/src/web/jsp/cached.jsp Wed Mar 7 15:37:21 2007
@@ -23,6 +23,7 @@
import="org.apache.nutch.searcher.*"
import="org.apache.nutch.parse.ParseData"
import="org.apache.nutch.metadata.Metadata"
+ import="org.apache.nutch.metadata.Nutch"
import="org.apache.hadoop.conf.Configuration"
import="org.apache.nutch.util.NutchConfiguration"
%><%
@@ -82,6 +83,17 @@
FIXME: have to sanitize 'content' : e.g. removing unncessary part
of head elememt
-->
+<%
+ String caching = details.getValue("cache");
+ String url = details.getValue("url");
+ if (caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE)) {
+%>
+Display of this content was administratively prohibited by the webmaster.
+You may visit the original page instead: <a href="<%=url%>"><%=url%></a>.
+<%
+ return;
+ }
+%>
<% if (contentType.startsWith("text/html")) {%>
<% if (content != null && !content.equals("")) {%>
Modified: lucene/nutch/trunk/src/web/jsp/search.jsp
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/web/jsp/search.jsp?view=diff&rev=515844&r1=515843&r2=515844
==============================================================================
--- lucene/nutch/trunk/src/web/jsp/search.jsp (original)
+++ lucene/nutch/trunk/src/web/jsp/search.jsp Wed Mar 7 15:37:21 2007
@@ -24,6 +24,7 @@
import="java.net.*"
import="org.apache.nutch.html.Entities"
+ import="org.apache.nutch.metadata.Nutch"
import="org.apache.nutch.searcher.*"
import="org.apache.nutch.plugin.*"
import="org.apache.nutch.clustering.*"
@@ -194,7 +195,6 @@
Hit[] show = hits.getHits(start, realEnd-start);
HitDetails[] details = bean.getDetails(show);
Summary[] summaries = bean.getSummary(details, query);
-
bean.LOG.info("total hits: " + hits.getTotal());
%>
@@ -228,6 +228,13 @@
String url = detail.getValue("url");
String id = "idx=" + hit.getIndexNo() + "&id=" + hit.getIndexDocNo();
String summary = summaries[i].toHtml(true);
+ String caching = detail.getValue("cache");
+ boolean showSummary = true;
+ boolean showCached = true;
+ if (caching != null) {
+ showSummary = !caching.equals(Nutch.CACHING_FORBIDDEN_ALL);
+ showCached = !caching.equals(Nutch.CACHING_FORBIDDEN_NONE);
+ }
if (title == null || title.equals("")) { // use url for docs w/o title
title = url;
@@ -235,12 +242,16 @@
%>
<b><a href="<%=url%>"><%=Entities.encode(title)%></a></b>
<%@ include file="more.jsp" %>
- <% if (!"".equals(summary)) { %>
+ <% if (!"".equals(summary) && showSummary) { %>
<br><%=summary%>
<% } %>
<br>
<span class="url"><%=Entities.encode(url)%></span>
- (<a href="../cached.jsp?<%=id%>"><i18n:message key="cached"/></a>)
+ <%
+ if (showCached) {
+ %>(<a href="../cached.jsp?<%=id%>"><i18n:message key="cached"/></a>) <%
+ }
+ %>
(<a href="../explain.jsp?<%=id%>&query=<%=URLEncoder.encode(queryString, "UTF-8")%>&lang=<%=queryLang%>"><i18n:message key="explain"/></a>)
(<a href="../anchors.jsp?<%=id%>"><i18n:message key="anchors"/></a>)
<% if (hit.moreFromDupExcluded()) {