You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2007/03/08 00:37:24 UTC

svn commit: r515844 - in /lucene/nutch/trunk: ./ conf/ src/java/org/apache/nutch/metadata/ src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/ src/plugin/parse-html/src/java/org/apache/nutch/parse/html/ src/web/jsp/

Author: ab
Date: Wed Mar  7 15:37:21 2007
New Revision: 515844

URL: http://svn.apache.org/viewvc?view=rev&rev=515844
Log:
NUTCH-167 - Observation of robots "noarchive" directive.

Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/conf/nutch-default.xml
    lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java
    lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
    lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
    lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
    lucene/nutch/trunk/src/web/jsp/cached.jsp
    lucene/nutch/trunk/src/web/jsp/search.jsp

Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=515844&r1=515843&r2=515844
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Wed Mar  7 15:37:21 2007
@@ -151,7 +151,9 @@
 50. NUTCH-432 - Fix a bug where platform name with spaces would break the
     bin/nutch script. (Brian Whitman via ab)
 
-51. Upgrade to Hadoop 0.11.2 and Lucene 2.1.0 release.
+51. Upgrade to Hadoop 0.11.2 and Lucene 2.1.0 release. (ab)
+
+52. NUTCH-167 - Observation of robots "noarchive" directive. (ab)
 
 
 Release 0.8 - 2006-07-25

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?view=diff&rev=515844&r1=515843&r2=515844
==============================================================================
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Wed Mar  7 15:37:21 2007
@@ -773,6 +773,17 @@
 </property>
 
 <property>
+  <name>parser.caching.forbidden.policy</name>
+  <value>content</value>
+  <description>If a site (or a page) requests through its robot metatags
+  that it should not be shown as cached content, apply this policy. Currently
+  three keywords are recognized: "none" ignores any "noarchive" directives.
+  "content" doesn't show the content, but shows summaries (snippets).
+  "all" doesn't show either content or summaries.</description>
+</property>
+
+
+<property>
   <name>parser.html.impl</name>
   <value>neko</value>
   <description>HTML Parser implementation. Currently the following keywords

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java?view=diff&rev=515844&r1=515843&r2=515844
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java Wed Mar  7 15:37:21 2007
@@ -47,4 +47,16 @@
 
   public static final Text WRITABLE_PROTO_STATUS_KEY = new Text(PROTO_STATUS_KEY);
 
+  /** Sites may request that search engines don't provide access to cached documents. */
+  public static final String CACHING_FORBIDDEN_KEY = "caching.forbidden";
+
+  /** Show both original forbidden content and summaries (default). */
+  public static final String CACHING_FORBIDDEN_NONE = "none";
+
+  /** Don't show either original forbidden content or summaries. */
+  public static final String CACHING_FORBIDDEN_ALL = "all";
+
+  /** Don't show original forbidden content, but show summaries. */
+  public static final String CACHING_FORBIDDEN_CONTENT = "content";
+
 }

Modified: lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?view=diff&rev=515844&r1=515843&r2=515844
==============================================================================
--- lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java Wed Mar  7 15:37:21 2007
@@ -24,6 +24,7 @@
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 
+import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.parse.Parse;
 
 import org.apache.nutch.indexer.IndexingFilter;
@@ -89,6 +90,11 @@
     }
     // add title indexed and stored so that it can be displayed
     doc.add(new Field("title", title, Field.Store.YES, Field.Index.TOKENIZED));
+    // add cached content/summary display policy, if available
+    String caching = parse.getData().getMeta(Nutch.CACHING_FORBIDDEN_KEY);
+    if (caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE)) {
+      doc.add(new Field("cache", caching, Field.Store.YES, Field.Index.NO));
+    }
     
     // add timestamp when fetched, for deduplication
     doc.add(new Field("tstamp",

Modified: lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java?view=diff&rev=515844&r1=515843&r2=515844
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java Wed Mar  7 15:37:21 2007
@@ -105,6 +105,11 @@
                 if (index >= 0) {
                   metaTags.setNoFollow();
                 }
+                
+                index = directives.indexOf("noarchive");
+                if (index >= 0) {
+                  metaTags.setNoCache();
+                }
               } 
   
             } // end if (name == robots)

Modified: lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?view=diff&rev=515844&r1=515843&r2=515844
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java Wed Mar  7 15:37:21 2007
@@ -33,6 +33,7 @@
 import org.apache.commons.logging.LogFactory;
 
 import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.net.protocols.Response;
 import org.apache.nutch.protocol.Content;
 import org.apache.hadoop.conf.*;
@@ -100,7 +101,9 @@
   private DOMContentUtils utils;
 
   private HtmlParseFilters htmlParseFilters;
-
+  
+  private String cachingPolicy;
+  
   public Parse getParse(Content content) {
     HTMLMetaTags metaTags = new HTMLMetaTags();
 
@@ -202,10 +205,6 @@
       }
     }
     
-    if (!metaTags.getNoCache()) {             // okay to cache
-      // ??? FIXME ???
-    }
-    
     ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
     if (metaTags.getRefresh()) {
       status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
@@ -217,7 +216,11 @@
     Parse parse = new ParseImpl(text, parseData);
 
     // run filters on parse
-    return this.htmlParseFilters.filter(content, parse, metaTags, root);
+    parse = this.htmlParseFilters.filter(content, parse, metaTags, root);
+    if (metaTags.getNoCache()) {             // not okay to cache
+      parse.getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
+    }
+    return parse;
   }
 
   private DocumentFragment parse(InputSource input) throws Exception {
@@ -302,6 +305,8 @@
     this.defaultCharEncoding = getConf().get(
         "parser.character.encoding.default", "windows-1252");
     this.utils = new DOMContentUtils(conf);
+    this.cachingPolicy = getConf().get("parser.caching.forbidden.policy",
+        Nutch.CACHING_FORBIDDEN_CONTENT);
   }
 
   public Configuration getConf() {

Modified: lucene/nutch/trunk/src/web/jsp/cached.jsp
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/web/jsp/cached.jsp?view=diff&rev=515844&r1=515843&r2=515844
==============================================================================
--- lucene/nutch/trunk/src/web/jsp/cached.jsp (original)
+++ lucene/nutch/trunk/src/web/jsp/cached.jsp Wed Mar  7 15:37:21 2007
@@ -23,6 +23,7 @@
   import="org.apache.nutch.searcher.*"
   import="org.apache.nutch.parse.ParseData"
   import="org.apache.nutch.metadata.Metadata"
+  import="org.apache.nutch.metadata.Nutch"
   import="org.apache.hadoop.conf.Configuration"
   import="org.apache.nutch.util.NutchConfiguration"
 %><%
@@ -82,6 +83,17 @@
    FIXME: have to sanitize 'content' : e.g. removing unncessary part
         of head elememt
 -->
+<%
+   String caching = details.getValue("cache");
+   String url = details.getValue("url");
+   if (caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE)) {
+%>
+Display of this content was administratively prohibited by the webmaster.
+You may visit the original page instead: <a href="<%=url%>"><%=url%></a>.
+<%
+     return;
+   }
+%>
 <% if (contentType.startsWith("text/html")) {%>
 
 <% if (content != null && !content.equals("")) {%>

Modified: lucene/nutch/trunk/src/web/jsp/search.jsp
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/web/jsp/search.jsp?view=diff&rev=515844&r1=515843&r2=515844
==============================================================================
--- lucene/nutch/trunk/src/web/jsp/search.jsp (original)
+++ lucene/nutch/trunk/src/web/jsp/search.jsp Wed Mar  7 15:37:21 2007
@@ -24,6 +24,7 @@
   import="java.net.*"
 
   import="org.apache.nutch.html.Entities"
+  import="org.apache.nutch.metadata.Nutch"
   import="org.apache.nutch.searcher.*"
   import="org.apache.nutch.plugin.*"
   import="org.apache.nutch.clustering.*"
@@ -194,7 +195,6 @@
    Hit[] show = hits.getHits(start, realEnd-start);
    HitDetails[] details = bean.getDetails(show);
    Summary[] summaries = bean.getSummary(details, query);
-
    bean.LOG.info("total hits: " + hits.getTotal());
 %>
 
@@ -228,6 +228,13 @@
     String url = detail.getValue("url");
     String id = "idx=" + hit.getIndexNo() + "&id=" + hit.getIndexDocNo();
     String summary = summaries[i].toHtml(true);
+    String caching = detail.getValue("cache");
+    boolean showSummary = true;
+    boolean showCached = true;
+    if (caching != null) {
+      showSummary = !caching.equals(Nutch.CACHING_FORBIDDEN_ALL);
+      showCached = !caching.equals(Nutch.CACHING_FORBIDDEN_NONE);
+    }
 
     if (title == null || title.equals("")) {      // use url for docs w/o title
       title = url;
@@ -235,12 +242,16 @@
     %>
     <b><a href="<%=url%>"><%=Entities.encode(title)%></a></b>
     <%@ include file="more.jsp" %>
-    <% if (!"".equals(summary)) { %>
+    <% if (!"".equals(summary) && showSummary) { %>
     <br><%=summary%>
     <% } %>
     <br>
     <span class="url"><%=Entities.encode(url)%></span>
-    (<a href="../cached.jsp?<%=id%>"><i18n:message key="cached"/></a>)
+    <%
+      if (showCached) {
+        %>(<a href="../cached.jsp?<%=id%>"><i18n:message key="cached"/></a>) <%
+    }
+    %>
     (<a href="../explain.jsp?<%=id%>&query=<%=URLEncoder.encode(queryString, "UTF-8")%>&lang=<%=queryLang%>"><i18n:message key="explain"/></a>)
     (<a href="../anchors.jsp?<%=id%>"><i18n:message key="anchors"/></a>)
     <% if (hit.moreFromDupExcluded()) {