You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2005/06/02 00:20:05 UTC

svn commit: r179436 [1/3] - in /incubator/nutch/trunk: ./ conf/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/parse/ src/java/org/apache/nutch/protocol/ src/java/org/apache/nutch/tools/ src/plugin/ src/plugin/creativecommons/src/java/org/creativecommons/nutch/ src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/ src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/ src/plugin/parse-html/ src/plugin/parse-html/lib/ src/plugin/parse-html/src/java/org/apache/nutch/parse/html/ src/plugin/parse-html/src/test/org/apache/nutch/parse/html/ src/plugin/parse-js/ src/plugin/parse-js/src/ src/plugin/parse-js/src/java/ src/plugin/parse-js/src/java/org/ src/plugin/parse-js/src/java/org/apache/ src/plugin/parse-js/src/java/org/apache/nutch/ src/plugin/parse-js/src/java/org/apache/nutch/parse/ src/plugin/parse-js/src/java/org/apache/nutch/parse/js/ src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/ src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/ src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/ src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/ src/plugin/parse-text/src/java/org/apache/nutch/parse/text/ src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/ src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/ src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/ src/plugin/protocol-httpclient/ src/plugin/protocol-httpclient/lib/ src/plugin/protocol-httpclient/src/ src/plugin/protocol-httpclient/src/java/ src/plugin/protocol-httpclient/src/java/org/ src/plugin/protocol-httpclient/src/java/org/apache/ src/plugin/protocol-httpclient/src/java/org/apache/nutch/ src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/ src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/ src/test/org/apache/nutch/fetcher/ src/test/org/apache/nutch/parse/ src/test/org/apache/nutch/tools/

Author: ab
Date: Wed Jun  1 15:20:01 2005
New Revision: 179436

URL: http://svn.apache.org/viewcvs?rev=179436&view=rev
Log:
This patchset contains improvements to Fetcher, described in NUTCH-54,
specifically the following:

* protocol- and content-based redirection handling in Fetcher.

* parse-js: heuristic link extractor for JavaScript

* protocol-httpclient: HTTP and HTTPS protocol handler, based on
Jakarta Commons HttpClient library.

* alternative HTML parser based on TagSoup.

* improved status reporting for protocol and parse plugins. Status
information is persisted in segment data, so that other plugins can
use it.

* and other assorted fixes...

This work has been sponsored by EvaluMetrix LLC (http://www.evalumetrix.com).
Thank you!


Added:
    incubator/nutch/trunk/src/java/org/apache/nutch/parse/HTMLMetaTags.java   (with props)
    incubator/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java   (with props)
    incubator/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolOutput.java   (with props)
    incubator/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java   (with props)
    incubator/nutch/trunk/src/plugin/parse-html/lib/tagsoup-1.0rc3.jar   (with props)
    incubator/nutch/trunk/src/plugin/parse-html/lib/tagsoup.LICENSE.txt   (with props)
    incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java   (with props)
    incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java   (with props)
    incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java   (with props)
    incubator/nutch/trunk/src/plugin/parse-js/
    incubator/nutch/trunk/src/plugin/parse-js/build.xml   (with props)
    incubator/nutch/trunk/src/plugin/parse-js/plugin.xml   (with props)
    incubator/nutch/trunk/src/plugin/parse-js/src/
    incubator/nutch/trunk/src/plugin/parse-js/src/java/
    incubator/nutch/trunk/src/plugin/parse-js/src/java/org/
    incubator/nutch/trunk/src/plugin/parse-js/src/java/org/apache/
    incubator/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/
    incubator/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/package.html   (with props)
    incubator/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/
    incubator/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/
    incubator/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java   (with props)
    incubator/nutch/trunk/src/plugin/protocol-httpclient/
    incubator/nutch/trunk/src/plugin/protocol-httpclient/build.xml   (with props)
    incubator/nutch/trunk/src/plugin/protocol-httpclient/lib/
    incubator/nutch/trunk/src/plugin/protocol-httpclient/lib/commons-codec.jar   (with props)
    incubator/nutch/trunk/src/plugin/protocol-httpclient/lib/commons-httpclient-3.0-rc2.jar   (with props)
    incubator/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml   (with props)
    incubator/nutch/trunk/src/plugin/protocol-httpclient/src/
    incubator/nutch/trunk/src/plugin/protocol-httpclient/src/java/
    incubator/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/
    incubator/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/
    incubator/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/
    incubator/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/
    incubator/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/
    incubator/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java   (with props)
    incubator/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java   (with props)
    incubator/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java   (with props)
    incubator/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java   (with props)
    incubator/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java   (with props)
    incubator/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java   (with props)
    incubator/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java   (with props)
    incubator/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpError.java   (with props)
    incubator/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpException.java   (with props)
    incubator/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java   (with props)
    incubator/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/MultiProperties.java   (with props)
    incubator/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java   (with props)
    incubator/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/package.html   (with props)
Removed:
    incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/RobotsMetaProcessor.java
Modified:
    incubator/nutch/trunk/build.xml
    incubator/nutch/trunk/conf/nutch-default.xml
    incubator/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
    incubator/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java
    incubator/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java
    incubator/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilter.java
    incubator/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java
    incubator/nutch/trunk/src/java/org/apache/nutch/parse/Parse.java
    incubator/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java
    incubator/nutch/trunk/src/java/org/apache/nutch/parse/Parser.java
    incubator/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
    incubator/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java
    incubator/nutch/trunk/src/java/org/apache/nutch/tools/ParseSegment.java
    incubator/nutch/trunk/src/java/org/apache/nutch/tools/UpdateDatabaseTool.java
    incubator/nutch/trunk/src/plugin/build.xml
    incubator/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
    incubator/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
    incubator/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
    incubator/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
    incubator/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
    incubator/nutch/trunk/src/plugin/parse-html/plugin.xml
    incubator/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
    incubator/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
    incubator/nutch/trunk/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
    incubator/nutch/trunk/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
    incubator/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java
    incubator/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
    incubator/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java
    incubator/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
    incubator/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
    incubator/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
    incubator/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
    incubator/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcherOutput.java
    incubator/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java
    incubator/nutch/trunk/src/test/org/apache/nutch/tools/TestSegmentMergeTool.java

Modified: incubator/nutch/trunk/build.xml
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/build.xml?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/build.xml (original)
+++ incubator/nutch/trunk/build.xml Wed Jun  1 15:20:01 2005
@@ -209,7 +209,9 @@
     	<packageset dir="${plugins.dir}/protocol-file/src/java"/>
     	<packageset dir="${plugins.dir}/protocol-ftp/src/java"/>
        	<packageset dir="${plugins.dir}/protocol-http/src/java"/>
+       	<packageset dir="${plugins.dir}/protocol-httpclient/src/java"/>
     	<packageset dir="${plugins.dir}/parse-html/src/java"/>
+    	<packageset dir="${plugins.dir}/parse-js/src/java"/>
     	<packageset dir="${plugins.dir}/parse-text/src/java"/>
     	<packageset dir="${plugins.dir}/parse-pdf/src/java"/>
 	<packageset dir="${plugins.dir}/parse-rtf/src/java"/>

Modified: incubator/nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/conf/nutch-default.xml?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/conf/nutch-default.xml (original)
+++ incubator/nutch/trunk/conf/nutch-default.xml Wed Jun  1 15:20:01 2005
@@ -578,7 +578,7 @@
 
 <property>
   <name>plugin.includes</name>
-  <value>protocol-http|urlfilter-regex|parse-(text|html)|index-basic|query-(basic|site|url)</value>
+  <value>protocol-httpclient|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url)</value>
   <description>Regular expression naming plugin directory names to
   include.  Any plugin not matching this expression is excluded.  By
   default Nutch includes crawling just HTML and plain text via HTTP,
@@ -599,6 +599,13 @@
   <description>The character encoding to fall back to when no other information
   is available</description>
 </property>
+
+<property>
+  <name>parser.html.impl</name>
+  <value>neko</value>
+  <description>HTML Parser implementation. Currently the following keywords
+  are recognized: "neko" uses NekoHTML, "tagsoup" uses TagSoup.
+  </description>
 
 <!-- urlfilter plugin properties -->
 

Modified: incubator/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Wed Jun  1 15:20:01 2005
@@ -20,6 +20,7 @@
 import java.io.File;
 import java.util.Properties;
 
+import org.apache.nutch.net.URLFilters;
 import org.apache.nutch.pagedb.FetchListEntry;
 import org.apache.nutch.io.*;
 import org.apache.nutch.db.*;
@@ -68,6 +69,10 @@
 
   private int threadCount =                       // max number of threads
     NutchConf.get().getInt("fetcher.threads.fetch", 10);
+  private static final float NEW_INJECTED_PAGE_SCORE =
+    NutchConf.get().getFloat("db.score.injected", 2.0f);
+  private static final int MAX_REDIRECT =
+    NutchConf.get().getInt("http.redirect.max", 3);
 
   // All threads (FetcherThread or thread started by it) belong to
   // group "fetcher". Each FetcherThread is named as "fetcherXX",
@@ -110,45 +115,84 @@
           if (!fle.getFetch()) {                  // should we fetch this page?
             if (LOG.isLoggable(Level.FINE))
               LOG.fine("not fetching " + url);
-            handleNoFetch(fle, FetcherOutput.SUCCESS);
+            handleNoFetch(fle, ProtocolStatus.STATUS_NOTFETCHING);
             continue;
           }
 
-          LOG.info("fetching " + url);            // fetch the page
-
-          Protocol protocol = ProtocolFactory.getProtocol(url);
-          Content content = protocol.getContent(url);
-
-          handleFetch(url, fle, content);
-
-          synchronized (Fetcher.this) {           // update status
-            pages++;
-            bytes += content.getContent().length;
-            if ((pages % 100) == 0) {             // show status every 100pp
-              status();
+          // support multiple redirects, if requested by protocol
+          // or content meta-tags (the latter requires running Fetcher
+          // in parsing mode). Protocol-level redirects take precedence over
+          // content-level redirects. Some plugins can handle redirects
+          // automatically, so that only the final success or failure will be
+          // shown here.
+          boolean refetch = false;
+          int redirCnt = 0;
+          do {
+            LOG.fine("redirCnt=" + redirCnt);
+            refetch = false;
+            LOG.info("fetching " + url);            // fetch the page
+            Protocol protocol = ProtocolFactory.getProtocol(url);
+            ProtocolOutput output = protocol.getProtocolOutput(fle);
+            ProtocolStatus pstat = output.getStatus();
+            Content content = output.getContent();
+            switch(pstat.getCode()) {
+              case ProtocolStatus.SUCCESS:
+                if (content != null) {
+                  synchronized (Fetcher.this) {           // update status
+                    pages++;
+                    bytes += content.getContent().length;
+                    if ((pages % 100) == 0) {             // show status every 100pp
+                      status();
+                    }
+                  }
+                  ParseStatus ps = handleFetch(url, fle, output);
+                  if (ps != null && ps.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
+                    url = ps.getMessage();
+                    url = URLFilters.filter(url);
+                    if (url != null) {
+                      refetch = true;
+                      redirCnt++;
+                      fle = new FetchListEntry(true, new Page(url, NEW_INJECTED_PAGE_SCORE), new String[0]);
+                      LOG.info(" - content redirect to " + url);
+                    }
+                  }
+                }
+                break;
+              case ProtocolStatus.MOVED: // try to redirect immediately
+              case ProtocolStatus.TEMP_MOVED: // try to redirect immediately
+                // record the redirect. perhaps the DB will want to know this.
+                handleNoFetch(fle, pstat);
+                url = pstat.getMessage();
+                if (url != null) {
+                  refetch = true;
+                  redirCnt++;
+                  // create new entry.
+                  fle = new FetchListEntry(true, new Page(url, NEW_INJECTED_PAGE_SCORE), new String[0]);
+                  LOG.info(" - protocol redirect to " + url);
+                }
+                break;
+              case ProtocolStatus.GONE:
+              case ProtocolStatus.NOT_FOUND:
+              case ProtocolStatus.ACCESS_DENIED:
+              case ProtocolStatus.ROBOTS_DENIED:
+              case ProtocolStatus.RETRY:
+              case ProtocolStatus.NOTMODIFIED:
+                handleNoFetch(fle, pstat);
+                break;
+              case ProtocolStatus.EXCEPTION:
+                logError(url, fle, new Exception(pstat.getMessage()));                // retry?
+                handleNoFetch(fle, pstat);
+              break;
+              default:
+                LOG.warning("Unknown ProtocolStatus: " + pstat.getCode());
+                handleNoFetch(fle, pstat);
             }
-          }
-        } catch (ResourceGone e) {                // don't retry
-          logError(url, fle, e);
-          handleNoFetch(fle, FetcherOutput.NOT_FOUND);
-
-        // dealt with in handleFetch() below
-        //} catch (ParseException e) {              // don't retry
-        //  logError(url, fle, e);
-        //  handleNoFetch(fle, FetcherOutput.CANT_PARSE);
-
-        } catch (RetryLater e) {                  // explicit retry
-          logError(url, fle, e);
-          handleNoFetch(fle, FetcherOutput.RETRY);
-
-        } catch (ProtocolException e) {           // implicit retry
-          logError(url, fle, e);
-          handleNoFetch(fle, FetcherOutput.RETRY);
+          } while (refetch && (redirCnt < MAX_REDIRECT));
 
         } catch (Throwable t) {                   // an unchecked exception
           if (fle != null) {
             logError(url, fle, t);                // retry?
-            handleNoFetch(fle, FetcherOutput.RETRY);
+            handleNoFetch(fle, new ProtocolStatus(t));
           }
         }
       }
@@ -176,36 +220,44 @@
       }
     }
 
-    private void handleFetch(String url, FetchListEntry fle, Content content) {
+    private ParseStatus handleFetch(String url, FetchListEntry fle, ProtocolOutput output) {
+      Content content = output.getContent();
+      ProtocolStatus protocolStatus = output.getStatus();
       if (!Fetcher.this.parsing) {
         outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
-                                    FetcherOutput.SUCCESS),
+                protocolStatus),
                 content, null, null);
-        return;
+        return null;
       }
 
-      try {
         String contentType = content.getContentType();
-        Parser parser = ParserFactory.getParser(contentType, url);
-        Parse parse = parser.getParse(content);
-        outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
-                                    FetcherOutput.SUCCESS),
-                content, new ParseText(parse.getText()), parse.getData());
-      } catch (ParseException e) {
-        // 20041026, xing
-        // If fetching succeeds, but parsing fails, content should be saved
-        // so that we can try to parse again in separate pass, possibly
-        // using better/alternative parser.
-        LOG.info("fetch okay, but can't parse " + url + ", reason: "
-          + e.getMessage());
-        outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
-                                    FetcherOutput.CANT_PARSE),
-                content, new ParseText(""),
-                new ParseData("", new Outlink[0], new Properties()));
-      }
+        Parser parser = null;
+        Parse parse = null;
+        ParseStatus status = null;
+        try {
+          parser = ParserFactory.getParser(contentType, url);
+          parse = parser.getParse(content);
+          status = parse.getData().getStatus();
+        } catch (Exception e) {
+          e.printStackTrace();
+          status = new ParseStatus(e);
+        }
+        if (status.isSuccess()) {
+          outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
+                  protocolStatus),
+                  content, new ParseText(parse.getText()), parse.getData());
+        } else {
+          LOG.info("fetch okay, but can't parse " + url + ", reason: "
+                  + status.toString());
+          outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
+                  protocolStatus),
+                  content, new ParseText(""),
+                  new ParseData(status, "", new Outlink[0], new Properties()));
+        }
+        return status;
     }
 
-    private void handleNoFetch(FetchListEntry fle, int status) {
+    private void handleNoFetch(FetchListEntry fle, ProtocolStatus status) {
       String url = fle.getPage().getURL().toString();
       MD5Hash hash = MD5Hash.digest(url);
 
@@ -213,7 +265,7 @@
         outputPage(new FetcherOutput(fle, hash, status),
                    new Content(url, url, new byte[0], "", new Properties()),
                    new ParseText(""),
-                   new ParseData("", new Outlink[0], new Properties()));
+                   new ParseData(ParseStatus.STATUS_NOTPARSED, "", new Outlink[0], new Properties()));
       } else {
         outputPage(new FetcherOutput(fle, hash, status),
                    new Content(url, url, new byte[0], "", new Properties()),
@@ -234,6 +286,7 @@
         }
       } catch (Throwable t) {
         LOG.severe("error writing output:" + t.toString());
+        t.printStackTrace();
       }
     }
                                        
@@ -429,7 +482,7 @@
     }
 
     // set log level
-    fetcher.setLogLevel(Level.parse(logLevel.toUpperCase()));
+    setLogLevel(Level.parse(logLevel.toUpperCase()));
 
     if (showThreadID) {
       LogFormatter.setShowThreadIDs(showThreadID);

Modified: incubator/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java (original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java Wed Jun  1 15:20:01 2005
@@ -26,6 +26,8 @@
 import org.apache.nutch.pagedb.FetchListEntry;
 import org.apache.nutch.tools.UpdateDatabaseTool;
 import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.protocol.ProtocolStatus;
 
 /*********************************************
  * An entry in the fetcher's output.  This includes all of the fetcher output
@@ -50,25 +52,34 @@
   public static final String DONE_NAME = "fetcher.done";
   public static final String ERROR_NAME = "fetcher.error";
 
-  private final static byte VERSION = 4;
+  private final static byte VERSION = 5;
 
-  public final static byte RETRY = 0;
-  public final static byte SUCCESS = 1;
-  public final static byte NOT_FOUND = 2;
-  public final static byte CANT_PARSE = 4; // fetched, but can't be parsed
+  // backwards compatibility codes
+  private final static byte RETRY = 0;
+  private final static byte SUCCESS = 1;
+  private final static byte NOT_FOUND = 2;
+  private final static byte CANT_PARSE = 4; // fetched, but can't be parsed
+  
+  private static final byte[] oldToNewMap = {
+          ProtocolStatus.RETRY,
+          ProtocolStatus.SUCCESS,
+          ProtocolStatus.NOT_FOUND,
+          ProtocolStatus.FAILED,
+          ProtocolStatus.RETRY
+  };
 
   private FetchListEntry fetchListEntry;
   private MD5Hash md5Hash;
-  private int status;
+  private ProtocolStatus protocolStatus;
   private long fetchDate;
 
   public FetcherOutput() {}
 
   public FetcherOutput(FetchListEntry fetchListEntry,
-                       MD5Hash md5Hash, int status) {
+                       MD5Hash md5Hash, ProtocolStatus protocolStatus) {
     this.fetchListEntry = fetchListEntry;
     this.md5Hash = md5Hash;
-    this.status = status;
+    this.protocolStatus = protocolStatus;
     this.fetchDate = System.currentTimeMillis();
   }
 
@@ -78,7 +89,12 @@
     byte version = in.readByte();                 // read version
     fetchListEntry = FetchListEntry.read(in);
     md5Hash = MD5Hash.read(in);
-    status = in.readByte();
+    if (version < 5) {
+      int status = in.readByte();
+      protocolStatus = new ProtocolStatus(oldToNewMap[status]);
+    } else {
+      protocolStatus = ProtocolStatus.read(in);
+    }
 
     if (version < 4) {
       UTF8.readString(in);                        // read & ignore title
@@ -95,7 +111,7 @@
     out.writeByte(VERSION);                       // store current version
     fetchListEntry.write(out);
     md5Hash.write(out);
-    out.writeByte(status);
+    protocolStatus.write(out);
     out.writeLong(fetchDate);
   }
 
@@ -110,8 +126,8 @@
   //
   public FetchListEntry getFetchListEntry() { return fetchListEntry; }
   public MD5Hash getMD5Hash() { return md5Hash; }
-  public int getStatus() { return status; }
-  public void setStatus(int status) { this.status = status; }
+  public ProtocolStatus getProtocolStatus() { return protocolStatus; }
+  public void setProtocolStatus(ProtocolStatus protocolStatus) { this.protocolStatus = protocolStatus; }
   public long getFetchDate() { return fetchDate; }
   public void setFetchDate(long fetchDate) { this.fetchDate = fetchDate; }
 
@@ -126,7 +142,7 @@
     return
       this.fetchListEntry.equals(other.fetchListEntry) &&
       this.md5Hash.equals(other.md5Hash) &&
-      (this.status == other.status);
+      this.protocolStatus.equals(other.protocolStatus);
   }
 
 
@@ -134,7 +150,7 @@
     StringBuffer buffer = new StringBuffer();
     buffer.append("FetchListEntry: " + fetchListEntry + "Fetch Result:\n" );
     buffer.append("MD5Hash: " + md5Hash + "\n" );
-    buffer.append("Status: " + status + "\n" );
+    buffer.append("ProtocolStatus: " + protocolStatus + "\n" );
     buffer.append("FetchDate: " + new Date(fetchDate) + "\n" );
     return buffer.toString();
   }

Modified: incubator/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java (original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java Wed Jun  1 15:20:01 2005
@@ -134,7 +134,7 @@
             if (!sr.next(fetcherOutput, null, parseText, parseData)) continue;
 
               // only index the page if it was fetched correctly
-              if (fetcherOutput.getStatus() != FetcherOutput.SUCCESS) {
+              if (!fetcherOutput.getProtocolStatus().isSuccess()) {
                   continue;                              
               }
 

Added: incubator/nutch/trunk/src/java/org/apache/nutch/parse/HTMLMetaTags.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/parse/HTMLMetaTags.java?rev=179436&view=auto
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/parse/HTMLMetaTags.java (added)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/parse/HTMLMetaTags.java Wed Jun  1 15:20:01 2005
@@ -0,0 +1,203 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import java.net.URL;
+import java.util.Iterator;
+import java.util.Properties;
+
+/**
+ * This class holds the information about HTML "meta" tags extracted from 
+ * a page. Some special tags have convenience methods for easy checking.
+ */
+public class HTMLMetaTags {
+  private boolean noIndex = false;
+
+  private boolean noFollow = false;
+
+  private boolean noCache = false;
+
+  private URL baseHref = null;
+
+  private boolean refresh = false;
+
+  private int refreshTime = 0;
+
+  private URL refreshHref = null;
+
+  private Properties generalTags = new Properties();
+
+  private Properties httpEquivTags = new Properties();
+
+  /**
+   * Sets all boolean values to <code>false</code>. Clears all other tags.
+   */
+  public void reset() {
+    noIndex = false;
+    noFollow = false;
+    noCache = false;
+    refresh = false;
+    refreshTime = 0;
+    baseHref = null;
+    refreshHref = null;
+    generalTags.clear();
+    httpEquivTags.clear();
+  }
+
+  /**
+   * Sets <code>noFollow</code> to <code>true</code>.
+   */
+  public void setNoFollow() {
+    noFollow = true;
+  }
+
+  /**
+   * Sets <code>noIndex</code> to <code>true</code>.
+   */
+  public void setNoIndex() {
+    noIndex = true;
+  }
+
+  /**
+   * Sets <code>noCache</code> to <code>true</code>.
+   */
+  public void setNoCache() {
+    noCache = true;
+  }
+
+  /**
+   * Sets <code>refresh</code> to the supplied value.
+   */
+  public void setRefresh(boolean refresh) {
+    this.refresh = refresh;
+  }
+
+  /**
+   * Sets the <code>baseHref</code>.
+   */
+  public void setBaseHref(URL baseHref) {
+    this.baseHref = baseHref;
+  }
+
+  /**
+   * Sets the <code>refreshHref</code>.
+   */
+  public void setRefreshHref(URL refreshHref) {
+    this.refreshHref = refreshHref;
+  }
+
+  /**
+   * Sets the <code>refreshTime</code>.
+   */
+  public void setRefreshTime(int refreshTime) {
+    this.refreshTime = refreshTime;
+  }
+
+  /**
+   * A convenience method. Returns the current value of <code>noIndex</code>.
+   */
+  public boolean getNoIndex() {
+    return noIndex;
+  }
+
+  /**
+   * A convenience method. Returns the current value of <code>noFollow</code>.
+   */
+  public boolean getNoFollow() {
+    return noFollow;
+  }
+
+  /**
+   * A convenience method. Returns the current value of <code>noCache</code>.
+   */
+  public boolean getNoCache() {
+    return noCache;
+  }
+
+  /**
+   * A convenience method. Returns the current value of <code>refresh</code>.
+   */
+  public boolean getRefresh() {
+    return refresh;
+  }
+
+  /**
+   * A convenience method. Returns the <code>baseHref</code>, if set, or
+   * <code>null</code> otherwise.
+   */
+  public URL getBaseHref() {
+    return baseHref;
+  }
+
+  /**
+   * A convenience method. Returns the <code>refreshHref</code>, if set, or
+   * <code>null</code> otherwise. The value may be invalid if
+   * {@link #getRefresh()}returns <code>false</code>.
+   */
+  public URL getRefreshHref() {
+    return refreshHref;
+  }
+
+  /**
+   * A convenience method. Returns the current value of <code>refreshTime</code>.
+   * The value may be invalid if {@link #getRefresh()}returns
+   * <code>false</code>.
+   */
+  public int getRefreshTime() {
+    return refreshTime;
+  }
+
+  /**
+   * Returns all collected values of the general meta tags. Property names are
+   * tag names, property values are "content" values.
+   */
+  public Properties getGeneralTags() {
+    return generalTags;
+  }
+
+  /**
+   * Returns all collected values of the "http-equiv" meta tags. Property names
+   * are tag names, property values are "content" values.
+   */
+  public Properties getHttpEquivTags() {
+    return httpEquivTags;
+  }
+  
+  public String toString() {
+    StringBuffer sb = new StringBuffer();
+    sb.append("base=" + baseHref
+            + ", noCache=" + noCache
+            + ", noFollow=" + noFollow
+            + ", noIndex=" + noIndex
+            + ", refresh=" + refresh
+            + ", refreshHref=" + refreshHref + "\n"
+            );
+    sb.append(" * general tags:\n");
+    Iterator it = generalTags.keySet().iterator();
+    while (it.hasNext()) {
+      String key = (String)it.next();
+      sb.append("   - " + key + "\t=\t" + generalTags.get(key) + "\n");
+    }
+    sb.append(" * http-equiv tags:\n");
+    it = httpEquivTags.keySet().iterator();
+    while (it.hasNext()) {
+      String key = (String)it.next();
+      sb.append("   - " + key + "\t=\t" + httpEquivTags.get(key) + "\n");
+    }
+    return sb.toString();
+  }
+}

Propchange: incubator/nutch/trunk/src/java/org/apache/nutch/parse/HTMLMetaTags.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: incubator/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilter.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilter.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilter.java (original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilter.java Wed Jun  1 15:20:01 2005
@@ -30,6 +30,5 @@
 
   /** Adds metadata or otherwise modifies a parse of HTML content, given
    * the DOM tree of a page. */
-  Parse filter(Content content, Parse parse, DocumentFragment doc)
-    throws ParseException;
+  Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc);
 }

Modified: incubator/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java (original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java Wed Jun  1 15:20:01 2005
@@ -45,11 +45,11 @@
   private  HtmlParseFilters() {}                  // no public ctor
 
   /** Run all defined filters. */
-  public static Parse filter(Content content,Parse parse,DocumentFragment doc)
-    throws ParseException {
+  public static Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc) {
 
     for (int i = 0 ; i < CACHE.length; i++) {
-      parse = CACHE[i].filter(content, parse, doc);
+      parse = CACHE[i].filter(content, parse, metaTags, doc);
+      if (!parse.getData().getStatus().isSuccess()) break;
     }
 
     return parse;

Modified: incubator/nutch/trunk/src/java/org/apache/nutch/parse/Parse.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/parse/Parse.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/parse/Parse.java (original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/parse/Parse.java Wed Jun  1 15:20:01 2005
@@ -20,6 +20,7 @@
  * @see Parser#getParse(FetcherOutput,Content)
  */
 public interface Parse {
+  
   /** The textual content of the page. This is indexed, searched, and used when
    * generating snippets.*/ 
   String getText();

Modified: incubator/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java (original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java Wed Jun  1 15:20:01 2005
@@ -21,7 +21,6 @@
 
 import org.apache.nutch.io.*;
 import org.apache.nutch.fs.*;
-import org.apache.nutch.util.*;
 import org.apache.nutch.tools.UpdateDatabaseTool;
 
 
@@ -31,15 +30,17 @@
 public final class ParseData extends VersionedWritable {
   public static final String DIR_NAME = "parse_data";
 
-  private final static byte VERSION = 1;
+  private final static byte VERSION = 2;
 
   private String title;
   private Outlink[] outlinks;
   private Properties metadata;
+  private ParseStatus status;
 
   public ParseData() {}
 
-  public ParseData(String title, Outlink[] outlinks, Properties metadata) {
+  public ParseData(ParseStatus status, String title, Outlink[] outlinks, Properties metadata) {
+    this.status = status;
     this.title = title;
     this.outlinks = outlinks;
     this.metadata = metadata;
@@ -49,6 +50,9 @@
   // Accessor methods
   //
 
+  /** The status of parsing the page. */
+  public ParseStatus getStatus() { return status; }
+  
   /** The title of the page. */
   public String getTitle() { return title; }
 
@@ -70,8 +74,12 @@
   public byte getVersion() { return VERSION; }
 
   public final void readFields(DataInput in) throws IOException {
-    super.readFields(in);                         // check version
 
+    byte version = in.readByte();
+    if (version > 1)
+      status = ParseStatus.read(in);
+    else
+      status = ParseStatus.STATUS_SUCCESS;
     title = UTF8.readString(in);                   // read title
 
     int totalOutlinks = in.readInt();             // read outlinks
@@ -94,8 +102,8 @@
   }
 
   public final void write(DataOutput out) throws IOException {
-    super.write(out);                             // write version
-
+    out.writeByte(VERSION);                             // write version
+    status.write(out);                       // write status
     UTF8.writeString(out, title);                 // write title
 
     out.writeInt(outlinks.length);                // write outlinks
@@ -127,6 +135,7 @@
       return false;
     ParseData other = (ParseData)o;
     return
+      this.status.equals(other.status) &&
       this.title.equals(other.title) &&
       Arrays.equals(this.outlinks, other.outlinks) &&
       this.metadata.equals(other.metadata);
@@ -135,6 +144,7 @@
   public String toString() {
     StringBuffer buffer = new StringBuffer();
 
+    buffer.append("Status: " + status + "\n" );
     buffer.append("Title: " + title + "\n" );
 
     buffer.append("Outlinks: " + outlinks.length + "\n" );

Added: incubator/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java?rev=179436&view=auto
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java (added)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java Wed Jun  1 15:20:01 2005
@@ -0,0 +1,234 @@
+/*
+ * Created on Apr 28, 2005
+ * Author: Andrzej Bialecki &lt;ab@getopt.org&gt;
+ *
+ */
+package org.apache.nutch.parse;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Properties;
+
+import org.apache.nutch.io.Writable;
+import org.apache.nutch.io.WritableUtils;
+
+/**
+ * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
+ */
+public class ParseStatus implements Writable {
+  
+  // Primary status codes:
+  
+  /** Parsing was not performed. */
+  public static final byte NOTPARSED       = 0;
+  /** Parsing succeeded. */
+  public static final byte SUCCESS         = 1;
+  /** General failure. There may be a more specific error message in arguments. */
+  public static final byte FAILED          = 2;
+  
+  public static final String[] majorCodes = {
+          "notparsed",
+          "success",
+          "failed"
+  };
+  
+  // Secondary success codes go here:
+  
+  /** Parsed content contains a directive to redirect to another URL.
+   * The target URL can be retrieved from the arguments.
+   */
+  public static final short SUCCESS_REDIRECT          = 100;
+  
+  // Secondary failure codes go here:
+  
+  /** Parsing failed. An Exception occured (which may be retrieved from the arguments). */
+  public static final short FAILED_EXCEPTION          = 200;
+  /** Parsing failed. Invalid format - the content may be corrupted or of wrong type. */
+  public static final short FAILED_TRUNCATED          = 202;
+  /** Parsing failed. Invalid format - the content may be corrupted or of wrong type. */
+  public static final short FAILED_INVALID_FORMAT     = 203;
+  /** Parsing failed. Other related parts of the content are needed to complete
+   * parsing. The list of URLs to missing parts may be provided in arguments.
+   * The Fetcher may decide to fetch these parts at once, then put them into
+   * Content.metadata, and supply them for re-parsing.
+   */
+  public static final short FAILED_MISSING_PARTS      = 204;
+  /** Parsing failed. There was no content to be parsed - probably caused
+   * by errors at protocol stage.
+   */
+  public static final short FAILED_MISSING_CONTENT    = 205;
+
+
+  public static final ParseStatus STATUS_NOTPARSED = new ParseStatus(NOTPARSED);
+  public static final ParseStatus STATUS_SUCCESS = new ParseStatus(SUCCESS);
+  public static final ParseStatus STATUS_FAILURE = new ParseStatus(FAILED);
+  
+  private byte majorCode = 0;
+  private short minorCode = 0;
+  private String[] args = null;
+  
+  protected ParseStatus() {
+    
+  }
+  
+  public ParseStatus(int majorCode, int minorCode, String[] args) {
+    this.args = args;
+    this.majorCode = (byte)majorCode;
+    this.minorCode = (short)minorCode;
+  }
+  
+  public ParseStatus(int majorCode) {
+    this(majorCode, 0, (String[])null);
+  }
+  
+  public ParseStatus(int majorCode, String[] args) {
+    this(majorCode, 0, args);
+  }
+  
+  public ParseStatus(int majorCode, int minorCode) {
+    this(majorCode, minorCode, (String[])null);
+  }
+  
+  /** Simplified constructor for passing just a text message. */
+  public ParseStatus(int majorCode, int minorCode, String message) {
+    this(majorCode, minorCode, new String[]{message});
+  }
+  
+  /** Simplified constructor for passing just a text message. */
+  public ParseStatus(int majorCode, String message) {
+    this(majorCode, 0, new String[]{message});
+  }
+  
+  public ParseStatus(Throwable t) {
+    this(FAILED, FAILED_EXCEPTION, new String[]{t.toString()});
+  }
+  
+  public static ParseStatus read(DataInput in) throws IOException {
+    ParseStatus res = new ParseStatus();
+    res.readFields(in);
+    return res;
+  }
+  
+  public void readFields(DataInput in) throws IOException {
+    majorCode = in.readByte();
+    minorCode = in.readShort();
+    args = WritableUtils.readCompressedStringArray(in);
+  }
+  
+  public void write(DataOutput out) throws IOException {
+    out.writeByte(majorCode);
+    out.writeShort(minorCode);
+    WritableUtils.writeCompressedStringArray(out, args);
+  }
+  
+  /** A convenience method. Returns true if majorCode is SUCCESS, false
+   * otherwise.
+   */
+  
+  public boolean isSuccess() {
+    return majorCode == SUCCESS;
+  }
+  
+  /** A convenience method. Return a String representation of the first
+   * argument, or null.
+   */
+  public String getMessage() {
+    if (args != null && args.length > 0 && args[0] != null)
+      return args[0].toString();
+    return null;
+  }
+  
+  public String[] getArgs() {
+    return args;
+  }
+  
+  public int getMajorCode() {
+    return majorCode;
+  }
+  
+  public int getMinorCode() {
+    return minorCode;
+  }
+  
+  /** A convenience method. Creates an empty Parse instance,
+   * which returns this status.
+   */
+  public Parse getEmptyParse() {
+    return new EmptyParseImpl(this);
+  }
+  
+  public String toString() {
+    StringBuffer res = new StringBuffer();
+    res.append(majorCodes[majorCode] + "(" + majorCode + "," + minorCode + ")");
+    if (args != null) {
+      if (args.length == 1) {
+        res.append(": " + String.valueOf(args[0]));
+      } else {
+        for (int i = 0; i < args.length; i++) {
+          if (args[i] != null)
+            res.append(", args[" + i + "]=" + String.valueOf(args[i]));
+        }
+      }
+    }
+    return res.toString();
+  }
+  
+  public void setArgs(String[] args) {
+    this.args = args;
+  }
+  
+  public void setMessage(String msg) {
+    if (args == null || args.length == 0) {
+      args = new String[1];
+    }
+    args[0] = msg;
+  }
+  
+  public void setMajorCode(byte majorCode) {
+    this.majorCode = majorCode;
+  }
+
+  public void setMinorCode(short minorCode) {
+    this.minorCode = minorCode;
+  }
+  
+  public boolean equals(Object o) {
+    if (o == null) return false;
+    if (!(o instanceof ParseStatus)) return false;
+    boolean res = true;
+    ParseStatus other = (ParseStatus)o;
+    res = res && (this.majorCode == other.majorCode) &&
+      (this.minorCode == other.minorCode);
+    if (!res) return res;
+    if (this.args == null) {
+      if (other.args == null) return true;
+      else return false;
+    } else {
+      if (other.args == null) return false;
+      if (other.args.length != this.args.length) return false;
+      for (int i = 0; i < this.args.length; i++) {
+        if (!this.args[i].equals(other.args[i])) return false;
+      }
+    }
+    return true;
+  }
+}
+
+class EmptyParseImpl implements Parse {
+  
+  private ParseData data = null;
+  
+  public EmptyParseImpl(ParseStatus status) {
+    data = new ParseData(status, "", new Outlink[0], new Properties());
+  }
+  
+  public ParseData getData() {
+    return data;
+  }
+
+  public String getText() {
+    return "";
+  }
+}
+

Propchange: incubator/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: incubator/nutch/trunk/src/java/org/apache/nutch/parse/Parser.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/parse/Parser.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/parse/Parser.java (original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/parse/Parser.java Wed Jun  1 15:20:01 2005
@@ -27,5 +27,5 @@
   public final static String X_POINT_ID = Parser.class.getName();
 
   /** Creates the parse for some content. */
-  Parse getParse(Content c) throws ParseException;
+  Parse getParse(Content c);
 }

Modified: incubator/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java (original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java Wed Jun  1 15:20:01 2005
@@ -67,7 +67,7 @@
     LOG.info("fetching: "+url);
 
     Protocol protocol = ProtocolFactory.getProtocol(url);
-    Content content = protocol.getContent(url);
+    Content content = protocol.getProtocolOutput(url).getContent();
 
     if (force) {
       content.setContentType(contentType);

Modified: incubator/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java (original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java Wed Jun  1 15:20:01 2005
@@ -18,13 +18,21 @@
 
 import java.io.IOException;
 
+import org.apache.nutch.pagedb.FetchListEntry;
+
 /** A retriever of url content.  Implemented by protocol extensions. */
 public interface Protocol {
   /** The name of the extension point. */
   public final static String X_POINT_ID = Protocol.class.getName();
 
-  /** Returns the {@link Content} for a url.
+  /** Returns the {@link Content} for a url. This method may be
+   * more limited than {@link #getProtocolOutput(FetchListEntry)}.
+   * @throws IOException for any errors.
+   */
+  ProtocolOutput getProtocolOutput(String url);
+
+  /** Returns the {@link Content} for a fetchlist entry.
    * @throws IOException for any errors.
    */
-  Content getContent(String url) throws ProtocolException;
+  ProtocolOutput getProtocolOutput(FetchListEntry fle);
 }

Added: incubator/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolOutput.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolOutput.java?rev=179436&view=auto
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolOutput.java (added)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolOutput.java Wed Jun  1 15:20:01 2005
@@ -0,0 +1,53 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol;
+
+/**
+ * Simple aggregate to pass from protocol plugins both content and
+ * protocol status.
+ * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
+ */
+public class ProtocolOutput {
+  private Content content;
+  private ProtocolStatus status;
+
+  public ProtocolOutput(Content content, ProtocolStatus status) {
+    this.content = content;
+    this.status = status;
+  }
+  
+  public ProtocolOutput(Content content) {
+    this.content = content;
+    this.status = ProtocolStatus.STATUS_SUCCESS;
+  }
+  
+  public Content getContent() {
+    return content;
+  }
+
+  public void setContent(Content content) {
+    this.content = content;
+  }
+
+  public ProtocolStatus getStatus() {
+    return status;
+  }
+
+  public void setStatus(ProtocolStatus status) {
+    this.status = status;
+  }
+}

Propchange: incubator/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolOutput.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java?rev=179436&view=auto
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java (added)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java Wed Jun  1 15:20:01 2005
@@ -0,0 +1,159 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.nutch.io.Writable;
+import org.apache.nutch.io.WritableUtils;
+import org.apache.nutch.parse.ParseStatus;
+
+/**
+ * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
+ */
+public class ProtocolStatus implements Writable {
+  
+  /** Content was retrieved without errors. */
+  public static final int SUCCESS              = 1;
+  /** Content was not retrieved. Any further errors may be indicated in args. */
+  public static final int FAILED               = 2;
+  
+  /** This protocol was not found.  Application may attempt to retry later. */
+  public static final int PROTO_NOT_FOUND      = 10;
+  /** Resource is gone. */
+  public static final int GONE                 = 11;
+  /** Resource has moved permanently. New url should be found in args. */
+  public static final int MOVED                = 12;
+  /** Resource has moved temporarily. New url should be found in args. */
+  public static final int TEMP_MOVED           = 13;
+  /** Resource was not found. */
+  public static final int NOT_FOUND            = 14;
+  /** Temporary failure. Application may retry immediately. */
+  public static final int RETRY                = 15;
+  /** Unspecified exception occured. Further information may be provided in args. */
+  public static final int EXCEPTION            = 16;
+  /** Access denied - authorization required, but missing/incorrect. */
+  public static final int ACCESS_DENIED        = 17;
+  /** Access denied by robots.txt rules. */
+  public static final int ROBOTS_DENIED        = 18;
+  /** Too many redirects. */
+  public static final int REDIR_EXCEED         = 19;
+  /** Not fetching. */
+  public static final int NOTFETCHING          = 20;
+  /** Unchanged since the last fetch. */
+  public static final int NOTMODIFIED          = 21;
+  
+  
+  public static final ProtocolStatus STATUS_SUCCESS = new ProtocolStatus(SUCCESS);
+  public static final ProtocolStatus STATUS_NOTFETCHING = new ProtocolStatus(NOTFETCHING);
+  public static final ProtocolStatus STATUS_FAILED = new ProtocolStatus(FAILED);
+  public static final ProtocolStatus STATUS_NOTMODIFIED = new ProtocolStatus(NOTMODIFIED);
+  
+  private int code;
+  private String[] args;
+  
+  protected ProtocolStatus() {
+    
+  }
+
+  public ProtocolStatus(int code, String[] args) {
+    this.code = code;
+    this.args = args;
+  }
+  
+  public ProtocolStatus(int code) {
+    this(code, null);
+  }
+  
+  public ProtocolStatus(int code, Object message) {
+    this.code = code;
+    this.args = new String[]{String.valueOf(message)};
+  }
+  
+  public ProtocolStatus(Throwable t) {
+    this(EXCEPTION, t);
+  }
+
+  public static ProtocolStatus read(DataInput in) throws IOException {
+    ProtocolStatus res = new ProtocolStatus();
+    res.readFields(in);
+    return res;
+  }
+  
+  public void readFields(DataInput in) throws IOException {
+    code = in.readByte();
+    args = WritableUtils.readCompressedStringArray(in);
+  }
+  
+  public void write(DataOutput out) throws IOException {
+    out.writeByte((byte)code);
+    WritableUtils.writeCompressedStringArray(out, args);
+  }
+
+  public String[] getArgs() {
+    return args;
+  }
+
+  public int getCode() {
+    return code;
+  }
+  
+  public boolean isSuccess() {
+    return code == SUCCESS; 
+  }
+  
+  public String getMessage() {
+    if (args != null && args.length > 0) return args[0];
+    return null;
+  }
+  
+  public boolean equals(Object o) {
+    if (o == null) return false;
+    if (!(o instanceof ProtocolStatus)) return false;
+    ProtocolStatus other = (ProtocolStatus)o;
+    if (this.code != other.code) return false;
+    if (this.args == null) {
+      if (other.args == null) return true;
+      else return false;
+    } else {
+      if (other.args == null) return false;
+      if (other.args.length != this.args.length) return false;
+      for (int i = 0; i < this.args.length; i++) {
+        if (!this.args[i].equals(other.args[i])) return false;
+      }
+    }
+    return true;
+  }
+  
+  public String toString() {
+    StringBuffer res = new StringBuffer();
+    res.append("(" + code + ")");
+    if (args != null) {
+      if (args.length == 1) {
+        res.append(": " + String.valueOf(args[0]));
+      } else {
+        for (int i = 0; i < args.length; i++) {
+          if (args[i] != null)
+            res.append(", args[" + i + "]=" + String.valueOf(args[i]));
+        }
+      }
+    }
+    return res.toString();
+  }
+}

Propchange: incubator/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolStatus.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: incubator/nutch/trunk/src/java/org/apache/nutch/tools/ParseSegment.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/tools/ParseSegment.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/tools/ParseSegment.java (original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/tools/ParseSegment.java Wed Jun  1 15:20:01 2005
@@ -178,14 +178,15 @@
           // safe guard against mismatched files
           if (!url.equals(content.getUrl())) {
             LOG.severe("Mismatched entries under "
-              + FetcherOutput.DIR_NAME_NP + " and " + Content.DIR_NAME);
+              + FetcherOutput.DIR_NAME_NP + " (" + url +
+              ") and " + Content.DIR_NAME + " (" + content.getUrl() + ")");
             continue;
           }
 
           // if fetch was successful or
           // previously unable to parse (so try again)
-          if (fetcherOutput.getStatus() == FetcherOutput.SUCCESS ||
-              fetcherOutput.getStatus() == FetcherOutput.CANT_PARSE) {
+          ProtocolStatus ps = fetcherOutput.getProtocolStatus();
+          if (ps.isSuccess()) {
             handleContent(url, content);
             synchronized (ParseSegment.this) {
               pages++;                    // record successful parse
@@ -195,18 +196,18 @@
             }
           } else {
             // errored at fetch step
-            logError(url, new ProtocolException("Error at fetch stage"));
-            handleNoContent(ParserOutput.NOFETCH);
+            logError(url, new ProtocolException("Error at fetch stage: " + ps));
+            handleNoContent(new ParseStatus(ParseStatus.FAILED_MISSING_CONTENT));
           }
 
         } catch (ParseException e) {
           logError(url, e);
-          handleNoContent(ParserOutput.FAILURE);
+          handleNoContent(new ParseStatus(e));
 
         } catch (Throwable t) {                   // an unchecked exception
           if (fle != null) {
             logError(url, t);
-            handleNoContent(ParserOutput.UNKNOWN);
+            handleNoContent(new ParseStatus(t));
           } else {
             LOG.severe("Unexpected exception");
           }
@@ -238,27 +239,26 @@
       Parse parse = parser.getParse(content);
 
       outputPage
-        (new ParseText(parse.getText()), parse.getData(),ParserOutput.SUCCESS);
+        (new ParseText(parse.getText()), parse.getData());
     }
 
-    private void handleNoContent(int status) {
+    private void handleNoContent(ParseStatus status) {
       if (ParseSegment.this.dryRun) {
         LOG.info("To be handled as no content");
         return;
       }
       outputPage(new ParseText(""),
-                 new ParseData("", new Outlink[0], new Properties()),
-                 status);
+                 new ParseData(status, "", new Outlink[0], new Properties()));
     }
       
     private void outputPage
-      (ParseText parseText, ParseData parseData, int status) {
+      (ParseText parseText, ParseData parseData) {
       try {
         t3 = System.currentTimeMillis();
         synchronized (parserOutputWriter) {
           t4 = System.currentTimeMillis();
           parserOutputWriter.append(new LongWritable(myEntry),
-            new ParserOutput(parseData, parseText, status));
+            new ParserOutput(parseData, parseText));
           t5 = System.currentTimeMillis();
           if (LOG.isLoggable(Level.FINE))
             LOG.fine("Entry: "+myEntry
@@ -274,30 +274,21 @@
   }
 
   /**
-   * Inner class ParserOutput: ParseData + ParseText + status
+   * Inner class ParserOutput: ParseData + ParseText
    */
   private class ParserOutput extends VersionedWritable {
     public static final String DIR_NAME = "parser";
 
-    private final static byte VERSION = 1;
-
-    // could be more detailed
-    public final static byte UNKNOWN = (byte)0; // unknown problem in parsing
-    public final static byte SUCCESS = (byte)1; // parsing succeeded
-    public final static byte FAILURE = (byte)2; // parsing failed
-    public final static byte NOFETCH = (byte)3; // fetch was not a SUCCESS
-
-    private int status;
+    private final static byte VERSION = 2;
 
     private ParseData parseData = new ParseData();
     private ParseText parseText = new ParseText();
 
     public ParserOutput() {}
     
-    public ParserOutput(ParseData parseData, ParseText parseText, int status) {
+    public ParserOutput(ParseData parseData, ParseText parseText) {
       this.parseData = parseData;
       this.parseText = parseText;
-      this.status = status;
     }
 
     public byte getVersion() { return VERSION; }
@@ -310,13 +301,8 @@
       return this.parseText;
     }
 
-    public int getStatus() {
-      return this.status;
-    }
-
     public final void readFields(DataInput in) throws IOException {
       super.readFields(in);                         // check version
-      status = in.readByte();
       parseData.readFields(in);
       parseText.readFields(in);
       return;
@@ -324,7 +310,6 @@
 
     public final void write(DataOutput out) throws IOException {
       super.write(out);                             // write version
-      out.writeByte(status);
       parseData.write(out);
       parseText.write(out);
       return;
@@ -523,19 +508,6 @@
         if (fetcherNPReader.key() != key.get())
           throw new IOException("Mismatch between entries under "
             + FetcherOutput.DIR_NAME_NP + " and in " + sortedFile.getName());
-        // reset status in fo (FetcherOutput), using status in ParserOutput
-        switch (val.getStatus()) {
-        case ParserOutput.SUCCESS:
-          fo.setStatus(FetcherOutput.SUCCESS);
-          break;
-        case ParserOutput.UNKNOWN:
-        case ParserOutput.FAILURE:
-          fo.setStatus(FetcherOutput.CANT_PARSE);
-          break;
-        case ParserOutput.NOFETCH:
-        default:
-          // do not reset
-        }
         fetcherWriter.append(fo);
         parseDataWriter.append(val.getParseData());
         parseTextWriter.append(val.getParseText());

Modified: incubator/nutch/trunk/src/java/org/apache/nutch/tools/UpdateDatabaseTool.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/tools/UpdateDatabaseTool.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/tools/UpdateDatabaseTool.java (original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/tools/UpdateDatabaseTool.java Wed Jun  1 15:20:01 2005
@@ -29,6 +29,7 @@
 import org.apache.nutch.pagedb.*;
 import org.apache.nutch.fetcher.*;
 import org.apache.nutch.parse.*;
+import org.apache.nutch.protocol.ProtocolStatus;
 import org.apache.nutch.util.*;
 
 
@@ -108,14 +109,14 @@
             if (!fle.getFetch()) {                // didn't fetch
               pageContentsUnchanged(fo);          // treat as unchanged
 
-            } else if (fo.getStatus() == fo.SUCCESS) { // fetch succeed
+            } else if (fo.getProtocolStatus().isSuccess()) { // fetch succeed
               if (fo.getMD5Hash().equals(page.getMD5())) {
                 pageContentsUnchanged(fo);        // contents unchanged
               } else {
                 pageContentsChanged(fo, pd);      // contents changed
               }
 
-            } else if (fo.getStatus() == fo.RETRY &&
+            } else if (fo.getProtocolStatus().getCode() == ProtocolStatus.RETRY &&
                        page.getRetriesSinceFetch() < MAX_RETRIES) {
 
               pageRetry(fo);                      // retry later

Modified: incubator/nutch/trunk/src/plugin/build.xml
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/build.xml?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/plugin/build.xml (original)
+++ incubator/nutch/trunk/src/plugin/build.xml Wed Jun  1 15:20:01 2005
@@ -9,12 +9,14 @@
      <ant dir="protocol-file" target="deploy"/>
      <ant dir="protocol-ftp" target="deploy"/>
      <ant dir="protocol-http" target="deploy"/>
+     <ant dir="protocol-httpclient" target="deploy"/>
      <ant dir="parse-html" target="deploy"/>
+     <ant dir="parse-js" target="deploy"/>
      <ant dir="parse-text" target="deploy"/>
      <ant dir="parse-pdf" target="deploy"/>
      <ant dir="parse-msword" target="deploy"/>
-<!-- <ant dir="parse-mp3" target="deploy"/>      license: jid3 is LGPL-->
-<!-- <ant dir="parse-rtf" target="deploy"/>      license: parse-rtf is LGPL-->
+<!-- <ant dir="parse-mp3" target="deploy"/> -->
+<!-- <ant dir="parse-rtf" target="deploy"/> -->
      <ant dir="parse-ext" target="deploy"/>
      <ant dir="index-basic" target="deploy"/>
      <ant dir="index-more" target="deploy"/>
@@ -38,8 +40,8 @@
      <ant dir="parse-html" target="test"/>
      <ant dir="parse-pdf" target="test"/>
      <ant dir="parse-msword" target="test"/>
-<!-- <ant dir="parse-mp3" target="test"/> -->
-<!-- <ant dir="parse-rtf" target="test"/> -->
+ <!-- <ant dir="parse-mp3" target="test"/> -->
+ <!-- <ant dir="parse-rtf" target="test"/> -->
      <ant dir="parse-ext" target="test"/>
      <ant dir="creativecommons" target="test"/>
      <ant dir="languageidentifier" target="test"/>
@@ -53,7 +55,9 @@
     <ant dir="protocol-file" target="clean"/>
     <ant dir="protocol-ftp" target="clean"/>
     <ant dir="protocol-http" target="clean"/>
+    <ant dir="protocol-httpclient" target="clean"/>
     <ant dir="parse-html" target="clean"/>
+    <ant dir="parse-js" target="clean"/>
     <ant dir="parse-text" target="clean"/>
     <ant dir="parse-pdf" target="clean"/>
     <ant dir="parse-msword" target="clean"/>

Modified: incubator/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java (original)
+++ incubator/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java Wed Jun  1 15:20:01 2005
@@ -252,19 +252,22 @@
 
   /** Adds metadata or otherwise modifies a parse of an HTML document, given
    * the DOM tree of a page. */
-  public Parse filter(Content content, Parse parse, DocumentFragment doc)
-    throws ParseException {
+  public Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc) {
 
     // construct base url
     URL base;
     try {
       base = new URL(content.getBaseUrl());
     } catch (MalformedURLException e) {
-      throw new ParseException(e);
+      return new ParseStatus(e).getEmptyParse();
     }
 
-    // extract license metadata
-    Walker.walk(doc, base, parse.getData().getMetadata());
+    try {
+      // extract license metadata
+      Walker.walk(doc, base, parse.getData().getMetadata());
+    } catch (ParseException e) {
+      return new ParseStatus(e).getEmptyParse();
+    }
 
     return parse;
   }

Modified: incubator/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java (original)
+++ incubator/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java Wed Jun  1 15:20:01 2005
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 package org.apache.nutch.analysis.lang;
+import org.apache.nutch.parse.HTMLMetaTags;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.HtmlParseFilter;
-import org.apache.nutch.parse.ParseException;
 import org.apache.nutch.protocol.Content;
 import org.w3c.dom.*;
 
@@ -38,8 +38,7 @@
    * <li>3. meta http-equiv (content-language) (http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2)
    * <br>Only the first occurence of language is stored.
    */
-  public Parse filter(Content content, Parse parse, DocumentFragment doc)
-    throws ParseException {
+  public Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc) {
     String lang = findLanguage(doc);
 
     if (lang != null) {

Modified: incubator/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java (original)
+++ incubator/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java Wed Jun  1 15:20:01 2005
@@ -239,7 +239,7 @@
     Protocol protocol;
     try {
       protocol = ProtocolFactory.getProtocol(url);
-      Content content = protocol.getContent(url);
+      Content content = protocol.getProtocolOutput(url).getContent();
       String contentType = content.getContentType();
       Parser parser = ParserFactory.getParser(contentType, url);
       Parse parse = parser.getParse(content);

Modified: incubator/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java (original)
+++ incubator/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java Wed Jun  1 15:20:01 2005
@@ -17,15 +17,14 @@
 package org.apache.nutch.parse.ext;
 
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.parse.ParseStatus;
 import org.apache.nutch.parse.Parser;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.parse.ParseImpl;
 import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.ParseException;
 
 import org.apache.nutch.util.LogFormatter;
-import org.apache.nutch.util.NutchConf;
 import org.apache.nutch.util.CommandRunner;
 
 import org.apache.nutch.plugin.Extension;
@@ -88,14 +87,14 @@
 
   public ExtParser () {}
 
-  public Parse getParse(Content content) throws ParseException {
+  public Parse getParse(Content content) {
 
     String contentType = content.getContentType();
 
     String[] params = (String[]) TYPE_PARAMS_MAP.get(contentType);
     if (params == null)
-      throw new ParseException(
-        "No external command defined for contentType: " + contentType);
+      return new ParseStatus(ParseStatus.FAILED,
+                      "No external command defined for contentType: " + contentType).getEmptyParse();
 
     String command = params[0];
     int timeout = Integer.parseInt(params[1]);
@@ -114,8 +113,10 @@
         (String)content.getMetadata().get("Content-Length");
       if (contentLength != null
             && raw.length != Integer.parseInt(contentLength)) {
-          throw new ParseException("Content truncated at "+raw.length
-            +" bytes. Parser can't handle incomplete "+contentType+" file.");
+          return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
+                "Content truncated at " + raw.length
+            +" bytes. Parser can't handle incomplete "
+            + contentType + " file.").getEmptyParse();
       }
 
       ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE);
@@ -133,15 +134,14 @@
       cr.evaluate();
 
       if (cr.getExitValue() != 0)
-        throw new ParseException("External command "+command
-          +" failed with error: "+es.toString());
+        return new ParseStatus(ParseStatus.FAILED,
+                        "External command " + command
+                        + " failed with error: " + es.toString()).getEmptyParse();
 
       text = os.toString();
 
-    } catch (ParseException e) {
-      throw e;
     } catch (Exception e) { // run time exception
-      throw new ParseException("ExtParser failed. "+e);
+      return new ParseStatus(e).getEmptyParse();
     }
 
     if (text == null)
@@ -157,7 +157,7 @@
     Properties metaData = new Properties();
     metaData.putAll(content.getMetadata()); // copy through
 
-    ParseData parseData = new ParseData(title, outlinks, metaData);
+    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
     return new ParseImpl(text, parseData);
   }
 

Modified: incubator/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java (original)
+++ incubator/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java Wed Jun  1 15:20:01 2005
@@ -79,7 +79,7 @@
 
     // get nutch content
     Protocol protocol = ProtocolFactory.getProtocol(urlString);
-    content = protocol.getContent(urlString);
+    content = protocol.getProtocolOutput(urlString).getContent();
     protocol = null;
   }
 

Added: incubator/nutch/trunk/src/plugin/parse-html/lib/tagsoup-1.0rc3.jar
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/parse-html/lib/tagsoup-1.0rc3.jar?rev=179436&view=auto
==============================================================================
Binary file - no diff available.

Propchange: incubator/nutch/trunk/src/plugin/parse-html/lib/tagsoup-1.0rc3.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: incubator/nutch/trunk/src/plugin/parse-html/lib/tagsoup.LICENSE.txt
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/parse-html/lib/tagsoup.LICENSE.txt?rev=179436&view=auto
==============================================================================
--- incubator/nutch/trunk/src/plugin/parse-html/lib/tagsoup.LICENSE.txt (added)
+++ incubator/nutch/trunk/src/plugin/parse-html/lib/tagsoup.LICENSE.txt Wed Jun  1 15:20:01 2005
@@ -0,0 +1,49 @@
+The Academic Free License
+v. 2.1
+
+This Academic Free License (the "License") applies to any original work of authorship (the "Original Work") whose owner (the "Licensor") has placed the following notice immediately following the copyright notice for the Original Work:
+
+Licensed under the Academic Free License version 2.1
+
+1) Grant of Copyright License. Licensor hereby grants You a world-wide, royalty-free, non-exclusive, perpetual, sublicenseable license to do the following:
+
+a) to reproduce the Original Work in copies;
+
+b) to prepare derivative works ("Derivative Works") based upon the Original Work;
+
+c) to distribute copies of the Original Work and Derivative Works to the public;
+
+d) to perform the Original Work publicly; and
+
+e) to display the Original Work publicly.
+
+2) Grant of Patent License. Licensor hereby grants You a world-wide, royalty-free, non-exclusive, perpetual, sublicenseable license, under patent claims owned or controlled by the Licensor that are embodied in the Original Work as furnished by the Licensor, to make, use, sell and offer for sale the Original Work and Derivative Works.
+
+3) Grant of Source Code License. The term "Source Code" means the preferred form of the Original Work for making modifications to it and all available documentation describing how to modify the Original Work. Licensor hereby agrees to provide a machine-readable copy of the Source Code of the Original Work along with each copy of the Original Work that Licensor distributes. Licensor reserves the right to satisfy this obligation by placing a machine-readable copy of the Source Code in an information repository reasonably calculated to permit inexpensive and convenient access by You for as long as Licensor continues to distribute the Original Work, and by publishing the address of that information repository in a notice immediately following the copyright notice that applies to the Original Work.
+
+4) Exclusions From License Grant. Neither the names of Licensor, nor the names of any contributors to the Original Work, nor any of their trademarks or service marks, may be used to endorse or promote products derived from this Original Work without express prior written permission of the Licensor. Nothing in this License shall be deemed to grant any rights to trademarks, copyrights, patents, trade secrets or any other intellectual property of Licensor except as expressly stated herein. No patent license is granted to make, use, sell or offer to sell embodiments of any patent claims other than the licensed claims defined in Section 2. No right is granted to the trademarks of Licensor even if such marks are included in the Original Work. Nothing in this License shall be interpreted to prohibit Licensor from licensing under different terms from this License any Original Work that Licensor otherwise would have a right to license.
+
+5) This section intentionally omitted.
+
+6) Attribution Rights. You must retain, in the Source Code of any Derivative Works that You create, all copyright, patent or trademark notices from the Source Code of the Original Work, as well as any notices of licensing and any descriptive text identified therein as an "Attribution Notice." You must cause the Source Code for any Derivative Works that You create to carry a prominent Attribution Notice reasonably calculated to inform recipients that You have modified the Original Work.
+
+7) Warranty of Provenance and Disclaimer of Warranty. Licensor warrants that the copyright in and to the Original Work and the patent rights granted herein by Licensor are owned by the Licensor or are sublicensed to You under the terms of this License with the permission of the contributor(s) of those copyrights and patent rights. Except as expressly stated in the immediately proceeding sentence, the Original Work is provided under this License on an "AS IS" BASIS and WITHOUT WARRANTY, either express or implied, including, without limitation, the warranties of NON-INFRINGEMENT, MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY OF THE ORIGINAL WORK IS WITH YOU. This DISCLAIMER OF WARRANTY constitutes an essential part of this License. No license to Original Work is granted hereunder except under this disclaimer.
+
+8) Limitation of Liability. Under no circumstances and under no legal theory, whether in tort (including negligence), contract, or otherwise, shall the Licensor be liable to any person for any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or the use of the Original Work including, without limitation, damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses. This limitation of liability shall not apply to liability for death or personal injury resulting from Licensor's negligence to the extent applicable law prohibits such limitation. Some jurisdictions do not allow the exclusion or limitation of incidental or consequential damages, so this exclusion and limitation may not apply to You.
+
+9) Acceptance and Termination. If You distribute copies of the Original Work or a Derivative Work, You must make a reasonable effort under the circumstances to obtain the express assent of recipients to the terms of this License. Nothing else but this License (or another written agreement between Licensor and You) grants You permission to create Derivative Works based upon the Original Work or to exercise any of the rights granted in Section 1 herein, and any attempt to do so except under the terms of this License (or another written agreement between Licensor and You) is expressly prohibited by U.S. copyright law, the equivalent laws of other countries, and by international treaty. Therefore, by exercising any of the rights granted to You in Section 1 herein, You indicate Your acceptance of this License and all of its terms and conditions.
+
+10) Termination for Patent Action. This License shall terminate automatically and You may no longer exercise any of the rights granted to You by this License as of the date You commence an action, including a cross-claim or counterclaim, against Licensor or any licensee alleging that the Original Work infringes a patent. This termination provision shall not apply for an action alleging patent infringement by combinations of the Original Work with other software or hardware.
+
+11) Jurisdiction, Venue and Governing Law. Any action or suit relating to this License may be brought only in the courts of a jurisdiction wherein the Licensor resides or in which Licensor conducts its primary business, and under the laws of that jurisdiction excluding its conflict-of-law provisions. The application of the United Nations Convention on Contracts for the International Sale of Goods is expressly excluded. Any use of the Original Work outside the scope of this License or after its termination shall be subject to the requirements and penalties of the U.S. Copyright Act, 17 U.S.C.  101 et seq., the equivalent laws of other countries, and international treaty. This section shall survive the termination of this License.
+
+12) Attorneys Fees. In any action to enforce the terms of this License or seeking damages relating thereto, the prevailing party shall be entitled to recover its costs and expenses, including, without limitation, reasonable attorneys' fees and costs incurred in connection with such action, including any appeal of such action. This section shall survive the termination of this License.
+
+13) Miscellaneous. This License represents the complete agreement concerning the subject matter hereof. If any provision of this License is held to be unenforceable, such provision shall be reformed only to the extent necessary to make it enforceable.
+
+14) Definition of "You" in This License. "You" throughout this License, whether in upper or lower case, means an individual or a legal entity exercising rights under, and complying with all of the terms of, this License. For legal entities, "You" includes any entity that controls, is controlled by, or is under common control with you. For purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
+
+15) Right to Use. You may use the Original Work in all ways not otherwise restricted or conditioned by this License or by law, and Licensor promises not to interfere with or be responsible for such uses by You.
+
+This license is Copyright (C) 2003-2004 Lawrence E. Rosen. All rights reserved. Permission is hereby granted to copy and distribute this license without modification. This license may not be modified without the express written permission of its copyright owner.
+

Propchange: incubator/nutch/trunk/src/plugin/parse-html/lib/tagsoup.LICENSE.txt
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: incubator/nutch/trunk/src/plugin/parse-html/plugin.xml
URL: http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/plugin/parse-html/plugin.xml?rev=179436&r1=179435&r2=179436&view=diff
==============================================================================
--- incubator/nutch/trunk/src/plugin/parse-html/plugin.xml (original)
+++ incubator/nutch/trunk/src/plugin/parse-html/plugin.xml Wed Jun  1 15:20:01 2005
@@ -18,6 +18,7 @@
          <export name="*"/>
       </library>
       <library name="nekohtml-0.9.4.jar"/>
+      <library name="tagsoup-1.0rc3.jar"/>
    </runtime>
 
    <extension id="org.apache.nutch.parse.html"