You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by do...@apache.org on 2007/06/24 12:04:33 UTC

svn commit: r550196 - in /lucene/nutch/trunk: ./ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/test/ src/test/org/apache/nutch/fetcher/ src/testresources/fetch-test-site/

Author: dogacan
Date: Sun Jun 24 03:04:30 2007
New Revision: 550196

URL: http://svn.apache.org/viewvc?view=rev&rev=550196
Log:
NUTCH-504 - Parsing during fetching is broken.

Added:
    lucene/nutch/trunk/src/testresources/fetch-test-site/exception.html
Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
    lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
    lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
    lucene/nutch/trunk/src/test/crawl-tests.xml
    lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=550196&r1=550195&r2=550196
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Sun Jun 24 03:04:30 2007
@@ -62,6 +62,8 @@
 19. NUTCH-468 - Scoring filter should distribute score to all outlinks at 
     once. (dogacan)
 
+20. NUTCH-504 - NUTCH-443 broke parsing during fetching. (dogacan)
+
 Release 0.9 - 2007-04-02
 
  1. Changed log4j confiquration to log to stdout on commandline

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?view=diff&rev=550196&r1=550195&r2=550196
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Sun Jun 24 03:04:30 2007
@@ -311,41 +311,7 @@
             LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e));
           }
 
-          if (parseResult != null) {
-            for (Entry<Text, Parse> entry : parseResult) {
-              Text url = entry.getKey();
-              Parse parse = entry.getValue();
-              ParseStatus parseStatus = parse.getData().getStatus();
-              
-              if (!parseStatus.isSuccess()) {
-                LOG.warn("Error parsing: " + key + ": " + parseStatus);
-                parse = parseStatus.getEmptyParse(getConf());
-              }
-
-              // Calculate page signature. For non-parsing fetchers this will
-              // be done in ParseSegment
-              byte[] signature = 
-                SignatureFactory.getSignature(getConf()).calculate(content, parse);
-              // Ensure segment name and score are in parseData metadata
-              parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, 
-                  segmentName);
-              parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, 
-                  StringUtil.toHexString(signature));
-              // Pass fetch time to content meta
-              parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY,
-                  Long.toString(datum.getFetchTime()));
-              if (url.equals(key))
-                datum.setSignature(signature);
-              try {
-                scfilters.passScoreAfterParsing(url, content, parse);
-              } catch (Exception e) {
-                if (LOG.isWarnEnabled()) {
-                  e.printStackTrace(LogUtil.getWarnStream(LOG));
-                  LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
-                }
-              }
-            }
-          } else {
+          if (parseResult == null) {
             byte[] signature = 
               SignatureFactory.getSignature(getConf()).calculate(content, 
                   new ParseStatus().getEmptyParse(conf));
@@ -360,8 +326,40 @@
           output.collect(key, new ObjectWritable(content));
         if (parseResult != null) {
           for (Entry<Text, Parse> entry : parseResult) {
-            output.collect(entry.getKey(), 
-                new ObjectWritable(new ParseImpl(entry.getValue())));
+            Text url = entry.getKey();
+            Parse parse = entry.getValue();
+            ParseStatus parseStatus = parse.getData().getStatus();
+            
+            if (!parseStatus.isSuccess()) {
+              LOG.warn("Error parsing: " + key + ": " + parseStatus);
+              parse = parseStatus.getEmptyParse(getConf());
+            }
+
+            // Calculate page signature. For non-parsing fetchers this will
+            // be done in ParseSegment
+            byte[] signature = 
+              SignatureFactory.getSignature(getConf()).calculate(content, parse);
+            // Ensure segment name and score are in parseData metadata
+            parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, 
+                segmentName);
+            parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, 
+                StringUtil.toHexString(signature));
+            // Pass fetch time to content meta
+            parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY,
+                Long.toString(datum.getFetchTime()));
+            if (url.equals(key))
+              datum.setSignature(signature);
+            try {
+              scfilters.passScoreAfterParsing(url, content, parse);
+            } catch (Exception e) {
+              if (LOG.isWarnEnabled()) {
+                e.printStackTrace(LogUtil.getWarnStream(LOG));
+                LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
+              }
+            }
+            output.collect(url, new ObjectWritable(
+                    new ParseImpl(new ParseText(parse.getText()), 
+                                  parse.getData(), parse.isCanonical())));
           }
         }
       } catch (IOException e) {

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java?view=diff&rev=550196&r1=550195&r2=550196
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Sun Jun 24 03:04:30 2007
@@ -685,41 +685,7 @@
             LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e));
           }
 
-          if (parseResult != null) {
-            for (Entry<Text, Parse> entry : parseResult) {
-              Text url = entry.getKey();
-              Parse parse = entry.getValue();
-              ParseStatus parseStatus = parse.getData().getStatus();
-
-              if (!parseStatus.isSuccess()) {
-                LOG.warn("Error parsing: " + key + ": " + parseStatus);
-                parse = parseStatus.getEmptyParse(getConf());
-              }
-
-              // Calculate page signature. For non-parsing fetchers this will
-              // be done in ParseSegment
-              byte[] signature = 
-                SignatureFactory.getSignature(getConf()).calculate(content, parse);
-              // Ensure segment name and score are in parseData metadata
-              parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, 
-                  segmentName);
-              parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, 
-                  StringUtil.toHexString(signature));
-              // Pass fetch time to content meta
-              parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY,
-                  Long.toString(datum.getFetchTime()));
-              if (url.equals(key))
-                datum.setSignature(signature);
-              try {
-                scfilters.passScoreAfterParsing(url, content, parse);
-              } catch (Exception e) {
-                if (LOG.isWarnEnabled()) {
-                  e.printStackTrace(LogUtil.getWarnStream(LOG));
-                  LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
-                }
-              }
-            }
-          } else {
+          if (parseResult == null) {
             byte[] signature = 
               SignatureFactory.getSignature(getConf()).calculate(content, 
                   new ParseStatus().getEmptyParse(conf));
@@ -730,12 +696,44 @@
 
       try {
         output.collect(key, new ObjectWritable(datum));
-        if (storingContent)
+        if (content != null && storingContent)
           output.collect(key, new ObjectWritable(content));
         if (parseResult != null) {
           for (Entry<Text, Parse> entry : parseResult) {
-            output.collect(entry.getKey(), 
-                new ObjectWritable(new ParseImpl(entry.getValue())));
+            Text url = entry.getKey();
+            Parse parse = entry.getValue();
+            ParseStatus parseStatus = parse.getData().getStatus();
+            
+            if (!parseStatus.isSuccess()) {
+              LOG.warn("Error parsing: " + key + ": " + parseStatus);
+              parse = parseStatus.getEmptyParse(getConf());
+            }
+
+            // Calculate page signature. For non-parsing fetchers this will
+            // be done in ParseSegment
+            byte[] signature = 
+              SignatureFactory.getSignature(getConf()).calculate(content, parse);
+            // Ensure segment name and score are in parseData metadata
+            parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, 
+                segmentName);
+            parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, 
+                StringUtil.toHexString(signature));
+            // Pass fetch time to content meta
+            parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY,
+                Long.toString(datum.getFetchTime()));
+            if (url.equals(key))
+              datum.setSignature(signature);
+            try {
+              scfilters.passScoreAfterParsing(url, content, parse);
+            } catch (Exception e) {
+              if (LOG.isWarnEnabled()) {
+                e.printStackTrace(LogUtil.getWarnStream(LOG));
+                LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
+              }
+            }
+            output.collect(url, new ObjectWritable(
+                    new ParseImpl(new ParseText(parse.getText()), 
+                                  parse.getData(), parse.isCanonical())));
           }
         }
       } catch (IOException e) {

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java?view=diff&rev=550196&r1=550195&r2=550196
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java Sun Jun 24 03:04:30 2007
@@ -186,6 +186,10 @@
         || parseText == null || parseData == null) {
       return;                                     // only have inlinks
     }
+    
+    if (!parseData.getStatus().isSuccess()) {
+      return;
+    }
 
     Document doc = new Document();
     Metadata metadata = parseData.getContentMeta();

Modified: lucene/nutch/trunk/src/test/crawl-tests.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/crawl-tests.xml?view=diff&rev=550196&r1=550195&r2=550196
==============================================================================
--- lucene/nutch/trunk/src/test/crawl-tests.xml (original)
+++ lucene/nutch/trunk/src/test/crawl-tests.xml Sun Jun 24 03:04:30 2007
@@ -28,4 +28,10 @@
   <value>test-nutch</value>
 </property>
 
-</configuration>
\ No newline at end of file
+<property>
+  <name>http.robots.agents</name>
+  <value>test-nutch,*</value>
+</property>
+
+</configuration>
+

Modified: lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java?view=diff&rev=550196&r1=550195&r2=550196
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java Sun Jun 24 03:04:30 2007
@@ -28,6 +28,9 @@
 import org.apache.nutch.crawl.CrawlDBTestUtil;
 import org.apache.nutch.crawl.Generator;
 import org.apache.nutch.crawl.Injector;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.protocol.Content;
 import org.mortbay.jetty.Server;
 
@@ -78,6 +81,7 @@
     addUrl(urls,"pagea.html");
     addUrl(urls,"pageb.html");
     addUrl(urls,"dup_of_pagea.html");
+    addUrl(urls,"exception.html");
     
     CrawlDBTestUtil.generateSeedList(fs, urlPath, urls);
     
@@ -102,17 +106,17 @@
     int minimumTime=(int) ((urls.size()+1)*1000*conf.getFloat("fetcher.server.delay",5));
     assertTrue(time > minimumTime);
     
-    //verify results
+    //verify content
     Path content=new Path(new Path(generatedSegment, Content.DIR_NAME),"part-00000/data");
     SequenceFile.Reader reader=new SequenceFile.Reader(fs, content, conf);
     
     ArrayList<String> handledurls=new ArrayList<String>();
     
-    READ:
+    READ_CONTENT:
       do {
       Text key=new Text();
       Content value=new Content();
-      if(!reader.next(key, value)) break READ;
+      if(!reader.next(key, value)) break READ_CONTENT;
       String contentString=new String(value.getContent());
       if(contentString.indexOf("Nutch fetcher test page")!=-1) { 
         handledurls.add(key.toString());
@@ -130,7 +134,33 @@
     //verify that correct pages were handled
     assertTrue(handledurls.containsAll(urls));
     assertTrue(urls.containsAll(handledurls));
+    
+    handledurls.clear();
 
+    //verify parse data
+    Path parseData = new Path(new Path(generatedSegment, ParseData.DIR_NAME),"part-00000/data");
+    reader = new SequenceFile.Reader(fs, parseData, conf);
+    
+    READ_PARSE_DATA:
+      do {
+      Text key = new Text();
+      ParseData value = new ParseData();
+      if(!reader.next(key, value)) break READ_PARSE_DATA;
+      // make sure they all contain "nutch.segment.name" and "nutch.content.digest" 
+      // keys in parse metadata
+      Metadata contentMeta = value.getContentMeta();
+      if (contentMeta.get(Nutch.SEGMENT_NAME_KEY) != null 
+            && contentMeta.get(Nutch.SIGNATURE_KEY) != null) {
+        handledurls.add(key.toString());
+      }
+    } while(true);
+    
+    Collections.sort(handledurls);
+
+    assertEquals(urls.size(), handledurls.size());
+
+    assertTrue(handledurls.containsAll(urls));
+    assertTrue(urls.containsAll(handledurls));
   }
 
   private void addUrl(ArrayList<String> urls, String page) {

Added: lucene/nutch/trunk/src/testresources/fetch-test-site/exception.html
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/testresources/fetch-test-site/exception.html?view=auto&rev=550196
==============================================================================
--- lucene/nutch/trunk/src/testresources/fetch-test-site/exception.html (added)
+++ lucene/nutch/trunk/src/testresources/fetch-test-site/exception.html Sun Jun 24 03:04:30 2007
@@ -0,0 +1,13 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
+<HTML>
+<HEAD>
+<TITLE>Exception</TITLE>
+<META http-equiv="Content-Type" content="text/html; charset=unicode">
+</HEAD>
+<BODY>
+!!Trying to parse this one will fail with a MalformedInputException!!
+
+Nutch fetcher test page.
+</BODY>
+</HTML>
+