You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by fe...@apache.org on 2012/07/25 14:51:38 UTC

svn commit: r1365562 - in /nutch/branches/2.x: CHANGES.txt src/java/org/apache/nutch/host/HostInjectorJob.java

Author: ferdy
Date: Wed Jul 25 12:51:37 2012
New Revision: 1365562

URL: http://svn.apache.org/viewvc?rev=1365562&view=rev
Log:
NUTCH-1437 HostInjectorJob to accept lines with or without protocol

Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/src/java/org/apache/nutch/host/HostInjectorJob.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1365562&r1=1365561&r2=1365562&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Wed Jul 25 12:51:37 2012
@@ -1,6 +1,7 @@
 Nutch Change Log
 
 Release 2.1 - Current Development
+* NUTCH-1437 HostInjectorJob to accept lines with or without protocol (ferdy)
 
 * NUTCH-1435 Host jobs throw NullPointerException with MySQL (ferdy via lewismc)
 

Modified: nutch/branches/2.x/src/java/org/apache/nutch/host/HostInjectorJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/host/HostInjectorJob.java?rev=1365562&r1=1365561&r2=1365562&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/host/HostInjectorJob.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/host/HostInjectorJob.java Wed Jul 25 12:51:37 2012
@@ -17,6 +17,7 @@
 package org.apache.nutch.host;
 
 import java.io.IOException;
+import java.net.URL;
 import java.nio.ByteBuffer;
 import java.util.HashSet;
 import java.util.Iterator;
@@ -50,7 +51,7 @@ import org.slf4j.LoggerFactory;
  * The files contain one host name per line, optionally followed by custom
  * metadata separated by tabs with the metadata key is separated from the
  * corresponding value by '='. <br>
- * The URLs must contain the protocol as well as the host name <br>
+ * The URLs can contain the protocol. It will be stripped if present <br>
  * e.g. http://www.nutch.org \t nutch.score=10 \t nutch.fetchInterval=2592000 \t
  * userType=open_source
  **/
@@ -96,7 +97,7 @@ public class HostInjectorJob implements 
     @Override
     protected void map(LongWritable key, Text value, Context context)
         throws IOException, InterruptedException {
-      String url = value.toString();
+      String url = value.toString().trim();
 
       // skip empty lines
       if (url.trim().length() == 0)
@@ -113,8 +114,8 @@ public class HostInjectorJob implements 
             // skip anything without a =
             continue;
           }
-          String metaname = splits[s].substring(0, indexEquals);
-          String metavalue = splits[s].substring(indexEquals + 1);
+          String metaname = splits[s].substring(0, indexEquals).trim();
+          String metavalue = splits[s].substring(indexEquals + 1).trim();
           metadata.put(metaname, metavalue);
         }
       }
@@ -128,8 +129,13 @@ public class HostInjectorJob implements 
         String valuemd = metadata.get(keymd);
         host.putToMetadata(new Utf8(keymd), ByteBuffer.wrap(valuemd.getBytes()));
       }
-
-      String hostkey = TableUtil.reverseHost(url);
+      String hostname;
+      if (url.indexOf("://")> -1) {
+        hostname=new URL(url).getHost();
+      } else {
+        hostname=new URL("http://"+url).getHost();
+      }
+      String hostkey = TableUtil.reverseHost(hostname);
       context.write(hostkey, host);
     }
   }