You are viewing a plain text version of this content. The canonical link for it is here.
Posted to droids-commits@incubator.apache.org by ol...@apache.org on 2008/11/08 17:40:53 UTC

svn commit: r712408 - in /incubator/droids/trunk: droids-core/src/main/java/org/apache/droids/protocol/http/ droids-norobots/src/main/java/org/apache/droids/norobots/ droids-norobots/src/test/java/org/ droids-norobots/src/test/java/org/apache/ droids-n...

Author: olegk
Date: Sat Nov  8 09:40:52 2008
New Revision: 712408

URL: http://svn.apache.org/viewvc?rev=712408&view=rev
Log:
Refactored NoRobotsClient to depend on the abstract ContentLoader interface for content retrieval instead of the URLConnection class

Added:
    incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/ContentLoader.java
    incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/SimpleContentLoader.java
    incubator/droids/trunk/droids-norobots/src/test/java/org/
    incubator/droids/trunk/droids-norobots/src/test/java/org/apache/
    incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/
    incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/norobots/
    incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/norobots/TestNorobotsClient.java
    incubator/droids/trunk/droids-norobots/src/test/resources/
    incubator/droids/trunk/droids-norobots/src/test/resources/simple-robots.txt
Modified:
    incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/Http.java
    incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/NoRobotClient.java

Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/Http.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/Http.java?rev=712408&r1=712407&r2=712408&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/Http.java (original)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/Http.java Sat Nov  8 09:40:52 2008
@@ -20,12 +20,15 @@
 import java.io.InputStream;
 import java.net.HttpURLConnection;
 import java.net.MalformedURLException;
+import java.net.URI;
+import java.net.URISyntaxException;
 import java.net.URL;
 
 import org.apache.droids.api.Protocol;
 import org.apache.droids.net.UrlHelper;
 import org.apache.droids.norobots.NoRobotClient;
 import org.apache.droids.norobots.NoRobotException;
+import org.apache.droids.norobots.SimpleContentLoader;
 import org.apache.droids.protocol.HttpBase;
 
 /**
@@ -78,11 +81,13 @@
     robotsUrl = UrlHelper.findRobotsUrl(base);
     boolean test = false;
     if (null != robotsUrl) {
-      NoRobotClient nrc = new NoRobotClient(userAgent);
+      NoRobotClient nrc = new NoRobotClient(new SimpleContentLoader(), userAgent);
       try {
-        nrc.parse(robotsUrl);
-        URL url = new URL(location);
+        nrc.parse(robotsUrl.toURI());
+        URI url = new URI(location);
         test = nrc.isUrlAllowed(url);
+      } catch (URISyntaxException e) {
+          throw new MalformedURLException("Invalid URL: " + e.getInput());
       } catch (IOException e) {
         log.fatal(e);
       } catch (NoRobotException e) {

Added: incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/ContentLoader.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/ContentLoader.java?rev=712408&view=auto
==============================================================================
--- incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/ContentLoader.java (added)
+++ incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/ContentLoader.java Sat Nov  8 09:40:52 2008
@@ -0,0 +1,42 @@
+/*
+ * ====================================================================
+ *
+ *  Copyright 2005 The Apache Software Foundation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation.  For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ *
+ */
+
+package org.apache.droids.norobots;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URI;
+
+/**
+ * An abstract loader intended for retrieving content identified by a URI.
+ */
+public interface ContentLoader
+{
+
+  boolean exists(URI uri) throws IOException;
+
+  InputStream load(URI uri) throws IOException;
+
+}

Modified: incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/NoRobotClient.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/NoRobotClient.java?rev=712408&r1=712407&r2=712408&view=diff
==============================================================================
--- incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/NoRobotClient.java (original)
+++ incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/NoRobotClient.java Sat Nov  8 09:40:52 2008
@@ -25,16 +25,17 @@
 
 package org.apache.droids.norobots;
 
-import java.io.IOException;
-import java.io.StringReader;
 import java.io.BufferedReader;
-import java.io.InputStreamReader;
+import java.io.IOException;
 import java.io.InputStream;
-import java.net.URL;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.io.StringReader;
+import java.io.StringWriter;
+import java.io.UnsupportedEncodingException;
+import java.net.URI;
+import java.net.URISyntaxException;
 import java.net.URLDecoder;
-import java.net.MalformedURLException;
-import java.net.HttpURLConnection;
-import java.net.URLConnection;
 
 /**
  * A Client which may be used to decide which urls on a website 
@@ -44,17 +45,26 @@
  */
 public class NoRobotClient {
 
-  private String userAgent;
+  private static final String US_ASCII = "US-ASCII";
+  
+  private final ContentLoader contentLoader;
+  private final String userAgent;
+  
+  private URI baseURI;
   private RulesEngine rules;
   private RulesEngine wildcardRules;
-  private URL baseUrl;
 
   /**
    * Create a Client for a particular user-agent name. 
    *
    * @param userAgent name for the robot
    */
-  public NoRobotClient(String userAgent) {
+  public NoRobotClient(ContentLoader contentLoader, String userAgent) {
+    super();
+    if (contentLoader == null) {
+      throw new IllegalArgumentException("Content loader may not be null");
+    }
+    this.contentLoader = contentLoader;
     this.userAgent = userAgent;
     this.rules = new RulesEngine();
   }
@@ -66,42 +76,44 @@
    *
    * @param baseUrl of the site
    */
-  public void parse(URL baseUrl) throws NoRobotException {
-    URL txtUrl = null;
+  public void parse(URI baseUri) throws IOException, NoRobotException {
+    URI uri;
     try {
-      // fetch baseUrl+"robots.txt"
-      txtUrl = new URL(baseUrl, "/robots.txt");
-    } catch(MalformedURLException murle) {
-      throw new NoRobotException("Bad URL: "+baseUrl+", robots.txt. ", murle);
+      uri = baseUri.resolve(new URI("/robots.txt"));
+    } catch (URISyntaxException ex) {
+      throw new NoRobotException("Invalid URI", ex);
+    }
+    // fetch baseUrl+"robots.txt"
+    if (!contentLoader.exists(uri)) {
+      return;
     }
-    
-    if (txtUrl!=null){
-      this.baseUrl = baseUrl;
-
-      String txt = null;
-      try {
-        txt = loadContent(txtUrl, this.userAgent);
-        if(txt == null) {
-          throw new NoRobotException("No content found for: "+txtUrl);
-        }
-      } catch(IOException ioe) {
-        throw new NoRobotException("Unable to get content for: "+txtUrl, ioe);
-      }
-
-      try {
-        parseText(txt);
-      } catch(NoRobotException nre) {
-        throw new NoRobotException("Problem while parsing "+txtUrl, nre);
-      }
+    InputStream instream = contentLoader.load(uri);
+    try {
+      parseText(instream);
+    } finally {
+      instream.close();
     }
+    baseURI = baseUri;
   }
 
-  public void parseText(String txt) throws NoRobotException {
+  public void parseText(InputStream instream) throws IOException {
+    StringWriter writer = new StringWriter();
+    Reader reader = new InputStreamReader(instream, US_ASCII);
+    try {
+      char[] tmp = new char[2048];
+      int l;
+      while ((l = reader.read(tmp)) != -1) {
+        writer.write(tmp, 0, l);
+      }
+    } finally {
+      reader.close();
+    }
+    String txt = writer.toString();
     this.rules = parseTextForUserAgent(txt, this.userAgent);
     this.wildcardRules = parseTextForUserAgent(txt, "*");
   }
 
-  private RulesEngine parseTextForUserAgent(String txt, String userAgent) throws NoRobotException {
+  private RulesEngine parseTextForUserAgent(String txt, String userAgent) throws IOException {
 
     RulesEngine engine = new RulesEngine();
 
@@ -113,65 +125,59 @@
     String line = "";
     String value = null;
     boolean parsingAllowBlock = false;
-    try {
-      while( (line = rdr.readLine()) != null ) {
-        // trim whitespace from either side
-        line = line.trim();
-
-        // ignore startsWith('#')
-        if(line.startsWith("#")) {
-          continue;
-        }
-
-        // if User-agent == userAgent 
-        // record the rest up until end or next User-agent
-        // then quit (? check spec)
-        if(line.startsWith("User-agent:")) {
-
-          if(parsingAllowBlock) {
-            // we've just finished reading allows/disallows
-            if(engine.isEmpty()) {
-              // multiple user agents in a line, let's 
-              // wait til we get rules
-              continue;
-            } else {
-              break;
-            }
-          }
+    while( (line = rdr.readLine()) != null ) {
+      // trim whitespace from either side
+      line = line.trim();
+
+      // ignore startsWith('#')
+      if(line.startsWith("#")) {
+        continue;
+      }
 
-          value = line.substring("User-agent:".length()).trim();
-          if(value.equalsIgnoreCase(userAgent)) {
-            parsingAllowBlock = true;
+      // if User-agent == userAgent 
+      // record the rest up until end or next User-agent
+      // then quit (? check spec)
+      if(line.startsWith("User-agent:")) {
+
+        if(parsingAllowBlock) {
+          // we've just finished reading allows/disallows
+          if(engine.isEmpty()) {
+            // multiple user agents in a line, let's 
+            // wait til we get rules
             continue;
+          } else {
+            break;
           }
-        } 
-        else {
-          // if not, then store if we're currently the user agent
-          if(parsingAllowBlock) {
-            if(line.startsWith("Allow:")) {
-              value = line.substring("Allow:".length()).trim();
-              value = URLDecoder.decode(value);
-              engine.allowPath( value );
-            } else 
-            if(line.startsWith("Disallow:")) {
-              value = line.substring("Disallow:".length()).trim();
-              value = URLDecoder.decode(value);
-              engine.disallowPath( value );
-            } else {
-              // ignore
-              continue;
-            }
+        }
+
+        value = line.substring("User-agent:".length()).trim();
+        if(value.equalsIgnoreCase(userAgent)) {
+          parsingAllowBlock = true;
+          continue;
+        }
+      } 
+      else {
+        // if not, then store if we're currently the user agent
+        if(parsingAllowBlock) {
+          if(line.startsWith("Allow:")) {
+            value = line.substring("Allow:".length()).trim();
+            value = URLDecoder.decode(value, US_ASCII);
+            engine.allowPath( value );
+          } else 
+          if(line.startsWith("Disallow:")) {
+            value = line.substring("Disallow:".length()).trim();
+            value = URLDecoder.decode(value, US_ASCII);
+            engine.disallowPath( value );
           } else {
             // ignore
             continue;
           }
+        } else {
+          // ignore
+          continue;
         }
       }
-    } catch (IOException ioe) {
-      // As this is parsing a String, it should not have an IOE
-      throw new NoRobotException("Problem while parsing text. ", ioe);
     }
-
     return engine;
   }
 
@@ -187,30 +193,34 @@
    *
    * @throws IllegalStateException when parse has not been called
    */
-  public boolean isUrlAllowed(URL url) throws IllegalStateException, IllegalArgumentException {
+  public boolean isUrlAllowed(URI uri) throws IllegalStateException, IllegalArgumentException {
     if(rules == null) {
       throw new IllegalStateException("You must call parse before you call this method.  ");
     }
 
-    if( !baseUrl.getHost().equals(url.getHost()) ||
-         baseUrl.getPort() != url.getPort() ||
-        !baseUrl.getProtocol().equals(url.getProtocol()) )
+    if (baseURI != null && (!baseURI.getHost().equals(uri.getHost()) ||
+         baseURI.getPort() != uri.getPort() ||
+        !baseURI.getScheme().equals(uri.getScheme())))
     {
       throw new IllegalArgumentException(
-          "Illegal to use a different url, " + url.toExternalForm() + 
-          ",  for this robots.txt: "+this.baseUrl.toExternalForm());
+          "Illegal to use a different url, " + uri.toString() + 
+          ",  for this robots.txt: " + baseURI.toString());
     }
     
-    if(url.sameFile(baseUrl)) {
+    if((uri.equals(baseURI))) {
       return true;
     }
     
-    String urlStr = url.toExternalForm();
-    
-    urlStr = URLDecoder.decode( urlStr );
-    Boolean allowed = this.rules.isAllowed( urlStr );
+    String uriStr = uri.toString();
+    try {
+      uriStr = URLDecoder.decode(uriStr, US_ASCII);
+    } catch (UnsupportedEncodingException ex) {
+      // ASCII always supported
+      return false;
+    }
+    Boolean allowed = this.rules.isAllowed( uriStr );
     if(allowed == null) {
-      allowed = this.wildcardRules.isAllowed( urlStr );
+      allowed = this.wildcardRules.isAllowed( uriStr );
     }
     if(allowed == null) {
       allowed = Boolean.TRUE;
@@ -219,24 +229,4 @@
     return allowed.booleanValue();
   }
 
-  // INLINE: as such from genjava/gj-core's net package. Simple method 
-  // stolen from Payload too.
-  private static String loadContent(URL url, String userAgent) throws IOException {
-    URLConnection urlConn = url.openConnection();
-    if(urlConn instanceof HttpURLConnection) {
-      if(userAgent != null) {
-        ((HttpURLConnection)urlConn).addRequestProperty("User-Agent", userAgent);
-      }
-    }
-    InputStream in = urlConn.getInputStream();
-    BufferedReader rdr = new BufferedReader(new InputStreamReader(in));
-    StringBuffer buffer = new StringBuffer();
-    String line = "";
-    while( (line = rdr.readLine()) != null) {
-      buffer.append(line);
-      buffer.append("\n");
-    }
-    in.close();
-    return buffer.toString();
-  }
 }

Added: incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/SimpleContentLoader.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/SimpleContentLoader.java?rev=712408&view=auto
==============================================================================
--- incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/SimpleContentLoader.java (added)
+++ incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/SimpleContentLoader.java Sat Nov  8 09:40:52 2008
@@ -0,0 +1,57 @@
+/*
+ * ====================================================================
+ *
+ *  Copyright 2005 The Apache Software Foundation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation.  For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ *
+ */
+
+package org.apache.droids.norobots;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URI;
+import java.net.URL;
+import java.net.URLConnection;
+
+/**
+ * A simple implementation of {@link ContentLoader} based on {@link URLConnection}.
+ */
+public class SimpleContentLoader implements ContentLoader
+{
+
+  public boolean exists(URI uri) throws IOException
+  {
+    URL url = uri.toURL();
+    try {
+      URLConnection conn = url.openConnection();
+      return conn != null;
+    } catch (IOException ex) {
+      return false;
+    }
+  }
+
+  public InputStream load(URI uri) throws IOException {
+    URL url = uri.toURL();
+    URLConnection conn = url.openConnection();
+    return conn.getInputStream();
+  }
+
+}

Added: incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/norobots/TestNorobotsClient.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/norobots/TestNorobotsClient.java?rev=712408&view=auto
==============================================================================
--- incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/norobots/TestNorobotsClient.java (added)
+++ incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/norobots/TestNorobotsClient.java Sat Nov  8 09:40:52 2008
@@ -0,0 +1,25 @@
+package org.apache.droids.norobots;
+
+import java.net.URI;
+import java.net.URL;
+
+import junit.framework.Assert;
+
+import org.junit.Test;
+
+public class TestNorobotsClient
+{
+  
+  @Test
+  public void testSimpleRobotsFile() throws Exception {
+    ClassLoader cl = getClass().getClassLoader();
+    URL url = cl.getResource("simple-robots.txt");
+    Assert.assertNotNull(url);
+    NoRobotClient nrc = new NoRobotClient(new SimpleContentLoader(), "whatever");
+    nrc.parseText(url.openStream());
+    Assert.assertTrue(nrc.isUrlAllowed(new URI("/whatever/")));
+    Assert.assertFalse(nrc.isUrlAllowed(new URI("/~mine/")));
+    Assert.assertFalse(nrc.isUrlAllowed(new URI("/tmp/")));
+  }
+  
+}

Added: incubator/droids/trunk/droids-norobots/src/test/resources/simple-robots.txt
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-norobots/src/test/resources/simple-robots.txt?rev=712408&view=auto
==============================================================================
--- incubator/droids/trunk/droids-norobots/src/test/resources/simple-robots.txt (added)
+++ incubator/droids/trunk/droids-norobots/src/test/resources/simple-robots.txt Sat Nov  8 09:40:52 2008
@@ -0,0 +1,4 @@
+User-agent: *
+Disallow: /cgi-bin/
+Disallow: /tmp/
+Disallow: /~mine/
\ No newline at end of file