You are viewing a plain text version of this content. The canonical link for it is here.
Posted to droids-commits@incubator.apache.org by ol...@apache.org on 2008/11/08 22:46:02 UTC

svn commit: r712445 - in /incubator/droids/trunk/droids-norobots/src: main/java/org/apache/droids/norobots/NoRobotClient.java test/java/org/apache/droids/norobots/TestNorobotsClient.java test/resources/simple-robots.txt

Author: olegk
Date: Sat Nov  8 14:46:02 2008
New Revision: 712445

URL: http://svn.apache.org/viewvc?rev=712445&view=rev
Log:
* Eliminated in-memory content buffering in the NoRobotClient
* Added method to return a complete set of rules for all user agents

Removed:
    incubator/droids/trunk/droids-norobots/src/test/resources/simple-robots.txt
Modified:
    incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/NoRobotClient.java
    incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/norobots/TestNorobotsClient.java

Modified: incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/NoRobotClient.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/NoRobotClient.java?rev=712445&r1=712444&r2=712445&view=diff
==============================================================================
--- incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/NoRobotClient.java (original)
+++ incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/NoRobotClient.java Sat Nov  8 14:46:02 2008
@@ -29,13 +29,15 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
-import java.io.Reader;
-import java.io.StringReader;
-import java.io.StringWriter;
 import java.io.UnsupportedEncodingException;
 import java.net.URI;
 import java.net.URISyntaxException;
 import java.net.URLDecoder;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
 
 /**
  * A Client which may be used to decide which urls on a website 
@@ -65,8 +67,11 @@
       throw new IllegalArgumentException("Content loader may not be null");
     }
     this.contentLoader = contentLoader;
-    this.userAgent = userAgent;
-    this.rules = new RulesEngine();
+    if (userAgent != null) {
+      this.userAgent = userAgent.toLowerCase(Locale.ENGLISH);
+    } else {
+      this.userAgent = null;
+    }
   }
 
   /**
@@ -97,34 +102,44 @@
   }
 
   public void parseText(InputStream instream) throws IOException {
-    StringWriter writer = new StringWriter();
-    Reader reader = new InputStreamReader(instream, US_ASCII);
+    Map<String, RulesEngine> map = parse(instream);
+    this.rules = map.get(this.userAgent);
+    if (this.rules == null) {
+      this.rules = new RulesEngine();
+    }
+    this.wildcardRules = map.get("*");
+    if (this.wildcardRules == null) {
+      this.wildcardRules = new RulesEngine();
+    }
+  }
+
+  public static Map<String, RulesEngine> parse(InputStream instream) throws IOException {
     try {
-      char[] tmp = new char[2048];
-      int l;
-      while ((l = reader.read(tmp)) != -1) {
-        writer.write(tmp, 0, l);
-      }
+      return doParse(instream);
     } finally {
-      reader.close();
+      instream.close();
     }
-    String txt = writer.toString();
-    this.rules = parseTextForUserAgent(txt, this.userAgent);
-    this.wildcardRules = parseTextForUserAgent(txt, "*");
   }
+  
+  enum ParserState 
+  {
+    USER_AGENT_DEF, ALLOW_DISALLOW_DEF
+  }
+  
+  private static Map<String, RulesEngine> doParse(InputStream instream) throws IOException {
 
-  private RulesEngine parseTextForUserAgent(String txt, String userAgent) throws IOException {
-
-    RulesEngine engine = new RulesEngine();
-
+    Map<String, RulesEngine> map = new HashMap<String, RulesEngine>();
     // Classic basic parser style, read an element at a time, 
     // changing a state variable [parsingAllowBlock]
 
     // take each line, one at a time
-    BufferedReader rdr = new BufferedReader( new StringReader(txt) );
+    BufferedReader rdr = new BufferedReader(new InputStreamReader(instream, US_ASCII));
+    
+    Set<RulesEngine> engines = new HashSet<RulesEngine>();
+    
+    ParserState state = ParserState.ALLOW_DISALLOW_DEF;
+    
     String line = "";
-    String value = null;
-    boolean parsingAllowBlock = false;
     while( (line = rdr.readLine()) != null ) {
       // trim whitespace from either side
       line = line.trim();
@@ -134,43 +149,38 @@
         continue;
       }
 
-      // if User-agent == userAgent 
-      // record the rest up until end or next User-agent
-      // then quit (? check spec)
       if(line.startsWith("User-agent:")) {
-
-        if(parsingAllowBlock) {
-          // we've just finished reading allows/disallows
-          if(engine.isEmpty()) {
-            // multiple user agents in a line, let's 
-            // wait til we get rules
-            continue;
-          } else {
-            break;
-          }
+        if (state == ParserState.ALLOW_DISALLOW_DEF) {
+          engines.clear();
         }
-
-        value = line.substring("User-agent:".length()).trim();
-        if(value.equalsIgnoreCase(userAgent)) {
-          parsingAllowBlock = true;
-          continue;
+        state = ParserState.USER_AGENT_DEF;
+        String userAgent = line.substring("User-agent:".length());
+        userAgent = userAgent.trim().toLowerCase(Locale.ENGLISH);
+        RulesEngine engine = map.get(userAgent);
+        if (engine == null) {
+          engine = new RulesEngine();
+          map.put(userAgent, engine);
         }
+        engines.add(engine);
       } 
       else {
-        // if not, then store if we're currently the user agent
-        if(parsingAllowBlock) {
-          if(line.startsWith("Allow:")) {
-            value = line.substring("Allow:".length()).trim();
-            value = URLDecoder.decode(value, US_ASCII);
+        if (engines.isEmpty()) {
+          continue;
+        }
+        if(line.startsWith("Allow:")) {
+          state = ParserState.ALLOW_DISALLOW_DEF;
+          String value = line.substring("Allow:".length()).trim();
+          value = URLDecoder.decode(value, US_ASCII);
+          for (RulesEngine engine: engines) {
             engine.allowPath( value );
-          } else 
-          if(line.startsWith("Disallow:")) {
-            value = line.substring("Disallow:".length()).trim();
-            value = URLDecoder.decode(value, US_ASCII);
+          }
+        } else 
+        if(line.startsWith("Disallow:")) {
+          state = ParserState.ALLOW_DISALLOW_DEF;
+          String value = line.substring("Disallow:".length()).trim();
+          value = URLDecoder.decode(value, US_ASCII);
+          for (RulesEngine engine: engines) {
             engine.disallowPath( value );
-          } else {
-            // ignore
-            continue;
           }
         } else {
           // ignore
@@ -178,7 +188,7 @@
         }
       }
     }
-    return engine;
+    return map;
   }
 
   /**

Modified: incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/norobots/TestNorobotsClient.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/norobots/TestNorobotsClient.java?rev=712445&r1=712444&r2=712445&view=diff
==============================================================================
--- incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/norobots/TestNorobotsClient.java (original)
+++ incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/norobots/TestNorobotsClient.java Sat Nov  8 14:46:02 2008
@@ -1,7 +1,8 @@
 package org.apache.droids.norobots;
 
+import java.io.ByteArrayInputStream;
 import java.net.URI;
-import java.net.URL;
+import java.util.Map;
 
 import junit.framework.Assert;
 
@@ -9,14 +10,84 @@
 
 public class TestNorobotsClient
 {
+
+  @Test
+  public void testRobotsParsing() throws Exception {
+    String s = 
+      "User-agent: *\r\n" +
+      "Disallow: /tmp/\r\n" +
+      "User-agent: BadRobot\r\n" +
+      "Disallow: /cgi-bin/\r\n" +
+      "Disallow: /blah/";
+    Map<String, RulesEngine> map = NoRobotClient.parse(
+        new ByteArrayInputStream(s.getBytes("US-ASCII")));
+    Assert.assertNotNull(map);
+    Assert.assertEquals(2, map.size());
+    Assert.assertNotNull(map.get("*"));
+    Assert.assertNotNull(map.get("badrobot"));
+    Assert.assertNull(map.get("BadRobot"));
+    Assert.assertNull(map.get("wnatever"));
+  }
+  
+  @Test
+  public void testComplexRobotsParsing() throws Exception {
+    String s = 
+      "User-agent: *\r\n" +
+      "Disallow: /tmp/\r\n" +
+      "User-agent: BadRobot1\r\n" +
+      "User-agent: BadRobot2\r\n" +
+      "User-agent: BadRobot3\r\n" +
+      "Disallow: /cgi-bin/\r\n" +
+      "Disallow: /blah/\r\n" +
+      "User-agent: BadRobot1\r\n" +
+      "Disallow: /yada/\r\n" +
+      "User-agent: BadRobot3\r\n" +
+      "Allow: /haha/";
+    Map<String, RulesEngine> map = NoRobotClient.parse(
+        new ByteArrayInputStream(s.getBytes("US-ASCII")));
+    Assert.assertNotNull(map);
+    Assert.assertEquals(4, map.size());
+    Assert.assertNotNull(map.get("*"));
+    Assert.assertNotNull(map.get("badrobot1"));
+    Assert.assertNotNull(map.get("badrobot2"));
+    Assert.assertNotNull(map.get("badrobot3"));
+    Assert.assertNull(map.get("badrobot4"));
+    Assert.assertNull(map.get("wnatever"));
+    
+    RulesEngine e1 = map.get("*");
+    Assert.assertEquals(Boolean.FALSE, e1.isAllowed("/tmp/"));
+    Assert.assertNull(e1.isAllowed("/blah/"));
+    Assert.assertNull(e1.isAllowed("/yada/"));
+    Assert.assertNull(e1.isAllowed("/haha/"));
+
+    RulesEngine e2 = map.get("badrobot1");
+    Assert.assertEquals(Boolean.FALSE, e2.isAllowed("/cgi-bin/"));
+    Assert.assertEquals(Boolean.FALSE, e2.isAllowed("/blah/"));
+    Assert.assertEquals(Boolean.FALSE, e2.isAllowed("/yada/"));
+    Assert.assertNull(e2.isAllowed("/haha/"));
+
+    RulesEngine e3 = map.get("badrobot2");
+    Assert.assertEquals(Boolean.FALSE, e3.isAllowed("/cgi-bin/"));
+    Assert.assertEquals(Boolean.FALSE, e3.isAllowed("/blah/"));
+    Assert.assertNull(e3.isAllowed("/yada/"));
+    Assert.assertNull(e3.isAllowed("/haha/"));
+
+    RulesEngine e4 = map.get("badrobot3");
+    Assert.assertEquals(Boolean.FALSE, e4.isAllowed("/cgi-bin/"));
+    Assert.assertEquals(Boolean.FALSE, e4.isAllowed("/blah/"));
+    Assert.assertNull(e4.isAllowed("/yada/"));
+    Assert.assertEquals(Boolean.TRUE, e4.isAllowed("/haha/"));
+  }
   
   @Test
-  public void testSimpleRobotsFile() throws Exception {
-    ClassLoader cl = getClass().getClassLoader();
-    URL url = cl.getResource("simple-robots.txt");
-    Assert.assertNotNull(url);
+  public void testSimpleRobotsCheck() throws Exception {
+    String s = 
+      "User-agent: *\r\n" +
+      "Disallow: /cgi-bin/\r\n" +
+      "Disallow: /tmp/\r\n" +
+      "Disallow: /~mine/";
     NoRobotClient nrc = new NoRobotClient(new SimpleContentLoader(), "whatever");
-    nrc.parseText(url.openStream());
+    nrc.parseText(new ByteArrayInputStream(s.getBytes("US-ASCII")));
     Assert.assertTrue(nrc.isUrlAllowed(new URI("/whatever/")));
     Assert.assertFalse(nrc.isUrlAllowed(new URI("/~mine/")));
     Assert.assertFalse(nrc.isUrlAllowed(new URI("/tmp/")));