You are viewing a plain text version of this content. The canonical link for it is here.
Posted to droids-commits@incubator.apache.org by ol...@apache.org on 2008/11/08 22:46:02 UTC
svn commit: r712445 - in /incubator/droids/trunk/droids-norobots/src:
main/java/org/apache/droids/norobots/NoRobotClient.java
test/java/org/apache/droids/norobots/TestNorobotsClient.java
test/resources/simple-robots.txt
Author: olegk
Date: Sat Nov 8 14:46:02 2008
New Revision: 712445
URL: http://svn.apache.org/viewvc?rev=712445&view=rev
Log:
* Eliminated in-memory content buffering in the NoRobotClient
* Added method to return a complete set of rules for all user agents
Removed:
incubator/droids/trunk/droids-norobots/src/test/resources/simple-robots.txt
Modified:
incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/NoRobotClient.java
incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/norobots/TestNorobotsClient.java
Modified: incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/NoRobotClient.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/NoRobotClient.java?rev=712445&r1=712444&r2=712445&view=diff
==============================================================================
--- incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/NoRobotClient.java (original)
+++ incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/NoRobotClient.java Sat Nov 8 14:46:02 2008
@@ -29,13 +29,15 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
-import java.io.Reader;
-import java.io.StringReader;
-import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URLDecoder;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
/**
* A Client which may be used to decide which urls on a website
@@ -65,8 +67,11 @@
throw new IllegalArgumentException("Content loader may not be null");
}
this.contentLoader = contentLoader;
- this.userAgent = userAgent;
- this.rules = new RulesEngine();
+ if (userAgent != null) {
+ this.userAgent = userAgent.toLowerCase(Locale.ENGLISH);
+ } else {
+ this.userAgent = null;
+ }
}
/**
@@ -97,34 +102,44 @@
}
public void parseText(InputStream instream) throws IOException {
- StringWriter writer = new StringWriter();
- Reader reader = new InputStreamReader(instream, US_ASCII);
+ Map<String, RulesEngine> map = parse(instream);
+ this.rules = map.get(this.userAgent);
+ if (this.rules == null) {
+ this.rules = new RulesEngine();
+ }
+ this.wildcardRules = map.get("*");
+ if (this.wildcardRules == null) {
+ this.wildcardRules = new RulesEngine();
+ }
+ }
+
+ public static Map<String, RulesEngine> parse(InputStream instream) throws IOException {
try {
- char[] tmp = new char[2048];
- int l;
- while ((l = reader.read(tmp)) != -1) {
- writer.write(tmp, 0, l);
- }
+ return doParse(instream);
} finally {
- reader.close();
+ instream.close();
}
- String txt = writer.toString();
- this.rules = parseTextForUserAgent(txt, this.userAgent);
- this.wildcardRules = parseTextForUserAgent(txt, "*");
}
+
+ enum ParserState
+ {
+ USER_AGENT_DEF, ALLOW_DISALLOW_DEF
+ }
+
+ private static Map<String, RulesEngine> doParse(InputStream instream) throws IOException {
- private RulesEngine parseTextForUserAgent(String txt, String userAgent) throws IOException {
-
- RulesEngine engine = new RulesEngine();
-
+ Map<String, RulesEngine> map = new HashMap<String, RulesEngine>();
// Classic basic parser style, read an element at a time,
// changing a state variable [parsingAllowBlock]
// take each line, one at a time
- BufferedReader rdr = new BufferedReader( new StringReader(txt) );
+ BufferedReader rdr = new BufferedReader(new InputStreamReader(instream, US_ASCII));
+
+ Set<RulesEngine> engines = new HashSet<RulesEngine>();
+
+ ParserState state = ParserState.ALLOW_DISALLOW_DEF;
+
String line = "";
- String value = null;
- boolean parsingAllowBlock = false;
while( (line = rdr.readLine()) != null ) {
// trim whitespace from either side
line = line.trim();
@@ -134,43 +149,38 @@
continue;
}
- // if User-agent == userAgent
- // record the rest up until end or next User-agent
- // then quit (? check spec)
if(line.startsWith("User-agent:")) {
-
- if(parsingAllowBlock) {
- // we've just finished reading allows/disallows
- if(engine.isEmpty()) {
- // multiple user agents in a line, let's
- // wait til we get rules
- continue;
- } else {
- break;
- }
+ if (state == ParserState.ALLOW_DISALLOW_DEF) {
+ engines.clear();
}
-
- value = line.substring("User-agent:".length()).trim();
- if(value.equalsIgnoreCase(userAgent)) {
- parsingAllowBlock = true;
- continue;
+ state = ParserState.USER_AGENT_DEF;
+ String userAgent = line.substring("User-agent:".length());
+ userAgent = userAgent.trim().toLowerCase(Locale.ENGLISH);
+ RulesEngine engine = map.get(userAgent);
+ if (engine == null) {
+ engine = new RulesEngine();
+ map.put(userAgent, engine);
}
+ engines.add(engine);
}
else {
- // if not, then store if we're currently the user agent
- if(parsingAllowBlock) {
- if(line.startsWith("Allow:")) {
- value = line.substring("Allow:".length()).trim();
- value = URLDecoder.decode(value, US_ASCII);
+ if (engines.isEmpty()) {
+ continue;
+ }
+ if(line.startsWith("Allow:")) {
+ state = ParserState.ALLOW_DISALLOW_DEF;
+ String value = line.substring("Allow:".length()).trim();
+ value = URLDecoder.decode(value, US_ASCII);
+ for (RulesEngine engine: engines) {
engine.allowPath( value );
- } else
- if(line.startsWith("Disallow:")) {
- value = line.substring("Disallow:".length()).trim();
- value = URLDecoder.decode(value, US_ASCII);
+ }
+ } else
+ if(line.startsWith("Disallow:")) {
+ state = ParserState.ALLOW_DISALLOW_DEF;
+ String value = line.substring("Disallow:".length()).trim();
+ value = URLDecoder.decode(value, US_ASCII);
+ for (RulesEngine engine: engines) {
engine.disallowPath( value );
- } else {
- // ignore
- continue;
}
} else {
// ignore
@@ -178,7 +188,7 @@
}
}
}
- return engine;
+ return map;
}
/**
Modified: incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/norobots/TestNorobotsClient.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/norobots/TestNorobotsClient.java?rev=712445&r1=712444&r2=712445&view=diff
==============================================================================
--- incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/norobots/TestNorobotsClient.java (original)
+++ incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/norobots/TestNorobotsClient.java Sat Nov 8 14:46:02 2008
@@ -1,7 +1,8 @@
package org.apache.droids.norobots;
+import java.io.ByteArrayInputStream;
import java.net.URI;
-import java.net.URL;
+import java.util.Map;
import junit.framework.Assert;
@@ -9,14 +10,84 @@
public class TestNorobotsClient
{
+
+ @Test
+ public void testRobotsParsing() throws Exception {
+ String s =
+ "User-agent: *\r\n" +
+ "Disallow: /tmp/\r\n" +
+ "User-agent: BadRobot\r\n" +
+ "Disallow: /cgi-bin/\r\n" +
+ "Disallow: /blah/";
+ Map<String, RulesEngine> map = NoRobotClient.parse(
+ new ByteArrayInputStream(s.getBytes("US-ASCII")));
+ Assert.assertNotNull(map);
+ Assert.assertEquals(2, map.size());
+ Assert.assertNotNull(map.get("*"));
+ Assert.assertNotNull(map.get("badrobot"));
+ Assert.assertNull(map.get("BadRobot"));
+ Assert.assertNull(map.get("wnatever"));
+ }
+
+ @Test
+ public void testComplexRobotsParsing() throws Exception {
+ String s =
+ "User-agent: *\r\n" +
+ "Disallow: /tmp/\r\n" +
+ "User-agent: BadRobot1\r\n" +
+ "User-agent: BadRobot2\r\n" +
+ "User-agent: BadRobot3\r\n" +
+ "Disallow: /cgi-bin/\r\n" +
+ "Disallow: /blah/\r\n" +
+ "User-agent: BadRobot1\r\n" +
+ "Disallow: /yada/\r\n" +
+ "User-agent: BadRobot3\r\n" +
+ "Allow: /haha/";
+ Map<String, RulesEngine> map = NoRobotClient.parse(
+ new ByteArrayInputStream(s.getBytes("US-ASCII")));
+ Assert.assertNotNull(map);
+ Assert.assertEquals(4, map.size());
+ Assert.assertNotNull(map.get("*"));
+ Assert.assertNotNull(map.get("badrobot1"));
+ Assert.assertNotNull(map.get("badrobot2"));
+ Assert.assertNotNull(map.get("badrobot3"));
+ Assert.assertNull(map.get("badrobot4"));
+ Assert.assertNull(map.get("wnatever"));
+
+ RulesEngine e1 = map.get("*");
+ Assert.assertEquals(Boolean.FALSE, e1.isAllowed("/tmp/"));
+ Assert.assertNull(e1.isAllowed("/blah/"));
+ Assert.assertNull(e1.isAllowed("/yada/"));
+ Assert.assertNull(e1.isAllowed("/haha/"));
+
+ RulesEngine e2 = map.get("badrobot1");
+ Assert.assertEquals(Boolean.FALSE, e2.isAllowed("/cgi-bin/"));
+ Assert.assertEquals(Boolean.FALSE, e2.isAllowed("/blah/"));
+ Assert.assertEquals(Boolean.FALSE, e2.isAllowed("/yada/"));
+ Assert.assertNull(e2.isAllowed("/haha/"));
+
+ RulesEngine e3 = map.get("badrobot2");
+ Assert.assertEquals(Boolean.FALSE, e3.isAllowed("/cgi-bin/"));
+ Assert.assertEquals(Boolean.FALSE, e3.isAllowed("/blah/"));
+ Assert.assertNull(e3.isAllowed("/yada/"));
+ Assert.assertNull(e3.isAllowed("/haha/"));
+
+ RulesEngine e4 = map.get("badrobot3");
+ Assert.assertEquals(Boolean.FALSE, e4.isAllowed("/cgi-bin/"));
+ Assert.assertEquals(Boolean.FALSE, e4.isAllowed("/blah/"));
+ Assert.assertNull(e4.isAllowed("/yada/"));
+ Assert.assertEquals(Boolean.TRUE, e4.isAllowed("/haha/"));
+ }
@Test
- public void testSimpleRobotsFile() throws Exception {
- ClassLoader cl = getClass().getClassLoader();
- URL url = cl.getResource("simple-robots.txt");
- Assert.assertNotNull(url);
+ public void testSimpleRobotsCheck() throws Exception {
+ String s =
+ "User-agent: *\r\n" +
+ "Disallow: /cgi-bin/\r\n" +
+ "Disallow: /tmp/\r\n" +
+ "Disallow: /~mine/";
NoRobotClient nrc = new NoRobotClient(new SimpleContentLoader(), "whatever");
- nrc.parseText(url.openStream());
+ nrc.parseText(new ByteArrayInputStream(s.getBytes("US-ASCII")));
Assert.assertTrue(nrc.isUrlAllowed(new URI("/whatever/")));
Assert.assertFalse(nrc.isUrlAllowed(new URI("/~mine/")));
Assert.assertFalse(nrc.isUrlAllowed(new URI("/tmp/")));