You are viewing a plain text version of this content. The canonical link for it is here.
Posted to droids-commits@incubator.apache.org by ol...@apache.org on 2008/11/08 17:40:53 UTC
svn commit: r712408 - in /incubator/droids/trunk:
droids-core/src/main/java/org/apache/droids/protocol/http/
droids-norobots/src/main/java/org/apache/droids/norobots/
droids-norobots/src/test/java/org/
droids-norobots/src/test/java/org/apache/ droids-n...
Author: olegk
Date: Sat Nov 8 09:40:52 2008
New Revision: 712408
URL: http://svn.apache.org/viewvc?rev=712408&view=rev
Log:
Refactored NoRobotsClient to depend on the abstract ContentLoader interface for content retrieval instead of the URLConnection class
Added:
incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/ContentLoader.java
incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/SimpleContentLoader.java
incubator/droids/trunk/droids-norobots/src/test/java/org/
incubator/droids/trunk/droids-norobots/src/test/java/org/apache/
incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/
incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/norobots/
incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/norobots/TestNorobotsClient.java
incubator/droids/trunk/droids-norobots/src/test/resources/
incubator/droids/trunk/droids-norobots/src/test/resources/simple-robots.txt
Modified:
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/Http.java
incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/NoRobotClient.java
Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/Http.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/Http.java?rev=712408&r1=712407&r2=712408&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/Http.java (original)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/protocol/http/Http.java Sat Nov 8 09:40:52 2008
@@ -20,12 +20,15 @@
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
+import java.net.URI;
+import java.net.URISyntaxException;
import java.net.URL;
import org.apache.droids.api.Protocol;
import org.apache.droids.net.UrlHelper;
import org.apache.droids.norobots.NoRobotClient;
import org.apache.droids.norobots.NoRobotException;
+import org.apache.droids.norobots.SimpleContentLoader;
import org.apache.droids.protocol.HttpBase;
/**
@@ -78,11 +81,13 @@
robotsUrl = UrlHelper.findRobotsUrl(base);
boolean test = false;
if (null != robotsUrl) {
- NoRobotClient nrc = new NoRobotClient(userAgent);
+ NoRobotClient nrc = new NoRobotClient(new SimpleContentLoader(), userAgent);
try {
- nrc.parse(robotsUrl);
- URL url = new URL(location);
+ nrc.parse(robotsUrl.toURI());
+ URI url = new URI(location);
test = nrc.isUrlAllowed(url);
+ } catch (URISyntaxException e) {
+ throw new MalformedURLException("Invalid URL: " + e.getInput());
} catch (IOException e) {
log.fatal(e);
} catch (NoRobotException e) {
Added: incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/ContentLoader.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/ContentLoader.java?rev=712408&view=auto
==============================================================================
--- incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/ContentLoader.java (added)
+++ incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/ContentLoader.java Sat Nov 8 09:40:52 2008
@@ -0,0 +1,42 @@
+/*
+ * ====================================================================
+ *
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ *
+ */
+
+package org.apache.droids.norobots;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URI;
+
+/**
+ * An abstract loader intended for retrieving content identified by a URI.
+ */
+public interface ContentLoader
+{
+
+ boolean exists(URI uri) throws IOException;
+
+ InputStream load(URI uri) throws IOException;
+
+}
Modified: incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/NoRobotClient.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/NoRobotClient.java?rev=712408&r1=712407&r2=712408&view=diff
==============================================================================
--- incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/NoRobotClient.java (original)
+++ incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/NoRobotClient.java Sat Nov 8 09:40:52 2008
@@ -25,16 +25,17 @@
package org.apache.droids.norobots;
-import java.io.IOException;
-import java.io.StringReader;
import java.io.BufferedReader;
-import java.io.InputStreamReader;
+import java.io.IOException;
import java.io.InputStream;
-import java.net.URL;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.io.StringReader;
+import java.io.StringWriter;
+import java.io.UnsupportedEncodingException;
+import java.net.URI;
+import java.net.URISyntaxException;
import java.net.URLDecoder;
-import java.net.MalformedURLException;
-import java.net.HttpURLConnection;
-import java.net.URLConnection;
/**
* A Client which may be used to decide which urls on a website
@@ -44,17 +45,26 @@
*/
public class NoRobotClient {
- private String userAgent;
+ private static final String US_ASCII = "US-ASCII";
+
+ private final ContentLoader contentLoader;
+ private final String userAgent;
+
+ private URI baseURI;
private RulesEngine rules;
private RulesEngine wildcardRules;
- private URL baseUrl;
/**
* Create a Client for a particular user-agent name.
*
* @param userAgent name for the robot
*/
- public NoRobotClient(String userAgent) {
+ public NoRobotClient(ContentLoader contentLoader, String userAgent) {
+ super();
+ if (contentLoader == null) {
+ throw new IllegalArgumentException("Content loader may not be null");
+ }
+ this.contentLoader = contentLoader;
this.userAgent = userAgent;
this.rules = new RulesEngine();
}
@@ -66,42 +76,44 @@
*
* @param baseUrl of the site
*/
- public void parse(URL baseUrl) throws NoRobotException {
- URL txtUrl = null;
+ public void parse(URI baseUri) throws IOException, NoRobotException {
+ URI uri;
try {
- // fetch baseUrl+"robots.txt"
- txtUrl = new URL(baseUrl, "/robots.txt");
- } catch(MalformedURLException murle) {
- throw new NoRobotException("Bad URL: "+baseUrl+", robots.txt. ", murle);
+ uri = baseUri.resolve(new URI("/robots.txt"));
+ } catch (URISyntaxException ex) {
+ throw new NoRobotException("Invalid URI", ex);
+ }
+ // fetch baseUrl+"robots.txt"
+ if (!contentLoader.exists(uri)) {
+ return;
}
-
- if (txtUrl!=null){
- this.baseUrl = baseUrl;
-
- String txt = null;
- try {
- txt = loadContent(txtUrl, this.userAgent);
- if(txt == null) {
- throw new NoRobotException("No content found for: "+txtUrl);
- }
- } catch(IOException ioe) {
- throw new NoRobotException("Unable to get content for: "+txtUrl, ioe);
- }
-
- try {
- parseText(txt);
- } catch(NoRobotException nre) {
- throw new NoRobotException("Problem while parsing "+txtUrl, nre);
- }
+ InputStream instream = contentLoader.load(uri);
+ try {
+ parseText(instream);
+ } finally {
+ instream.close();
}
+ baseURI = baseUri;
}
- public void parseText(String txt) throws NoRobotException {
+ public void parseText(InputStream instream) throws IOException {
+ StringWriter writer = new StringWriter();
+ Reader reader = new InputStreamReader(instream, US_ASCII);
+ try {
+ char[] tmp = new char[2048];
+ int l;
+ while ((l = reader.read(tmp)) != -1) {
+ writer.write(tmp, 0, l);
+ }
+ } finally {
+ reader.close();
+ }
+ String txt = writer.toString();
this.rules = parseTextForUserAgent(txt, this.userAgent);
this.wildcardRules = parseTextForUserAgent(txt, "*");
}
- private RulesEngine parseTextForUserAgent(String txt, String userAgent) throws NoRobotException {
+ private RulesEngine parseTextForUserAgent(String txt, String userAgent) throws IOException {
RulesEngine engine = new RulesEngine();
@@ -113,65 +125,59 @@
String line = "";
String value = null;
boolean parsingAllowBlock = false;
- try {
- while( (line = rdr.readLine()) != null ) {
- // trim whitespace from either side
- line = line.trim();
-
- // ignore startsWith('#')
- if(line.startsWith("#")) {
- continue;
- }
-
- // if User-agent == userAgent
- // record the rest up until end or next User-agent
- // then quit (? check spec)
- if(line.startsWith("User-agent:")) {
-
- if(parsingAllowBlock) {
- // we've just finished reading allows/disallows
- if(engine.isEmpty()) {
- // multiple user agents in a line, let's
- // wait til we get rules
- continue;
- } else {
- break;
- }
- }
+ while( (line = rdr.readLine()) != null ) {
+ // trim whitespace from either side
+ line = line.trim();
+
+ // ignore startsWith('#')
+ if(line.startsWith("#")) {
+ continue;
+ }
- value = line.substring("User-agent:".length()).trim();
- if(value.equalsIgnoreCase(userAgent)) {
- parsingAllowBlock = true;
+ // if User-agent == userAgent
+ // record the rest up until end or next User-agent
+ // then quit (? check spec)
+ if(line.startsWith("User-agent:")) {
+
+ if(parsingAllowBlock) {
+ // we've just finished reading allows/disallows
+ if(engine.isEmpty()) {
+ // multiple user agents in a line, let's
+ // wait til we get rules
continue;
+ } else {
+ break;
}
- }
- else {
- // if not, then store if we're currently the user agent
- if(parsingAllowBlock) {
- if(line.startsWith("Allow:")) {
- value = line.substring("Allow:".length()).trim();
- value = URLDecoder.decode(value);
- engine.allowPath( value );
- } else
- if(line.startsWith("Disallow:")) {
- value = line.substring("Disallow:".length()).trim();
- value = URLDecoder.decode(value);
- engine.disallowPath( value );
- } else {
- // ignore
- continue;
- }
+ }
+
+ value = line.substring("User-agent:".length()).trim();
+ if(value.equalsIgnoreCase(userAgent)) {
+ parsingAllowBlock = true;
+ continue;
+ }
+ }
+ else {
+ // if not, then store if we're currently the user agent
+ if(parsingAllowBlock) {
+ if(line.startsWith("Allow:")) {
+ value = line.substring("Allow:".length()).trim();
+ value = URLDecoder.decode(value, US_ASCII);
+ engine.allowPath( value );
+ } else
+ if(line.startsWith("Disallow:")) {
+ value = line.substring("Disallow:".length()).trim();
+ value = URLDecoder.decode(value, US_ASCII);
+ engine.disallowPath( value );
} else {
// ignore
continue;
}
+ } else {
+ // ignore
+ continue;
}
}
- } catch (IOException ioe) {
- // As this is parsing a String, it should not have an IOE
- throw new NoRobotException("Problem while parsing text. ", ioe);
}
-
return engine;
}
@@ -187,30 +193,34 @@
*
* @throws IllegalStateException when parse has not been called
*/
- public boolean isUrlAllowed(URL url) throws IllegalStateException, IllegalArgumentException {
+ public boolean isUrlAllowed(URI uri) throws IllegalStateException, IllegalArgumentException {
if(rules == null) {
throw new IllegalStateException("You must call parse before you call this method. ");
}
- if( !baseUrl.getHost().equals(url.getHost()) ||
- baseUrl.getPort() != url.getPort() ||
- !baseUrl.getProtocol().equals(url.getProtocol()) )
+ if (baseURI != null && (!baseURI.getHost().equals(uri.getHost()) ||
+ baseURI.getPort() != uri.getPort() ||
+ !baseURI.getScheme().equals(uri.getScheme())))
{
throw new IllegalArgumentException(
- "Illegal to use a different url, " + url.toExternalForm() +
- ", for this robots.txt: "+this.baseUrl.toExternalForm());
+ "Illegal to use a different url, " + uri.toString() +
+ ", for this robots.txt: " + baseURI.toString());
}
- if(url.sameFile(baseUrl)) {
+ if((uri.equals(baseURI))) {
return true;
}
- String urlStr = url.toExternalForm();
-
- urlStr = URLDecoder.decode( urlStr );
- Boolean allowed = this.rules.isAllowed( urlStr );
+ String uriStr = uri.toString();
+ try {
+ uriStr = URLDecoder.decode(uriStr, US_ASCII);
+ } catch (UnsupportedEncodingException ex) {
+ // ASCII always supported
+ return false;
+ }
+ Boolean allowed = this.rules.isAllowed( uriStr );
if(allowed == null) {
- allowed = this.wildcardRules.isAllowed( urlStr );
+ allowed = this.wildcardRules.isAllowed( uriStr );
}
if(allowed == null) {
allowed = Boolean.TRUE;
@@ -219,24 +229,4 @@
return allowed.booleanValue();
}
- // INLINE: as such from genjava/gj-core's net package. Simple method
- // stolen from Payload too.
- private static String loadContent(URL url, String userAgent) throws IOException {
- URLConnection urlConn = url.openConnection();
- if(urlConn instanceof HttpURLConnection) {
- if(userAgent != null) {
- ((HttpURLConnection)urlConn).addRequestProperty("User-Agent", userAgent);
- }
- }
- InputStream in = urlConn.getInputStream();
- BufferedReader rdr = new BufferedReader(new InputStreamReader(in));
- StringBuffer buffer = new StringBuffer();
- String line = "";
- while( (line = rdr.readLine()) != null) {
- buffer.append(line);
- buffer.append("\n");
- }
- in.close();
- return buffer.toString();
- }
}
Added: incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/SimpleContentLoader.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/SimpleContentLoader.java?rev=712408&view=auto
==============================================================================
--- incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/SimpleContentLoader.java (added)
+++ incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/SimpleContentLoader.java Sat Nov 8 09:40:52 2008
@@ -0,0 +1,57 @@
+/*
+ * ====================================================================
+ *
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ *
+ */
+
+package org.apache.droids.norobots;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URI;
+import java.net.URL;
+import java.net.URLConnection;
+
+/**
+ * A simple implementation of {@link ContentLoader} based on {@link URLConnection}.
+ */
+public class SimpleContentLoader implements ContentLoader
+{
+
+ public boolean exists(URI uri) throws IOException
+ {
+ URL url = uri.toURL();
+ try {
+ URLConnection conn = url.openConnection();
+ return conn != null;
+ } catch (IOException ex) {
+ return false;
+ }
+ }
+
+ public InputStream load(URI uri) throws IOException {
+ URL url = uri.toURL();
+ URLConnection conn = url.openConnection();
+ return conn.getInputStream();
+ }
+
+}
Added: incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/norobots/TestNorobotsClient.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/norobots/TestNorobotsClient.java?rev=712408&view=auto
==============================================================================
--- incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/norobots/TestNorobotsClient.java (added)
+++ incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/norobots/TestNorobotsClient.java Sat Nov 8 09:40:52 2008
@@ -0,0 +1,25 @@
+package org.apache.droids.norobots;
+
+import java.net.URI;
+import java.net.URL;
+
+import junit.framework.Assert;
+
+import org.junit.Test;
+
+public class TestNorobotsClient
+{
+
+ @Test
+ public void testSimpleRobotsFile() throws Exception {
+ ClassLoader cl = getClass().getClassLoader();
+ URL url = cl.getResource("simple-robots.txt");
+ Assert.assertNotNull(url);
+ NoRobotClient nrc = new NoRobotClient(new SimpleContentLoader(), "whatever");
+ nrc.parseText(url.openStream());
+ Assert.assertTrue(nrc.isUrlAllowed(new URI("/whatever/")));
+ Assert.assertFalse(nrc.isUrlAllowed(new URI("/~mine/")));
+ Assert.assertFalse(nrc.isUrlAllowed(new URI("/tmp/")));
+ }
+
+}
Added: incubator/droids/trunk/droids-norobots/src/test/resources/simple-robots.txt
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-norobots/src/test/resources/simple-robots.txt?rev=712408&view=auto
==============================================================================
--- incubator/droids/trunk/droids-norobots/src/test/resources/simple-robots.txt (added)
+++ incubator/droids/trunk/droids-norobots/src/test/resources/simple-robots.txt Sat Nov 8 09:40:52 2008
@@ -0,0 +1,4 @@
+User-agent: *
+Disallow: /cgi-bin/
+Disallow: /tmp/
+Disallow: /~mine/
\ No newline at end of file