You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2021/03/16 15:56:45 UTC
[nutch] branch master updated: NUTCH-2596 Upgrade from
org.mortbay.jetty to org.eclipse.jetty - remove Jetty (serving JSP pages)
for HTTP protocol plugin tests - replace JSP pages by header/content
strings hold in unit test classes
This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new d193137 NUTCH-2596 Upgrade from org.mortbay.jetty to org.eclipse.jetty - remove Jetty (serving JSP pages) for HTTP protocol plugin tests - replace JSP pages by header/content strings hold in unit test classes
new 81fb7bc Merge pull request #574 from sebastian-nagel/NUTCH-2596-http-protocol-plugin-test-remove-jsp
d193137 is described below
commit d193137585f4f8cf653bb5cf678a494bab087784
Author: Sebastian Nagel <sn...@apache.org>
AuthorDate: Sat Jan 18 23:42:53 2020 +0100
NUTCH-2596 Upgrade from org.mortbay.jetty to org.eclipse.jetty
- remove Jetty (serving JSP pages) for HTTP protocol plugin tests
- replace JSP pages by header/content strings hold in unit test classes
---
ivy/ivy.xml | 7 -
src/plugin/protocol-http/build.xml | 6 -
src/plugin/protocol-http/jsp/basic-http.jsp | 44 ---
src/plugin/protocol-http/jsp/brokenpage.jsp | 47 ---
src/plugin/protocol-http/jsp/redirect301.jsp | 49 ---
src/plugin/protocol-http/jsp/redirect302.jsp | 49 ---
.../apache/nutch/protocol/http/HttpResponse.java | 4 +
.../protocol/http/TestBadServerResponses.java | 160 ++--------
.../nutch/protocol/http/TestProtocolHttp.java | 119 +------
src/plugin/protocol-httpclient/build.xml | 6 -
src/plugin/protocol-httpclient/jsp/basic.jsp | 74 -----
src/plugin/protocol-httpclient/jsp/cookies.jsp | 63 ----
src/plugin/protocol-httpclient/jsp/digest.jsp | 68 ----
src/plugin/protocol-httpclient/jsp/noauth.jsp | 36 ---
src/plugin/protocol-httpclient/jsp/ntlm.jsp | 89 ------
.../src/test/conf/nutch-site-test.xml | 5 +
.../httpclient/TestProtocolHttpClient.java | 348 ++++++++++++++-------
src/plugin/protocol-okhttp/build.xml | 6 -
src/plugin/protocol-okhttp/jsp/basic-http.jsp | 44 ---
src/plugin/protocol-okhttp/jsp/brokenpage.jsp | 47 ---
src/plugin/protocol-okhttp/jsp/redirect301.jsp | 49 ---
src/plugin/protocol-okhttp/jsp/redirect302.jsp | 49 ---
.../protocol/okhttp/TestBadServerResponses.java | 164 +---------
.../nutch/protocol/okhttp/TestProtocolOkHttp.java | 129 +-------
.../protocol/AbstractHttpProtocolPluginTest.java | 298 ++++++++++++++++++
25 files changed, 601 insertions(+), 1359 deletions(-)
diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index 3f1faf3..00d67eb 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -111,13 +111,6 @@
<exclude org="log4j" module="log4j" />
</dependency>
<dependency org="org.mortbay.jetty" name="jetty-client" rev="6.1.26" conf="test->default" />
- <dependency org="org.mortbay.jetty" name="jetty" rev="6.1.26" conf="test->default" />
- <dependency org="org.mortbay.jetty" name="jetty-util" rev="6.1.26" conf="test->default" />
- <dependency org="tomcat" name="jasper-runtime" rev="5.5.23" conf="test->default" />
- <dependency org="tomcat" name="jasper-compiler" rev="5.5.23" conf="test->default">
- <exclude org="ant" name="ant" />
- </dependency>
- <!-- end of test artifacts -->
<!-- web app dependencies -->
<dependency org="org.mortbay.jetty" name="jetty" rev="6.1.26" />
diff --git a/src/plugin/protocol-http/build.xml b/src/plugin/protocol-http/build.xml
index 30720f1..f00c9c2 100755
--- a/src/plugin/protocol-http/build.xml
+++ b/src/plugin/protocol-http/build.xml
@@ -41,10 +41,4 @@
</copy>
</target>
- <!-- for junit test -->
- <mkdir dir="${build.test}/data" />
- <copy todir="${build.test}/data">
- <fileset dir="jsp"/>
- </copy>
-
</project>
diff --git a/src/plugin/protocol-http/jsp/basic-http.jsp b/src/plugin/protocol-http/jsp/basic-http.jsp
deleted file mode 100644
index bf1f8bd..0000000
--- a/src/plugin/protocol-http/jsp/basic-http.jsp
+++ /dev/null
@@ -1,44 +0,0 @@
-<%--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
---%><%--
- Example JSP Page to Test Protocol-Http Plugin
---%><%@ page language="java" import="java.util.*" pageEncoding="UTF-8"%><%
-String path = request.getContextPath();
-String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
-%>
-
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
-<html>
- <head>
- <base href="<%=basePath%>">
-
- <title>HelloWorld</title>
- <meta http-equiv="content-type" content="text/html;charset=utf-8" />
- <meta name="Language" content="en" />
- <meta http-equiv="pragma" content="no-cache">
- <meta http-equiv="cache-control" content="no-cache">
- <meta http-equiv="expires" content="0">
- <meta http-equiv="keywords" content="keyword1,keyword2,keyword3">
- <meta http-equiv="description" content="This is my page">
- <!--
- <link rel="stylesheet" type="text/css" href="styles.css">
- -->
- </head>
-
- <body>
- Hello World!!! <br>
- </body>
-</html>
diff --git a/src/plugin/protocol-http/jsp/brokenpage.jsp b/src/plugin/protocol-http/jsp/brokenpage.jsp
deleted file mode 100644
index f3f7c4a..0000000
--- a/src/plugin/protocol-http/jsp/brokenpage.jsp
+++ /dev/null
@@ -1,47 +0,0 @@
-<%--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
---%><%--
- Example JSP Page to Test Protocol-Http Plugin
---%>
-
-@ page language="java" import="java.util.*" pageEncoding="UTF-8"
-
-String path = request.getContextPath();
-String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
-
-
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
-<html>
- <head>
- <base href="<%=basePath%>">
-
- <title>HelloWorld</title>
- <meta http-equiv="content-type" content="text/html;charset=utf-8" />
- <meta name="Language" content="en" />
- <meta http-equiv="pragma" content="no-cache">
- <meta http-equiv="cache-control" content="no-cache">
- <meta http-equiv="expires" content="0">
- <meta http-equiv="keywords" content="keyword1,keyword2,keyword3">
- <meta http-equiv="description" content="This is my page">
- <!--
- <link rel="stylesheet" type="text/css" href="styles.css">
- -->
- </head>
-
- <body>
- Hello World!!! <br>
- </body>
-</html>
diff --git a/src/plugin/protocol-http/jsp/redirect301.jsp b/src/plugin/protocol-http/jsp/redirect301.jsp
deleted file mode 100644
index ac16501..0000000
--- a/src/plugin/protocol-http/jsp/redirect301.jsp
+++ /dev/null
@@ -1,49 +0,0 @@
-<%--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
---%><%--
- Example JSP Page to Test Protocol-Http Plugin
---%><%@ page language="java" import="java.util.*" pageEncoding="UTF-8"%><%
-String path = request.getContextPath();
-String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
-%>
-
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
-<html>
- <head>
- <base href="<%=basePath%>">
-
- <title>My JSP page</title>
-
- <meta http-equiv="pragma" content="no-cache">
- <meta http-equiv="cache-control" content="no-cache">
- <meta http-equiv="expires" content="0">
- <meta http-equiv="keywords" content="keyword1,keyword2,keyword3">
- <meta http-equiv="description" content="This is my page">
- <!--
- <link rel="stylesheet" type="text/css" href="styles.css">
- -->
-
- </head>
-
- <body>
- <%
- response.setStatus(301);
- response.setHeader( "Location", "https://nutch.apache.org/");
- response.setHeader( "Connection", "close" );
- %>
- You are redirected by JSP<br>
- </body>
-</html>
diff --git a/src/plugin/protocol-http/jsp/redirect302.jsp b/src/plugin/protocol-http/jsp/redirect302.jsp
deleted file mode 100644
index 8a92fee..0000000
--- a/src/plugin/protocol-http/jsp/redirect302.jsp
+++ /dev/null
@@ -1,49 +0,0 @@
-<%--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
---%><%--
- Example JSP Page to Test Protocol-Http Plugin
---%><%@ page language="java" import="java.util.*" pageEncoding="UTF-8"%><%
-String path = request.getContextPath();
-String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
-%>
-
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
-<html>
- <head>
- <base href="<%=basePath%>">
-
- <title>My JSP page</title>
-
- <meta http-equiv="pragma" content="no-cache">
- <meta http-equiv="cache-control" content="no-cache">
- <meta http-equiv="expires" content="0">
- <meta http-equiv="keywords" content="keyword1,keyword2,keyword3">
- <meta http-equiv="description" content="This is my page">
- <!--
- <link rel="stylesheet" type="text/css" href="styles.css">
- -->
-
- </head>
-
- <body>
- <%
- response.setStatus(302);
- response.setHeader( "Location", "https://nutch.apache.org/");
- response.setHeader( "Connection", "close" );
- %>
- You are sucessfully redirected by JSP<br>
- </body>
-</html>
diff --git a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
index 5228f33..157c9ee 100644
--- a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
+++ b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
@@ -321,6 +321,10 @@ public class HttpResponse implements Response {
+ code + ":",
e);
content = null;
+ if (httpHeaders != null) {
+ httpHeaders.append("\r\n");
+ headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString());
+ }
} else {
// If the page is a "200 OK" response, we do not want to go further
// with processing the invalid payload.
diff --git a/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestBadServerResponses.java b/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestBadServerResponses.java
index 03ddfe9..13c2558 100644
--- a/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestBadServerResponses.java
+++ b/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestBadServerResponses.java
@@ -20,23 +20,11 @@ import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
-import java.io.BufferedReader;
-import java.io.InputStreamReader;
-import java.io.OutputStreamWriter;
-import java.io.PrintWriter;
import java.lang.invoke.MethodHandles;
-import java.net.InetSocketAddress;
-import java.net.ServerSocket;
-import java.net.Socket;
-import java.net.URL;
import java.nio.charset.StandardCharsets;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.net.protocols.Response;
-import org.junit.After;
+import org.apache.nutch.protocol.AbstractHttpProtocolPluginTest;
+import org.apache.nutch.protocol.ProtocolOutput;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -45,114 +33,18 @@ import org.slf4j.LoggerFactory;
* Test cases for protocol-http - robustness regarding bad server responses:
* malformed HTTP header lines, etc. See, NUTCH-2549.
*/
-public class TestBadServerResponses {
+public class TestBadServerResponses extends AbstractHttpProtocolPluginTest {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
- private Http http;
- private ServerSocket server;
- private Configuration conf;
- private int port = 47505;
-
- private static final String responseHeader = "HTTP/1.1 200 OK\r\n";
- private static final String simpleContent = "Content-Type: text/html\r\n\r\nThis is a text.";
-
- public void setUp() throws Exception {
- conf = new Configuration();
- conf.addResource("nutch-default.xml");
- conf.addResource("nutch-site-test.xml");
- conf.setBoolean("store.http.headers", true);
-
- http = new Http();
- http.setConf(conf);
- }
-
- @After
- public void tearDown() throws Exception {
- server.close();
- }
-
- /**
- * Starts the test server at a specified port and constant response.
- *
- * @param portno
- * Port number.
- * @param response
- * response sent on every request
- */
- private void runServer(int port, String response) throws Exception {
- server = new ServerSocket();
- server.bind(new InetSocketAddress("127.0.0.1", port));
- Pattern requestPattern = Pattern.compile("(?i)^GET\\s+(\\S+)");
- while (true) {
- LOG.info("Listening on port {}", port);
- Socket socket = server.accept();
- LOG.info("Connection received");
- try (
- BufferedReader in = new BufferedReader(new InputStreamReader(
- socket.getInputStream(), StandardCharsets.UTF_8));
- PrintWriter out = new PrintWriter(new OutputStreamWriter(
- socket.getOutputStream(), StandardCharsets.UTF_8), true)) {
-
- String line;
- while ((line = in.readLine()) != null) {
- LOG.info("Request: {}", line);
- if (line.trim().isEmpty()) {
- break;
- }
- Matcher m = requestPattern.matcher(line);
- if (m.find()) {
- LOG.info("Requested {}", m.group(1));
- if (!m.group(1).startsWith("/")) {
- response = "HTTP/1.1 400 Bad request\r\n\r\n";
- }
- }
- }
- LOG.info("Response: {}",
- response.substring(0, Math.min(1024, response.length())));
- out.print(response);
- } catch (Exception e) {
- LOG.warn("Exception in test server:", e);
- }
- }
- }
-
- private void launchServer(String response) throws InterruptedException {
- Thread serverThread = new Thread(() -> {
- try {
- runServer(port, response);
- } catch (Exception e) {
- LOG.warn("Test server died:", e);
- }
- });
- serverThread.start();
- Thread.sleep(50);
- }
-
- /**
- * Fetches the specified <code>page</code> from the local test server and
- * checks whether the HTTP response status code matches with the expected
- * code.
- *
- * @param page
- * Page to be fetched.
- * @param expectedCode
- * HTTP response status code expected while fetching the page.
- */
- private Response fetchPage(String page, int expectedCode) throws Exception {
- URL url = new URL("http", "127.0.0.1", port, page);
- LOG.info("Fetching {}", url);
- CrawlDatum crawlDatum = new CrawlDatum();
- Response response = http.getResponse(url, crawlDatum, true);
- assertEquals("HTTP Status Code for " + url, expectedCode,
- response.getCode());
- return response;
+ @Override
+ protected String getPluginClassName() {
+ return "org.apache.nutch.protocol.http.Http";
}
@Test
public void testBadHttpServer() throws Exception {
- setUp();
// test with trivial well-formed content, to make sure the server is
// responding
launchServer(responseHeader + simpleContent);
@@ -164,8 +56,8 @@ public class TestBadServerResponses {
*/
@Test
public void testRequestNotStartingWithSlash() throws Exception {
- setUp();
- launchServer(responseHeader + simpleContent);
+ launchServer("/?171", responseHeader + simpleContent);
+ // request ?171 should be normalized to /?171
fetchPage("?171", 200);
}
@@ -175,7 +67,6 @@ public class TestBadServerResponses {
*/
@Test
public void testContentLengthNotANumber() throws Exception {
- setUp();
launchServer(
responseHeader + "Content-Length: thousand\r\n" + simpleContent);
fetchPage("/", 200);
@@ -186,7 +77,6 @@ public class TestBadServerResponses {
*/
@Test
public void testHeaderWithColon() throws Exception {
- setUp();
launchServer("HTTP/1.1 200: OK\r\n" + simpleContent);
fetchPage("/", 200);
}
@@ -196,7 +86,6 @@ public class TestBadServerResponses {
*/
@Test
public void testHeaderSpellChecking() throws Exception {
- setUp();
launchServer(responseHeader + "Client-Transfer-Encoding: chunked\r\n"
+ simpleContent);
fetchPage("/", 200);
@@ -208,13 +97,12 @@ public class TestBadServerResponses {
*/
@Test
public void testIgnoreErrorInRedirectPayload() throws Exception {
- setUp();
launchServer("HTTP/1.1 302 Found\r\nLocation: http://example.com/\r\n"
+ "Transfer-Encoding: chunked\r\n\r\nNot a valid chunk.");
- Response fetched = fetchPage("/", 302);
- assertNotNull("No redirect Location.", fetched.getHeader("Location"));
+ ProtocolOutput fetched = fetchPage("/", 302);
+ assertNotNull("No redirect Location.", getHeader(fetched, "Location"));
assertEquals("Wrong redirect Location.", "http://example.com/",
- fetched.getHeader("Location"));
+ getHeader(fetched, "Location"));
}
/**
@@ -222,12 +110,11 @@ public class TestBadServerResponses {
*/
@Test
public void testNoStatusLine() throws Exception {
- setUp();
String text = "This is a text containing non-ASCII characters: \u00e4\u00f6\u00fc\u00df";
launchServer(text);
- Response fetched = fetchPage("/", 200);
+ ProtocolOutput fetched = fetchPage("/", 200);
assertEquals("Wrong text returned for response with no status line.", text,
- new String(fetched.getContent(), StandardCharsets.UTF_8));
+ new String(fetched.getContent().getContent(), StandardCharsets.UTF_8));
server.close();
text = "<!DOCTYPE html>\n<html>\n<head>\n"
+ "<title>Testing no HTTP header èéâ</title>\n"
@@ -237,7 +124,7 @@ public class TestBadServerResponses {
launchServer(text);
fetched = fetchPage("/", 200);
assertEquals("Wrong text returned for response with no status line.", text,
- new String(fetched.getContent(), StandardCharsets.UTF_8));
+ new String(fetched.getContent().getContent(), StandardCharsets.UTF_8));
}
/**
@@ -246,24 +133,22 @@ public class TestBadServerResponses {
*/
@Test
public void testMultiLineHeader() throws Exception {
- setUp();
launchServer(responseHeader
+ "Set-Cookie: UserID=JohnDoe;\r\n Max-Age=3600;\r\n Version=1\r\n"
+ simpleContent);
- Response fetched = fetchPage("/", 200);
- LOG.info("Headers: {}", fetched.getHeaders());
- assertNotNull("Failed to set multi-line \"Set-Cookie\" header.", fetched.getHeader("Set-Cookie"));
+ ProtocolOutput fetched = fetchPage("/", 200);
+ LOG.info("Headers: {}", getHeaders(fetched));
+ assertNotNull("Failed to set multi-line \"Set-Cookie\" header.",
+ getHeader(fetched, "Set-Cookie"));
assertTrue("Failed to set multi-line \"Set-Cookie\" header.",
- fetched.getHeader("Set-Cookie").contains("Version=1"));
+ getHeader(fetched, "Set-Cookie").contains("Version=1"));
}
/**
* NUTCH-2561 protocol-http can be made to read arbitrarily large HTTP
* responses
*/
- @Test(expected = Exception.class)
public void testOverlongHeader() throws Exception {
- setUp();
StringBuilder response = new StringBuilder();
response.append(responseHeader);
for (int i = 0; i < 80; i++) {
@@ -276,7 +161,7 @@ public class TestBadServerResponses {
response.append("\r\n" + simpleContent);
launchServer(response.toString());
// should throw exception because of overlong header
- fetchPage("/", 200);
+ fetchPage("/", -1);
}
/**
@@ -286,7 +171,6 @@ public class TestBadServerResponses {
*/
@Test
public void testChunkedContent() throws Exception {
- setUp();
StringBuilder response = new StringBuilder();
response.append(responseHeader);
response.append("Content-Type: text/html\r\n");
@@ -303,10 +187,10 @@ public class TestBadServerResponses {
}
response.append("\r\n0\r\n\r\n");
launchServer(response.toString());
- Response fetched = fetchPage("/", 200);
+ ProtocolOutput fetched = fetchPage("/", 200);
assertEquals(
"Chunked content not truncated according to http.content.limit", 65536,
- fetched.getContent().length);
+ fetched.getContent().getContent().length);
}
}
diff --git a/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java b/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java
index 18db917..fb76573 100644
--- a/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java
+++ b/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java
@@ -16,124 +16,39 @@
*/
package org.apache.nutch.protocol.http;
-import static org.junit.Assert.assertEquals;
+import static java.nio.charset.StandardCharsets.UTF_8;
-import java.net.URL;
+import java.util.Map;
+import java.util.TreeMap;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolOutput;
-import org.junit.After;
+import org.apache.nutch.protocol.AbstractHttpProtocolPluginTest;
import org.junit.Test;
-import org.mortbay.jetty.Server;
-import org.mortbay.jetty.nio.SelectChannelConnector;
-import org.mortbay.jetty.servlet.Context;
-import org.mortbay.jetty.servlet.ServletHolder;
/**
* Test cases for protocol-http
*/
-public class TestProtocolHttp {
- private static final String RES_DIR = System.getProperty("test.data", ".");
+public class TestProtocolHttp extends AbstractHttpProtocolPluginTest {
- private Http http;
- private Server server;
- private Context root;
- private Configuration conf;
- private int port;
-
- public void setUp(boolean redirection) throws Exception {
- conf = new Configuration();
- conf.addResource("nutch-default.xml");
- conf.addResource("nutch-site-test.xml");
-
- http = new Http();
- http.setConf(conf);
-
- server = new Server();
-
- if (redirection) {
- root = new Context(server, "/redirection", Context.SESSIONS);
- root.setAttribute("newContextURL", "/redirect");
- } else {
- root = new Context(server, "/", Context.SESSIONS);
- }
-
- ServletHolder sh = new ServletHolder(
- org.apache.jasper.servlet.JspServlet.class);
- root.addServlet(sh, "*.jsp");
- root.setResourceBase(RES_DIR);
- }
-
- @After
- public void tearDown() throws Exception {
- server.stop();
+ @Override
+ protected String getPluginClassName() {
+ return "org.apache.nutch.protocol.http.Http";
}
@Test
public void testStatusCode() throws Exception {
- startServer(47504, false);
- fetchPage("/basic-http.jsp", 200);
+ Map<String, byte[]> responses = new TreeMap<>();
+ responses.put("/basic-http.jsp",
+ (responseHeader + simpleContent).getBytes(UTF_8));
+ responses.put("/redirect301.jsp", redirect301.getBytes(UTF_8));
+ responses.put("/redirect302.jsp", redirect302.getBytes(UTF_8));
+ responses.put("/brokenpage.jsp", serverError.getBytes(UTF_8));
+ launchServer(responses);
+
+ fetchPage("/basic-http.jsp", 200, "text/html");
fetchPage("/redirect301.jsp", 301);
fetchPage("/redirect302.jsp", 302);
fetchPage("/nonexists.html", 404);
fetchPage("/brokenpage.jsp", 500);
}
- @Test
- public void testRedirectionJetty() throws Exception {
- // Redirection via Jetty
- startServer(47503, true);
- fetchPage("/redirection", 302);
- }
-
- /**
- * Starts the Jetty server at a specified port and redirection parameter.
- *
- * @param portno
- * Port number.
- * @param redirection
- * whether redirection
- */
- private void startServer(int portno, boolean redirection) throws Exception {
- port = portno;
- setUp(redirection);
- SelectChannelConnector connector = new SelectChannelConnector();
- connector.setHost("127.0.0.1");
- connector.setPort(port);
-
- server.addConnector(connector);
- server.start();
- }
-
- /**
- * Fetches the specified <code>page</code> from the local Jetty server and
- * checks whether the HTTP response status code matches with the expected
- * code. Also use jsp pages for redirection.
- *
- * @param page
- * Page to be fetched.
- * @param expectedCode
- * HTTP response status code expected while fetching the page.
- */
- private void fetchPage(String page, int expectedCode) throws Exception {
- URL url = new URL("http", "127.0.0.1", port, page);
- CrawlDatum crawlDatum = new CrawlDatum();
- Response response = http.getResponse(url, crawlDatum, true);
- ProtocolOutput out = http.getProtocolOutput(new Text(url.toString()),
- crawlDatum);
- Content content = out.getContent();
- assertEquals("HTTP Status Code for " + url, expectedCode,
- response.getCode());
-
- if (page.compareTo("/nonexists.html") != 0
- && page.compareTo("/brokenpage.jsp") != 0
- && page.compareTo("/redirection") != 0) {
- assertEquals("ContentType " + url, "text/html",
- content.getContentType());
- }
- }
}
diff --git a/src/plugin/protocol-httpclient/build.xml b/src/plugin/protocol-httpclient/build.xml
index b66eb97..8da5c0c 100644
--- a/src/plugin/protocol-httpclient/build.xml
+++ b/src/plugin/protocol-httpclient/build.xml
@@ -36,10 +36,4 @@
</copy>
</target>
- <!-- for junit test -->
- <mkdir dir="${build.test}/data" />
- <copy todir="${build.test}/data">
- <fileset dir="jsp"/>
- </copy>
-
</project>
diff --git a/src/plugin/protocol-httpclient/jsp/basic.jsp b/src/plugin/protocol-httpclient/jsp/basic.jsp
deleted file mode 100644
index c5bfb89..0000000
--- a/src/plugin/protocol-httpclient/jsp/basic.jsp
+++ /dev/null
@@ -1,74 +0,0 @@
-<%--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
---%><%--
- This JSP demonstrates basic authentication. When this JSP page is
- requested with no query parameters, then the user must enter the
- username as 'userx' and password as 'passx' when prompted for
- authentication. Apart from this there are a few other test cases,
- which can be used by passing a test case number as query parameter in
- the following manner: basic.jsp?case=1, basic.jsp?case=2, etc.
- The credentials for each test case can be easily figured out from the
- code below.
-
- Author: Susam Pal
---%><%@ page
- import = "sun.misc.BASE64Decoder"
-%><%
- String authHeader = request.getHeader("Authorization");
- String realm = null;
- String username = null;
- String password = null;
- int testCase = 0;
- try {
- testCase = Integer.parseInt(request.getParameter("case"));
- } catch (Exception ex) {
- // do nothing
- }
- switch (testCase) {
- case 1:
- realm = "realm1"; username = "user1"; password = "pass1";
- break;
-
- case 2:
- realm = "realm2"; username = "user2"; password = "pass2";
- break;
-
- default:
- realm = "realmx"; username = "userx"; password = "passx";
- break;
- }
-
- boolean authenticated = false;
- if (authHeader != null && authHeader.toUpperCase().startsWith("BASIC")) {
- String creds[] = new String(new BASE64Decoder().decodeBuffer(
- authHeader.substring(6))).split(":", 2);
- if (creds[0].equals(username) && creds[1].equals(password))
- authenticated = true;
- }
- if (!authenticated) {
- response.setHeader("WWW-Authenticate", "Basic realm=\"" + realm + "\"");
- response.sendError(response.SC_UNAUTHORIZED);
- } else {
-%>
-<html>
-<head><title>Basic Authentication Test</title></head>
-<body>
-<p>Hi <%= username %>, you have been successfully authenticated.</p>
-</body>
-</html>
-<%
- }
-%>
\ No newline at end of file
diff --git a/src/plugin/protocol-httpclient/jsp/cookies.jsp b/src/plugin/protocol-httpclient/jsp/cookies.jsp
deleted file mode 100644
index ae2ace2..0000000
--- a/src/plugin/protocol-httpclient/jsp/cookies.jsp
+++ /dev/null
@@ -1,63 +0,0 @@
-<%--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
---%><%--
- This JSP tests whether the client can remember cookies. When the JSP
- is fetched for the first time without any query parameters, it sets
- a few cookies in the client. On a second request, with the query
- parameter, 'cookie=yes', it checks whether all the client has sent
- the cookies. If the cookies are found, HTTP 200 response is returned.
- If the cookies are not found, HTTP 403 response is returned.
-
- Author: Susam Pal
---%><%
- String cookieParam = request.getParameter("cookie");
- if (!"yes".equals(cookieParam)) { // Send cookies
- response.addCookie(new Cookie("var1", "val1"));
- response.addCookie(new Cookie("var2", "val2"));
-%>
-<html>
-<head><title>Cookies Set</title></head>
-<body><p>Cookies have been set.</p></body>
-</html>
-<%
- } else { // Check cookies
- int cookiesCount = 0;
-
- Cookie[] cookies = request.getCookies();
- if (cookies != null) {
- for (int i = 0; i < cookies.length; i++) {
- if (cookies[i].getName().equals("var1")
- && cookies[i].getValue().equals("val1"))
- cookiesCount++;
-
- if (cookies[i].getName().equals("var2")
- && cookies[i].getValue().equals("val2"))
- cookiesCount++;
- }
- }
-
- if (cookiesCount != 2) {
- response.sendError(response.SC_FORBIDDEN);
- } else {
-%>
-<html>
-<head><title>Cookies Found</title></head>
-<body><p>Cookies found!</p></body>
-</html>
-<%
- }
- }
-%>
\ No newline at end of file
diff --git a/src/plugin/protocol-httpclient/jsp/digest.jsp b/src/plugin/protocol-httpclient/jsp/digest.jsp
deleted file mode 100644
index c657484..0000000
--- a/src/plugin/protocol-httpclient/jsp/digest.jsp
+++ /dev/null
@@ -1,68 +0,0 @@
-<%--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
---%><%--
- This JSP tests digest authentication. It generates an HTTP response
- with authorization header for digest authentication and checks the
- user-name supplied by the client. It does not check the other
- parameters and hashes as controlled JUnit tests would be performed
- against this and only the proper submission of credentials need to
- be tested.
-
- Author: Susam Pal
---%><%@ page
- import = "java.util.StringTokenizer"
- import = "java.util.HashMap"
-%><%
- String username = "digest_user";
- String authHeader = request.getHeader("Authorization");
-
- boolean authenticated = false;
- if (authHeader != null && authHeader.toUpperCase().startsWith("DIGEST")) {
- HashMap map = new HashMap();
- StringTokenizer tokenizer = new StringTokenizer(
- authHeader.substring(7).trim(), ",");
- while (tokenizer.hasMoreTokens()) {
- String[] param = tokenizer.nextToken().trim().split("=", 2);
- if (param[1].charAt(0) == '"') {
- param[1] = param[1].substring(1, param[1].length() - 1);
- }
- map.put(param[0], param[1]);
- }
-
- if (username.equals((String)map.get("username")))
- authenticated = true;
- }
-
- if (!authenticated) {
- String realm = "realm=\"realm1\"";
- String qop = "qop=\"auth,auth-int\"";
- String nonce = "nonce=\"dcd98b7102dd2f0e8b11d0f600bfb0c093\"";
- String opaque = "opaque=\"5ccc069c403ebaf9f0171e9517f40e41\"";
-
- response.setHeader("WWW-Authenticate", "Digest " + realm + ", "
- + qop + ", " + nonce + ", " + opaque);
- response.sendError(response.SC_UNAUTHORIZED);
- } else {
-%>
-<html>
-<head><title>Digest Authentication Test</title></head>
-<body>
-<p>Hi <%= username %>, you have been successfully authenticated.</p>
-</body>
-</html>
-<%
- }
-%>
\ No newline at end of file
diff --git a/src/plugin/protocol-httpclient/jsp/noauth.jsp b/src/plugin/protocol-httpclient/jsp/noauth.jsp
deleted file mode 100644
index c726b0f..0000000
--- a/src/plugin/protocol-httpclient/jsp/noauth.jsp
+++ /dev/null
@@ -1,36 +0,0 @@
-<%--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
---%><%--
- This JSP tests whether the client is sending any pre-emptive
- authentication headers. The client is expected not to send pre-emptive
- authentication headers. If such authentication headers are found, this
- JSP will return an HTTP 403 response; HTTP 200 response otherwise.
-
- Author: Susam Pal
---%><%
- if (request.getHeader("Authorization") != null) {
- response.sendError(response.SC_UNAUTHORIZED);
- } else {
-%>
-<html>
-<head><title>No authorization headers found</title></head>
-<body>
-<p>No authorization headers found.</p>
-</body>
-</html>
-<%
- }
-%>
\ No newline at end of file
diff --git a/src/plugin/protocol-httpclient/jsp/ntlm.jsp b/src/plugin/protocol-httpclient/jsp/ntlm.jsp
deleted file mode 100644
index 6ad921e..0000000
--- a/src/plugin/protocol-httpclient/jsp/ntlm.jsp
+++ /dev/null
@@ -1,89 +0,0 @@
-<%--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
---%><%--
- This JSP tests NTLM authentication. It generates an HTTP response
- with authorization header for NTLM authentication and checks the
- user-name supplied by the client. It does not check the other
- parameters and hashes as controlled JUnit tests would be performed
- against this and only the proper submission of credentials need to
- be tested.
-
- Author: Susam Pal
---%><%@ page
- import = "sun.misc.BASE64Decoder"
- import = "sun.misc.BASE64Encoder"
-%><%
- String authHeader = request.getHeader("Authorization");
- String username = null;
- String domain = null;
- String host = null;
-
- boolean authenticated = false;
- if (authHeader != null && authHeader.startsWith("NTLM")) {
- byte[] msg = new BASE64Decoder().decodeBuffer(
- authHeader.substring(5));
- if (msg[8] == 1) {
- byte[] type2msg = {
- 'N', 'T', 'L', 'M', 'S', 'S', 'P', 0, // NTLMSSP Signature
- 2, 0, 0, 0, // Type 2 Indicator
- 10, 0, 10, 0, 32, 0, 0, 0, // length, offset
- 0x00, 0x02, (byte) 0x81, 0, // Flags
- 1, 2, 3, 4, 5, 6, 7, 8, // Challenge
- 'N', 'U', 'T', 'C', 'H' // NUTCH (Domain)
- };
- response.setHeader("WWW-Authenticate", "NTLM "
- + new BASE64Encoder().encodeBuffer(type2msg));
- response.sendError(response.SC_UNAUTHORIZED);
- return;
- } else if (msg[8] == 3) {
- int length;
- int offset;
-
- // Get domain name
- length = msg[30] + msg[31] * 256;
- offset = msg[32] + msg[33] * 256;
- domain = new String(msg, offset, length);
-
- // Get user name
- length = msg[38] + msg[39] * 256;
- offset = msg[40] + msg[41] * 256;
- username = new String(msg, offset, length);
-
- // Get password
- length = msg[46] + msg[47] * 256;
- offset = msg[48] + msg[49] * 256;
- host = new String(msg, offset, length);
-
- if ("ntlm_user".equalsIgnoreCase(username)
- && "NUTCH".equalsIgnoreCase(domain))
- authenticated = true;
- }
- }
-
- if (!authenticated) {
- response.setHeader("WWW-Authenticate", "NTLM");
- response.sendError(response.SC_UNAUTHORIZED);
- } else {
-%>
-<html>
-<head>NTLM Authentication Test</head>
-<body>
-<p>Hi <%= username %>, You have been successfully authenticated.</p>
-</body>
-</html>
-<%
- }
-%>
\ No newline at end of file
diff --git a/src/plugin/protocol-httpclient/src/test/conf/nutch-site-test.xml b/src/plugin/protocol-httpclient/src/test/conf/nutch-site-test.xml
index 856ea15..12100a5 100644
--- a/src/plugin/protocol-httpclient/src/test/conf/nutch-site-test.xml
+++ b/src/plugin/protocol-httpclient/src/test/conf/nutch-site-test.xml
@@ -20,6 +20,11 @@
<configuration>
<property>
+ <name>plugin.includes</name>
+ <value>protocol-httpclient</value>
+</property>
+
+<property>
<name>http.robots.agents</name>
<value>Nutch-Test,*</value>
<description></description>
diff --git a/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java b/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java
index a897cd4..f7277bd 100644
--- a/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java
+++ b/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java
@@ -16,61 +16,36 @@
*/
package org.apache.nutch.protocol.httpclient;
-import java.net.URL;
-import org.junit.After;
-import org.junit.Assert;
-import org.junit.Before;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import java.lang.invoke.MethodHandles;
+import java.nio.charset.StandardCharsets;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.StringTokenizer;
+import java.util.TreeMap;
+
+import org.apache.commons.codec.binary.Base64;
+
+import org.apache.nutch.protocol.AbstractHttpProtocolPluginTest;
+
import org.junit.Test;
-import org.mortbay.jetty.Server;
-import org.mortbay.jetty.bio.SocketConnector;
-import org.mortbay.jetty.handler.ContextHandler;
-import org.mortbay.jetty.servlet.ServletHandler;
-import org.mortbay.jetty.servlet.SessionHandler;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.net.protocols.Response;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/**
- * Test cases for protocol-httpclient.
+ * Test cases for protocol-httpclient. See also
+ * src/test/conf/httpclient-auth-test.xml
*/
-public class TestProtocolHttpClient {
-
- private Server server;
- private Configuration conf;
- private static final String RES_DIR = System.getProperty("test.data", ".");
- private int port;
- private Http http = new Http();
-
- @Before
- public void setUp() throws Exception {
-
- ContextHandler context = new ContextHandler();
- context.setContextPath("/");
- context.setResourceBase(RES_DIR);
- ServletHandler sh = new ServletHandler();
- sh.addServletWithMapping("org.apache.jasper.servlet.JspServlet", "*.jsp");
- context.addHandler(sh);
- context.addHandler(new SessionHandler());
-
- server = new Server();
- server.addHandler(context);
-
- conf = new Configuration();
- conf.addResource("nutch-default.xml");
- conf.addResource("nutch-site-test.xml");
-
- http = new Http();
- http.setConf(conf);
- }
+public class TestProtocolHttpClient extends AbstractHttpProtocolPluginTest {
- @After
- public void tearDown() throws Exception {
- server.stop();
- for (int i = 0; i < 5; i++) {
- if (!server.isStopped()) {
- Thread.sleep(1000);
- }
- }
+ private static final Logger LOG = LoggerFactory
+ .getLogger(MethodHandles.lookup().lookupClass());
+
+ @Override
+ protected String getPluginClassName() {
+ return "org.apache.nutch.protocol.httpclient.Http";
}
/**
@@ -81,9 +56,40 @@ public class TestProtocolHttpClient {
*/
@Test
public void testCookies() throws Exception {
- startServer(47500);
- fetchPage("/cookies.jsp", 200);
- fetchPage("/cookies.jsp?cookie=yes", 200);
+ int port = 47500;
+ String responseSetCookies = responseHeader //
+ + "Set-Cookie: var1=val1\r\n" //
+ + "Set-Cookie: var2=val2\r\n" //
+ + "Content-Type: text/html\r\n\r\n" //
+ + "<html>\n" //
+ + "<head><title>Cookies Set</title></head>" //
+ + "<body><p>Cookies have been set.</p></body>" //
+ + "</html>";
+ String response = responseHeader //
+ + "Content-Type: text/html\r\n\r\n" //
+ + "<html>\n" //
+ + "<head><title>Cookies Found</title></head>" //
+ + "<body><p>Cookies found!</p></body>" //
+ + "</html>";
+ Map<String, byte[]> responses = new TreeMap<>();
+ responses.put("/cookies.jsp",
+ responseSetCookies.getBytes(StandardCharsets.UTF_8));
+ responses.put("/cookies.jsp?cookies=yes",
+ response.getBytes(StandardCharsets.UTF_8));
+ launchServer(port, (String requestPath) -> {
+ return responses.get(requestPath);
+ }, (List<String> requestLines) -> {
+ // verify whether cookies are set by httpclient
+ if (requestLines.get(0).contains("?cookies=yes")) {
+ return requestLines.stream().anyMatch((String line) -> {
+ return line.startsWith("Cookie:") && line.contains("var1=val1")
+ && line.contains("var2=val2");
+ });
+ }
+ return true;
+ });
+ fetchPage(port, "/cookies.jsp", 200, "text/html");
+ fetchPage(port, "/cookies.jsp?cookies=yes", 200, "text/html");
}
/**
@@ -94,10 +100,157 @@ public class TestProtocolHttpClient {
*/
@Test
public void testNoPreemptiveAuth() throws Exception {
- startServer(47500);
- fetchPage("/noauth.jsp", 200);
+ int port = 47500;
+ String response = responseHeader //
+ + "Content-Type: text/html\r\n\r\n" //
+ + "<html>\n" //
+ + "<head><title>No authorization headers found</title></head>" //
+ + "<body>" //
+ + "<p>No authorization headers found.</p>" //
+ + "</body>" //
+ + "</html>";
+ launchServer(port, (String requestPath) -> {
+ return response.getBytes(UTF_8);
+ }, (List<String> requestLines) -> {
+ // verify that no "Authentication" header is sent
+ return requestLines.stream().noneMatch((String line) -> {
+ if (line.startsWith("Authorization:")) {
+ LOG.error("Found `Authorization` header, none expected!");
+ return true;
+ }
+ LOG.debug("Verified header: {}", line);
+ return false;
+ });
+ });
+ fetchPage(port, "/noauth.jsp", 200, "text/html");
}
+ // see old basic.jsp, digest.jsp, ntlm.jsp
+ private static byte[] authenticationResponder(String requestPath, String[] requestHeaders) {
+
+ String authenticationType = "BASIC";
+ if (requestPath.startsWith("/digest.jsp")) {
+ authenticationType = "DIGEST";
+ } else if (requestPath.startsWith("/ntlm.jsp")) {
+ authenticationType = "NTLM";
+ }
+
+ char id = 'x';
+ if (requestPath.endsWith("?case=1")) {
+ id = '1';
+ } else if (requestPath.endsWith("?case=2")) {
+ id = '2';
+ }
+
+ String authHeader = getHeader(requestHeaders, "Authorization");
+ boolean authenticated = false;
+ String authReq = "Basic realm=\"realm" + id + "\"";
+ if (authHeader != null) {
+ if (authHeader.toUpperCase().startsWith("BASIC")) {
+ authenticationType = "BASIC";
+ String creds[] = new String(Base64.decodeBase64(authHeader.substring(6)), UTF_8).split(":", 2);
+ if (creds[0].equals("user" + id) && creds[1].equals("pass" + id)) {
+ authenticated = true;
+ }
+
+ } else if (authHeader.toUpperCase().startsWith("DIGEST")) {
+ authenticationType = "DIGEST";
+ Map<String, String> map = new HashMap<>();
+ StringTokenizer tokenizer = new StringTokenizer(
+ authHeader.substring(7).trim(), ",");
+ while (tokenizer.hasMoreTokens()) {
+ String[] param = tokenizer.nextToken().trim().split("=", 2);
+ if (param[1].charAt(0) == '"') {
+ param[1] = param[1].substring(1, param[1].length() - 1);
+ }
+ map.put(param[0], param[1]);
+ }
+ String username = "user" + id;
+ if (username.equals(map.get("username"))) {
+ authenticated = true;
+ }
+
+ } else if (authHeader.toUpperCase().startsWith("NTLM")) {
+ authenticationType = "NTLM";
+ String username = null;
+ String domain = null;
+ String host = null;
+ byte[] msg = Base64.decodeBase64(authHeader.substring(5));
+ if (msg[8] == 1) {
+ byte[] type2msg = {
+ 'N', 'T', 'L', 'M', 'S', 'S', 'P', 0, // NTLMSSP Signature
+ 2, 0, 0, 0, // Type 2 Indicator
+ 10, 0, 10, 0, 32, 0, 0, 0, // length, offset
+ 0x00, 0x02, (byte) 0x81, 0, // Flags
+ 1, 2, 3, 4, 5, 6, 7, 8, // Challenge
+ 'N', 'U', 'T', 'C', 'H' // NUTCH (Domain)
+ };
+ // request authentication
+ authReq = "NTLM " + Base64.encodeBase64String(type2msg);
+ } else if (msg[8] == 3) {
+ int length;
+ int offset;
+
+ // Get domain name
+ length = msg[30] + msg[31] * 256;
+ offset = msg[32] + msg[33] * 256;
+ domain = new String(msg, offset, length);
+
+ // Get user name
+ length = msg[38] + msg[39] * 256;
+ offset = msg[40] + msg[41] * 256;
+ username = new String(msg, offset, length);
+
+ // Get password
+ length = msg[46] + msg[47] * 256;
+ offset = msg[48] + msg[49] * 256;
+ host = new String(msg, offset, length);
+
+ if ("ntlm_user".equalsIgnoreCase(username)
+ && "NUTCH".equalsIgnoreCase(domain)) {
+ authenticated = true;
+ }
+ }
+ }
+ }
+
+ if (!authenticated) {
+ LOG.info("Requesting authentication for realm{} and type {}", id,
+ authenticationType);
+
+ if ("DIGEST".equals(authenticationType)) {
+ String qop = "qop=\"auth,auth-int\"";
+ String nonce = "nonce=\"dcd98b7102dd2f0e8b11d0f600bfb0c093\"";
+ String opaque = "opaque=\"5ccc069c403ebaf9f0171e9517f40e41\"";
+ authReq = "Digest realm=\"realm" + id + "\", " + qop + ", " + nonce
+ + ", " + opaque;
+
+ } else if ("NTLM".equals(authenticationType)) {
+ if (!authReq.startsWith("NTLM")) {
+ authReq = "NTLM";
+ }
+ }
+
+ String requestAuthorization = "HTTP/1.1 401 Unauthorized\r\n" //
+ + "WWW-Authenticate: " + authReq + "\r\n" //
+ + "\r\n";
+ return requestAuthorization.getBytes(UTF_8);
+ }
+
+ LOG.info("User user{} (realm{}, auth. type {}) successfully authenticated",
+ id, id, authenticationType);
+ String responseAuthenticated = responseHeader //
+ + "Content-Type: text/html\r\n\r\n" //
+ + "<html>" //
+ + "<head><title>" + authenticationType //
+ + " Authentication Test</title></head>" //
+ + "<body>" //
+ + "<p>Hi user" + id + ", you have been successfully authenticated.</p>" //
+ + "</body>" //
+ + "</html>";
+ return responseAuthenticated.getBytes(UTF_8);
+ }
+
/**
* Tests default credentials.
*
@@ -106,8 +259,12 @@ public class TestProtocolHttpClient {
*/
@Test
public void testDefaultCredentials() throws Exception {
- startServer(47502);
- fetchPage("/basic.jsp", 200);
+ // the behavior when connecting to port 47502
+ // is not configured in httpclient-auth-test.xml
+ // which means that authentication is requested first
+ int port = 47502;
+ launchServer(port, TestProtocolHttpClient::authenticationResponder, null);
+ fetchPage(port, "/basic.jsp", 200, "text/html");
}
/**
@@ -118,11 +275,11 @@ public class TestProtocolHttpClient {
*/
@Test
public void testBasicAuth() throws Exception {
- startServer(47500);
- fetchPage("/basic.jsp", 200);
- fetchPage("/basic.jsp?case=1", 200);
- fetchPage("/basic.jsp?case=2", 200);
- server.start();
+ int port = 47500;
+ launchServer(port, TestProtocolHttpClient::authenticationResponder, null);
+ fetchPage(port, "/basic.jsp", 200, "text/html");
+ fetchPage(port, "/basic.jsp?case=1", 200, "text/html");
+ fetchPage(port, "/basic.jsp?case=2", 200, "text/html");
}
/**
@@ -135,10 +292,11 @@ public class TestProtocolHttpClient {
*/
@Test
public void testOtherRealmsNoAuth() throws Exception {
- startServer(47501);
- fetchPage("/basic.jsp", 200);
- fetchPage("/basic.jsp?case=1", 401);
- fetchPage("/basic.jsp?case=2", 401);
+ int port = 47501;
+ launchServer(port, TestProtocolHttpClient::authenticationResponder, null);
+ fetchPage(port, "/basic.jsp", 200, "text/html");
+ fetchPage(port, "/basic.jsp?case=1", 401, "text/html");
+ fetchPage(port, "/basic.jsp?case=2", 401, "text/html");
}
/**
@@ -149,8 +307,9 @@ public class TestProtocolHttpClient {
*/
@Test
public void testDigestAuth() throws Exception {
- startServer(47500);
- fetchPage("/digest.jsp", 200);
+ int port = 47500;
+ launchServer(port, TestProtocolHttpClient::authenticationResponder, null);
+ fetchPage(port, "/digest.jsp", 200, "text/html");
}
/**
@@ -161,56 +320,9 @@ public class TestProtocolHttpClient {
*/
@Test
public void testNtlmAuth() throws Exception {
- startServer(47501);
- fetchPage("/ntlm.jsp", 200);
- }
-
- /**
- * Starts the Jetty server at a specified port.
- *
- * Will try up to 10 ports to find an available port to use.
- *
- * @param portno
- * Port number.
- * @throws Exception
- * When an error occurs.
- */
- private void startServer(int portno) throws Exception {
- SocketConnector listener = new SocketConnector();
- listener.setHost("127.0.0.1");
- server.addConnector(listener);
- for (int p = portno; p < portno + 10; p++) {
- port = portno;
- listener.setPort(port);
- try {
- server.start();
- break;
- } catch (Exception e) {
- if (p == portno + 9) {
- throw e;
- }
- }
- }
+ int port = 47501;
+ launchServer(port, TestProtocolHttpClient::authenticationResponder, null);
+ fetchPage(port, "/ntlm.jsp", 200, "text/html");
}
- /**
- * Fetches the specified <code>page</code> from the local Jetty server and
- * checks whether the HTTP response status code matches with the expected
- * code.
- *
- * @param page
- * Page to be fetched.
- * @param expectedCode
- * HTTP response status code expected while fetching the page.
- * @throws Exception
- * When an error occurs or test case fails.
- */
- private void fetchPage(String page, int expectedCode) throws Exception {
- URL url = new URL("http", "127.0.0.1", port, page);
- Response response = null;
- response = http.getResponse(url, new CrawlDatum(), true);
-
- int code = response.getCode();
- Assert.assertEquals("HTTP Status Code for " + url, expectedCode, code);
- }
}
diff --git a/src/plugin/protocol-okhttp/build.xml b/src/plugin/protocol-okhttp/build.xml
index 644eeb0..b98e695 100755
--- a/src/plugin/protocol-okhttp/build.xml
+++ b/src/plugin/protocol-okhttp/build.xml
@@ -41,10 +41,4 @@
</copy>
</target>
- <!-- for junit test -->
- <mkdir dir="${build.test}/data" />
- <copy todir="${build.test}/data">
- <fileset dir="jsp"/>
- </copy>
-
</project>
diff --git a/src/plugin/protocol-okhttp/jsp/basic-http.jsp b/src/plugin/protocol-okhttp/jsp/basic-http.jsp
deleted file mode 100644
index bf1f8bd..0000000
--- a/src/plugin/protocol-okhttp/jsp/basic-http.jsp
+++ /dev/null
@@ -1,44 +0,0 @@
-<%--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
---%><%--
- Example JSP Page to Test Protocol-Http Plugin
---%><%@ page language="java" import="java.util.*" pageEncoding="UTF-8"%><%
-String path = request.getContextPath();
-String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
-%>
-
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
-<html>
- <head>
- <base href="<%=basePath%>">
-
- <title>HelloWorld</title>
- <meta http-equiv="content-type" content="text/html;charset=utf-8" />
- <meta name="Language" content="en" />
- <meta http-equiv="pragma" content="no-cache">
- <meta http-equiv="cache-control" content="no-cache">
- <meta http-equiv="expires" content="0">
- <meta http-equiv="keywords" content="keyword1,keyword2,keyword3">
- <meta http-equiv="description" content="This is my page">
- <!--
- <link rel="stylesheet" type="text/css" href="styles.css">
- -->
- </head>
-
- <body>
- Hello World!!! <br>
- </body>
-</html>
diff --git a/src/plugin/protocol-okhttp/jsp/brokenpage.jsp b/src/plugin/protocol-okhttp/jsp/brokenpage.jsp
deleted file mode 100644
index f3f7c4a..0000000
--- a/src/plugin/protocol-okhttp/jsp/brokenpage.jsp
+++ /dev/null
@@ -1,47 +0,0 @@
-<%--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
---%><%--
- Example JSP Page to Test Protocol-Http Plugin
---%>
-
-@ page language="java" import="java.util.*" pageEncoding="UTF-8"
-
-String path = request.getContextPath();
-String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
-
-
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
-<html>
- <head>
- <base href="<%=basePath%>">
-
- <title>HelloWorld</title>
- <meta http-equiv="content-type" content="text/html;charset=utf-8" />
- <meta name="Language" content="en" />
- <meta http-equiv="pragma" content="no-cache">
- <meta http-equiv="cache-control" content="no-cache">
- <meta http-equiv="expires" content="0">
- <meta http-equiv="keywords" content="keyword1,keyword2,keyword3">
- <meta http-equiv="description" content="This is my page">
- <!--
- <link rel="stylesheet" type="text/css" href="styles.css">
- -->
- </head>
-
- <body>
- Hello World!!! <br>
- </body>
-</html>
diff --git a/src/plugin/protocol-okhttp/jsp/redirect301.jsp b/src/plugin/protocol-okhttp/jsp/redirect301.jsp
deleted file mode 100644
index ac16501..0000000
--- a/src/plugin/protocol-okhttp/jsp/redirect301.jsp
+++ /dev/null
@@ -1,49 +0,0 @@
-<%--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
---%><%--
- Example JSP Page to Test Protocol-Http Plugin
---%><%@ page language="java" import="java.util.*" pageEncoding="UTF-8"%><%
-String path = request.getContextPath();
-String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
-%>
-
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
-<html>
- <head>
- <base href="<%=basePath%>">
-
- <title>My JSP page</title>
-
- <meta http-equiv="pragma" content="no-cache">
- <meta http-equiv="cache-control" content="no-cache">
- <meta http-equiv="expires" content="0">
- <meta http-equiv="keywords" content="keyword1,keyword2,keyword3">
- <meta http-equiv="description" content="This is my page">
- <!--
- <link rel="stylesheet" type="text/css" href="styles.css">
- -->
-
- </head>
-
- <body>
- <%
- response.setStatus(301);
- response.setHeader( "Location", "https://nutch.apache.org/");
- response.setHeader( "Connection", "close" );
- %>
- You are redirected by JSP<br>
- </body>
-</html>
diff --git a/src/plugin/protocol-okhttp/jsp/redirect302.jsp b/src/plugin/protocol-okhttp/jsp/redirect302.jsp
deleted file mode 100644
index 8a92fee..0000000
--- a/src/plugin/protocol-okhttp/jsp/redirect302.jsp
+++ /dev/null
@@ -1,49 +0,0 @@
-<%--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
---%><%--
- Example JSP Page to Test Protocol-Http Plugin
---%><%@ page language="java" import="java.util.*" pageEncoding="UTF-8"%><%
-String path = request.getContextPath();
-String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
-%>
-
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
-<html>
- <head>
- <base href="<%=basePath%>">
-
- <title>My JSP page</title>
-
- <meta http-equiv="pragma" content="no-cache">
- <meta http-equiv="cache-control" content="no-cache">
- <meta http-equiv="expires" content="0">
- <meta http-equiv="keywords" content="keyword1,keyword2,keyword3">
- <meta http-equiv="description" content="This is my page">
- <!--
- <link rel="stylesheet" type="text/css" href="styles.css">
- -->
-
- </head>
-
- <body>
- <%
- response.setStatus(302);
- response.setHeader( "Location", "https://nutch.apache.org/");
- response.setHeader( "Connection", "close" );
- %>
- You are sucessfully redirected by JSP<br>
- </body>
-</html>
diff --git a/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestBadServerResponses.java b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestBadServerResponses.java
index dafe365..5a587fe 100644
--- a/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestBadServerResponses.java
+++ b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestBadServerResponses.java
@@ -20,30 +20,14 @@ import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
-import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
-import java.io.InputStreamReader;
import java.lang.invoke.MethodHandles;
-import java.net.InetSocketAddress;
-import java.net.MalformedURLException;
-import java.net.ServerSocket;
-import java.net.Socket;
-import java.net.URL;
import java.nio.charset.StandardCharsets;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
import java.util.zip.GZIPOutputStream;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.AbstractHttpProtocolPluginTest;
import org.apache.nutch.protocol.ProtocolOutput;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.After;
import org.junit.Ignore;
import org.junit.Test;
import org.slf4j.Logger;
@@ -53,138 +37,18 @@ import org.slf4j.LoggerFactory;
* Test cases for protocol-http - robustness regarding bad server responses:
* malformed HTTP header lines, etc. See, NUTCH-2549.
*/
-public class TestBadServerResponses {
+public class TestBadServerResponses extends AbstractHttpProtocolPluginTest {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
- private Protocol http;
- private ServerSocket server;
- private Configuration conf;
- private int port = 47506;
-
- private static final String responseHeader = "HTTP/1.1 200 OK\r\n";
- private static final String simpleContent = "Content-Type: text/html\r\n\r\nThis is a text.";
-
- public void setUp() throws Exception {
- conf = NutchConfiguration.create();
- conf.addResource("nutch-default.xml");
- // plugin tests specific config file - adds protocol-okhttp to
- // plugin.includes
- conf.addResource("nutch-site-test.xml");
- conf.setBoolean("store.http.headers", true);
-
- http = new ProtocolFactory(conf)
- .getProtocolById("org.apache.nutch.protocol.okhttp.OkHttp");
- }
-
- @After
- public void tearDown() throws Exception {
- server.close();
- }
-
- public static String getHeaders(ProtocolOutput response) {
- return response.getContent().getMetadata().get(Response.RESPONSE_HEADERS);
- }
-
- public static String getHeader(ProtocolOutput response, String header) {
- for (String line : getHeaders(response).split("\r\n")) {
- String[] parts = line.split(": ", 1);
- if (parts[0].equals(header)) {
- return parts[1];
- }
- }
- return null;
- }
-
- /**
- * Starts the test server at a specified port and constant response.
- *
- * @param portno
- * Port number.
- * @param response
- * response sent on every request
- */
- private void runServer(int port, byte[] response) throws Exception {
- server = new ServerSocket();
- server.bind(new InetSocketAddress("127.0.0.1", port));
- Pattern requestPattern = Pattern.compile("(?i)^GET\\s+(\\S+)");
- while (true) {
- LOG.info("Listening on port {}", port);
- Socket socket = server.accept();
- LOG.info("Connection received");
- try (
- BufferedReader in = new BufferedReader(new InputStreamReader(
- socket.getInputStream(), StandardCharsets.UTF_8))) {
-
- String line;
- while ((line = in.readLine()) != null) {
- LOG.info("Request: {}", line);
- if (line.trim().isEmpty()) {
- break;
- }
- Matcher m = requestPattern.matcher(line);
- if (m.find()) {
- LOG.info("Requested {}", m.group(1));
- if (!m.group(1).startsWith("/")) {
- response = "HTTP/1.1 400 Bad request\r\n\r\n".getBytes(StandardCharsets.UTF_8);
- }
- }
- }
- socket.getOutputStream().write(response);
- } catch (Exception e) {
- LOG.warn("Exception in test server:", e);
- }
- }
- }
-
- private void launchServer(String response) throws InterruptedException {
- launchServer(response.getBytes(StandardCharsets.UTF_8));
- }
-
- private void launchServer(byte[] response) throws InterruptedException {
- Thread serverThread = new Thread(() -> {
- try {
- runServer(port, response);
- } catch (Exception e) {
- LOG.warn("Test server died:", e);
- }
- });
- serverThread.start();
- Thread.sleep(50);
- }
-
- /**
- * Fetches the specified <code>page</code> from the local test server and
- * checks whether the HTTP response status code matches with the expected
- * code.
- *
- * @param page
- * Page to be fetched.
- * @param expectedCode
- * HTTP response status code expected while fetching the page.
- */
- private ProtocolOutput fetchPage(String page, int expectedCode)
- throws MalformedURLException {
- URL url = new URL("http", "127.0.0.1", port, page);
- LOG.info("Fetching {}", url);
- CrawlDatum crawlDatum = new CrawlDatum();
- ProtocolOutput out = http.getProtocolOutput(new Text(url.toString()),
- crawlDatum);
- int httpStatusCode = -1;
- if (crawlDatum.getMetaData().containsKey(Nutch.PROTOCOL_STATUS_CODE_KEY)) {
- httpStatusCode = Integer.parseInt(crawlDatum.getMetaData()
- .get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString());
- }
-
- assertEquals("HTTP Status Code for " + url, expectedCode, httpStatusCode);
-
- return out;
+ @Override
+ protected String getPluginClassName() {
+ return "org.apache.nutch.protocol.okhttp.OkHttp";
}
@Test
public void testBadHttpServer() throws Exception {
- setUp();
// test with trivial well-formed content, to make sure the server is
// responding
launchServer(responseHeader + simpleContent);
@@ -196,8 +60,8 @@ public class TestBadServerResponses {
*/
@Test
public void testRequestNotStartingWithSlash() throws Exception {
- setUp();
- launchServer(responseHeader + simpleContent);
+ launchServer("/?171", responseHeader + simpleContent);
+ // request ?171 should be normalized to /?171
fetchPage("?171", 200);
}
@@ -207,7 +71,6 @@ public class TestBadServerResponses {
*/
@Test
public void testContentLengthNotANumber() throws Exception {
- setUp();
launchServer(
responseHeader + "Content-Length: thousand\r\n" + simpleContent);
fetchPage("/", 200);
@@ -219,7 +82,6 @@ public class TestBadServerResponses {
@Ignore("Fails with okhttp 3.10.0")
@Test
public void testHeaderWithColon() throws Exception {
- setUp();
launchServer("HTTP/1.1 200: OK\r\n" + simpleContent);
fetchPage("/", 200);
}
@@ -229,7 +91,6 @@ public class TestBadServerResponses {
*/
@Test
public void testHeaderSpellChecking() throws Exception {
- setUp();
launchServer(responseHeader + "Client-Transfer-Encoding: chunked\r\n"
+ simpleContent);
fetchPage("/", 200);
@@ -242,7 +103,6 @@ public class TestBadServerResponses {
@Ignore("Fails with okhttp 3.10.0")
@Test
public void testIgnoreErrorInRedirectPayload() throws Exception {
- setUp();
launchServer("HTTP/1.1 302 Found\r\nLocation: http://example.com/\r\n"
+ "Transfer-Encoding: chunked\r\n\r\nNot a valid chunk.");
ProtocolOutput fetched = fetchPage("/", 302);
@@ -257,7 +117,6 @@ public class TestBadServerResponses {
@Ignore("Fails with okhttp 3.10.0")
@Test
public void testNoStatusLine() throws Exception {
- setUp();
String text = "This is a text containing non-ASCII characters: \u00e4\u00f6\u00fc\u00df";
launchServer(text);
ProtocolOutput fetched = fetchPage("/", 200);
@@ -282,7 +141,6 @@ public class TestBadServerResponses {
@Ignore("Fails with okhttp 3.10.0")
@Test
public void testMultiLineHeader() throws Exception {
- setUp();
launchServer(responseHeader
+ "Set-Cookie: UserID=JohnDoe;\r\n Max-Age=3600;\r\n Version=1\r\n"
+ simpleContent);
@@ -299,7 +157,6 @@ public class TestBadServerResponses {
* responses
*/
public void testOverlongHeader() throws Exception {
- setUp();
StringBuilder response = new StringBuilder();
response.append(responseHeader);
for (int i = 0; i < 80; i++) {
@@ -323,7 +180,6 @@ public class TestBadServerResponses {
*/
@Test
public void testChunkedContent() throws Exception {
- setUp();
StringBuilder response = new StringBuilder();
response.append(responseHeader);
response.append("Content-Type: text/html\r\n");
@@ -358,7 +214,6 @@ public class TestBadServerResponses {
*/
@Test
public void testTruncationMarking() throws Exception {
- setUp();
int[] kBs = { 63, 64, 65 };
for (int kB : kBs) {
StringBuilder response = new StringBuilder();
@@ -394,7 +249,6 @@ public class TestBadServerResponses {
*/
@Test
public void testTruncationMarkingGzip() throws Exception {
- setUp();
int[] kBs = { 63, 64, 65 };
for (int kB : kBs) {
StringBuilder payload = new StringBuilder();
@@ -417,7 +271,7 @@ public class TestBadServerResponses {
response.write(responseHead.toString().getBytes(StandardCharsets.UTF_8));
response.write(bytes.toByteArray());
- launchServer(response.toByteArray());
+ launchServer("/", response.toByteArray());
ProtocolOutput fetched = fetchPage("/", 200);
assertEquals("Content not truncated according to http.content.limit",
Math.min(kB * 1024, 65536), fetched.getContent().getContent().length);
@@ -439,7 +293,6 @@ public class TestBadServerResponses {
*/
@Test
public void testPartialContentTruncated() throws Exception {
- setUp();
conf.setBoolean("http.partial.truncated", true);
http.setConf(conf);
String testContent = "This is a text.";
@@ -454,7 +307,6 @@ public class TestBadServerResponses {
@Test
public void testNoContentLimit() throws Exception {
- setUp();
conf.setInt("http.content.limit", -1);
http.setConf(conf);
StringBuilder response = new StringBuilder();
diff --git a/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestProtocolOkHttp.java b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestProtocolOkHttp.java
index 3650722..289e756 100644
--- a/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestProtocolOkHttp.java
+++ b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestProtocolOkHttp.java
@@ -16,134 +16,39 @@
*/
package org.apache.nutch.protocol.okhttp;
-import static org.junit.Assert.assertEquals;
+import static java.nio.charset.StandardCharsets.UTF_8;
-import java.net.URL;
+import java.util.Map;
+import java.util.TreeMap;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.metadata.Nutch;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.protocol.ProtocolOutput;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.After;
+import org.apache.nutch.protocol.AbstractHttpProtocolPluginTest;
import org.junit.Test;
-import org.mortbay.jetty.Server;
-import org.mortbay.jetty.nio.SelectChannelConnector;
-import org.mortbay.jetty.servlet.Context;
-import org.mortbay.jetty.servlet.ServletHolder;
/**
* Test cases for protocol-http
*/
-public class TestProtocolOkHttp {
- private static final String RES_DIR = System.getProperty("test.data", ".");
+public class TestProtocolOkHttp extends AbstractHttpProtocolPluginTest {
- private Protocol http;
- private Server server;
- private Context root;
- private Configuration conf;
- private int port;
-
- public void setUp(boolean redirection) throws Exception {
- conf = NutchConfiguration.create();
- conf.addResource("nutch-default.xml");
- // plugin tests specific config file - adds protocol-okhttp to
- // plugin.includes
- conf.addResource("nutch-site-test.xml");
-
- http = new ProtocolFactory(conf)
- .getProtocolById("org.apache.nutch.protocol.okhttp.OkHttp");
-
- server = new Server();
-
- if (redirection) {
- root = new Context(server, "/redirection", Context.SESSIONS);
- root.setAttribute("newContextURL", "/redirect");
- } else {
- root = new Context(server, "/", Context.SESSIONS);
- }
-
- ServletHolder sh = new ServletHolder(
- org.apache.jasper.servlet.JspServlet.class);
- root.addServlet(sh, "*.jsp");
- root.setResourceBase(RES_DIR);
- }
-
- @After
- public void tearDown() throws Exception {
- server.stop();
+ @Override
+ protected String getPluginClassName() {
+ return "org.apache.nutch.protocol.okhttp.OkHttp";
}
@Test
public void testStatusCode() throws Exception {
- startServer(47504, false);
- fetchPage("/basic-http.jsp", 200);
+ Map<String, byte[]> responses = new TreeMap<>();
+ responses.put("/basic-http.jsp",
+ (responseHeader + simpleContent).getBytes(UTF_8));
+ responses.put("/redirect301.jsp", redirect301.getBytes(UTF_8));
+ responses.put("/redirect302.jsp", redirect302.getBytes(UTF_8));
+ responses.put("/brokenpage.jsp", serverError.getBytes(UTF_8));
+ launchServer(responses);
+
+ fetchPage("/basic-http.jsp", 200, "text/html");
fetchPage("/redirect301.jsp", 301);
fetchPage("/redirect302.jsp", 302);
fetchPage("/nonexists.html", 404);
fetchPage("/brokenpage.jsp", 500);
}
- @Test
- public void testRedirectionJetty() throws Exception {
- // Redirection via Jetty
- startServer(47503, true);
- fetchPage("/redirection", 302);
- }
-
- /**
- * Starts the Jetty server at a specified port and redirection parameter.
- *
- * @param portno
- * Port number.
- * @param redirection
- * whether redirection
- */
- private void startServer(int portno, boolean redirection) throws Exception {
- port = portno;
- setUp(redirection);
- SelectChannelConnector connector = new SelectChannelConnector();
- connector.setHost("127.0.0.1");
- connector.setPort(port);
-
- server.addConnector(connector);
- server.start();
- }
-
- /**
- * Fetches the specified <code>page</code> from the local Jetty server and
- * checks whether the HTTP response status code matches with the expected
- * code. Also use jsp pages for redirection.
- *
- * @param page
- * Page to be fetched.
- * @param expectedCode
- * HTTP response status code expected while fetching the page.
- */
- private void fetchPage(String page, int expectedCode) throws Exception {
- URL url = new URL("http", "127.0.0.1", port, page);
- CrawlDatum crawlDatum = new CrawlDatum();
-
- ProtocolOutput out = http.getProtocolOutput(new Text(url.toString()),
- crawlDatum);
- int httpStatusCode = -1;
- if (crawlDatum.getMetaData().containsKey(Nutch.PROTOCOL_STATUS_CODE_KEY)) {
- httpStatusCode = Integer.parseInt(crawlDatum.getMetaData()
- .get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString());
- }
- Content content = out.getContent();
-
- assertEquals("HTTP Status Code for " + url, expectedCode, httpStatusCode);
-
- if (page.compareTo("/nonexists.html") != 0
- && page.compareTo("/brokenpage.jsp") != 0
- && page.compareTo("/redirection") != 0) {
- assertEquals("ContentType " + url, "text/html",
- content.getContentType());
- }
- }
}
diff --git a/src/test/org/apache/nutch/protocol/AbstractHttpProtocolPluginTest.java b/src/test/org/apache/nutch/protocol/AbstractHttpProtocolPluginTest.java
new file mode 100644
index 0000000..3a90e21
--- /dev/null
+++ b/src/test/org/apache/nutch/protocol/AbstractHttpProtocolPluginTest.java
@@ -0,0 +1,298 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.lang.invoke.MethodHandles;
+import java.net.InetSocketAddress;
+import java.net.ServerSocket;
+import java.net.Socket;
+import java.net.SocketException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.function.BiFunction;
+import java.util.function.Function;
+import java.util.function.Predicate;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.protocols.Response;
+import org.junit.After;
+import org.junit.Before;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Abstract class providing methods to easily implement unit tests for HTTP
+ * protocol plugins.
+ */
+public abstract class AbstractHttpProtocolPluginTest {
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(MethodHandles.lookup().lookupClass());
+
+ protected Protocol http;
+ protected ServerSocket server;
+ protected Configuration conf;
+ protected int defaultPort = 47505;
+
+ protected static final String responseHeader = "HTTP/1.1 200 OK\r\n";
+ protected static final String simpleContent = "Content-Type: text/html\r\n\r\nThis is a text.";
+ protected static final String notFound = "HTTP/1.1 404 Not Found\r\n"
+ + "Content-Type: text/html\r\n\r\n" //
+ + "<html>\n<head><title>404 Not Found</title></head>\n" //
+ + "<body>\n<h1>404 Not Found</h1>\n</body>\n</html>";
+ protected static final String redirect301 = "HTTP/1.1 301 Moved Permanently\r\n" //
+ + "Content-Type: text/html; charset=UTF-8\r\n" //
+ + "Content-Length: 0\r\n" //
+ + "Location: https://nutch.apache.org/\r\n\r\n";
+ protected static final String redirect302 = "HTTP/1.1 302 Found\r\n" //
+ + "Content-Type: text/html; charset=UTF-8\r\n" //
+ + "Content-Length: 0\r\n" //
+ + "Location: https://nutch.apache.org/\r\n\r\n";
+ protected static final String serverError = "HTTP/1.1 500 Internal Server Error\r\n" //
+ + "Server: Nutch Test\r\n" //
+ + "Content-Length: 21\r\n" //
+ + "Content-Type: text/html\r\n\r\n" //
+ + "Internal Server Error";
+ protected static final String badRequest = "HTTP/1.1 400 Bad request\r\n\r\n";
+
+ protected abstract String getPluginClassName();
+
+ @Before
+ public void setUp() throws Exception {
+ conf = new Configuration();
+ conf.addResource("nutch-default.xml");
+ /*
+ * plugin tests specific config file - needs to add the tested plugin to
+ * plugin.includes
+ */
+ conf.addResource("nutch-site-test.xml");
+ conf.setBoolean("store.http.headers", true);
+
+ http = new ProtocolFactory(conf)
+ .getProtocolById(getPluginClassName());
+ http.setConf(conf);
+ }
+
+ @After
+ public void tearDown() throws Exception {
+ server.close();
+ }
+
+ /**
+ * Starts the test server at a specified port and constant response.
+ *
+ * @param portno
+ * Port number.
+ * @param responder
+ * function to return a response (byte[] containing HTTP response
+ * header and payload content) for a given request header represented
+ * as list of request header lines
+ * @param requestChecker
+ * verify request passed as list of HTTP header lines
+ * @throws Exception
+ */
+ protected void runServer(int port,
+ BiFunction<String, String[], byte[]> responder,
+ Predicate<List<String>> requestChecker) throws Exception {
+ server = new ServerSocket();
+ server.bind(new InetSocketAddress("127.0.0.1", port));
+ Pattern requestPattern = Pattern.compile("(?i)^GET\\s+(\\S+)");
+ while (true) {
+ LOG.info("Listening on port {}", port);
+ if (server.isClosed()) {
+ server = new ServerSocket();
+ server.bind(new InetSocketAddress("127.0.0.1", port));
+ }
+ Socket socket = server.accept();
+ LOG.info("Connection received");
+ try (BufferedReader in = new BufferedReader(new InputStreamReader(
+ socket.getInputStream(), UTF_8))) {
+
+ List<String> requestLines = new ArrayList<>();
+ String line;
+ while ((line = in.readLine()) != null) {
+ LOG.info("Request: {}", line);
+ if (line.trim().isEmpty()) {
+ break;
+ }
+ requestLines.add(line);
+ }
+ String requestPath = null;
+ Matcher m = requestPattern.matcher(requestLines.get(0));
+ if (m.find()) {
+ requestPath = m.group(1);
+ LOG.info("Requested path {}", requestPath);
+ }
+ byte[] response = badRequest.getBytes(UTF_8);
+ if (requestChecker != null && !requestChecker.test(requestLines)) {
+ LOG.warn("Request validation failed!");
+ response = "HTTP/1.1 500 Internal Server Error\r\n\r\nRequest validation failed!"
+ .getBytes(UTF_8);
+ } else if (requestPath == null) {
+ LOG.warn("No request path!");
+ // bad request
+ } else if (!requestPath.startsWith("/")) {
+ // bad request
+ LOG.warn("Request path must start with `/`");
+ } else {
+ response = responder.apply(requestPath,
+ requestLines.toArray(new String[requestLines.size()]));
+ if (response == null) {
+ LOG.warn("No response found for given path `{}`", requestPath);
+ response = notFound.getBytes(UTF_8);
+ }
+ }
+ socket.getOutputStream().write(response);
+ } catch (Exception e) {
+ LOG.error("Exception in test server:", e);
+ }
+ }
+ }
+
+ protected void launchServer(int port, BiFunction<String, String[], byte[]> responder,
+ Predicate<List<String>> requestChecker) throws InterruptedException {
+ Thread serverThread = new Thread(() -> {
+ try {
+ runServer(port, responder, requestChecker);
+ } catch (SocketException e) {
+ LOG.info("Socket on port {} closed: {}", port, e.getMessage());
+ } catch (Exception e) {
+ LOG.warn("Test server died:", e);
+ }
+ });
+ serverThread.start();
+ Thread.sleep(50);
+ }
+
+ protected void launchServer(Function<String, byte[]> responder)
+ throws InterruptedException {
+ launchServer(responder, null);
+ }
+
+ protected void launchServer(Function<String, byte[]> responder,
+ Predicate<List<String>> requestChecker) throws InterruptedException {
+ launchServer(defaultPort, responder, requestChecker);
+ }
+
+ protected void launchServer(int port, Function<String, byte[]> responder,
+ Predicate<List<String>> requestChecker) throws InterruptedException {
+ BiFunction<String, String[], byte[]> responderBiFunc = (String path, String[] ignoredHeaders) -> {
+ return responder.apply(path);
+ };
+ launchServer(port, responderBiFunc, requestChecker);
+ }
+
+ protected void launchServer(Map<String, byte[]> responses)
+ throws InterruptedException {
+ launchServer(defaultPort, (String requestPath) -> {
+ return responses.get(requestPath);
+ }, null);
+ }
+
+ protected void launchServer(String response) throws InterruptedException {
+ launchServer("/", response);
+ }
+
+ protected void launchServer(String path, String response)
+ throws InterruptedException {
+ launchServer(path, response.getBytes(UTF_8));
+ }
+
+ protected void launchServer(String path, byte[] response)
+ throws InterruptedException {
+ Map<String, byte[]> responses = new TreeMap<>();
+ responses.put(path, response);
+ launchServer(responses);
+ }
+
+ protected ProtocolOutput fetchPage(String page, int expectedCode)
+ throws Exception {
+ return fetchPage(defaultPort, page, expectedCode, null);
+ }
+
+ protected ProtocolOutput fetchPage(String page, int expectedCode,
+ String expectedContentType) throws Exception {
+ return fetchPage(defaultPort, page, expectedCode, null);
+ }
+
+ /**
+ * Fetches the specified <code>page</code> from the local test server and
+ * checks whether the HTTP response status code matches with the expected
+ * code.
+ *
+ * @param port
+ * port server is running on
+ * @param page
+ * Page to be fetched
+ * @param expectedCode
+ * HTTP response status code expected while fetching the page
+ * @param expectedContentType
+ * Expected Content-Type
+ */
+ protected ProtocolOutput fetchPage(int port, String page, int expectedCode,
+ String expectedContentType) throws Exception {
+ URL url = new URL("http", "127.0.0.1", port, page);
+ LOG.info("Fetching {}", url);
+ CrawlDatum crawlDatum = new CrawlDatum();
+ ProtocolOutput protocolOutput = http
+ .getProtocolOutput(new Text(url.toString()), crawlDatum);
+ int httpStatusCode = -1;
+ if (crawlDatum.getMetaData().containsKey(Nutch.PROTOCOL_STATUS_CODE_KEY)) {
+ httpStatusCode = Integer.parseInt(crawlDatum.getMetaData()
+ .get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString());
+ }
+ assertEquals("HTTP Status Code for " + url, expectedCode, httpStatusCode);
+ if (httpStatusCode == 200 && expectedContentType != null) {
+ Content content = protocolOutput.getContent();
+ assertEquals("ContentType " + url, "text/html", content.getContentType());
+ }
+ return protocolOutput;
+ }
+
+ public static String getHeaders(ProtocolOutput response) {
+ return response.getContent().getMetadata().get(Response.RESPONSE_HEADERS);
+ }
+
+ public static String getHeader(ProtocolOutput response, String header) {
+ return getHeader(getHeaders(response).split("\r\n"), header);
+ }
+
+ public static String getHeader(String[] headers, String header) {
+ for (String line : headers) {
+ String[] parts = line.split(": ", 2);
+ if (parts[0].equals(header)) {
+ return parts[1];
+ }
+ }
+ return null;
+ }
+
+}