You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2012/12/02 17:58:00 UTC
svn commit: r1416199 [3/3] - in /manifoldcf/trunk:
connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/
connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcraw...
Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_en_US.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_en_US.properties?rev=1416199&r1=1416198&r2=1416199&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_en_US.properties (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_en_US.properties Sun Dec 2 16:57:56 2012
@@ -52,8 +52,9 @@ WebcrawlerConnector.SessionBasedAccessCr
WebcrawlerConnector.LoginPages=Login pages
WebcrawlerConnector.LoginURLRegularExpression=Login URL regular expression
WebcrawlerConnector.PageType=Page type
-WebcrawlerConnector.FormNamelinkTargetRegularExpression=Form name/link target regular expression
+WebcrawlerConnector.FormNamelinkTargetRegularExpression=Identification regular expression
WebcrawlerConnector.OverrideFormParameters=Override form parameters
+WebcrawlerConnector.OverrideTargetURL=Override target URL
WebcrawlerConnector.NoLoginPagesSpecified=No login pages specified
WebcrawlerConnector.NoSessionBasedAccessCredentials=No session-based access credentials
WebcrawlerConnector.TrustCertificates=Trust certificates:
@@ -135,9 +136,10 @@ WebcrawlerConnector.ForLoginPage= for lo
WebcrawlerConnector.ForCredential= for credential #
WebcrawlerConnector.AddParameterToLoginPage=Add parameter to login page #
WebcrawlerConnector.AddLoginPageToCredential=Add login page to credential #
-WebcrawlerConnector.FormName=Form name
+WebcrawlerConnector.FormName=Form name/id/action
WebcrawlerConnector.LinkTarget=Link target
-WebcrawlerConnector.Redirection=Redirection
+WebcrawlerConnector.RedirectionTo=Redirection to
+WebcrawlerConnector.PageContent=Page content
WebcrawlerConnector.AddSessionAuthenticationUrlRegularExpression=Add session authentication url regular expression
WebcrawlerConnector.DeleteTrustUrlRegularExpression=Delete trust url regular expression #
WebcrawlerConnector.TrustEverything=Trust everything
@@ -158,7 +160,7 @@ WebcrawlerConnector.Exclusions=Exclusion
WebcrawlerConnector.Security=Security
WebcrawlerConnector.Metadata=Metadata
WebcrawlerConnector.FoundAnIllegalRegularExpressionIn=Found an illegal regular expression in
-WebcrawlerConnector.ErrorWas='. Error was:
+WebcrawlerConnector.ErrorWas=Error was:
WebcrawlerConnector.TypeInAnAccessToken=Type in an access token
WebcrawlerConnector.TypeInMetadataName=Type in metadata name
WebcrawlerConnector.TypeInMetadataValue=Type in metadata value
Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_ja_JP.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_ja_JP.properties?rev=1416199&r1=1416198&r2=1416199&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_ja_JP.properties (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_ja_JP.properties Sun Dec 2 16:57:56 2012
@@ -54,6 +54,7 @@ WebcrawlerConnector.LoginURLRegularExpre
WebcrawlerConnector.PageType=ãã¼ã¸ã¿ã¤ã
WebcrawlerConnector.FormNamelinkTargetRegularExpression=ãã©ã¼ã åï¼ãªã³ã¯ã¿ã¼ã²ããæ£è¦è¡¨ç¾
WebcrawlerConnector.OverrideFormParameters=ãã©ã¼ã å¼æ°ããªã¼ãã¼ã©ã¤ã
+WebcrawlerConnector.OverrideTargetURL=Override target URL
WebcrawlerConnector.NoLoginPagesSpecified=ãã°ã¤ã³ãã¼ã¸ãæå®ããã¦ãã¾ãã
WebcrawlerConnector.NoSessionBasedAccessCredentials=ã»ãã·ã§ã³ãã¼ã¹ã¢ã¯ã»ã¹ãµã¼ãã£ãã£ã±ã¼ããããã¾ãã
WebcrawlerConnector.TrustCertificates=ãã©ã¹ããµã¼ãã£ãã£ã±ã¼ãï¼
@@ -135,9 +136,10 @@ WebcrawlerConnector.ForLoginPage= ã
WebcrawlerConnector.ForCredential= èªè¨¼ #
WebcrawlerConnector.AddParameterToLoginPage=ãã°ã¤ã³ãã¼ã¸ã«å¼æ°ã追å #
WebcrawlerConnector.AddLoginPageToCredential=èªè¨¼ã«ãã°ã¤ã³ãã¼ã¸ã追å ï¼ #
-WebcrawlerConnector.FormName= ãã©ã¼ã å
-WebcrawlerConnector.LinkTarget= ãªã³ã¯ã¿ã¼ã²ãã
-WebcrawlerConnector.Redirection= ãªãã¤ã¬ã¯ã·ã§ã³
+WebcrawlerConnector.FormName=ãã©ã¼ã å
+WebcrawlerConnector.LinkTarget=ãªã³ã¯ã¿ã¼ã²ãã
+WebcrawlerConnector.RedirectionTo=ãªãã¤ã¬ã¯ã·ã§ã³
+WebcrawlerConnector.PageContent=Page content
WebcrawlerConnector.AddSessionAuthenticationUrlRegularExpression=ã»ãã·ã§ã³èªè¨¼URIæ£è¦è¡¨ç¾
WebcrawlerConnector.DeleteTrustUrlRegularExpression=ä¿¡ç¨URIæ£è¦è¡¨ç¾å¼ãåé¤ï¼ #
WebcrawlerConnector.TrustEverything=ãã¹ã¦ä¿¡ç¨
@@ -157,7 +159,7 @@ WebcrawlerConnector.Exclusions=é¤å�
WebcrawlerConnector.Security=ã»ãã¥ãªãã£
WebcrawlerConnector.Metadata=ã¡ã¿ãã¼ã¿
WebcrawlerConnector.FoundAnIllegalRegularExpressionIn=ä¸æ£ãªæ£è¦è¡¨ç¾å¼
-WebcrawlerConnector.ErrorWas=ãã¨ã©ã¼ï¼
+WebcrawlerConnector.ErrorWas=ã¨ã©ã¼ï¼
WebcrawlerConnector.TypeInAnAccessToken=ã¢ã¯ã»ã¹ãã¼ã¯ã³ã®å
¥å
WebcrawlerConnector.TypeInMetadataName=ã¡ã¿ãã¼ã¿åã®å
¥å
WebcrawlerConnector.TypeInMetadataValue=ã¡ã¿ãã¼ã¿å¤ã®å
¥å
Added: manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/MockSessionWebService.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/MockSessionWebService.java?rev=1416199&view=auto
==============================================================================
--- manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/MockSessionWebService.java (added)
+++ manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/MockSessionWebService.java Sun Dec 2 16:57:56 2012
@@ -0,0 +1,223 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.webcrawler_tests;
+
+import org.eclipse.jetty.servlet.ServletHolder;
+import org.eclipse.jetty.server.Server;
+import org.eclipse.jetty.servlet.ServletContextHandler;
+import org.eclipse.jetty.util.thread.QueuedThreadPool;
+
+import javax.servlet.http.HttpServlet;
+import javax.servlet.http.HttpServletRequest;
+import javax.servlet.http.HttpServletResponse;
+
+
+import java.io.*;
+import java.util.*;
+
+/** Mock web service that requires session authentication */
+public class MockSessionWebService
+{
+ Server server;
+ SessionWebServlet servlet;
+
+ public MockSessionWebService(int numContentDocs, String userName, String password)
+ {
+ server = new Server(8191);
+ server.setThreadPool(new QueuedThreadPool(100));
+ servlet = new SessionWebServlet(numContentDocs,userName,password);
+ ServletContextHandler context = new ServletContextHandler(ServletContextHandler.SESSIONS);
+ context.setContextPath("/web");
+ server.setHandler(context);
+ context.addServlet(new ServletHolder(servlet), "/");
+ }
+
+ public void start() throws Exception
+ {
+ server.start();
+ }
+
+ public void stop() throws Exception
+ {
+ server.stop();
+ }
+
+
+ public static class SessionWebServlet extends HttpServlet
+ {
+ protected final int contentPageCount;
+ protected final String loginUser;
+ protected final String loginPassword;
+
+ public SessionWebServlet(int contentPageCount, String loginUser, String loginPassword)
+ {
+ this.contentPageCount = contentPageCount;
+ this.loginUser = loginUser;
+ this.loginPassword = loginPassword;
+ }
+
+ @Override
+ public void service(HttpServletRequest req, HttpServletResponse res)
+ throws IOException
+ {
+ try {
+ // This mock web service resolves the following urls:
+ // /index.html (an index of all N content pages)
+ // /protectedcontent.html?id=N (N content pages)
+ // /loginpage.html (the login page, rendered either as a form,
+ // or as a redirection back to the content page, or as a redirection to the index page)
+
+ // Get path part of request URL
+ // MHL
+ String pathPart = "";
+
+ if (pathPart.equals("/loginpage.html"))
+ {
+ // Login page logic
+ String id = req.getParameter("id");
+ Integer idNumber;
+ if (id == null)
+ idNumber = null;
+ else
+ idNumber = new Integer(id);
+
+ String userName = req.getParameter("user");
+ String password = req.getParameter("password");
+
+ // MHL
+ }
+ else if (pathPart.equals("/protectedcontent.html"))
+ {
+ // Content page logic
+ String id = req.getParameter("id");
+ if (id == null)
+ {
+ generateBadArgumentResponse(res);
+ }
+ // MHL
+ }
+ else if (pathPart.equals("/index.html"))
+ {
+ // Index logic
+ // MHL
+ }
+ }
+ catch (IOException e)
+ {
+ e.printStackTrace();
+ throw e;
+ }
+ }
+
+ protected static void generateBadArgumentResponse(HttpServletResponse res)
+ throws IOException
+ {
+ // MHL
+ }
+
+ protected static void generateLoginRedirectPage(HttpServletResponse res, Integer returnID)
+ throws IOException
+ {
+ String redirectTarget;
+ if (returnID == null)
+ redirectTarget = "/web/indexpage.html";
+ else
+ redirectTarget = "/web/protectedcontent.html?id="+returnID;
+
+ // MHL
+ }
+
+ protected static void generateLoginFormPage(HttpServletResponse res, Integer returnID)
+ throws IOException
+ {
+ String actionURI = "/web/loginpage.html";
+ if (returnID != null)
+ actionURI += "?id="+returnID;
+ res.setStatus(HttpServletResponse.SC_OK);
+ res.setContentType("text/html; charset=utf-8");
+
+ res.getWriter().printf("<html>\n");
+ res.getWriter().printf(" <body>\n");
+ res.getWriter().printf(" <form name=\"login\" action=\""+actionURI+"\">\n");
+ res.getWriter().printf(" User name: <input type=\"text\" name=\"user\" value=\"\" size=\"20\"/>\n");
+ res.getWriter().printf(" Password: <input type=\"password\" name=\"password\" value=\"\" size=\"20\"/>\n");
+ res.getWriter().printf(" <input type=\"submit\"/>\n");
+ res.getWriter().printf(" </form>\n");
+ res.getWriter().printf(" </body>\n");
+ res.getWriter().printf("</html>\n");
+
+ res.getWriter().flush();
+
+ }
+
+ protected static void generateContentRedirectPage(HttpServletResponse res, int itemNumber)
+ throws IOException
+ {
+ String redirectTarget = "/web/loginpage.html?id="+itemNumber;
+ // MHL
+ }
+
+ protected static void generateContentDisplayPage(HttpServletResponse res, int itemNumber)
+ throws IOException
+ {
+ res.setStatus(HttpServletResponse.SC_OK);
+ res.setContentType("text/html; charset=utf-8");
+
+ res.getWriter().printf("<html>\n");
+ res.getWriter().printf(" <body>This is the document content for item "+itemNumber+"</body>");
+ res.getWriter().printf("</html>\n");
+
+ res.getWriter().flush();
+ }
+
+ protected static void generateIndexRedirectPage(HttpServletResponse res)
+ throws IOException
+ {
+ String redirectTarget = "/web/loginpage.html";
+ // MHL
+ }
+
+ protected static void generateIndexDisplayPage(HttpServletResponse res, int countItems)
+ throws IOException
+ {
+ res.setStatus(HttpServletResponse.SC_OK);
+ res.setContentType("text/html; charset=utf-8");
+
+ res.getWriter().printf("<html>\n");
+ res.getWriter().printf(" <body>\n");
+
+ for (int i = 0; i < countItems; i++)
+ {
+ generateContentLink(res,i);
+ }
+
+ res.getWriter().printf(" </body>\n");
+ res.getWriter().printf("</html>\n");
+ res.getWriter().flush();
+
+ }
+
+ protected static void generateContentLink(HttpServletResponse res, int itemNumber)
+ throws IOException
+ {
+ res.getWriter().printf(" <a href=\"/web/protectedcontent.html?id="+itemNumber+"\">Item "+itemNumber+"</a>\n");
+ }
+
+ }
+}
Propchange: manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/MockSessionWebService.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/MockSessionWebService.java
------------------------------------------------------------------------------
svn:keywords = Id
Modified: manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/MockWebService.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/MockWebService.java?rev=1416199&r1=1416198&r2=1416199&view=diff
==============================================================================
--- manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/MockWebService.java (original)
+++ manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/MockWebService.java Sun Dec 2 16:57:56 2012
@@ -73,90 +73,90 @@ public class MockWebService
throws IOException
{
try {
- String resourceName = null;
-
- String site = req.getParameter("site"); // Site ID
- if (site == null)
- throw new IOException("Site ID parameter must be set");
-
- String level = req.getParameter("level"); // Level #
- if (site == null)
- throw new IOException("Level number parameter must be set");
-
- String item = req.getParameter("item"); // Item #
- if (item == null)
- throw new IOException("Item number parameter must be set");
+ String resourceName = null;
+
+ String site = req.getParameter("site"); // Site ID
+ if (site == null)
+ throw new IOException("Site ID parameter must be set");
+
+ String level = req.getParameter("level"); // Level #
+ if (site == null)
+ throw new IOException("Level number parameter must be set");
+
+ String item = req.getParameter("item"); // Item #
+ if (item == null)
+ throw new IOException("Item number parameter must be set");
- int theLevel;
- try
- {
- theLevel = Integer.parseInt(level);
- }
- catch (NumberFormatException e)
- {
- throw new IOException("Level number must be a number: "+level);
- }
-
- int theItem;
- try
- {
- theItem = Integer.parseInt(item);
- }
- catch (NumberFormatException e)
- {
- throw new IOException("Item number must be a number: "+item);
- }
+ int theLevel;
+ try
+ {
+ theLevel = Integer.parseInt(level);
+ }
+ catch (NumberFormatException e)
+ {
+ throw new IOException("Level number must be a number: "+level);
+ }
+
+ int theItem;
+ try
+ {
+ theItem = Integer.parseInt(item);
+ }
+ catch (NumberFormatException e)
+ {
+ throw new IOException("Item number must be a number: "+item);
+ }
- // Formulate the response.
- // First, calculate the number of docs on the current level
- int maxDocsThisLevel = 1;
- for (int i = 0 ; i < theLevel ; i++)
- {
- maxDocsThisLevel *= docsPerLevel;
- }
- if (theItem >= maxDocsThisLevel)
- // Not legal
- throw new IOException("Doc number too big: "+theItem+" ; level "+theLevel+" ; docsPerLevel "+docsPerLevel);
-
- // Generate the page
- res.setStatus(HttpServletResponse.SC_OK);
- res.setContentType("text/html; charset=utf-8");
- res.getWriter().printf("<html>\n");
- res.getWriter().printf(" <body>\n");
-
- res.getWriter().printf("This is doc number "+theItem+" and level number "+theLevel+" in site "+site+"\n");
-
- // Generate links to all parents
- int parentLevel = theLevel;
- int parentItem = theItem;
- while (parentLevel > 0)
- {
- parentLevel--;
- parentItem /= docsPerLevel;
- generateLink(res,site,parentLevel,parentItem);
- }
-
- // Temporary: Prevent links to children deeper than a certain level; this is to help
- // the debug process
- if (theLevel < 9)
- {
- // Generate links to direct children
- for (int i = 0; i < docsPerLevel; i++)
+ // Formulate the response.
+ // First, calculate the number of docs on the current level
+ int maxDocsThisLevel = 1;
+ for (int i = 0 ; i < theLevel ; i++)
{
- int docNumber = i + theItem * docsPerLevel;
- generateLink(res,site,theLevel+1,docNumber);
+ maxDocsThisLevel *= docsPerLevel;
}
- }
-
- // Generate some limited cross-links to other items at this level
- for (int i = theItem; i < maxDocsThisLevel && i < theItem + docsPerLevel; i++)
- {
- generateLink(res,site,theLevel,i);
- }
-
- res.getWriter().printf(" </body>\n");
- res.getWriter().printf("</html>\n");
- res.getWriter().flush();
+ if (theItem >= maxDocsThisLevel)
+ // Not legal
+ throw new IOException("Doc number too big: "+theItem+" ; level "+theLevel+" ; docsPerLevel "+docsPerLevel);
+
+ // Generate the page
+ res.setStatus(HttpServletResponse.SC_OK);
+ res.setContentType("text/html; charset=utf-8");
+ res.getWriter().printf("<html>\n");
+ res.getWriter().printf(" <body>\n");
+
+ res.getWriter().printf("This is doc number "+theItem+" and level number "+theLevel+" in site "+site+"\n");
+
+ // Generate links to all parents
+ int parentLevel = theLevel;
+ int parentItem = theItem;
+ while (parentLevel > 0)
+ {
+ parentLevel--;
+ parentItem /= docsPerLevel;
+ generateLink(res,site,parentLevel,parentItem);
+ }
+
+ // Temporary: Prevent links to children deeper than a certain level; this is to help
+ // the debug process
+ if (theLevel < 9)
+ {
+ // Generate links to direct children
+ for (int i = 0; i < docsPerLevel; i++)
+ {
+ int docNumber = i + theItem * docsPerLevel;
+ generateLink(res,site,theLevel+1,docNumber);
+ }
+ }
+
+ // Generate some limited cross-links to other items at this level
+ for (int i = theItem; i < maxDocsThisLevel && i < theItem + docsPerLevel; i++)
+ {
+ generateLink(res,site,theLevel,i);
+ }
+
+ res.getWriter().printf(" </body>\n");
+ res.getWriter().printf("</html>\n");
+ res.getWriter().flush();
}
catch (IOException e)
{
Added: manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/SessionTester.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/SessionTester.java?rev=1416199&view=auto
==============================================================================
--- manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/SessionTester.java (added)
+++ manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/SessionTester.java Sun Dec 2 16:57:56 2012
@@ -0,0 +1,180 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.webcrawler_tests;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import org.apache.manifoldcf.agents.interfaces.*;
+import org.apache.manifoldcf.crawler.interfaces.*;
+import org.apache.manifoldcf.crawler.system.ManifoldCF;
+
+import java.io.*;
+import java.util.*;
+
+import org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConnector;
+import org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConfig;
+
+/** Run a session-based crawl */
+public class SessionTester
+{
+ protected org.apache.manifoldcf.crawler.tests.ManifoldCFInstance instance;
+
+ public SessionTester(org.apache.manifoldcf.crawler.tests.ManifoldCFInstance instance)
+ {
+ this.instance = instance;
+ }
+
+ public void executeTest()
+ throws Exception
+ {
+ // Hey, we were able to install the web connector etc.
+ // Now, create a local test job and run it.
+ IThreadContext tc = ThreadContextFactory.make();
+
+ // Create a basic file system connection, and save it.
+ IRepositoryConnectionManager mgr = RepositoryConnectionManagerFactory.make(tc);
+ IRepositoryConnection conn = mgr.create();
+ conn.setName("Web Connection");
+ conn.setDescription("Web Connection");
+ conn.setClassName("org.apache.manifoldcf.crawler.connectors.webcrawler.WebcrawlerConnector");
+ conn.setMaxConnections(100);
+ ConfigParams cp = conn.getConfigParams();
+
+ cp.setParameter(WebcrawlerConfig.PARAMETER_EMAIL,"someone@somewhere.com");
+ cp.setParameter(WebcrawlerConfig.PARAMETER_ROBOTSUSAGE,"none");
+
+ // Set session auth settings
+ ConfigurationNode accessCredential = new ConfigurationNode(WebcrawlerConfig.NODE_ACCESSCREDENTIAL);
+ accessCredential.setAttribute(WebcrawlerConfig.ATTR_TYPE,WebcrawlerConfig.ATTRVALUE_SESSION);
+ accessCredential.setAttribute(WebcrawlerConfig.ATTR_URLREGEXP,"/web/");
+
+ // Add auth pages to accessCredential node
+
+ // Redirection to login page
+ ConfigurationNode redirectToLogin = new ConfigurationNode(WebcrawlerConfig.NODE_AUTHPAGE);
+ redirectToLogin.setAttribute(WebcrawlerConfig.ATTR_URLREGEXP,"/protectedcontent\\.html\\?");
+ redirectToLogin.setAttribute(WebcrawlerConfig.ATTR_TYPE,WebcrawlerConfig.ATTRVALUE_REDIRECTION);
+ redirectToLogin.setAttribute(WebcrawlerConfig.ATTR_MATCHREGEXP,"/loginpage\\.html\\?");
+ accessCredential.addChild(accessCredential.getChildCount(),redirectToLogin);
+
+ // Redirection to login page from index
+ ConfigurationNode redirectFromIndex = new ConfigurationNode(WebcrawlerConfig.NODE_AUTHPAGE);
+ redirectFromIndex.setAttribute(WebcrawlerConfig.ATTR_URLREGEXP,"/index\\.html$");
+ redirectFromIndex.setAttribute(WebcrawlerConfig.ATTR_TYPE,WebcrawlerConfig.ATTRVALUE_REDIRECTION);
+ redirectFromIndex.setAttribute(WebcrawlerConfig.ATTR_MATCHREGEXP,"/loginpage\\.html$");
+ accessCredential.addChild(accessCredential.getChildCount(),redirectFromIndex);
+
+ // Login page
+ ConfigurationNode loginPage = new ConfigurationNode(WebcrawlerConfig.NODE_AUTHPAGE);
+ loginPage.setAttribute(WebcrawlerConfig.ATTR_URLREGEXP,"/loginpage\\.html(\\?|$)");
+ loginPage.setAttribute(WebcrawlerConfig.ATTR_TYPE,WebcrawlerConfig.ATTRVALUE_FORM);
+ loginPage.setAttribute(WebcrawlerConfig.ATTR_MATCHREGEXP,"/loginpage\\.html(\\?|$)");
+ // Set credentials
+ ConfigurationNode userParameter = new ConfigurationNode(WebcrawlerConfig.NODE_AUTHPARAMETER);
+ userParameter.setAttribute(WebcrawlerConfig.ATTR_NAMEREGEXP,"user");
+ userParameter.setAttribute(WebcrawlerConfig.ATTR_VALUE,"foo");
+ loginPage.addChild(loginPage.getChildCount(),userParameter);
+ ConfigurationNode passwordParameter = new ConfigurationNode(WebcrawlerConfig.NODE_AUTHPARAMETER);
+ passwordParameter.setAttribute(WebcrawlerConfig.ATTR_NAMEREGEXP,"password");
+ passwordParameter.setAttribute(WebcrawlerConfig.ATTR_VALUE,"bar");
+ loginPage.addChild(loginPage.getChildCount(),passwordParameter);
+ accessCredential.addChild(accessCredential.getChildCount(),loginPage);
+
+ // Redirection from login page to content
+ ConfigurationNode redirectFromLogin = new ConfigurationNode(WebcrawlerConfig.NODE_AUTHPAGE);
+ redirectFromLogin.setAttribute(WebcrawlerConfig.ATTR_URLREGEXP,"/loginpage\\.html\\?");
+ redirectFromLogin.setAttribute(WebcrawlerConfig.ATTR_TYPE,WebcrawlerConfig.ATTRVALUE_REDIRECTION);
+ redirectFromLogin.setAttribute(WebcrawlerConfig.ATTR_MATCHREGEXP,"/protectedcontent\\.html\\?");
+ accessCredential.addChild(accessCredential.getChildCount(),redirectFromLogin);
+
+ // Redirection from login page to index
+ ConfigurationNode redirectToIndexFromLogin = new ConfigurationNode(WebcrawlerConfig.NODE_AUTHPAGE);
+ redirectToIndexFromLogin.setAttribute(WebcrawlerConfig.ATTR_URLREGEXP,"/loginpage\\.html$");
+ redirectToIndexFromLogin.setAttribute(WebcrawlerConfig.ATTR_TYPE,WebcrawlerConfig.ATTRVALUE_REDIRECTION);
+ redirectToIndexFromLogin.setAttribute(WebcrawlerConfig.ATTR_MATCHREGEXP,"/index\\.html$");
+ accessCredential.addChild(accessCredential.getChildCount(),redirectToIndexFromLogin);
+
+ cp.addChild(cp.getChildCount(),accessCredential);
+
+ // Now, save
+ mgr.save(conn);
+
+ // Create a basic null output connection, and save it.
+ IOutputConnectionManager outputMgr = OutputConnectionManagerFactory.make(tc);
+ IOutputConnection outputConn = outputMgr.create();
+ outputConn.setName("Null Connection");
+ outputConn.setDescription("Null Connection");
+ outputConn.setClassName("org.apache.manifoldcf.agents.output.nullconnector.NullConnector");
+ outputConn.setMaxConnections(100);
+ // Now, save
+ outputMgr.save(outputConn);
+
+ // Create a job.
+ IJobManager jobManager = JobManagerFactory.make(tc);
+ IJobDescription job = jobManager.createJob();
+ job.setDescription("Test Job");
+ job.setConnectionName("Web Connection");
+ job.setOutputConnectionName("Null Connection");
+ job.setType(job.TYPE_SPECIFIED);
+ job.setStartMethod(job.START_DISABLE);
+ job.setHopcountMode(job.HOPCOUNT_NEVERDELETE);
+
+ // Now, set up the document specification.
+ DocumentSpecification ds = job.getSpecification();
+
+ // Set up the seed
+ SpecificationNode sn = new SpecificationNode(WebcrawlerConfig.NODE_SEEDS);
+ sn.setValue("http://localhost:8191/web/index.html\n");
+ ds.addChild(ds.getChildCount(),sn);
+
+ sn = new SpecificationNode(WebcrawlerConfig.NODE_INCLUDES);
+ sn.setValue(".*\n");
+ ds.addChild(ds.getChildCount(),sn);
+
+ sn = new SpecificationNode(WebcrawlerConfig.NODE_INCLUDESINDEX);
+ sn.setValue(".*\n");
+ ds.addChild(ds.getChildCount(),sn);
+
+ // Set up the output specification.
+ OutputSpecification os = job.getOutputSpecification();
+ // Null output connections have no output specification, so this is a no-op.
+
+ // Save the job.
+ jobManager.save(job);
+
+ // Now, start the job, and wait until it completes.
+ long startTime = System.currentTimeMillis();
+ jobManager.manualStart(job.getID());
+ instance.waitJobInactiveNative(jobManager,job.getID(),600000L);
+ System.err.println("Crawl required "+new Long(System.currentTimeMillis()-startTime).toString()+" milliseconds");
+
+ // Check to be sure we actually processed the right number of documents.
+ JobStatus status = jobManager.getStatus(job.getID());
+ if (status.getDocumentsProcessed() != 101)
+ {
+ throw new ManifoldCFException("Wrong number of documents processed - expected 101, saw "+new Long(status.getDocumentsProcessed()).toString());
+ }
+
+ // Now, delete the job.
+ jobManager.deleteJob(job.getID());
+ instance.waitJobDeletedNative(jobManager,job.getID(),600000L);
+
+ // Cleanup is automatic by the base class, so we can feel free to leave jobs and connections lying around.
+ }
+
+}
Propchange: manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/SessionTester.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/SessionTester.java
------------------------------------------------------------------------------
svn:keywords = Id