You are viewing a plain text version of this content. The canonical link for it is here.
Posted to svn@forrest.apache.org by rg...@apache.org on 2006/11/22 02:31:16 UTC
svn commit: r478001 - in /forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest: cli/CLI.java core/document/AbstractOutputDocument.java core/document/DefaultOutputDocument.java

Author: rgardler
Date: Tue Nov 21 17:31:15 2006
New Revision: 478001

URL: http://svn.apache.org/viewvc?view=rev&rev=478001
Log:
Add a (very basic) crawler.

Modified:
    forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/cli/CLI.java
    forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/core/document/AbstractOutputDocument.java
    forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/core/document/DefaultOutputDocument.java

Modified: forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/cli/CLI.java
URL: http://svn.apache.org/viewvc/forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/cli/CLI.java?view=diff&rev=478001&r1=478000&r2=478001
==============================================================================
--- forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/cli/CLI.java (original)
+++ forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/cli/CLI.java Tue Nov 21 17:31:15 2006
@@ -16,13 +16,19 @@
  */
 package org.apache.forrest.cli;
 
+import java.io.IOException;
+import java.net.MalformedURLException;
 import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.HashSet;
+import java.util.Set;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.forrest.core.Controller;
 import org.apache.forrest.core.IController;
 import org.apache.forrest.core.document.AbstractOutputDocument;
+import org.apache.forrest.core.exception.ProcessingException;
 
 /**
  * A command line interface for Forrest.
@@ -31,6 +37,12 @@
 public class CLI {
 	private static final Log log = LogFactory.getLog(CLI.class);
 
+	private static Set<String> processedUris = new HashSet<String>();
+
+	private static Set<String> unProcessedUris = new HashSet<String>();
+
+	private static IController controller;
+
 	/**
 	 * @param args
 	 */
@@ -45,19 +57,47 @@
 
 		try {
 			AbstractOutputDocument doc = null;
+			controller = new Controller();
 			System.out.println("\n Processing request for " + args[0]);
-			final URI requestURI = new URI(args[0]);
-			final IController controller = new Controller();
-			doc = controller.getOutputDocument(requestURI);
-
-			System.out.println("\n Resulting document for request " + args[0]
-					+ " is:\n");
-			System.out.println(doc.getContentAsString());
-
+			unProcessedUris.add(args[0]);
+			while (unProcessedUris.size() > 0) {
+				processURIs(unProcessedUris);
+			}
 		} catch (final Exception e) {
 			e.printStackTrace();
 			log.error(e);
 			System.exit(1);
+		}
+	}
+
+	/**
+	 * Processes a URI to get the response document. Any local links found in
+	 * the document are added to the list of documents to be processed.
+	 * 
+	 * @param uri
+	 * @param controller
+	 * @throws MalformedURLException
+	 * @throws ProcessingException
+	 * @throws IOException
+	 * @throws URISyntaxException
+	 */
+	private static void processURIs(final Set<String> uris)
+			throws MalformedURLException, ProcessingException, IOException,
+			URISyntaxException {
+		AbstractOutputDocument doc;
+		HashSet<String> processingUris = new HashSet<String>(uris);
+		unProcessedUris = new HashSet<String>();
+		for (String strUri : processingUris) {
+			URI uri = new URI(strUri);
+			if (!(processedUris.contains(strUri))) {
+				log.debug("Processing: " + strUri);
+				doc = controller.getOutputDocument(uri);
+				unProcessedUris.addAll(doc.getLocalDocumentLinks());
+				System.out.println("\n Resulting document for request " + uri
+						+ " is:\n");
+				System.out.println(doc.getContentAsString());
+				processedUris.add(strUri);
+			}
 		}
 	}
 

Modified: forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/core/document/AbstractOutputDocument.java
URL: http://svn.apache.org/viewvc/forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/core/document/AbstractOutputDocument.java?view=diff&rev=478001&r1=478000&r2=478001
==============================================================================
--- forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/core/document/AbstractOutputDocument.java (original)
+++ forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/core/document/AbstractOutputDocument.java Tue Nov 21 17:31:15 2006
@@ -17,6 +17,7 @@
 package org.apache.forrest.core.document;
 
 import java.net.URI;
+import java.util.Set;
 
 /**
  * An output document is a single document that has been processed by Forrest
@@ -34,5 +35,14 @@
 	public URI getRequestURI() {
 		return this.requestURI;
 	}
+
+	/**
+	 * Get a set of links to local documents in within this
+	 * document. This is used to identify links that should
+	 * be crawled when generating content.
+	 * 
+	 * @return
+	 */
+	public abstract Set<String> getLocalDocumentLinks();
 
 }

Modified: forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/core/document/DefaultOutputDocument.java
URL: http://svn.apache.org/viewvc/forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/core/document/DefaultOutputDocument.java?view=diff&rev=478001&r1=478000&r2=478001
==============================================================================
--- forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/core/document/DefaultOutputDocument.java (original)
+++ forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/core/document/DefaultOutputDocument.java Tue Nov 21 17:31:15 2006
@@ -16,12 +16,24 @@
  */
 package org.apache.forrest.core.document;
 
+import java.util.HashSet;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.log4j.Logger;
+
+import com.sun.org.apache.regexp.internal.RE;
+import com.sun.org.apache.regexp.internal.RESyntaxException;
+
 /**
  * The most basic of output documents. The document itself is nothing more than
  * a String.
  * 
  */
 public class DefaultOutputDocument extends AbstractOutputDocument {
+	
+	Logger log = Logger.getLogger(DefaultOutputDocument.class);
 
 	public DefaultOutputDocument(final String content) {
 		this.setContent(content);
@@ -30,6 +42,34 @@
 	@Override
 	public String getContentAsString() {
 		return this.content;
+	}
+
+	/**
+	 * Get the links that should be crawled from this document. Since type of
+	 * this document is not known (it's a string) it can be difficult to
+	 * identify links. However, if the document appears to be an HTML string
+	 * then href attributes of anchors are retrieved (only local links will be
+	 * returned in the resutls).
+	 */
+	@Override
+	public Set<String> getLocalDocumentLinks() {
+		Set<String> results = new HashSet<String>();
+		String content = getContentAsString();
+		if (content.contains("html") || content.contains("HTML")) {
+			String rePattern = "<[a|A]\\s*href=\"([^\"#]+)\"\\s*>([^*<]+)</[a|A]>";
+			Pattern pattern = Pattern.compile(rePattern);
+			Matcher matcher = pattern.matcher(content);
+			while (matcher.find()) {
+				String href = matcher.group(1);
+				if (href.startsWith("#") || href.startsWith("href://")) {
+					log.debug("Ignoring non-local href: " + href);
+				} else {
+		            results.add(href);
+		            log.debug("Added local href: " + href);
+				}
+	        }
+		}
+		return results;
 	}
 
 }