You are viewing a plain text version of this content. The canonical link for it is here.
Posted to svn@forrest.apache.org by rg...@apache.org on 2006/11/22 02:31:16 UTC
svn commit: r478001 - in
/forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest:
cli/CLI.java core/document/AbstractOutputDocument.java
core/document/DefaultOutputDocument.java
Author: rgardler
Date: Tue Nov 21 17:31:15 2006
New Revision: 478001
URL: http://svn.apache.org/viewvc?view=rev&rev=478001
Log:
Add a (very basic) crawler.
Modified:
forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/cli/CLI.java
forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/core/document/AbstractOutputDocument.java
forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/core/document/DefaultOutputDocument.java
Modified: forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/cli/CLI.java
URL: http://svn.apache.org/viewvc/forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/cli/CLI.java?view=diff&rev=478001&r1=478000&r2=478001
==============================================================================
--- forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/cli/CLI.java (original)
+++ forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/cli/CLI.java Tue Nov 21 17:31:15 2006
@@ -16,13 +16,19 @@
*/
package org.apache.forrest.cli;
+import java.io.IOException;
+import java.net.MalformedURLException;
import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.HashSet;
+import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.forrest.core.Controller;
import org.apache.forrest.core.IController;
import org.apache.forrest.core.document.AbstractOutputDocument;
+import org.apache.forrest.core.exception.ProcessingException;
/**
* A command line interface for Forrest.
@@ -31,6 +37,12 @@
public class CLI {
private static final Log log = LogFactory.getLog(CLI.class);
+ private static Set<String> processedUris = new HashSet<String>();
+
+ private static Set<String> unProcessedUris = new HashSet<String>();
+
+ private static IController controller;
+
/**
* @param args
*/
@@ -45,19 +57,47 @@
try {
AbstractOutputDocument doc = null;
+ controller = new Controller();
System.out.println("\n Processing request for " + args[0]);
- final URI requestURI = new URI(args[0]);
- final IController controller = new Controller();
- doc = controller.getOutputDocument(requestURI);
-
- System.out.println("\n Resulting document for request " + args[0]
- + " is:\n");
- System.out.println(doc.getContentAsString());
-
+ unProcessedUris.add(args[0]);
+ while (unProcessedUris.size() > 0) {
+ processURIs(unProcessedUris);
+ }
} catch (final Exception e) {
e.printStackTrace();
log.error(e);
System.exit(1);
+ }
+ }
+
+ /**
+ * Processes a URI to get the response document. Any local links found in
+ * the document are added to the list of documents to be processed.
+ *
+ * @param uri
+ * @param controller
+ * @throws MalformedURLException
+ * @throws ProcessingException
+ * @throws IOException
+ * @throws URISyntaxException
+ */
+ private static void processURIs(final Set<String> uris)
+ throws MalformedURLException, ProcessingException, IOException,
+ URISyntaxException {
+ AbstractOutputDocument doc;
+ HashSet<String> processingUris = new HashSet<String>(uris);
+ unProcessedUris = new HashSet<String>();
+ for (String strUri : processingUris) {
+ URI uri = new URI(strUri);
+ if (!(processedUris.contains(strUri))) {
+ log.debug("Processing: " + strUri);
+ doc = controller.getOutputDocument(uri);
+ unProcessedUris.addAll(doc.getLocalDocumentLinks());
+ System.out.println("\n Resulting document for request " + uri
+ + " is:\n");
+ System.out.println(doc.getContentAsString());
+ processedUris.add(strUri);
+ }
}
}
Modified: forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/core/document/AbstractOutputDocument.java
URL: http://svn.apache.org/viewvc/forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/core/document/AbstractOutputDocument.java?view=diff&rev=478001&r1=478000&r2=478001
==============================================================================
--- forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/core/document/AbstractOutputDocument.java (original)
+++ forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/core/document/AbstractOutputDocument.java Tue Nov 21 17:31:15 2006
@@ -17,6 +17,7 @@
package org.apache.forrest.core.document;
import java.net.URI;
+import java.util.Set;
/**
* An output document is a single document that has been processed by Forrest
@@ -34,5 +35,14 @@
public URI getRequestURI() {
return this.requestURI;
}
+
+ /**
+ * Get a set of links to local documents in within this
+ * document. This is used to identify links that should
+ * be crawled when generating content.
+ *
+ * @return
+ */
+ public abstract Set<String> getLocalDocumentLinks();
}
Modified: forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/core/document/DefaultOutputDocument.java
URL: http://svn.apache.org/viewvc/forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/core/document/DefaultOutputDocument.java?view=diff&rev=478001&r1=478000&r2=478001
==============================================================================
--- forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/core/document/DefaultOutputDocument.java (original)
+++ forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/core/document/DefaultOutputDocument.java Tue Nov 21 17:31:15 2006
@@ -16,12 +16,24 @@
*/
package org.apache.forrest.core.document;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.log4j.Logger;
+
+import com.sun.org.apache.regexp.internal.RE;
+import com.sun.org.apache.regexp.internal.RESyntaxException;
+
/**
* The most basic of output documents. The document itself is nothing more than
* a String.
*
*/
public class DefaultOutputDocument extends AbstractOutputDocument {
+
+ Logger log = Logger.getLogger(DefaultOutputDocument.class);
public DefaultOutputDocument(final String content) {
this.setContent(content);
@@ -30,6 +42,34 @@
@Override
public String getContentAsString() {
return this.content;
+ }
+
+ /**
+ * Get the links that should be crawled from this document. Since type of
+ * this document is not known (it's a string) it can be difficult to
+ * identify links. However, if the document appears to be an HTML string
+ * then href attributes of anchors are retrieved (only local links will be
+ * returned in the resutls).
+ */
+ @Override
+ public Set<String> getLocalDocumentLinks() {
+ Set<String> results = new HashSet<String>();
+ String content = getContentAsString();
+ if (content.contains("html") || content.contains("HTML")) {
+ String rePattern = "<[a|A]\\s*href=\"([^\"#]+)\"\\s*>([^*<]+)</[a|A]>";
+ Pattern pattern = Pattern.compile(rePattern);
+ Matcher matcher = pattern.matcher(content);
+ while (matcher.find()) {
+ String href = matcher.group(1);
+ if (href.startsWith("#") || href.startsWith("href://")) {
+ log.debug("Ignoring non-local href: " + href);
+ } else {
+ results.add(href);
+ log.debug("Added local href: " + href);
+ }
+ }
+ }
+ return results;
}
}