You are viewing a plain text version of this content. The canonical link for it is here.
Posted to cvs@cocoon.apache.org by up...@apache.org on 2003/10/06 14:40:14 UTC
cvs commit: cocoon-2.1/src/java/org/apache/cocoon/bean/helpers OutputStreamListener.java
upayavira 2003/10/06 05:40:14
Modified: src/java/org/apache/cocoon/bean CocoonBean.java
src/java/org/apache/cocoon/bean/helpers
OutputStreamListener.java
Log:
Moved crawling code into a separate crawler class. This has made it possible to implement all of my proposed reporting options. The CLI now reports:
* [no of pages generated/no of pages left] [new links in page/total pages in links] page URI
Revision Changes Path
1.32 +52 -75 cocoon-2.1/src/java/org/apache/cocoon/bean/CocoonBean.java
Index: CocoonBean.java
===================================================================
RCS file: /home/cvs/cocoon-2.1/src/java/org/apache/cocoon/bean/CocoonBean.java,v
retrieving revision 1.31
retrieving revision 1.32
diff -u -r1.31 -r1.32
--- CocoonBean.java 1 Oct 2003 20:27:49 -0000 1.31
+++ CocoonBean.java 6 Oct 2003 12:40:14 -0000 1.32
@@ -53,6 +53,7 @@
import org.apache.cocoon.Constants;
import org.apache.cocoon.ResourceNotFoundException;
import org.apache.cocoon.ProcessingException;
+import org.apache.cocoon.bean.helpers.Crawler;
import org.apache.cocoon.bean.helpers.DelayedOutputStream;
import org.apache.cocoon.components.notification.SimpleNotifyingBean;
import org.apache.cocoon.components.notification.Notifier;
@@ -95,7 +96,6 @@
private boolean precompileOnly = false;
private boolean confirmExtension = true;
private String defaultFilename = Constants.INDEX_URI;
- private List targets = new ArrayList();
private boolean brokenLinkGenerate = false;
private String brokenLinkExtension = "";
private List excludePatterns = new ArrayList();
@@ -103,12 +103,15 @@
private List includeLinkExtensions = null;
// Internal Objects
- private Map allProcessedLinks;
- private Map allTranslatedLinks;
private boolean initialized;
private List listeners = new ArrayList();
private boolean verbose;
SourceResolver sourceResolver;
+ private Crawler crawler;
+
+ public CocoonBean() {
+ this.crawler = new Crawler();
+ }
//
// INITIALISATION METHOD
@@ -118,7 +121,7 @@
if (this.initialized == false) {
super.initialize();
- if (targets.size() == 0 && !precompileOnly) {
+ if (crawler.getRemainingCount() == 0 && !precompileOnly) {
String error = "Please, specify at least one starting URI.";
log.fatalError(error);
throw new ProcessingException(error);
@@ -196,7 +199,7 @@
target.setFollowLinks(this.followLinks);
target.setConfirmExtension(this.confirmExtension);
target.setLogger(this.logger);
- targets.add(target);
+ crawler.addTarget(target);
}
public void addTarget(String type, String sourceURI, String destURI)
@@ -206,7 +209,7 @@
target.setFollowLinks(this.followLinks);
target.setConfirmExtension(this.confirmExtension);
target.setLogger(this.logger);
- targets.add(target);
+ crawler.addTarget(target);
}
public void addTarget(String sourceURI, String destURI)
@@ -216,7 +219,7 @@
target.setFollowLinks(this.followLinks);
target.setConfirmExtension(this.confirmExtension);
target.setLogger(this.logger);
- targets.add(target);
+ crawler.addTarget(target);
}
public void addTargets(List uris, String destURI)
@@ -228,7 +231,7 @@
target.setFollowLinks(this.followLinks);
target.setConfirmExtension(this.confirmExtension);
target.setLogger(this.logger);
- targets.add(target);
+ crawler.addTarget(target);
}
}
@@ -254,7 +257,7 @@
target.setFollowLinks(followLinks);
target.setConfirmExtension(confirmExtension);
target.setLogger(logger);
- targets.add(target);
+ crawler.addTarget(target);
}
public void addExcludePattern(String pattern) {
@@ -351,60 +354,34 @@
this.initialize();
}
- allProcessedLinks = new HashMap();
- allTranslatedLinks = new HashMap();
-
- Map targetMap = new HashMap();
- Iterator i = targets.iterator();
- while (i.hasNext()) {
- Target target = (Target) i.next();
- targetMap.put(target, target);
- }
-
- int nCount = 0;
- while (targetMap.size() > 0) {
- Target target = (Target) targetMap.keySet().iterator().next();
- try {
- if (!allProcessedLinks.containsKey(target)) {
- if (precompileOnly) {
- processXSP(target.getSourceURI());
- } else if (this.followLinks) {
- i = processTarget(target).iterator();
- while (i.hasNext()) {
- Target link = (Target) i.next();
- targetMap.put(link, link);
- }
- } else {
- processTarget(target);
- }
+ if (crawler.getRemainingCount()==0) {
+ super.precompile();
+ } else {
+ Iterator iterator = crawler.iterator();
+ while (iterator.hasNext()) {
+ Target target = (Target) iterator.next();
+ if (precompileOnly) {
+ processXSP(target.getSourceURI());
+ } else {
+ processTarget(crawler, target);
}
- } catch (ResourceNotFoundException rnfe) {
- this.sendBrokenLinkWarning(target.getSourceURI(), rnfe.getMessage());
}
-
- targetMap.remove(target);
- nCount++;
-
- if (log.isInfoEnabled()) {
- log.info(
- " Memory used: "
- + (Runtime.getRuntime().totalMemory()
- - Runtime.getRuntime().freeMemory()));
- log.info(
- " Processed, Translated & Left: "
- + allProcessedLinks.size()
- + ", "
- + allTranslatedLinks.size()
- + ", "
- + targetMap.size());
- }
- }
-
- if (nCount == 0) {
- super.precompile();
}
+ if (log.isInfoEnabled()) {
+ log.info(
+ " Memory used: "
+ + (Runtime.getRuntime().totalMemory()
+ - Runtime.getRuntime().freeMemory()));
+ log.info(
+ " Processed, Translated & Left: "
+ + crawler.getProcessedCount()
+ + ", "
+ + crawler.getTranslatedCount()
+ + ", "
+ + crawler.getRemainingCount());
+ }
}
-
+
/**
* Processes the given Target and return all links.
*
@@ -439,25 +416,22 @@
* Target objects.
* @exception Exception if an error occurs
*/
- private Collection processTarget(Target target) throws Exception {
+ private void processTarget(Crawler crawler, Target target) throws Exception {
int status = 0;
int linkCount = 0;
int newLinkCount = 0;
int pageSize = 0;
-
long startTimeMillis = System.currentTimeMillis();
if (target.confirmExtensions()) {
- if (null == allTranslatedLinks.get(target.getSourceURI())) {
+ if (!crawler.hasTranslatedLink(target)) {
final String mimeType = getType(target.getDeparameterizedSourceURI(), target.getParameters());
target.setMimeType(mimeType);
- allTranslatedLinks.put(target.getSourceURI(), target.getDestinationURI());
+ crawler.addTranslatedLink(target);
}
}
- // Store processed URI list to avoid eternal loop
- allProcessedLinks.put(target, target);
// IS THIS STILL NEEDED?
//if ("".equals(destinationURI)) {
@@ -466,7 +440,6 @@
// Process links
final HashMap translatedLinks = new HashMap();
- final List targets = new ArrayList();
if (target.followLinks() && target.confirmExtensions() && isCrawlablePage(target)) {
final Iterator i =
this.getLinks(target.getDeparameterizedSourceURI(), target.getParameters()).iterator();
@@ -485,18 +458,22 @@
continue;
}
- if (null == allTranslatedLinks.get(linkTarget.getSourceURI())) {
+ if (!crawler.hasTranslatedLink(linkTarget)) {
try {
final String mimeType =
getType(linkTarget.getDeparameterizedSourceURI(), linkTarget.getParameters());
linkTarget.setMimeType(mimeType);
- allTranslatedLinks.put(linkTarget.getSourceURI(), linkTarget.getDestinationURI());
+ crawler.addTranslatedLink(linkTarget);
log.info(" Link translated: " + linkTarget.getSourceURI());
- targets.add(linkTarget);
+ if (crawler.addTarget(linkTarget)) {
+ newLinkCount++;
+ }
} catch (ProcessingException pe) {
this.sendBrokenLinkWarning(linkTarget.getSourceURI(), pe.getMessage());
if (this.brokenLinkGenerate) {
- targets.add(linkTarget);
+ if (crawler.addTarget(linkTarget)) {
+ newLinkCount++;
+ }
}
}
}
@@ -546,7 +523,9 @@
pageSkipped(linkTarget.getSourceURI(), "matched include/exclude rules");
continue;
}
- targets.add(linkTarget);
+ if (crawler.addTarget(linkTarget)) {
+ newLinkCount++;
+ }
}
linkCount = gatheredLinks.size();
}
@@ -573,8 +552,8 @@
pageSize,
linkCount,
newLinkCount,
- 0, //pagesRemaining, @TODO@ Implement this
- 0, //pagesComplete, @TODO@ Implement this
+ crawler.getRemainingCount(),
+ crawler.getProcessedCount(),
System.currentTimeMillis()- startTimeMillis);
} catch (IOException ioex) {
@@ -588,8 +567,6 @@
log.warn("Could not process URI: " + target.getSourceURI());
this.sendBrokenLinkWarning(target.getSourceURI(), "URI not found");
}
-
- return targets;
}
/**
1.5 +16 -5 cocoon-2.1/src/java/org/apache/cocoon/bean/helpers/OutputStreamListener.java
Index: OutputStreamListener.java
===================================================================
RCS file: /home/cvs/cocoon-2.1/src/java/org/apache/cocoon/bean/helpers/OutputStreamListener.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -r1.4 -r1.5
--- OutputStreamListener.java 27 Sep 2003 09:50:29 -0000 1.4
+++ OutputStreamListener.java 6 Oct 2003 12:40:14 -0000 1.5
@@ -75,6 +75,8 @@
private final long startTimeMillis;
private String reportFile = null;
private String reportType = "text";
+ private long siteSize = 0L;
+ private int sitePages = 0;
public OutputStreamListener(OutputStream os) {
writer = new PrintWriter(os);
@@ -97,6 +99,9 @@
int pagesRemaining,
int pagesComplete,
long timeTaken) {
+ this.siteSize += pageSize;
+ this.sitePages++;
+
double time = (((double)timeTaken)/1000);
String size;
@@ -109,7 +114,8 @@
if (linksInPage == -1) {
this.print("* " + sourceURI);
} else {
- this.print(pad(8, "* ["+linksInPage + "] ") +
+ this.print(pad(12, "* [" + pagesComplete + "/" + pagesRemaining + "] ") +
+ pad(10, "[" + newLinksInPage + "/" + linksInPage + "] ") +
pad(7,time + "s ") +
pad(7, size) + " " +
sourceURI);
@@ -125,7 +131,7 @@
}
public void brokenLinkFound(String uri, String parentURI, String message, Throwable t) {
- this.print(pad(28,"X [0] ")+uri+"\tBROKEN: "+message);
+ this.print(pad(42,"X [0] ")+uri+"\tBROKEN: "+message);
brokenLinks.add(uri + "\t" + message);
// StringWriter sw = new StringWriter();
@@ -135,14 +141,19 @@
}
public void pageSkipped(String uri, String message) {
- this.print(pad(23, "^ ") + uri);
+ this.print(pad(37, "^ ") + uri);
}
public void complete() {
outputBrokenLinks();
long duration = System.currentTimeMillis() - startTimeMillis;
- this.print("Total time: " + (duration / 60000) + " minutes " + (duration % 60000)/1000 + " seconds");
+
+ this.print("Total time: " +
+ (duration / 60000) + " minutes " +
+ (duration % 60000)/1000 + " seconds, " +
+ " Site size: " + this.siteSize +
+ " Site pages: " + this.sitePages);
this.close();
}