You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@labs.apache.org by ry...@apache.org on 2008/10/02 22:52:34 UTC
svn commit: r701239 - in /labs/droids/branch/LABS-144/src:
core/java/org/apache/droids/ core/java/org/apache/droids/api/
plugins/java/org/apache/droids/parse/html/
robots/java/org/apache/droids/crawler/
Author: ryan
Date: Thu Oct 2 13:52:34 2008
New Revision: 701239
URL: http://svn.apache.org/viewvc?rev=701239&view=rev
Log:
making Link and Outlink the same class... no need to over complicate things :)
Removed:
labs/droids/branch/LABS-144/src/core/java/org/apache/droids/OutlinkTask.java
labs/droids/branch/LABS-144/src/core/java/org/apache/droids/api/Outlink.java
Modified:
labs/droids/branch/LABS-144/src/core/java/org/apache/droids/LinkTask.java
labs/droids/branch/LABS-144/src/core/java/org/apache/droids/ParseData.java
labs/droids/branch/LABS-144/src/core/java/org/apache/droids/api/Link.java
labs/droids/branch/LABS-144/src/plugins/java/org/apache/droids/parse/html/HtmlParser.java
labs/droids/branch/LABS-144/src/robots/java/org/apache/droids/crawler/CrawlingWorker.java
Modified: labs/droids/branch/LABS-144/src/core/java/org/apache/droids/LinkTask.java
URL: http://svn.apache.org/viewvc/labs/droids/branch/LABS-144/src/core/java/org/apache/droids/LinkTask.java?rev=701239&r1=701238&r2=701239&view=diff
==============================================================================
--- labs/droids/branch/LABS-144/src/core/java/org/apache/droids/LinkTask.java (original)
+++ labs/droids/branch/LABS-144/src/core/java/org/apache/droids/LinkTask.java Thu Oct 2 13:52:34 2008
@@ -30,6 +30,7 @@
private Date lastModifedDate;
private String[] linksTo;
+ private String anchorText;
public LinkTask( Link from, URI uri, int depth )
{
@@ -78,4 +79,12 @@
public URI getURI() {
return uri;
}
+
+ public String getAnchorText() {
+ return anchorText;
+ }
+
+ public void setAnchorText(String anchorText) {
+ this.anchorText = anchorText;
+ }
}
\ No newline at end of file
Modified: labs/droids/branch/LABS-144/src/core/java/org/apache/droids/ParseData.java
URL: http://svn.apache.org/viewvc/labs/droids/branch/LABS-144/src/core/java/org/apache/droids/ParseData.java?rev=701239&r1=701238&r2=701239&view=diff
==============================================================================
--- labs/droids/branch/LABS-144/src/core/java/org/apache/droids/ParseData.java (original)
+++ labs/droids/branch/LABS-144/src/core/java/org/apache/droids/ParseData.java Thu Oct 2 13:52:34 2008
@@ -16,7 +16,7 @@
*/
package org.apache.droids;
-import org.apache.droids.api.Outlink;
+import org.apache.droids.api.Link;
/**
* The result object that are filled by a parser
@@ -25,7 +25,7 @@
*
*/
public class ParseData {
- private Outlink[] outlinks;
+ private Link[] outlinks;
/**
* Create a new instance of Parse data for the given outlinks
@@ -33,7 +33,7 @@
* @param outlinks
* the array of outgoing links
*/
- public ParseData(Outlink[] outlinks) {
+ public ParseData(Link[] outlinks) {
this.outlinks = outlinks.clone();
}
@@ -42,7 +42,7 @@
*
* @return all outlinks
*/
- public Outlink[] getOutlinks() {
+ public Link[] getOutlinks() {
return outlinks.clone();
}
}
Modified: labs/droids/branch/LABS-144/src/core/java/org/apache/droids/api/Link.java
URL: http://svn.apache.org/viewvc/labs/droids/branch/LABS-144/src/core/java/org/apache/droids/api/Link.java?rev=701239&r1=701238&r2=701239&view=diff
==============================================================================
--- labs/droids/branch/LABS-144/src/core/java/org/apache/droids/api/Link.java (original)
+++ labs/droids/branch/LABS-144/src/core/java/org/apache/droids/api/Link.java Thu Oct 2 13:52:34 2008
@@ -32,6 +32,11 @@
* @return the URI to this link
*/
URI getURI();
+
+ /**
+ * @return the Anchor text for this link
+ */
+ String getAnchorText();
/**
* From where the link was created
Modified: labs/droids/branch/LABS-144/src/plugins/java/org/apache/droids/parse/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/labs/droids/branch/LABS-144/src/plugins/java/org/apache/droids/parse/html/HtmlParser.java?rev=701239&r1=701238&r2=701239&view=diff
==============================================================================
--- labs/droids/branch/LABS-144/src/plugins/java/org/apache/droids/parse/html/HtmlParser.java (original)
+++ labs/droids/branch/LABS-144/src/plugins/java/org/apache/droids/parse/html/HtmlParser.java Thu Oct 2 13:52:34 2008
@@ -29,8 +29,7 @@
import org.apache.droids.api.Parse;
import org.apache.droids.api.Parser;
import org.apache.droids.helper.Loggable;
-import org.apache.droids.api.Outlink;
-import org.apache.droids.OutlinkTask;
+import org.apache.droids.LinkTask;
import org.apache.droids.ParseData;
import org.apache.droids.parse.ParseImpl;
import org.apache.html.dom.HTMLDocumentImpl;
@@ -93,14 +92,13 @@
}
private ParseData extract(DocumentFragment node) {
- final ArrayList<Outlink> links = new ArrayList<Outlink>();
+ final ArrayList<Link> links = new ArrayList<Link>();
try {
extractLinks(node, links, new HashSet<String>());
} catch (MalformedURLException e) {
log.fatal(e);
}
- Outlink[] outlinks = new Outlink[0];
- outlinks = links.toArray(new Outlink[links.size()]);
+ Link[] outlinks = links.toArray(new Link[links.size()]);
return new ParseData(outlinks);
}
@@ -138,7 +136,7 @@
return remover;
}
- private void extractLinks(Node node, ArrayList<Outlink> links,
+ private void extractLinks(Node node, ArrayList<Link> links,
HashSet<String> set) throws MalformedURLException {
if (node.getNodeType() == Node.ELEMENT_NODE) {
String nodeName = node.getNodeName().toLowerCase();
@@ -166,7 +164,7 @@
// Link from, URI uri, int depth, String text
String url = target.contains(":/") ? target : newUrl;
URI uri = new URI( url );
- final OutlinkTask outlink = new OutlinkTask( link, uri, null );
+ final LinkTask outlink = new LinkTask( link, uri, link.getDepth()+1 );
log.debug("set size: "+set.size());
log.debug("outlink.getToUrl(): "+outlink.getURI());
log.debug("set.contains(outlink.getToUrl(): "+set.contains(url));
Modified: labs/droids/branch/LABS-144/src/robots/java/org/apache/droids/crawler/CrawlingWorker.java
URL: http://svn.apache.org/viewvc/labs/droids/branch/LABS-144/src/robots/java/org/apache/droids/crawler/CrawlingWorker.java?rev=701239&r1=701238&r2=701239&view=diff
==============================================================================
--- labs/droids/branch/LABS-144/src/robots/java/org/apache/droids/crawler/CrawlingWorker.java (original)
+++ labs/droids/branch/LABS-144/src/robots/java/org/apache/droids/crawler/CrawlingWorker.java Thu Oct 2 13:52:34 2008
@@ -60,7 +60,7 @@
else {
Parse parse = parser.getParse(openStream, link);
if( parse.getData() != null ) {
- Collection<Outlink> outlinks = getFilteredOutlinks( parse );
+ Collection<Link> outlinks = getFilteredOutlinks( parse );
droid.getQueue().merge( outlinks );
}
handle( parse, openStream, link );
@@ -97,14 +97,14 @@
}
}
- protected Collection<Outlink> getFilteredOutlinks( Parse parse )
+ protected Collection<Link> getFilteredOutlinks( Parse parse )
{
- Outlink[] links = parse.getData().getOutlinks();
+ Link[] links = parse.getData().getOutlinks();
// new cleaned list
URLFiltersFactory filters = droid.getFiltersFactory();
// TODO -- make the hashvalue for Outlink...
- Map<String,Outlink> filtered = new HashMap<String,Outlink>();
- for( Outlink outlink : links ) {
+ Map<String,Link> filtered = new HashMap<String,Link>();
+ for( Link outlink : links ) {
String id = outlink.getId();
if (filters.accept(outlink.getId()) && !filtered.containsKey(id)) {
filtered.put(id,outlink);
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@labs.apache.org
For additional commands, e-mail: commits-help@labs.apache.org