You are viewing a plain text version of this content. The canonical link for it is here.
Posted to droids-commits@incubator.apache.org by ol...@apache.org on 2008/11/13 19:20:22 UTC
svn commit: r713797 -
/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java
Author: olegk
Date: Thu Nov 13 11:20:21 2008
New Revision: 713797
URL: http://svn.apache.org/viewvc?rev=713797&view=rev
Log:
Minor code refactoring in the HtmlParser
Modified:
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java
Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java?rev=713797&r1=713796&r2=713797&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java (original)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java Thu Nov 13 11:20:21 2008
@@ -97,9 +97,9 @@
private ParseData extract(DocumentFragment node) throws InvalidLinkException {
final ArrayList<Link> links = new ArrayList<Link>();
try {
- extractLinks(node, links, new HashSet<String>());
+ extractLinks(node, links, new HashSet<URI>());
} catch (URISyntaxException ex) {
- throw new InvalidLinkException(ex.getMessage(), ex);
+ throw new InvalidLinkException("Invalid URI: " + ex.getInput(), ex);
}
return new ParseData(links);
}
@@ -139,38 +139,35 @@
}
private void extractLinks(Node node, ArrayList<Link> links,
- HashSet<String> set) throws URISyntaxException {
+ HashSet<URI> set) throws URISyntaxException {
if (node.getNodeType() == Node.ELEMENT_NODE) {
String nodeName = node.getNodeName().toLowerCase();
if (elements.containsKey(nodeName)) {
String value = elements.get(nodeName);
NamedNodeMap attrs = node.getAttributes();
- String target = null;
for (int i = 0; i < attrs.getLength(); i++) {
Node attr = attrs.item(i);
String attrName = attr.getNodeName();
if (attrName.equalsIgnoreCase(value)) {
- target = attr.getNodeValue();
- String newUrl = "";
- if(target.startsWith("/")){
- newUrl=base.getScheme() + "://"+base.getHost();
- if(base.getPort()>-1){
- newUrl+=":"+base.getPort();
- }
- newUrl += target;
- }else if(!target.toLowerCase().startsWith("javascript")){
- newUrl = base.resolve(target).toString();
+ String ref = attr.getNodeValue();
+ URI newUri = null;
+ if(ref.startsWith("/")){
+ newUri = new URI(
+ base.getScheme(), base.getUserInfo(), base.getHost(), base.getPort(),
+ ref, null, null);
+ }else if(!ref.toLowerCase().startsWith("javascript")){
+ newUri = base.resolve(new URI(ref));
}
- if (!newUrl.equals("")) {
+ if (newUri != null) {
// Link from, URI uri, int depth, String text
- String url = target.contains(":/") ? target : newUrl;
- URI uri = new URI( url );
- final LinkTask outlink = new LinkTask( link, uri, link.getDepth()+1 );
- log.debug("set size: "+set.size());
- log.debug("outlink.getToUrl(): "+outlink.getURI());
- log.debug("set.contains(outlink.getToUrl(): "+set.contains(url));
- if (!set.contains(url)) {
- set.add(url);
+ final LinkTask outlink = new LinkTask( link, newUri, link.getDepth()+1 );
+ if (log.isDebugEnabled()) {
+ log.debug("set size: "+set.size());
+ log.debug("outlink.getToUrl(): "+outlink.getURI());
+ log.debug("set.contains(outlink.getToUrl(): " + set.contains(newUri));
+ }
+ if (!set.contains(newUri)) {
+ set.add(newUri);
links.add(outlink);
}
}