You are viewing a plain text version of this content. The canonical link for it is here.
Posted to droids-commits@incubator.apache.org by ol...@apache.org on 2008/11/13 19:20:22 UTC

svn commit: r713797 - /incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java

Author: olegk
Date: Thu Nov 13 11:20:21 2008
New Revision: 713797

URL: http://svn.apache.org/viewvc?rev=713797&view=rev
Log:
Minor code refactoring in the HtmlParser

Modified:
    incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java

Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java?rev=713797&r1=713796&r2=713797&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java (original)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/HtmlParser.java Thu Nov 13 11:20:21 2008
@@ -97,9 +97,9 @@
   private ParseData extract(DocumentFragment node) throws InvalidLinkException {
     final ArrayList<Link> links = new ArrayList<Link>();
     try {
-      extractLinks(node, links, new HashSet<String>());
+      extractLinks(node, links, new HashSet<URI>());
     } catch (URISyntaxException ex) {
-      throw new InvalidLinkException(ex.getMessage(), ex);
+      throw new InvalidLinkException("Invalid URI: " + ex.getInput(), ex);
     }
     return new ParseData(links);
   }
@@ -139,38 +139,35 @@
   }
 
   private void extractLinks(Node node, ArrayList<Link> links,
-      HashSet<String> set) throws URISyntaxException {
+      HashSet<URI> set) throws URISyntaxException {
     if (node.getNodeType() == Node.ELEMENT_NODE) {
       String nodeName = node.getNodeName().toLowerCase();
       if (elements.containsKey(nodeName)) {
         String value = elements.get(nodeName);
         NamedNodeMap attrs = node.getAttributes();
-        String target = null;
         for (int i = 0; i < attrs.getLength(); i++) {
           Node attr = attrs.item(i);
           String attrName = attr.getNodeName();
           if (attrName.equalsIgnoreCase(value)) {
-            target = attr.getNodeValue();
-            String newUrl = "";
-            if(target.startsWith("/")){
-              newUrl=base.getScheme() + "://"+base.getHost();
-              if(base.getPort()>-1){
-                newUrl+=":"+base.getPort();
-              }
-              newUrl += target;
-            }else if(!target.toLowerCase().startsWith("javascript")){
-              newUrl = base.resolve(target).toString();
+            String ref = attr.getNodeValue();
+            URI newUri = null;
+            if(ref.startsWith("/")){
+              newUri = new URI(
+                  base.getScheme(), base.getUserInfo(), base.getHost(), base.getPort(), 
+                  ref, null, null);
+            }else if(!ref.toLowerCase().startsWith("javascript")){
+              newUri = base.resolve(new URI(ref));
             }
-            if (!newUrl.equals("")) {
+            if (newUri != null) {
               // Link from, URI uri, int depth, String text
-              String url = target.contains(":/") ? target : newUrl;
-              URI uri = new URI( url );
-              final LinkTask outlink = new LinkTask( link, uri, link.getDepth()+1 );
-              log.debug("set size: "+set.size());
-              log.debug("outlink.getToUrl(): "+outlink.getURI());
-              log.debug("set.contains(outlink.getToUrl(): "+set.contains(url));
-              if (!set.contains(url)) {
-                set.add(url);
+              final LinkTask outlink = new LinkTask( link, newUri, link.getDepth()+1 );
+              if (log.isDebugEnabled()) {
+                log.debug("set size: "+set.size());
+                log.debug("outlink.getToUrl(): "+outlink.getURI());
+                log.debug("set.contains(outlink.getToUrl(): " + set.contains(newUri));
+              }
+              if (!set.contains(newUri)) {
+                set.add(newUri);
                 links.add(outlink);
               }
             }