You are viewing a plain text version of this content. The canonical link for it is here.
Posted to droids-commits@incubator.apache.org by th...@apache.org on 2010/04/30 12:00:39 UTC

svn commit: r939648 - /incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/LinkExtractor.java

Author: thorsten
Date: Fri Apr 30 12:00:39 2010
New Revision: 939648

URL: http://svn.apache.org/viewvc?rev=939648&view=rev
Log:
DROIDS-72
Reporter: Richard Frovarp
Patch: Richard Frovarp
review: thorsten

Modified:
    incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/LinkExtractor.java

Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/LinkExtractor.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/LinkExtractor.java?rev=939648&r1=939647&r2=939648&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/LinkExtractor.java (original)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/LinkExtractor.java Fri Apr 30 12:00:39 2010
@@ -17,6 +17,7 @@
 package org.apache.droids.parse.html;
 
 import java.net.URI;
+import java.net.URISyntaxException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashSet;
@@ -34,6 +35,17 @@ import org.xml.sax.helpers.DefaultHandle
 
 public class LinkExtractor extends DefaultHandler 
 {
+
+  /**
+   * Name of element that may contain base URI
+   */
+  private static final String BASE_ELEMENT = "base";
+
+  /**
+   * Name of attribute for base URI
+   */
+  private static final String BASE_ATTRIBUTE = "href";
+
   protected final Log log = LogFactory.getLog(this.getClass());
 
   /**
@@ -55,7 +67,17 @@ public class LinkExtractor extends Defau
    * Set of URIs visited yet
    */
   private Set<String> history = null;
-
+  
+  /**
+   * Base URI for resolving
+   */
+  private URI baseUri = null;
+  
+  /**
+   * Check for base elements
+   */
+  private boolean checkBase = true;
+  
   /**
    * The parsed link
    */
@@ -65,6 +87,7 @@ public class LinkExtractor extends Defau
     super();
     this.base = base;
     this.elements = elements;
+    this.baseUri = base.getURI();
   }
   
   @Override
@@ -76,6 +99,17 @@ public class LinkExtractor extends Defau
   @Override
   public void startElement(String uri, String loc, String raw, Attributes att) throws SAXException 
   {
+    if(checkBase && BASE_ELEMENT.equalsIgnoreCase(loc) && att.getValue(BASE_ATTRIBUTE) != null) {
+      try {
+        baseUri = new URI(att.getValue(BASE_ATTRIBUTE));
+        log.debug("Found base URI: " + baseUri);
+        checkBase = false;
+      } 
+      catch ( URISyntaxException e) {
+        log.error("Base URI not valid: " + att.getValue(BASE_ATTRIBUTE));
+      }
+    }
+    
     Iterator<String> it = elements.keySet().iterator();
     String elem, linkAtt;
     while (it.hasNext()) {
@@ -132,7 +166,7 @@ public class LinkExtractor extends Defau
     try {
       if (!target.toLowerCase().startsWith("javascript")
           && !target.contains(":/")) {
-        return base.getURI().resolve(target.split("#")[0]);
+        return baseUri.resolve(target.split("#")[0]);
       } 
       else if (!target.toLowerCase().startsWith("javascript")) {
         return new URI(target.split("#")[0]);