You are viewing a plain text version of this content. The canonical link for it is here.
Posted to droids-commits@incubator.apache.org by th...@apache.org on 2010/04/30 12:00:39 UTC
svn commit: r939648 -
/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/LinkExtractor.java
Author: thorsten
Date: Fri Apr 30 12:00:39 2010
New Revision: 939648
URL: http://svn.apache.org/viewvc?rev=939648&view=rev
Log:
DROIDS-72
Reporter: Richard Frovarp
Patch: Richard Frovarp
review: thorsten
Modified:
incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/LinkExtractor.java
Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/LinkExtractor.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/LinkExtractor.java?rev=939648&r1=939647&r2=939648&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/LinkExtractor.java (original)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/LinkExtractor.java Fri Apr 30 12:00:39 2010
@@ -17,6 +17,7 @@
package org.apache.droids.parse.html;
import java.net.URI;
+import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
@@ -34,6 +35,17 @@ import org.xml.sax.helpers.DefaultHandle
public class LinkExtractor extends DefaultHandler
{
+
+ /**
+ * Name of element that may contain base URI
+ */
+ private static final String BASE_ELEMENT = "base";
+
+ /**
+ * Name of attribute for base URI
+ */
+ private static final String BASE_ATTRIBUTE = "href";
+
protected final Log log = LogFactory.getLog(this.getClass());
/**
@@ -55,7 +67,17 @@ public class LinkExtractor extends Defau
* Set of URIs visited yet
*/
private Set<String> history = null;
-
+
+ /**
+ * Base URI for resolving
+ */
+ private URI baseUri = null;
+
+ /**
+ * Check for base elements
+ */
+ private boolean checkBase = true;
+
/**
* The parsed link
*/
@@ -65,6 +87,7 @@ public class LinkExtractor extends Defau
super();
this.base = base;
this.elements = elements;
+ this.baseUri = base.getURI();
}
@Override
@@ -76,6 +99,17 @@ public class LinkExtractor extends Defau
@Override
public void startElement(String uri, String loc, String raw, Attributes att) throws SAXException
{
+ if(checkBase && BASE_ELEMENT.equalsIgnoreCase(loc) && att.getValue(BASE_ATTRIBUTE) != null) {
+ try {
+ baseUri = new URI(att.getValue(BASE_ATTRIBUTE));
+ log.debug("Found base URI: " + baseUri);
+ checkBase = false;
+ }
+ catch ( URISyntaxException e) {
+ log.error("Base URI not valid: " + att.getValue(BASE_ATTRIBUTE));
+ }
+ }
+
Iterator<String> it = elements.keySet().iterator();
String elem, linkAtt;
while (it.hasNext()) {
@@ -132,7 +166,7 @@ public class LinkExtractor extends Defau
try {
if (!target.toLowerCase().startsWith("javascript")
&& !target.contains(":/")) {
- return base.getURI().resolve(target.split("#")[0]);
+ return baseUri.resolve(target.split("#")[0]);
}
else if (!target.toLowerCase().startsWith("javascript")) {
return new URI(target.split("#")[0]);