You are viewing a plain text version of this content. The canonical link for it is here.
Posted to droids-commits@incubator.apache.org by ry...@apache.org on 2008/11/13 18:42:49 UTC
svn commit: r713780 -
/incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/LinkExtractor.java
Author: ryan
Date: Thu Nov 13 10:42:48 2008
New Revision: 713780
URL: http://svn.apache.org/viewvc?rev=713780&view=rev
Log:
DROIDS-8 -- adding Javier's LinkExtractor changes
Modified:
incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/LinkExtractor.java
Modified: incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/LinkExtractor.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/LinkExtractor.java?rev=713780&r1=713779&r2=713780&view=diff
==============================================================================
--- incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/LinkExtractor.java (original)
+++ incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/LinkExtractor.java Thu Nov 13 10:42:48 2008
@@ -17,7 +17,6 @@
package org.apache.droids.tika;
import java.net.URI;
-import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
@@ -33,79 +32,87 @@
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
-public class LinkExtractor extends DefaultHandler {
-
+public class LinkExtractor extends DefaultHandler
+{
protected final Log log = LogFactory.getLog(this.getClass());
-
- private Collection<Link> links = new ArrayList<Link>();;
-
- private Map<String,String> elements;
-
+
+ /**
+ * List of links
+ */
+ private Collection<Link> links = new ArrayList<Link>();
+
+ /**
+ * Map with the pair label-attribute for the accepted items
+ */
+ private Map<String, String> elements;
+
+ /**
+ * Base url for host reference
+ */
private Link base = null;
- private Set<String> history = new HashSet<String>();
-
+ /**
+ * Set of URIs visited yet
+ */
+ private Set<String> history = null;
+
+ /**
+ * The parsed link
+ */
+ private URI link = null;
+
+ @Override
+ public void startDocument() throws SAXException {
+ history = new HashSet<String>();
+ history.add(base.getURI().toString());
+ }
+
@Override
- public void startElement(String uri, String loc, String raw, Attributes att)
- throws SAXException {
+ public void startElement(String uri, String loc, String raw, Attributes att) throws SAXException
+ {
Iterator<String> it = elements.keySet().iterator();
String elem, linkAtt;
- while(it.hasNext())
- {
+ while (it.hasNext()) {
elem = it.next();
linkAtt = elements.get(elem);
if (elem.equalsIgnoreCase(loc) && att.getValue(linkAtt) != null) {
-
- String link = att.getValue(linkAtt);
+ link = getURI(att.getValue(linkAtt));
log.debug("Found element: " + elem + " with link: " + link);
- //TODO: Buscar una forma genérica
- //Matcher match = Pattern.compile("^(http://|/|\\.)").matcher(link);
- if(!link.startsWith("#") && !link.equals("") && !link.startsWith("mailto"))
- addOutlinkURI(link);
}
}
- super.startElement(uri, loc, raw, att);
}
@Override
- public void endDocument() throws SAXException {
- super.endDocument();
+ public void characters(char[] chars, int start, int length) throws SAXException
+ {
+ if (link != null) {
+ addOutlinkURI(new StringBuilder().append(chars, start, length).toString());
+ link = null;
+ }
+ }
+
+ @Override
+ public void endDocument() throws SAXException
+ {
history = null;
+ log.debug("Found " + links.size() + " outliks");
}
- public void addOutlinkURI(String target) {
- URI uri = null;
- URI from = base.getURI();
- try {
- String newUrl = "";
- if (target.startsWith("/")) {
- newUrl = from.getScheme() + "://" + from.getHost();
- if (from.getPort() > -1) {
- newUrl += ":" + from.getPort();
- }
- newUrl += target;
- } else if (!target.toLowerCase().startsWith("javascript")) {
- newUrl = new URL(from.toURL(), target).toString();
- }
- if (!newUrl.equals("")) {
- String aux = target.contains(":/") ? target : newUrl;
- // TODO:
- aux = aux.split("#")[0];
- uri = new URI(aux);
- if(history == null)
- history = new HashSet<String>();
- if(links == null)
- links = new ArrayList<Link>();
- if (history.add(aux)) {
- // TODO? aux??
- links.add(new LinkTask(base, uri, base.getDepth()+1 ));
- }
- }
- } catch (Exception e) {
- e.printStackTrace();
+ /**
+ * Add the outlink to the {@code links} list if the value is a valid URI.
+ * @param value the outlink.
+ */
+ public void addOutlinkURI(String value) {
+ if (history == null)
+ history = new HashSet<String>();
+ if (links == null)
+ links = new ArrayList<Link>();
+ if (history.add(link.toString())) {
+ links.add(new LinkTask(base, link, base.getDepth() + 1));
+ log.debug("Added outlink: " + link + " with depth: " + base.getDepth() + 1);
}
}
-
+
public void setBase(Link base) {
this.base = base;
}
@@ -121,4 +128,25 @@
public void setElements(Map<String, String> elements) {
this.elements = elements;
}
-}
+
+ /**
+ * Transform a String into an URI.
+ * @param target the URI in String format.
+ * @return the URI or null if the URI is not valid.
+ */
+ private URI getURI(String target) {
+ try {
+ if (!target.toLowerCase().startsWith("javascript")
+ && !target.contains(":/")) {
+ return base.getURI().resolve(target.split("#")[0]);
+ }
+ else if (!target.toLowerCase().startsWith("javascript")) {
+ return new URI(target.split("#")[0]);
+ }
+ }
+ catch (Exception e) {
+ log.error("URI not valid: " + target);
+ }
+ return null;
+ }
+}
\ No newline at end of file