You are viewing a plain text version of this content. The canonical link for it is here.
Posted to droids-commits@incubator.apache.org by ry...@apache.org on 2008/11/13 18:42:49 UTC

svn commit: r713780 - /incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/LinkExtractor.java

Author: ryan
Date: Thu Nov 13 10:42:48 2008
New Revision: 713780

URL: http://svn.apache.org/viewvc?rev=713780&view=rev
Log:
DROIDS-8 -- adding Javier's LinkExtractor changes

Modified:
    incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/LinkExtractor.java

Modified: incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/LinkExtractor.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/LinkExtractor.java?rev=713780&r1=713779&r2=713780&view=diff
==============================================================================
--- incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/LinkExtractor.java (original)
+++ incubator/droids/trunk/droids-tika/src/main/java/org/apache/droids/tika/LinkExtractor.java Thu Nov 13 10:42:48 2008
@@ -17,7 +17,6 @@
 package org.apache.droids.tika;
 
 import java.net.URI;
-import java.net.URL;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashSet;
@@ -33,79 +32,87 @@
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
 
-public class LinkExtractor extends DefaultHandler {
-
+public class LinkExtractor extends DefaultHandler 
+{
   protected final Log log = LogFactory.getLog(this.getClass());
-  
-  private Collection<Link> links = new ArrayList<Link>();;
-  
-  private Map<String,String> elements;
-  
+
+  /**
+   * List of links
+   */
+  private Collection<Link> links = new ArrayList<Link>();
+
+  /**
+   * Map with the pair label-attribute for the accepted items
+   */
+  private Map<String, String> elements;
+
+  /**
+   * Base url for host reference
+   */
   private Link base = null;
 
-  private Set<String> history = new HashSet<String>();
-  
+  /**
+   * Set of URIs visited yet
+   */
+  private Set<String> history = null;
+
+  /**
+   * The parsed link
+   */
+  private URI link = null;
+
+  @Override
+  public void startDocument() throws SAXException {
+    history = new HashSet<String>();
+    history.add(base.getURI().toString());
+  }
+
   @Override
-  public void startElement(String uri, String loc, String raw, Attributes att)
-      throws SAXException {
+  public void startElement(String uri, String loc, String raw, Attributes att) throws SAXException 
+  {
     Iterator<String> it = elements.keySet().iterator();
     String elem, linkAtt;
-    while(it.hasNext())
-    {
+    while (it.hasNext()) {
       elem = it.next();
       linkAtt = elements.get(elem);
       if (elem.equalsIgnoreCase(loc) && att.getValue(linkAtt) != null) {
-        
-        String link = att.getValue(linkAtt);
+        link = getURI(att.getValue(linkAtt));
         log.debug("Found element: " + elem + " with link: " + link);
-        //TODO: Buscar una forma genérica
-        //Matcher match = Pattern.compile("^(http://|/|\\.)").matcher(link);
-        if(!link.startsWith("#") && !link.equals("") && !link.startsWith("mailto"))
-          addOutlinkURI(link);
       }
     }
-    super.startElement(uri, loc, raw, att);
   }
 
   @Override
-  public void endDocument() throws SAXException {
-    super.endDocument();
+  public void characters(char[] chars, int start, int length) throws SAXException 
+  {
+    if (link != null) {
+      addOutlinkURI(new StringBuilder().append(chars, start, length).toString());
+      link = null;
+    }
+  }
+
+  @Override
+  public void endDocument() throws SAXException 
+  {
     history = null;
+    log.debug("Found " + links.size() + " outliks");
   }
 
-  public void addOutlinkURI(String target) {
-    URI uri = null;
-    URI from = base.getURI();
-    try {
-      String newUrl = "";
-      if (target.startsWith("/")) {
-        newUrl = from.getScheme() + "://" + from.getHost();
-        if (from.getPort() > -1) {
-          newUrl += ":" + from.getPort();
-        }
-        newUrl += target;
-      } else if (!target.toLowerCase().startsWith("javascript")) {
-        newUrl = new URL(from.toURL(), target).toString();
-      }
-      if (!newUrl.equals("")) {
-        String aux = target.contains(":/") ? target : newUrl;
-        // TODO:
-        aux = aux.split("#")[0];
-        uri = new URI(aux);
-        if(history == null)
-          history = new HashSet<String>();
-        if(links == null)
-          links = new ArrayList<Link>();
-        if (history.add(aux)) {
-          // TODO? aux??
-          links.add(new LinkTask(base, uri, base.getDepth()+1 ));
-        }
-      }
-    } catch (Exception e) {
-      e.printStackTrace();
+  /**
+   * Add the outlink to the {@code links} list if the value is a valid URI.
+   * @param value the outlink.
+   */
+  public void addOutlinkURI(String value) {
+    if (history == null)
+      history = new HashSet<String>();
+    if (links == null)
+      links = new ArrayList<Link>();
+    if (history.add(link.toString())) {
+      links.add(new LinkTask(base, link, base.getDepth() + 1));
+      log.debug("Added outlink: " + link + " with depth: " + base.getDepth() + 1);
     }
   }
-  
+
   public void setBase(Link base) {
     this.base = base;
   }
@@ -121,4 +128,25 @@
   public void setElements(Map<String, String> elements) {
     this.elements = elements;
   }
-}
+
+  /**
+   * Transform a String into an URI.
+   * @param target the URI in String format.
+   * @return the URI or null if the URI is not valid.
+   */
+  private URI getURI(String target) {
+    try {
+      if (!target.toLowerCase().startsWith("javascript")
+          && !target.contains(":/")) {
+        return base.getURI().resolve(target.split("#")[0]);
+      } 
+      else if (!target.toLowerCase().startsWith("javascript")) {
+        return new URI(target.split("#")[0]);
+      }
+    } 
+    catch (Exception e) {
+      log.error("URI not valid: " + target);
+    }
+    return null;
+  }
+}
\ No newline at end of file