You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/10/27 18:32:50 UTC

svn commit: r1028028 - in /tika/trunk/tika-core/src/main/java/org/apache/tika/sax: Link.java LinkBuilder.java LinkContentHandler.java

Author: jukka
Date: Wed Oct 27 16:32:50 2010
New Revision: 1028028

URL: http://svn.apache.org/viewvc?rev=1028028&view=rev
Log:
TIKA-503: Add a ContentHandler for collecting links from parser output

Add support collecting also image links. More to come...

Added:
    tika/trunk/tika-core/src/main/java/org/apache/tika/sax/Link.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/sax/LinkBuilder.java
Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/sax/LinkContentHandler.java

Added: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/Link.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/Link.java?rev=1028028&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/Link.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/Link.java Wed Oct 27 16:32:50 2010
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+public class Link {
+
+    private final String type;
+
+    private final String uri;
+
+    private final String title;
+
+    private final String text;
+
+    public Link(String type, String uri, String title, String text) {
+        this.type = type;
+        this.uri = uri;
+        this.title = title;
+        this.text = text;
+    }
+
+    public boolean isAnchor() {
+        return "a".equals(type);
+    }
+
+    public boolean isImage() {
+        return "img".equals(type);
+    }
+
+    public String getType() {
+        return type;
+    }
+
+    public String getUri() {
+        return uri;
+    }
+
+    public String getTitle() {
+        return title;
+    }
+
+    public String getText() {
+        return text;
+    }
+
+    public String toString() {
+        StringBuilder builder = new StringBuilder();
+        if (isImage()) {
+            builder.append("<img src=\"");
+            builder.append(uri);
+            if (title != null && title.length() > 0) {
+                builder.append("\" title=\"");
+                builder.append(title);
+            }
+            if (text != null && text.length() > 0) {
+                builder.append("\" alt=\"");
+                builder.append(text);
+            }
+            builder.append("\"/>");
+        } else {
+            builder.append("<");
+            builder.append(type);
+            builder.append(" href=\"");
+            builder.append(uri);
+            if (title != null && title.length() > 0) {
+                builder.append("\" title=\"");
+                builder.append(title);
+            }
+            builder.append("\">");
+            builder.append(text);
+            builder.append("</");
+            builder.append(type);
+            builder.append(">");
+        }
+        return builder.toString();
+    }
+
+}

Added: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/LinkBuilder.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/LinkBuilder.java?rev=1028028&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/LinkBuilder.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/LinkBuilder.java Wed Oct 27 16:32:50 2010
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+class LinkBuilder {
+
+    private final String type;
+
+    private String uri = "";
+
+    private String title = "";
+
+    private final StringBuilder text = new StringBuilder();
+
+    public LinkBuilder(String type) {
+        this.type = type;
+    }
+
+    public void setURI(String uri) {
+        if (uri != null) {
+            this.uri = uri;
+        } else {
+            this.uri = "";
+        }
+    }
+
+    public void setTitle(String title) {
+        if (title != null) {
+            this.title = title;
+        } else {
+            this.title = "";
+        }
+    }
+
+    public void characters(char[] ch, int offset, int length) {
+        text.append(ch, offset, length);
+    }
+
+    public Link getLink() {
+        return new Link(type, uri, title, text.toString());
+    }
+
+}

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/LinkContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/LinkContentHandler.java?rev=1028028&r1=1028027&r2=1028028&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/LinkContentHandler.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/LinkContentHandler.java Wed Oct 27 16:32:50 2010
@@ -18,43 +18,70 @@ package org.apache.tika.sax;
 
 import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
 
-import java.util.HashMap;
-import java.util.Map;
+import java.util.ArrayList;
+import java.util.LinkedList;
+import java.util.List;
 
 import org.xml.sax.Attributes;
 import org.xml.sax.helpers.DefaultHandler;
 
+/**
+ * Content handler that collects links from an XHTML document.
+ */
 public class LinkContentHandler extends DefaultHandler {
 
-    private final Map<String, String> links = new HashMap<String, String>();
-
-    private String href = null;
-
-    private final StringBuilder text = new StringBuilder();
-
-    public Map<String, String> getLinks() {
+    /**
+     * Stack of link builders, one for each level of nested links currently
+     * being processed. A usual case of a nested link would be a hyperlinked
+     * image (<code>&a href="..."&gt;&lt;img src="..."&gt;&lt;&gt;</code>),
+     * but it's possible (though unlikely) for also other kinds of nesting
+     * to occur.
+     */
+    private final LinkedList<LinkBuilder> builderStack =
+        new LinkedList<LinkBuilder>();
+
+    /** Collected links */
+    private final List<Link> links = new ArrayList<Link>();
+
+    /**
+     * Returns the list of collected links.
+     *
+     * @return collected links
+     */
+    public List<Link> getLinks() {
         return links;
     }
 
-    protected void addLink(String href, String text) {
-        links.put(href, text);
-    }
-
     //-------------------------------------------------------< ContentHandler>
 
     @Override
     public void startElement(
             String uri, String local, String name, Attributes attributes) {
-        if (XHTML.equals(uri) && "a".equals(local)) {
-            href = attributes.getValue("", "href");
-            text.setLength(0);
+        if (XHTML.equals(uri)) {
+            if ("a".equals(local)) {
+                LinkBuilder builder = new LinkBuilder("a");
+                builder.setURI(attributes.getValue("", "href"));
+                builder.setTitle(attributes.getValue("", "title"));
+                builderStack.push(builder);
+            } else if ("img".equals(local)) {
+                LinkBuilder builder = new LinkBuilder("img");
+                builder.setURI(attributes.getValue("", "src"));
+                builder.setTitle(attributes.getValue("", "title"));
+                builderStack.push(builder);
+
+                String alt = attributes.getValue("", "alt");
+                if (alt != null) {
+                    char[] ch = alt.toCharArray();
+                    characters(ch, 0, ch.length);
+                }
+            }
         }
     }
 
     @Override
     public void characters(char[] ch, int start, int length) {
-        if (href != null) {
-            text.append(ch, start, length);
+        for (LinkBuilder builder : builderStack) {
+            builder.characters(ch, start, length);
         }
     }
 
@@ -65,12 +92,11 @@ public class LinkContentHandler extends 
 
     @Override
     public void endElement(String uri, String local, String name) {
-        if (XHTML.equals(uri) && "a".equals(local) && href != null) {
-            addLink(href, text.toString());
-            href = null;
-            text.setLength(0);
+        if (XHTML.equals(uri)) {
+            if ("a".equals(local) || "img".equals(local)) {
+                links.add(builderStack.pop().getLink());
+            }
         }
     }
 
 }
-