You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/10/27 18:32:50 UTC
svn commit: r1028028 - in
/tika/trunk/tika-core/src/main/java/org/apache/tika/sax: Link.java
LinkBuilder.java LinkContentHandler.java
Author: jukka
Date: Wed Oct 27 16:32:50 2010
New Revision: 1028028
URL: http://svn.apache.org/viewvc?rev=1028028&view=rev
Log:
TIKA-503: Add a ContentHandler for collecting links from parser output
Add support collecting also image links. More to come...
Added:
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/Link.java
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/LinkBuilder.java
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/LinkContentHandler.java
Added: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/Link.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/Link.java?rev=1028028&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/Link.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/Link.java Wed Oct 27 16:32:50 2010
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+public class Link {
+
+ private final String type;
+
+ private final String uri;
+
+ private final String title;
+
+ private final String text;
+
+ public Link(String type, String uri, String title, String text) {
+ this.type = type;
+ this.uri = uri;
+ this.title = title;
+ this.text = text;
+ }
+
+ public boolean isAnchor() {
+ return "a".equals(type);
+ }
+
+ public boolean isImage() {
+ return "img".equals(type);
+ }
+
+ public String getType() {
+ return type;
+ }
+
+ public String getUri() {
+ return uri;
+ }
+
+ public String getTitle() {
+ return title;
+ }
+
+ public String getText() {
+ return text;
+ }
+
+ public String toString() {
+ StringBuilder builder = new StringBuilder();
+ if (isImage()) {
+ builder.append("<img src=\"");
+ builder.append(uri);
+ if (title != null && title.length() > 0) {
+ builder.append("\" title=\"");
+ builder.append(title);
+ }
+ if (text != null && text.length() > 0) {
+ builder.append("\" alt=\"");
+ builder.append(text);
+ }
+ builder.append("\"/>");
+ } else {
+ builder.append("<");
+ builder.append(type);
+ builder.append(" href=\"");
+ builder.append(uri);
+ if (title != null && title.length() > 0) {
+ builder.append("\" title=\"");
+ builder.append(title);
+ }
+ builder.append("\">");
+ builder.append(text);
+ builder.append("</");
+ builder.append(type);
+ builder.append(">");
+ }
+ return builder.toString();
+ }
+
+}
Added: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/LinkBuilder.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/LinkBuilder.java?rev=1028028&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/LinkBuilder.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/LinkBuilder.java Wed Oct 27 16:32:50 2010
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+class LinkBuilder {
+
+ private final String type;
+
+ private String uri = "";
+
+ private String title = "";
+
+ private final StringBuilder text = new StringBuilder();
+
+ public LinkBuilder(String type) {
+ this.type = type;
+ }
+
+ public void setURI(String uri) {
+ if (uri != null) {
+ this.uri = uri;
+ } else {
+ this.uri = "";
+ }
+ }
+
+ public void setTitle(String title) {
+ if (title != null) {
+ this.title = title;
+ } else {
+ this.title = "";
+ }
+ }
+
+ public void characters(char[] ch, int offset, int length) {
+ text.append(ch, offset, length);
+ }
+
+ public Link getLink() {
+ return new Link(type, uri, title, text.toString());
+ }
+
+}
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/LinkContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/LinkContentHandler.java?rev=1028028&r1=1028027&r2=1028028&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/LinkContentHandler.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/LinkContentHandler.java Wed Oct 27 16:32:50 2010
@@ -18,43 +18,70 @@ package org.apache.tika.sax;
import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
-import java.util.HashMap;
-import java.util.Map;
+import java.util.ArrayList;
+import java.util.LinkedList;
+import java.util.List;
import org.xml.sax.Attributes;
import org.xml.sax.helpers.DefaultHandler;
+/**
+ * Content handler that collects links from an XHTML document.
+ */
public class LinkContentHandler extends DefaultHandler {
- private final Map<String, String> links = new HashMap<String, String>();
-
- private String href = null;
-
- private final StringBuilder text = new StringBuilder();
-
- public Map<String, String> getLinks() {
+ /**
+ * Stack of link builders, one for each level of nested links currently
+ * being processed. A usual case of a nested link would be a hyperlinked
+ * image (<code>&a href="..."><img src="..."><></code>),
+ * but it's possible (though unlikely) for also other kinds of nesting
+ * to occur.
+ */
+ private final LinkedList<LinkBuilder> builderStack =
+ new LinkedList<LinkBuilder>();
+
+ /** Collected links */
+ private final List<Link> links = new ArrayList<Link>();
+
+ /**
+ * Returns the list of collected links.
+ *
+ * @return collected links
+ */
+ public List<Link> getLinks() {
return links;
}
- protected void addLink(String href, String text) {
- links.put(href, text);
- }
-
//-------------------------------------------------------< ContentHandler>
@Override
public void startElement(
String uri, String local, String name, Attributes attributes) {
- if (XHTML.equals(uri) && "a".equals(local)) {
- href = attributes.getValue("", "href");
- text.setLength(0);
+ if (XHTML.equals(uri)) {
+ if ("a".equals(local)) {
+ LinkBuilder builder = new LinkBuilder("a");
+ builder.setURI(attributes.getValue("", "href"));
+ builder.setTitle(attributes.getValue("", "title"));
+ builderStack.push(builder);
+ } else if ("img".equals(local)) {
+ LinkBuilder builder = new LinkBuilder("img");
+ builder.setURI(attributes.getValue("", "src"));
+ builder.setTitle(attributes.getValue("", "title"));
+ builderStack.push(builder);
+
+ String alt = attributes.getValue("", "alt");
+ if (alt != null) {
+ char[] ch = alt.toCharArray();
+ characters(ch, 0, ch.length);
+ }
+ }
}
}
@Override
public void characters(char[] ch, int start, int length) {
- if (href != null) {
- text.append(ch, start, length);
+ for (LinkBuilder builder : builderStack) {
+ builder.characters(ch, start, length);
}
}
@@ -65,12 +92,11 @@ public class LinkContentHandler extends
@Override
public void endElement(String uri, String local, String name) {
- if (XHTML.equals(uri) && "a".equals(local) && href != null) {
- addLink(href, text.toString());
- href = null;
- text.setLength(0);
+ if (XHTML.equals(uri)) {
+ if ("a".equals(local) || "img".equals(local)) {
+ links.add(builderStack.pop().getLink());
+ }
}
}
}
-