You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by kk...@apache.org on 2012/08/22 20:51:06 UTC

svn commit: r1376190 - in /tika/trunk/tika-core/src: main/java/org/apache/tika/sax/LinkBuilder.java main/java/org/apache/tika/sax/LinkContentHandler.java test/java/org/apache/tika/sax/LinkContentHandlerTest.java

Author: kkrugler
Date: Wed Aug 22 18:51:05 2012
New Revision: 1376190

URL: http://svn.apache.org/viewvc?rev=1376190&view=rev
Log:
TIKA-975: LinkBuilder to optionally collapse anchor whitespace

Added:
    tika/trunk/tika-core/src/test/java/org/apache/tika/sax/LinkContentHandlerTest.java   (with props)
Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/sax/LinkBuilder.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/sax/LinkContentHandler.java

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/LinkBuilder.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/LinkBuilder.java?rev=1376190&r1=1376189&r2=1376190&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/LinkBuilder.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/LinkBuilder.java Wed Aug 22 18:51:05 2012
@@ -61,7 +61,17 @@ class LinkBuilder {
     }
 
     public Link getLink() {
-        return new Link(type, uri, title, text.toString(), rel);
+        return getLink(false);
+    }
+    
+    public Link getLink(boolean collapseWhitespace) {
+        String anchor = text.toString();
+        
+        if (collapseWhitespace) {
+            anchor = anchor.replaceAll("\\s+", " ").trim();
+        }
+        
+        return new Link(type, uri, title, anchor, rel);
     }
 
 }

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/LinkContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/LinkContentHandler.java?rev=1376190&r1=1376189&r2=1376190&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/LinkContentHandler.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/LinkContentHandler.java Wed Aug 22 18:51:05 2012
@@ -42,6 +42,27 @@ public class LinkContentHandler extends 
 
     /** Collected links */
     private final List<Link> links = new ArrayList<Link>();
+    
+    /** Whether to collapse whitespace in anchor text */
+    private boolean collapseWhitespaceInAnchor;
+    
+    /**
+     * Default constructor
+     */
+    public LinkContentHandler() { 
+        this(false);
+    }
+    
+    /**
+     * Default constructor
+     *
+     * @boolean collapseWhitespaceInAnchor
+     */
+    public LinkContentHandler(boolean collapseWhitespaceInAnchor) {
+      super();
+      
+      this.collapseWhitespaceInAnchor = collapseWhitespaceInAnchor;
+    }
 
     /**
      * Returns the list of collected links.
@@ -96,7 +117,7 @@ public class LinkContentHandler extends 
     public void endElement(String uri, String local, String name) {
         if (XHTML.equals(uri)) {
             if ("a".equals(local) || "img".equals(local)) {
-                links.add(builderStack.removeFirst().getLink());
+                links.add(builderStack.removeFirst().getLink(collapseWhitespaceInAnchor));
             }
         }
     }

Added: tika/trunk/tika-core/src/test/java/org/apache/tika/sax/LinkContentHandlerTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/sax/LinkContentHandlerTest.java?rev=1376190&view=auto
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/sax/LinkContentHandlerTest.java (added)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/sax/LinkContentHandlerTest.java Wed Aug 22 18:51:05 2012
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import junit.framework.TestCase;
+
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Test cases for the {@link LinkContentHandler} class.
+ */
+public class LinkContentHandlerTest extends TestCase {
+
+    /**
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-975">TIKA-975</a>
+     */
+    public void testWhitespaceCollapsing() throws Exception {
+        LinkContentHandler linkContentHandler = new LinkContentHandler(true);
+        
+        linkContentHandler.startElement(XHTMLContentHandler.XHTML, "a", "", new AttributesImpl());
+        char[] anchorText = {'\n', 'N', 'o', ' ', 'w', 'h', 'i', 't', 'e', '\n', '\t', '\t', 's', 'p', 'a', 'c', 'e'};
+        linkContentHandler.characters(anchorText, 1, anchorText.length - 1);
+        linkContentHandler.endElement(XHTMLContentHandler.XHTML, "a", "");
+
+        assertEquals("No white space", linkContentHandler.getLinks().get(0).getText());
+    }
+
+    /**
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-975">TIKA-975</a>
+     */
+    public void testDefaultBehavior() throws Exception {
+        LinkContentHandler linkContentHandler = new LinkContentHandler();
+        
+        linkContentHandler.startElement(XHTMLContentHandler.XHTML, "a", "", new AttributesImpl());
+        char[] anchorText = {' ', 'a', 'n', 'c', 'h', 'o', 'r', ' '};
+        linkContentHandler.characters(anchorText, 0, anchorText.length);
+        linkContentHandler.endElement(XHTMLContentHandler.XHTML, "a", "");
+
+        assertEquals(" anchor ", linkContentHandler.getLinks().get(0).getText());
+    }
+
+}

Propchange: tika/trunk/tika-core/src/test/java/org/apache/tika/sax/LinkContentHandlerTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/test/java/org/apache/tika/sax/LinkContentHandlerTest.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain