You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@any23.apache.org by mo...@apache.org on 2012/04/21 16:04:52 UTC

svn commit: r1328663 [1/2] - in /incubator/any23/trunk/core/src: main/java/org/apache/any23/extractor/html/ test/java/org/apache/any23/extractor/html/ test/resources/microformats/hcard/

Author: mostarda
Date: Sat Apr 21 14:04:51 2012
New Revision: 1328663

URL: http://svn.apache.org/viewvc?rev=1328663&view=rev
Log:
Improved HCardExtractor performances. Related to issue #ANY23-76 .

Added:
    incubator/any23/trunk/core/src/test/resources/microformats/hcard/performance.html
Modified:
    incubator/any23/trunk/core/src/main/java/org/apache/any23/extractor/html/DomUtils.java
    incubator/any23/trunk/core/src/main/java/org/apache/any23/extractor/html/HCardExtractor.java
    incubator/any23/trunk/core/src/test/java/org/apache/any23/extractor/html/HCardExtractorTest.java

Modified: incubator/any23/trunk/core/src/main/java/org/apache/any23/extractor/html/DomUtils.java
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/core/src/main/java/org/apache/any23/extractor/html/DomUtils.java?rev=1328663&r1=1328662&r2=1328663&view=diff
==============================================================================
--- incubator/any23/trunk/core/src/main/java/org/apache/any23/extractor/html/DomUtils.java (original)
+++ incubator/any23/trunk/core/src/main/java/org/apache/any23/extractor/html/DomUtils.java Sat Apr 21 14:04:51 2012
@@ -20,6 +20,9 @@ package org.apache.any23.extractor.html;
 import org.w3c.dom.NamedNodeMap;
 import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
+import org.w3c.dom.traversal.DocumentTraversal;
+import org.w3c.dom.traversal.NodeFilter;
+import org.w3c.dom.traversal.NodeIterator;
 
 import javax.xml.transform.OutputKeys;
 import javax.xml.transform.Transformer;
@@ -35,6 +38,7 @@ import java.io.IOException;
 import java.io.StringWriter;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.regex.Pattern;
 
 /**
  * This class provides utility methods for DOM manipulation.
@@ -188,7 +192,7 @@ public class DomUtils {
      * @return list of matching nodes or an empty list.
      */
     public static List<Node> findAllByClassName(Node root, String className) {
-        return findAllByTagAndClassName(root, "*", className.toLowerCase());
+        return findAllBy(root, null, "class", className.toLowerCase());
     }
 
     /**
@@ -200,35 +204,19 @@ public class DomUtils {
      * @return list of matching nodes or an empty list.
      */
     public static List<Node> findAllByAttributeName(Node root, String attrName) {
-        List<Node> result = new ArrayList<Node>();
-        for (Node node : findAll(root, String.format("./descendant-or-self::*[@%s]", attrName) ) ) {
-                result.add(node);
-        }
-        return result;
+        return findAllBy(root, null, attrName, null);
     }
+    
+   public static List<Node> findAllByAttributeContains(Node node, String attrName, String attrContains) {
+       return findAllBy(node, null, attrName, attrContains);
+   }
 
     public static List<Node> findAllByTag(Node root, String tagName) {
-        List<Node> result = new ArrayList<Node>();
-        for (Node node : findAll(root, "./descendant-or-self::" + tagName)) {
-            result.add(node);
-        }
-        return result;
+           return findAllBy(root, tagName, null, null);
     }
-
-    public static List<Node> findAllByTagAndClassName(Node root, String tagName, String className) {
-        List<Node> result = new ArrayList<Node>();
-        for (Node node : findAll(
-                root,
-                "./descendant-or-self::" +
-                tagName +
-                "[contains(translate(@class,'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'),'" +
-                className + "')]")
-        ) {
-            if (DomUtils.hasClassName(node, className)) {
-                result.add(node);
-            }
-        }
-        return result;
+    
+    public static List<Node> findAllByTagAndClassName(Node root, final String tagName, final String className) {
+       return findAllBy(root, tagName, "class", className);
     }
 
     /**
@@ -406,4 +394,70 @@ public class DomUtils {
         return sw.toString();
     }
 
+    /**
+     * High performance implementation of {@link #findAll(org.w3c.dom.Node, String)}.
+     *
+     * @param root root node to start search.
+     * @param tagName name of target tag.
+     * @param attrName name of attribute filter.
+     * @param attrContains expected content for attribute.
+     * @return
+     */
+    private static List<Node> findAllBy(Node root, final String tagName, final String attrName, String attrContains) {
+        DocumentTraversal documentTraversal = (DocumentTraversal) root.getOwnerDocument();
+        if (documentTraversal == null) {
+            documentTraversal = (DocumentTraversal) root;
+        }
+
+        final Pattern attrContainsPattern;
+        if (attrContains != null && !attrContains.equals("*")) {
+            attrContainsPattern = Pattern.compile("(^|\\s)" + attrContains + "(\\s|$)", Pattern.CASE_INSENSITIVE);
+        } else {
+            attrContainsPattern = null;
+        }
+
+        final List<Node> result = new ArrayList<Node>();
+        NodeIterator nodeIterator = documentTraversal.createNodeIterator(
+                root,
+                NodeFilter.SHOW_ELEMENT,
+                new NodeFilter() {
+                    @Override
+                    public short acceptNode(Node node) {
+                        if (node.getNodeType() == Node.ELEMENT_NODE) {
+                            if (tagName != null && !tagName.equals("*") && !tagName.equals(node.getNodeName())) {
+                                // tagName given but doesn't match.
+                                return FILTER_ACCEPT;
+                            }
+
+                            if (attrName != null) {
+                                Node attrNameNode = node.getAttributes().getNamedItem(attrName);
+                                if (attrNameNode == null) {
+                                    // attrName given but doesn't match
+                                    return FILTER_ACCEPT;
+                                }
+
+                                if (
+                                        attrContainsPattern != null
+                                                &&
+                                                !attrContainsPattern.matcher(attrNameNode.getNodeValue()).find()
+                                        ) {
+                                    // attrContains given but doesn't match
+                                    return FILTER_ACCEPT;
+                                }
+                            }
+                            result.add(node);
+                        }
+                        return FILTER_ACCEPT;
+                    }
+                }, false);
+
+        // To populate result we only need to iterate...
+        while (nodeIterator.nextNode() != null) ;
+
+        // We have to explicitly declare we are done with this nodeIterator to free it's resources.
+        nodeIterator.detach();
+
+        return result;
+    }
+
 }

Modified: incubator/any23/trunk/core/src/main/java/org/apache/any23/extractor/html/HCardExtractor.java
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/core/src/main/java/org/apache/any23/extractor/html/HCardExtractor.java?rev=1328663&r1=1328662&r2=1328663&view=diff
==============================================================================
--- incubator/any23/trunk/core/src/main/java/org/apache/any23/extractor/html/HCardExtractor.java (original)
+++ incubator/any23/trunk/core/src/main/java/org/apache/any23/extractor/html/HCardExtractor.java Sat Apr 21 14:04:51 2012
@@ -94,7 +94,7 @@ public class HCardExtractor extends Enti
         }
 
         // include pattern, test 31
-        for (Node current : document.findAll("//*[@class]")) {
+        for (Node current : DomUtils.findAllByAttributeName(document.getDocument(), "class")) {
             if (!DomUtils.hasClassName(current, "include")) continue;
             // we have to remove the field soon to avoid infinite loops
             // no null check, we know it's there or we won't be in the loop
@@ -164,7 +164,7 @@ public class HCardExtractor extends Enti
 
     private boolean addTelephones(Resource card) {
         boolean found = false;
-        for (Node node : fragment.findAll(".//*[contains(@class,'tel')]")) {
+        for (Node node : DomUtils.findAllByAttributeContains(fragment.getDocument(), "class", "tel")) {
             HTMLDocument telFragment = new HTMLDocument(node);
             TextField[] values = telFragment.getPluralUrlField("value");
             if (values.length == 0) {
@@ -237,7 +237,6 @@ public class HCardExtractor extends Enti
     private boolean addStringMultiProperty(String className, Resource resource, URI property) {
         HTMLDocument.TextField[] fields = fragment.getPluralTextField(className);
         boolean found = false;
-        final String extractorName = getDescription().getExtractorName();
         for(HTMLDocument.TextField field : fields) {
             found |= conditionallyAddStringProperty(
                     field.source(),
@@ -394,7 +393,6 @@ public class HCardExtractor extends Enti
     private boolean addOrganizationName(Resource card) {
         if (name.getOrganization() == null) return false;
         BNode org = valueFactory.createBNode();
-        final String extractorName =  getDescription().getExtractorName();
         addBNodeProperty(
                 this.fragment.getDocument(),
                 card, vCARD.org, org

Modified: incubator/any23/trunk/core/src/test/java/org/apache/any23/extractor/html/HCardExtractorTest.java
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/core/src/test/java/org/apache/any23/extractor/html/HCardExtractorTest.java?rev=1328663&r1=1328662&r2=1328663&view=diff
==============================================================================
--- incubator/any23/trunk/core/src/test/java/org/apache/any23/extractor/html/HCardExtractorTest.java (original)
+++ incubator/any23/trunk/core/src/test/java/org/apache/any23/extractor/html/HCardExtractorTest.java Sat Apr 21 14:04:51 2012
@@ -23,6 +23,7 @@ import org.apache.any23.extractor.Extrac
 import org.apache.any23.extractor.IssueReport;
 import org.apache.any23.rdf.RDFUtils;
 import org.apache.any23.vocab.VCARD;
+import org.junit.Ignore;
 import org.junit.Test;
 import org.openrdf.model.Resource;
 import org.openrdf.model.Statement;
@@ -955,7 +956,7 @@ public class HCardExtractorTest extends 
 
     /**
      * Tests the detection and prevention of the inclusion of an ancestor by a sibling node.
-         * This test is related to issue <a href="https://issues.apache.org/jira/browse/ANY23-58">ANY23-58</a>.
+     * This test is related to issue <a href="https://issues.apache.org/jira/browse/ANY23-58">ANY23-58</a>.
      *
      * @throws IOException
      * @throws ExtractionException
@@ -966,6 +967,16 @@ public class HCardExtractorTest extends 
         assertIssue(IssueReport.IssueLevel.Warning, ".*Current node tries to include an ancestor node.*");
     }
 
+    /**
+     * Tests extractor performances.
+     * This test is related to issue <a href="https://issues.apache.org/jira/browse/ANY23-76">ANY23-76</a>.
+     */
+    @Ignore
+    @Test(timeout = 30 * 1000)
+    public void testExtractionPerformance() {
+        assertExtract("microformats/hcard/performance.html");
+    }
+
     private void assertDefaultVCard() throws RepositoryException {
         assertModelNotEmpty();
         assertStatementsSize(RDF.TYPE, vVCARD.VCard, 1);