You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@any23.apache.org by mo...@apache.org on 2012/04/21 16:04:52 UTC
svn commit: r1328663 [1/2] - in /incubator/any23/trunk/core/src:
main/java/org/apache/any23/extractor/html/
test/java/org/apache/any23/extractor/html/ test/resources/microformats/hcard/
Author: mostarda
Date: Sat Apr 21 14:04:51 2012
New Revision: 1328663
URL: http://svn.apache.org/viewvc?rev=1328663&view=rev
Log:
Improved HCardExtractor performances. Related to issue #ANY23-76 .
Added:
incubator/any23/trunk/core/src/test/resources/microformats/hcard/performance.html
Modified:
incubator/any23/trunk/core/src/main/java/org/apache/any23/extractor/html/DomUtils.java
incubator/any23/trunk/core/src/main/java/org/apache/any23/extractor/html/HCardExtractor.java
incubator/any23/trunk/core/src/test/java/org/apache/any23/extractor/html/HCardExtractorTest.java
Modified: incubator/any23/trunk/core/src/main/java/org/apache/any23/extractor/html/DomUtils.java
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/core/src/main/java/org/apache/any23/extractor/html/DomUtils.java?rev=1328663&r1=1328662&r2=1328663&view=diff
==============================================================================
--- incubator/any23/trunk/core/src/main/java/org/apache/any23/extractor/html/DomUtils.java (original)
+++ incubator/any23/trunk/core/src/main/java/org/apache/any23/extractor/html/DomUtils.java Sat Apr 21 14:04:51 2012
@@ -20,6 +20,9 @@ package org.apache.any23.extractor.html;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
+import org.w3c.dom.traversal.DocumentTraversal;
+import org.w3c.dom.traversal.NodeFilter;
+import org.w3c.dom.traversal.NodeIterator;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
@@ -35,6 +38,7 @@ import java.io.IOException;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.List;
+import java.util.regex.Pattern;
/**
* This class provides utility methods for DOM manipulation.
@@ -188,7 +192,7 @@ public class DomUtils {
* @return list of matching nodes or an empty list.
*/
public static List<Node> findAllByClassName(Node root, String className) {
- return findAllByTagAndClassName(root, "*", className.toLowerCase());
+ return findAllBy(root, null, "class", className.toLowerCase());
}
/**
@@ -200,35 +204,19 @@ public class DomUtils {
* @return list of matching nodes or an empty list.
*/
public static List<Node> findAllByAttributeName(Node root, String attrName) {
- List<Node> result = new ArrayList<Node>();
- for (Node node : findAll(root, String.format("./descendant-or-self::*[@%s]", attrName) ) ) {
- result.add(node);
- }
- return result;
+ return findAllBy(root, null, attrName, null);
}
+
+ public static List<Node> findAllByAttributeContains(Node node, String attrName, String attrContains) {
+ return findAllBy(node, null, attrName, attrContains);
+ }
public static List<Node> findAllByTag(Node root, String tagName) {
- List<Node> result = new ArrayList<Node>();
- for (Node node : findAll(root, "./descendant-or-self::" + tagName)) {
- result.add(node);
- }
- return result;
+ return findAllBy(root, tagName, null, null);
}
-
- public static List<Node> findAllByTagAndClassName(Node root, String tagName, String className) {
- List<Node> result = new ArrayList<Node>();
- for (Node node : findAll(
- root,
- "./descendant-or-self::" +
- tagName +
- "[contains(translate(@class,'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'),'" +
- className + "')]")
- ) {
- if (DomUtils.hasClassName(node, className)) {
- result.add(node);
- }
- }
- return result;
+
+ public static List<Node> findAllByTagAndClassName(Node root, final String tagName, final String className) {
+ return findAllBy(root, tagName, "class", className);
}
/**
@@ -406,4 +394,70 @@ public class DomUtils {
return sw.toString();
}
+ /**
+ * High performance implementation of {@link #findAll(org.w3c.dom.Node, String)}.
+ *
+ * @param root root node to start search.
+ * @param tagName name of target tag.
+ * @param attrName name of attribute filter.
+ * @param attrContains expected content for attribute.
+ * @return
+ */
+ private static List<Node> findAllBy(Node root, final String tagName, final String attrName, String attrContains) {
+ DocumentTraversal documentTraversal = (DocumentTraversal) root.getOwnerDocument();
+ if (documentTraversal == null) {
+ documentTraversal = (DocumentTraversal) root;
+ }
+
+ final Pattern attrContainsPattern;
+ if (attrContains != null && !attrContains.equals("*")) {
+ attrContainsPattern = Pattern.compile("(^|\\s)" + attrContains + "(\\s|$)", Pattern.CASE_INSENSITIVE);
+ } else {
+ attrContainsPattern = null;
+ }
+
+ final List<Node> result = new ArrayList<Node>();
+ NodeIterator nodeIterator = documentTraversal.createNodeIterator(
+ root,
+ NodeFilter.SHOW_ELEMENT,
+ new NodeFilter() {
+ @Override
+ public short acceptNode(Node node) {
+ if (node.getNodeType() == Node.ELEMENT_NODE) {
+ if (tagName != null && !tagName.equals("*") && !tagName.equals(node.getNodeName())) {
+ // tagName given but doesn't match.
+ return FILTER_ACCEPT;
+ }
+
+ if (attrName != null) {
+ Node attrNameNode = node.getAttributes().getNamedItem(attrName);
+ if (attrNameNode == null) {
+ // attrName given but doesn't match
+ return FILTER_ACCEPT;
+ }
+
+ if (
+ attrContainsPattern != null
+ &&
+ !attrContainsPattern.matcher(attrNameNode.getNodeValue()).find()
+ ) {
+ // attrContains given but doesn't match
+ return FILTER_ACCEPT;
+ }
+ }
+ result.add(node);
+ }
+ return FILTER_ACCEPT;
+ }
+ }, false);
+
+ // To populate result we only need to iterate...
+ while (nodeIterator.nextNode() != null) ;
+
+ // We have to explicitly declare we are done with this nodeIterator to free it's resources.
+ nodeIterator.detach();
+
+ return result;
+ }
+
}
Modified: incubator/any23/trunk/core/src/main/java/org/apache/any23/extractor/html/HCardExtractor.java
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/core/src/main/java/org/apache/any23/extractor/html/HCardExtractor.java?rev=1328663&r1=1328662&r2=1328663&view=diff
==============================================================================
--- incubator/any23/trunk/core/src/main/java/org/apache/any23/extractor/html/HCardExtractor.java (original)
+++ incubator/any23/trunk/core/src/main/java/org/apache/any23/extractor/html/HCardExtractor.java Sat Apr 21 14:04:51 2012
@@ -94,7 +94,7 @@ public class HCardExtractor extends Enti
}
// include pattern, test 31
- for (Node current : document.findAll("//*[@class]")) {
+ for (Node current : DomUtils.findAllByAttributeName(document.getDocument(), "class")) {
if (!DomUtils.hasClassName(current, "include")) continue;
// we have to remove the field soon to avoid infinite loops
// no null check, we know it's there or we won't be in the loop
@@ -164,7 +164,7 @@ public class HCardExtractor extends Enti
private boolean addTelephones(Resource card) {
boolean found = false;
- for (Node node : fragment.findAll(".//*[contains(@class,'tel')]")) {
+ for (Node node : DomUtils.findAllByAttributeContains(fragment.getDocument(), "class", "tel")) {
HTMLDocument telFragment = new HTMLDocument(node);
TextField[] values = telFragment.getPluralUrlField("value");
if (values.length == 0) {
@@ -237,7 +237,6 @@ public class HCardExtractor extends Enti
private boolean addStringMultiProperty(String className, Resource resource, URI property) {
HTMLDocument.TextField[] fields = fragment.getPluralTextField(className);
boolean found = false;
- final String extractorName = getDescription().getExtractorName();
for(HTMLDocument.TextField field : fields) {
found |= conditionallyAddStringProperty(
field.source(),
@@ -394,7 +393,6 @@ public class HCardExtractor extends Enti
private boolean addOrganizationName(Resource card) {
if (name.getOrganization() == null) return false;
BNode org = valueFactory.createBNode();
- final String extractorName = getDescription().getExtractorName();
addBNodeProperty(
this.fragment.getDocument(),
card, vCARD.org, org
Modified: incubator/any23/trunk/core/src/test/java/org/apache/any23/extractor/html/HCardExtractorTest.java
URL: http://svn.apache.org/viewvc/incubator/any23/trunk/core/src/test/java/org/apache/any23/extractor/html/HCardExtractorTest.java?rev=1328663&r1=1328662&r2=1328663&view=diff
==============================================================================
--- incubator/any23/trunk/core/src/test/java/org/apache/any23/extractor/html/HCardExtractorTest.java (original)
+++ incubator/any23/trunk/core/src/test/java/org/apache/any23/extractor/html/HCardExtractorTest.java Sat Apr 21 14:04:51 2012
@@ -23,6 +23,7 @@ import org.apache.any23.extractor.Extrac
import org.apache.any23.extractor.IssueReport;
import org.apache.any23.rdf.RDFUtils;
import org.apache.any23.vocab.VCARD;
+import org.junit.Ignore;
import org.junit.Test;
import org.openrdf.model.Resource;
import org.openrdf.model.Statement;
@@ -955,7 +956,7 @@ public class HCardExtractorTest extends
/**
* Tests the detection and prevention of the inclusion of an ancestor by a sibling node.
- * This test is related to issue <a href="https://issues.apache.org/jira/browse/ANY23-58">ANY23-58</a>.
+ * This test is related to issue <a href="https://issues.apache.org/jira/browse/ANY23-58">ANY23-58</a>.
*
* @throws IOException
* @throws ExtractionException
@@ -966,6 +967,16 @@ public class HCardExtractorTest extends
assertIssue(IssueReport.IssueLevel.Warning, ".*Current node tries to include an ancestor node.*");
}
+ /**
+ * Tests extractor performances.
+ * This test is related to issue <a href="https://issues.apache.org/jira/browse/ANY23-76">ANY23-76</a>.
+ */
+ @Ignore
+ @Test(timeout = 30 * 1000)
+ public void testExtractionPerformance() {
+ assertExtract("microformats/hcard/performance.html");
+ }
+
private void assertDefaultVCard() throws RepositoryException {
assertModelNotEmpty();
assertStatementsSize(RDF.TYPE, vVCARD.VCard, 1);