You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@labs.apache.org by th...@apache.org on 2008/08/27 12:47:37 UTC

svn commit: r689441 - /labs/droids/trunk/src/core/java/org/apache/droids/parse/html/HtmlParser.java

Author: thorsten
Date: Wed Aug 27 03:47:37 2008
New Revision: 689441

URL: http://svn.apache.org/viewvc?rev=689441&view=rev
Log:
Enhancing parser implementation to be configurable in terms of the allowed elements

Modified:
    labs/droids/trunk/src/core/java/org/apache/droids/parse/html/HtmlParser.java

Modified: labs/droids/trunk/src/core/java/org/apache/droids/parse/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/labs/droids/trunk/src/core/java/org/apache/droids/parse/html/HtmlParser.java?rev=689441&r1=689440&r2=689441&view=diff
==============================================================================
--- labs/droids/trunk/src/core/java/org/apache/droids/parse/html/HtmlParser.java (original)
+++ labs/droids/trunk/src/core/java/org/apache/droids/parse/html/HtmlParser.java Wed Aug 27 03:47:37 2008
@@ -20,6 +20,7 @@
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.HashSet;
 
 import org.apache.droids.api.Parse;
@@ -47,6 +48,16 @@
  */
 public class HtmlParser extends Loggable implements Parser {
 
+  private HashMap<String, String> elements= null;
+
+  public HashMap<String, String> getElements() {
+    return elements;
+  }
+
+  public void setElements(HashMap<String, String> elements) {
+    this.elements = elements;
+  }
+
   private URL base = null;
 
   private Task link = null;
@@ -112,31 +123,32 @@
     return parser;
   }
 
-  private static ElementRemover getRemover() {
+  private ElementRemover getRemover() {
     // create element remover filter
     final ElementRemover remover = new ElementRemover();
     // set which elements to accept
-    remover.acceptElement("a", new String[] { "href" });
-    remover.acceptElement("link", new String[] { "href" });
-    remover.acceptElement("img", new String[] { "src" });
-    remover.acceptElement("script", new String[] { "src" });
+    for (String key : elements.keySet()) {
+      String value = elements.get(key);
+      remover.acceptElement(key, new String[] { value });
+    }
     // completely remove some elements
-    remover.removeElement("script");
-    remover.removeElement("head");
+    //remover.removeElement("head");
     return remover;
   }
 
   private void extractLinks(Node node, ArrayList<Outlink> links,
       HashSet<String> set) throws MalformedURLException {
     if (node.getNodeType() == Node.ELEMENT_NODE) {
-      if ("a".equalsIgnoreCase(node.getNodeName())||"img".equalsIgnoreCase(node.getNodeName())
-          ||"link".equalsIgnoreCase(node.getNodeName())||"script".equalsIgnoreCase(node.getNodeName()) ) {
+      String nodeName = node.getNodeName().toLowerCase();
+      if (elements.containsKey(nodeName)) {
+        String value = elements.get(nodeName);
+        System.out.println("key "+nodeName+" value "+value);
         NamedNodeMap attrs = node.getAttributes();
         String target = null;
         for (int i = 0; i < attrs.getLength(); i++) {
           Node attr = attrs.item(i);
           String attrName = attr.getNodeName();
-          if (attrName.equalsIgnoreCase("href")||attrName.equalsIgnoreCase("src")) {
+          if (attrName.equalsIgnoreCase(value)) {
             target = attr.getNodeValue();
             try {
               String newUrl = "";



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@labs.apache.org
For additional commands, e-mail: commits-help@labs.apache.org