You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@labs.apache.org by th...@apache.org on 2008/08/27 12:47:37 UTC
svn commit: r689441 -
/labs/droids/trunk/src/core/java/org/apache/droids/parse/html/HtmlParser.java
Author: thorsten
Date: Wed Aug 27 03:47:37 2008
New Revision: 689441
URL: http://svn.apache.org/viewvc?rev=689441&view=rev
Log:
Enhancing parser implementation to be configurable in terms of the allowed elements
Modified:
labs/droids/trunk/src/core/java/org/apache/droids/parse/html/HtmlParser.java
Modified: labs/droids/trunk/src/core/java/org/apache/droids/parse/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/labs/droids/trunk/src/core/java/org/apache/droids/parse/html/HtmlParser.java?rev=689441&r1=689440&r2=689441&view=diff
==============================================================================
--- labs/droids/trunk/src/core/java/org/apache/droids/parse/html/HtmlParser.java (original)
+++ labs/droids/trunk/src/core/java/org/apache/droids/parse/html/HtmlParser.java Wed Aug 27 03:47:37 2008
@@ -20,6 +20,7 @@
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
+import java.util.HashMap;
import java.util.HashSet;
import org.apache.droids.api.Parse;
@@ -47,6 +48,16 @@
*/
public class HtmlParser extends Loggable implements Parser {
+ private HashMap<String, String> elements= null;
+
+ public HashMap<String, String> getElements() {
+ return elements;
+ }
+
+ public void setElements(HashMap<String, String> elements) {
+ this.elements = elements;
+ }
+
private URL base = null;
private Task link = null;
@@ -112,31 +123,32 @@
return parser;
}
- private static ElementRemover getRemover() {
+ private ElementRemover getRemover() {
// create element remover filter
final ElementRemover remover = new ElementRemover();
// set which elements to accept
- remover.acceptElement("a", new String[] { "href" });
- remover.acceptElement("link", new String[] { "href" });
- remover.acceptElement("img", new String[] { "src" });
- remover.acceptElement("script", new String[] { "src" });
+ for (String key : elements.keySet()) {
+ String value = elements.get(key);
+ remover.acceptElement(key, new String[] { value });
+ }
// completely remove some elements
- remover.removeElement("script");
- remover.removeElement("head");
+ //remover.removeElement("head");
return remover;
}
private void extractLinks(Node node, ArrayList<Outlink> links,
HashSet<String> set) throws MalformedURLException {
if (node.getNodeType() == Node.ELEMENT_NODE) {
- if ("a".equalsIgnoreCase(node.getNodeName())||"img".equalsIgnoreCase(node.getNodeName())
- ||"link".equalsIgnoreCase(node.getNodeName())||"script".equalsIgnoreCase(node.getNodeName()) ) {
+ String nodeName = node.getNodeName().toLowerCase();
+ if (elements.containsKey(nodeName)) {
+ String value = elements.get(nodeName);
+ System.out.println("key "+nodeName+" value "+value);
NamedNodeMap attrs = node.getAttributes();
String target = null;
for (int i = 0; i < attrs.getLength(); i++) {
Node attr = attrs.item(i);
String attrName = attr.getNodeName();
- if (attrName.equalsIgnoreCase("href")||attrName.equalsIgnoreCase("src")) {
+ if (attrName.equalsIgnoreCase(value)) {
target = attr.getNodeValue();
try {
String newUrl = "";
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@labs.apache.org
For additional commands, e-mail: commits-help@labs.apache.org