You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2013/06/19 05:19:26 UTC

svn commit: r1494428 - /nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java

Author: lewismc
Date: Wed Jun 19 03:19:26 2013
New Revision: 1494428

URL: http://svn.apache.org/r1494428
Log:
format microformat-reltag and change tag logging to debug

Modified:
    nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java

Modified: nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java?rev=1494428&r1=1494427&r2=1494428&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java (original)
+++ nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java Wed Jun 19 03:19:26 2013
@@ -62,56 +62,55 @@ public class RelTagParser implements Par
     Set<String> tags = null;
 
     Parser(Node node) {
-	  tags = new TreeSet<String>();
-	  parse(node);
+      tags = new TreeSet<String>();
+      parse(node);
     }
 
     Set<String> getRelTags() {
-	  return tags;
+      return tags;
     }
 
     void parse(Node node) {
-
       if (node.getNodeType() == Node.ELEMENT_NODE) {
-	    // Look for <a> tag
-	    if ("a".equalsIgnoreCase(node.getNodeName())) {
-		  NamedNodeMap attrs = node.getAttributes();
-		  Node hrefNode = attrs.getNamedItem("href");
-		  // Checks that it contains a href attribute
-		  if (hrefNode != null) {
-		    Node relNode = attrs.getNamedItem("rel");
-		    // Checks that it contains a rel attribute too
-		    if (relNode != null) {
-		      // Finaly checks that rel=tag
-			  if ("tag".equalsIgnoreCase(relNode.getNodeValue())) {
-			    String tag = parseTag(hrefNode.getNodeValue());
-			    if (!StringUtil.isEmpty(tag)) {
-				  tags.add(tag);
-				  LOG.info("Adding tag: " + tag + " to tag set.");
-			    }
-			  }
-		    }
-		  }
+        // Look for <a> tag
+        if ("a".equalsIgnoreCase(node.getNodeName())) {
+	  NamedNodeMap attrs = node.getAttributes();
+	  Node hrefNode = attrs.getNamedItem("href");
+	  // Checks that it contains a href attribute
+	  if (hrefNode != null) {
+	    Node relNode = attrs.getNamedItem("rel");
+	    // Checks that it contains a rel attribute too
+	    if (relNode != null) {
+	      // Finaly checks that rel=tag
+	      if ("tag".equalsIgnoreCase(relNode.getNodeValue())) {
+	        String tag = parseTag(hrefNode.getNodeValue());
+	        if (!StringUtil.isEmpty(tag)) {
+	          tags.add(tag);
+		  LOG.debug("Adding tag: " + tag + " to tag set.");
+	        }
+	      }
 	    }
+	  }
+	}
       }
 
-	  // Recurse
-	  NodeList children = node.getChildNodes();
-	  for (int i = 0; children != null && i < children.getLength(); i++) {
-	    parse(children.item(i));
-	  }
+      // Recurse
+      NodeList children = node.getChildNodes();
+      for (int i = 0; children != null && i < children.getLength(); i++) {
+        parse(children.item(i));
+      }
     }
 
     private final static String parseTag(String url) {
-	  String tag = null;
-	  try {
-	    URL u = new URL(url);
-	    String path = u.getPath();
-	    tag = URLDecoder.decode(path.substring(path.lastIndexOf('/') + 1), "UTF-8");
-	  } catch (Exception e) {
-	    // Malformed tag...
-	    tag = null;
-	  } return tag;
+      String tag = null;
+      try {
+        URL u = new URL(url);
+        String path = u.getPath();
+        tag = URLDecoder.decode(path.substring(path.lastIndexOf('/') + 1), "UTF-8");
+      } catch (Exception e) {
+        // Malformed tag...
+        tag = null;
+      } return tag;
     }
   }
 
@@ -119,21 +118,21 @@ public class RelTagParser implements Par
    * Set the {@link Configuration} object
    */
   public void setConf(Configuration conf) {
-	this.conf = conf;
+    this.conf = conf;
   }
 
   /**
    * Get the {@link Configuration} object
    */
   public Configuration getConf() {
-	return this.conf;
+    return this.conf;
   }
 
   private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
 
   static {
-	FIELDS.add(WebPage.Field.BASE_URL);
-	FIELDS.add(WebPage.Field.METADATA);
+    FIELDS.add(WebPage.Field.BASE_URL);
+    FIELDS.add(WebPage.Field.METADATA);
   }
   
   /**
@@ -144,7 +143,7 @@ public class RelTagParser implements Par
    */
   @Override
   public Collection<Field> getFields() {
-	return FIELDS;
+    return FIELDS;
   }
 
   @Override
@@ -158,19 +157,19 @@ public class RelTagParser implements Par
    * @return parse the actual {@link Parse} object
    */
   public Parse filter(String url, WebPage page, Parse parse,
-    HTMLMetaTags metaTags, DocumentFragment doc) {
-	// Trying to find the document's rel-tags
-	Parser parser = new Parser(doc);
-	Set<String> tags = parser.getRelTags();
-	// can't store multiple values in page metadata -> separate by tabs
-	StringBuffer sb = new StringBuffer();
-	Iterator<String> iter = tags.iterator();
-	while (iter.hasNext()) {
-	  sb.append(iter.next());
-	  sb.append("\t");
-	}
-	ByteBuffer bb = ByteBuffer.wrap(sb.toString().getBytes());
-	page.putToMetadata(new Utf8(REL_TAG), bb);
-	return parse;
+      HTMLMetaTags metaTags, DocumentFragment doc) {
+    // Trying to find the document's rel-tags
+    Parser parser = new Parser(doc);
+    Set<String> tags = parser.getRelTags();
+    // can't store multiple values in page metadata -> separate by tabs
+    StringBuffer sb = new StringBuffer();
+    Iterator<String> iter = tags.iterator();
+    while (iter.hasNext()) {
+      sb.append(iter.next());
+      sb.append("\t");
+    }
+    ByteBuffer bb = ByteBuffer.wrap(sb.toString().getBytes());
+    page.putToMetadata(new Utf8(REL_TAG), bb);
+    return parse;
   }
 }