You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2013/06/19 05:19:26 UTC
svn commit: r1494428 -
/nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
Author: lewismc
Date: Wed Jun 19 03:19:26 2013
New Revision: 1494428
URL: http://svn.apache.org/r1494428
Log:
format microformat-reltag and change tag logging to debug
Modified:
nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
Modified: nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java?rev=1494428&r1=1494427&r2=1494428&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java (original)
+++ nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java Wed Jun 19 03:19:26 2013
@@ -62,56 +62,55 @@ public class RelTagParser implements Par
Set<String> tags = null;
Parser(Node node) {
- tags = new TreeSet<String>();
- parse(node);
+ tags = new TreeSet<String>();
+ parse(node);
}
Set<String> getRelTags() {
- return tags;
+ return tags;
}
void parse(Node node) {
-
if (node.getNodeType() == Node.ELEMENT_NODE) {
- // Look for <a> tag
- if ("a".equalsIgnoreCase(node.getNodeName())) {
- NamedNodeMap attrs = node.getAttributes();
- Node hrefNode = attrs.getNamedItem("href");
- // Checks that it contains a href attribute
- if (hrefNode != null) {
- Node relNode = attrs.getNamedItem("rel");
- // Checks that it contains a rel attribute too
- if (relNode != null) {
- // Finaly checks that rel=tag
- if ("tag".equalsIgnoreCase(relNode.getNodeValue())) {
- String tag = parseTag(hrefNode.getNodeValue());
- if (!StringUtil.isEmpty(tag)) {
- tags.add(tag);
- LOG.info("Adding tag: " + tag + " to tag set.");
- }
- }
- }
- }
+ // Look for <a> tag
+ if ("a".equalsIgnoreCase(node.getNodeName())) {
+ NamedNodeMap attrs = node.getAttributes();
+ Node hrefNode = attrs.getNamedItem("href");
+ // Checks that it contains a href attribute
+ if (hrefNode != null) {
+ Node relNode = attrs.getNamedItem("rel");
+ // Checks that it contains a rel attribute too
+ if (relNode != null) {
+ // Finaly checks that rel=tag
+ if ("tag".equalsIgnoreCase(relNode.getNodeValue())) {
+ String tag = parseTag(hrefNode.getNodeValue());
+ if (!StringUtil.isEmpty(tag)) {
+ tags.add(tag);
+ LOG.debug("Adding tag: " + tag + " to tag set.");
+ }
+ }
}
+ }
+ }
}
- // Recurse
- NodeList children = node.getChildNodes();
- for (int i = 0; children != null && i < children.getLength(); i++) {
- parse(children.item(i));
- }
+ // Recurse
+ NodeList children = node.getChildNodes();
+ for (int i = 0; children != null && i < children.getLength(); i++) {
+ parse(children.item(i));
+ }
}
private final static String parseTag(String url) {
- String tag = null;
- try {
- URL u = new URL(url);
- String path = u.getPath();
- tag = URLDecoder.decode(path.substring(path.lastIndexOf('/') + 1), "UTF-8");
- } catch (Exception e) {
- // Malformed tag...
- tag = null;
- } return tag;
+ String tag = null;
+ try {
+ URL u = new URL(url);
+ String path = u.getPath();
+ tag = URLDecoder.decode(path.substring(path.lastIndexOf('/') + 1), "UTF-8");
+ } catch (Exception e) {
+ // Malformed tag...
+ tag = null;
+ } return tag;
}
}
@@ -119,21 +118,21 @@ public class RelTagParser implements Par
* Set the {@link Configuration} object
*/
public void setConf(Configuration conf) {
- this.conf = conf;
+ this.conf = conf;
}
/**
* Get the {@link Configuration} object
*/
public Configuration getConf() {
- return this.conf;
+ return this.conf;
}
private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
static {
- FIELDS.add(WebPage.Field.BASE_URL);
- FIELDS.add(WebPage.Field.METADATA);
+ FIELDS.add(WebPage.Field.BASE_URL);
+ FIELDS.add(WebPage.Field.METADATA);
}
/**
@@ -144,7 +143,7 @@ public class RelTagParser implements Par
*/
@Override
public Collection<Field> getFields() {
- return FIELDS;
+ return FIELDS;
}
@Override
@@ -158,19 +157,19 @@ public class RelTagParser implements Par
* @return parse the actual {@link Parse} object
*/
public Parse filter(String url, WebPage page, Parse parse,
- HTMLMetaTags metaTags, DocumentFragment doc) {
- // Trying to find the document's rel-tags
- Parser parser = new Parser(doc);
- Set<String> tags = parser.getRelTags();
- // can't store multiple values in page metadata -> separate by tabs
- StringBuffer sb = new StringBuffer();
- Iterator<String> iter = tags.iterator();
- while (iter.hasNext()) {
- sb.append(iter.next());
- sb.append("\t");
- }
- ByteBuffer bb = ByteBuffer.wrap(sb.toString().getBytes());
- page.putToMetadata(new Utf8(REL_TAG), bb);
- return parse;
+ HTMLMetaTags metaTags, DocumentFragment doc) {
+ // Trying to find the document's rel-tags
+ Parser parser = new Parser(doc);
+ Set<String> tags = parser.getRelTags();
+ // can't store multiple values in page metadata -> separate by tabs
+ StringBuffer sb = new StringBuffer();
+ Iterator<String> iter = tags.iterator();
+ while (iter.hasNext()) {
+ sb.append(iter.next());
+ sb.append("\t");
+ }
+ ByteBuffer bb = ByteBuffer.wrap(sb.toString().getBytes());
+ page.putToMetadata(new Utf8(REL_TAG), bb);
+ return parse;
}
}