You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by je...@apache.org on 2006/03/03 23:33:31 UTC
svn commit: r382948 - in /lucene/nutch/trunk: ./ src/plugin/
src/plugin/microformats-reltag/ src/plugin/microformats-reltag/src/
src/plugin/microformats-reltag/src/java/
src/plugin/microformats-reltag/src/java/org/
src/plugin/microformats-reltag/src/ja...
Author: jerome
Date: Fri Mar 3 14:33:29 2006
New Revision: 382948
URL: http://svn.apache.org/viewcvs?rev=382948&view=rev
Log:
Add a microformats rel-tag parser/indexer/searcher plugin (a la technorati)
Added:
lucene/nutch/trunk/src/plugin/microformats-reltag/
lucene/nutch/trunk/src/plugin/microformats-reltag/build.xml (with props)
lucene/nutch/trunk/src/plugin/microformats-reltag/plugin.xml (with props)
lucene/nutch/trunk/src/plugin/microformats-reltag/src/
lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/
lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/
lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/
lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/
lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/
lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/
lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java (with props)
lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java (with props)
lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagQueryFilter.java (with props)
lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package.html (with props)
Modified:
lucene/nutch/trunk/build.xml
lucene/nutch/trunk/default.properties
lucene/nutch/trunk/src/plugin/build.xml
Modified: lucene/nutch/trunk/build.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=382948&r1=382947&r2=382948&view=diff
==============================================================================
--- lucene/nutch/trunk/build.xml (original)
+++ lucene/nutch/trunk/build.xml Fri Mar 3 14:33:29 2006
@@ -249,6 +249,7 @@
<packageset dir="${src.dir}"/>
<packageset dir="${plugins.dir}/lib-http/src/java"/>
<packageset dir="${plugins.dir}/lib-parsems/src/java"/>
+ <packageset dir="${plugins.dir}/microformats-reltag/src/java"/>
<packageset dir="${plugins.dir}/ontology/src/java"/>
<packageset dir="${plugins.dir}/protocol-file/src/java"/>
<packageset dir="${plugins.dir}/protocol-ftp/src/java"/>
Modified: lucene/nutch/trunk/default.properties
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/default.properties?rev=382948&r1=382947&r2=382948&view=diff
==============================================================================
--- lucene/nutch/trunk/default.properties (original)
+++ lucene/nutch/trunk/default.properties Fri Mar 3 14:33:29 2006
@@ -70,6 +70,7 @@
# plugin.ontology=org.apache.nutch.ontology*
plugin.parsems=org.apache.nutch.parse.ms*
plugin.pdf=org.apache.nutch.parse.pdf*
+plugin.reltag=org.apache.nutch.microformats.reltag*
plugin.rss=org.apache.nutch.parse.rss*
plugin.rtf=org.apache.nutch.parse.rtf*
plugin.site=org.apache.nutch.searcher.site*
@@ -98,6 +99,7 @@
${plugin.msword}:\
${plugin.parsems}:\
${plugin.pdf}:\
+ ${plugin.reltag}:\
${plugin.rss}:\
${plugin.rtf}:\
${plugin.site}:\
Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=382948&r1=382947&r2=382948&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Fri Mar 3 14:33:29 2006
@@ -18,6 +18,7 @@
<ant dir="lib-lucene-analyzers" target="deploy"/>
<ant dir="lib-nekohtml" target="deploy"/>
<ant dir="lib-parsems" target="deploy"/>
+ <ant dir="microformats-reltag" target="deploy"/>
<ant dir="nutch-extensionpoints" target="deploy"/>
<ant dir="ontology" target="deploy"/>
<ant dir="protocol-file" target="deploy"/>
@@ -86,6 +87,7 @@
<ant dir="lib-lucene-analyzers" target="clean"/>
<ant dir="lib-nekohtml" target="clean"/>
<ant dir="lib-parsems" target="clean"/>
+ <ant dir="microformats-reltag" target="clean"/>
<ant dir="nutch-extensionpoints" target="clean"/>
<ant dir="ontology" target="clean"/>
<ant dir="protocol-file" target="clean"/>
Added: lucene/nutch/trunk/src/plugin/microformats-reltag/build.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/microformats-reltag/build.xml?rev=382948&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/microformats-reltag/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/microformats-reltag/build.xml Fri Mar 3 14:33:29 2006
@@ -0,0 +1,17 @@
+<?xml version="1.0"?>
+
+<project name="microformats-reltag" default="jar">
+
+ <import file="../build-plugin.xml"/>
+
+ <!-- Build compilation dependencies -->
+ <target name="deps-jar">
+ <ant target="compile-core" inheritall="false" dir="${nutch.root}"/>
+ </target>
+
+ <!-- Deploy Unit test dependencies -->
+ <target name="deps-test">
+ <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+ </target>
+
+</project>
Propchange: lucene/nutch/trunk/src/plugin/microformats-reltag/build.xml
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/microformats-reltag/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/microformats-reltag/plugin.xml?rev=382948&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/microformats-reltag/plugin.xml (added)
+++ lucene/nutch/trunk/src/plugin/microformats-reltag/plugin.xml Fri Mar 3 14:33:29 2006
@@ -0,0 +1,43 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+ id="microformats-reltag"
+ name="Rel-Tag microformat Parser/Indexer/Querier"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <runtime>
+ <library name="microformats-reltag.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+ <extension id="org.apache.nutch.microformats.reltag.RelTagParser"
+ name="Rel-Tag parser"
+ point="org.apache.nutch.parse.HtmlParseFilter">
+ <implementation id="RelTagParser"
+ class="org.apache.nutch.microformats.reltag.RelTagParser"/>
+ </extension>
+
+ <extension id="org.apache.nutch.microformats.reltag.RelTagIndexingFilter"
+ name="Rel-Tag indexing filter"
+ point="org.apache.nutch.indexer.IndexingFilter">
+ <implementation id="RelTagIndexingFilter"
+ class="org.apache.nutch.microformats.reltag.RelTagIndexingFilter"/>
+ </extension>
+
+
+ <extension id="org.apache.nutch.microformats.reltag.RelTagQueryFilter"
+ name="Rel-Tag query filter"
+ point="org.apache.nutch.searcher.QueryFilter">
+ <implementation id="RelTagQueryFilter"
+ class="org.apache.nutch.microformats.reltag.RelTagQueryFilter"
+ raw-fields="tag"/>
+ </extension>
+
+
+</plugin>
+
Propchange: lucene/nutch/trunk/src/plugin/microformats-reltag/plugin.xml
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java?rev=382948&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java (added)
+++ lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java Fri Mar 3 14:33:29 2006
@@ -0,0 +1,82 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.microformats.reltag;
+
+
+// Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.hadoop.io.UTF8;
+import org.apache.nutch.parse.Parse;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Lucene imports
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Document;
+
+
+/**
+ * An {@link org.apache.nutch.indexer.IndexingFilter} that
+ * add <code>tag</code> field(s) to the document.
+ *
+ * @see <a href="http://www.microformats.org/wiki/rel-tag">
+ * http://www.microformats.org/wiki/rel-tag</a>
+ * @author Jérôme Charron
+ */
+public class RelTagIndexingFilter implements IndexingFilter {
+
+
+ private Configuration conf;
+
+
+ // Inherited JavaDoc
+ public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks)
+ throws IndexingException {
+
+ // Check if some Rel-Tags found, possibly put there by RelTagParser
+ String[] tags = parse.getData().getParseMeta().getValues(RelTagParser.REL_TAG);
+ if (tags != null) {
+ for (int i=0; i<tags.length; i++) {
+ doc.add(new Field("tag", tags[i],
+ Field.Store.YES, Field.Index.UN_TOKENIZED));
+ }
+ }
+
+ return doc;
+ }
+
+
+ /* ----------------------------- *
+ * <implementation:Configurable> *
+ * ----------------------------- */
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ }
+
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ /* ------------------------------ *
+ * </implementation:Configurable> *
+ * ------------------------------ */
+
+}
Propchange: lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java?rev=382948&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java (added)
+++ lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java Fri Mar 3 14:33:29 2006
@@ -0,0 +1,153 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.microformats.reltag;
+
+// JDK imports
+import java.net.URL;
+import java.net.URLDecoder;
+import java.util.Iterator;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.logging.Logger;
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+// Nutch imports
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.HtmlParseFilter;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.StringUtil;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.LogFormatter;
+
+
+/**
+ * Adds microformat rel-tags of document if found.
+ *
+ * @see <a href="http://www.microformats.org/wiki/rel-tag">
+ * http://www.microformats.org/wiki/rel-tag</a>
+ * @author Jérôme Charron
+ */
+public class RelTagParser implements HtmlParseFilter {
+
+ public final static Logger LOG =
+ LogFormatter.getLogger(RelTagParser.class.getName());
+
+ public final static String REL_TAG = "Rel-Tag";
+
+
+ private Configuration conf = null;
+
+
+ /**
+ * Scan the HTML document looking at possible rel-tags
+ */
+ public Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc) {
+
+ // Trying to find the document's rel-tags
+ Parser parser = new Parser(doc);
+ Set tags = parser.getRelTags();
+ Iterator iter = tags.iterator();
+ Metadata metadata = parse.getData().getParseMeta();
+ while (iter.hasNext()) {
+ metadata.add(REL_TAG, (String) iter.next());
+ }
+ return parse;
+ }
+
+ private static class Parser {
+
+ Set tags = null;
+
+ Parser(Node node) {
+ tags = new TreeSet();
+ parse(node);
+ }
+
+ Set getRelTags() {
+ return tags;
+ }
+
+ void parse(Node node) {
+
+ if (node.getNodeType() == Node.ELEMENT_NODE) {
+ // Look for <a> tag
+ if ("a".equalsIgnoreCase(node.getNodeName())) {
+ NamedNodeMap attrs = node.getAttributes();
+ Node hrefNode = attrs.getNamedItem("href");
+ // Checks that it contains a href attribute
+ if (hrefNode != null) {
+ Node relNode = attrs.getNamedItem("rel");
+ // Checks that it contains a rel attribute too
+ if (relNode != null) {
+ // Finaly checks that rel=tag
+ if ("tag".equalsIgnoreCase(relNode.getNodeValue())) {
+ String tag = parseTag(hrefNode.getNodeValue());
+ if (!StringUtil.isEmpty(tag)) {
+ tags.add(tag);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // Recurse
+ NodeList children = node.getChildNodes();
+ for (int i=0; children != null && i<children.getLength(); i++) {
+ parse(children.item(i));
+ }
+ }
+
+ private final static String parseTag(String url) {
+ String tag = null;
+ try {
+ URL u = new URL(url);
+ String path = u.getPath();
+ tag = URLDecoder.decode(path.substring(path.lastIndexOf('/') + 1), "UTF-8");
+ } catch (Exception e) {
+ // Malformed tag...
+ tag = null;
+ }
+ return tag;
+ }
+
+ }
+
+
+ /* ----------------------------- *
+ * <implementation:Configurable> *
+ * ----------------------------- */
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ }
+
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ /* ------------------------------ *
+ * </implementation:Configurable> *
+ * ------------------------------ */
+
+}
Propchange: lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagQueryFilter.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagQueryFilter.java?rev=382948&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagQueryFilter.java (added)
+++ lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagQueryFilter.java Fri Mar 3 14:33:29 2006
@@ -0,0 +1,57 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.microformats.reltag;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Nutch imports
+import org.apache.nutch.searcher.RawFieldQueryFilter;
+
+
+/**
+ * Handles <code>"tag:"<code> query clauses.
+ *
+ * @see <a href="http://www.microformats.org/wiki/rel-tag">
+ * http://www.microformats.org/wiki/rel-tag</a>
+ * @author Jérôme Charron
+ */
+public class RelTagQueryFilter extends RawFieldQueryFilter {
+
+ private Configuration conf;
+
+ public RelTagQueryFilter() {
+ super("tag", true, 1.0f);
+ }
+
+
+ /* ----------------------------- *
+ * <implementation:Configurable> *
+ * ----------------------------- */
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ }
+
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ /* ------------------------------ *
+ * </implementation:Configurable> *
+ * ------------------------------ */
+
+}
Propchange: lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagQueryFilter.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package.html
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package.html?rev=382948&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package.html (added)
+++ lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package.html Fri Mar 3 14:33:29 2006
@@ -0,0 +1,8 @@
+<html>
+<body>
+<p>
+A microformats <a href="http://www.microformats.org/wiki/Rel-Tag">Rel-Tag</a>
+Parser/Indexer/Querier plugin.
+</p>
+</body>
+</html>
Propchange: lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package.html
------------------------------------------------------------------------------
svn:eol-style = native