You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2010/07/20 12:34:18 UTC
svn commit: r965787 - in /nutch/branches/nutchbase/src/plugin: ./
creativecommons/src/java/org/creativecommons/nutch/
creativecommons/src/test/org/creativecommons/nutch/
scoring-link/src/java/org/apache/nutch/scoring/link/
Author: jnioche
Date: Tue Jul 20 10:34:18 2010
New Revision: 965787
URL: http://svn.apache.org/viewvc?rev=965787&view=rev
Log:
Ported Scoring-link + creative commons to new API
Modified:
nutch/branches/nutchbase/src/plugin/build.xml
nutch/branches/nutchbase/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
nutch/branches/nutchbase/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
nutch/branches/nutchbase/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
nutch/branches/nutchbase/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java
Modified: nutch/branches/nutchbase/src/plugin/build.xml
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/build.xml?rev=965787&r1=965786&r2=965787&view=diff
==============================================================================
--- nutch/branches/nutchbase/src/plugin/build.xml (original)
+++ nutch/branches/nutchbase/src/plugin/build.xml Tue Jul 20 10:34:18 2010
@@ -26,6 +26,7 @@
<!-- Build & deploy all the plugin jars. -->
<!-- ====================================================== -->
<target name="deploy">
+ <ant dir="creativecommons" target="deploy"/>
<ant dir="index-anchor" target="deploy"/>
<ant dir="index-basic" target="deploy"/>
<ant dir="index-more" target="deploy"/>
@@ -41,6 +42,7 @@
<ant dir="parse-js" target="deploy"/>
<ant dir="parse-rss" target="deploy"/>
<ant dir="parse-tika" target="deploy"/>
+ <ant dir="scoring-link" target="deploy"/>
<ant dir="scoring-opic" target="deploy"/>
<ant dir="subcollection" target="deploy"/>
<ant dir="tld" target="deploy"/>
@@ -54,14 +56,12 @@
<ant dir="urlnormalizer-pass" target="deploy"/>
<ant dir="urlnormalizer-regex" target="deploy"/>
<!--
- <ant dir="creativecommons" target="deploy"/>
<ant dir="feed" target="deploy"/>
<ant dir="microformats-reltag" target="deploy"/>
<ant dir="parse-ext" target="deploy"/>
<ant dir="parse-swf" target="deploy"/>
<ant dir="parse-zip" target="deploy"/>
<ant dir="protocol-httpclient" target="deploy"/>
- <ant dir="scoring-link" target="deploy"/>
-->
</target>
@@ -69,7 +69,7 @@
<!-- Test all of the plugins. -->
<!-- ====================================================== -->
<target name="test">
- <parallel threadCount="2">
+ <ant dir="creativecommons" target="test"/>
<ant dir="parse-rss" target="test"/>
<ant dir="parse-tika" target="test"/>
<ant dir="protocol-file" target="test"/>
@@ -87,14 +87,12 @@
<ant dir="lib-http" target="test"/>
<ant dir="subcollection" target="test"/>
<!--
- <ant dir="creativecommons" target="test"/>
<ant dir="feed" target="test"/>
<ant dir="parse-ext" target="test"/>
<ant dir="parse-swf" target="test"/>
<ant dir="parse-zip" target="test"/>
<ant dir="protocol-httpclient" target="test"/>
-->
- </parallel>
</target>
<!-- ====================================================== -->
Modified: nutch/branches/nutchbase/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java?rev=965787&r1=965786&r2=965787&view=diff
==============================================================================
--- nutch/branches/nutchbase/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java (original)
+++ nutch/branches/nutchbase/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java Tue Jul 20 10:34:18 2010
@@ -17,105 +17,121 @@
package org.creativecommons.nutch;
-import org.apache.nutch.metadata.CreativeCommons;
-
-import org.apache.nutch.parse.Parse;
-
-import org.apache.nutch.indexer.IndexingFilter;
-import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.hadoop.io.Text;
-
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.metadata.Metadata;
-
-import org.apache.hadoop.conf.Configuration;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.nio.ByteBuffer;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.StringTokenizer;
+import org.apache.avro.util.Utf8;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
-
-import java.util.*;
-import java.net.URL;
-import java.net.MalformedURLException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.metadata.CreativeCommons;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.storage.WebPage.Field;
/** Adds basic searchable fields to a document. */
public class CCIndexingFilter implements IndexingFilter {
- public static final Log LOG = LogFactory.getLog(CCIndexingFilter.class);
+ public static final Log LOG = LogFactory.getLog(CCIndexingFilter.class);
+
+ /** The name of the document field we use. */
+ public static String FIELD = "cc";
+
+ private Configuration conf;
- /** The name of the document field we use. */
- public static String FIELD = "cc";
+ private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
- private Configuration conf;
+ static {
+ FIELDS.add(WebPage.Field.BASE_URL);
+ FIELDS.add(WebPage.Field.METADATA);
+ }
+
+ /**
+ * Add the features represented by a license URL. Urls are of the form
+ * "http://creativecommons.org/licenses/xx-xx/xx/xx", where "xx" names a
+ * license feature.
+ */
+ public void addUrlFeatures(NutchDocument doc, String urlString) {
+ try {
+ URL url = new URL(urlString);
+
+ // tokenize the path of the url, breaking at slashes and dashes
+ StringTokenizer names = new StringTokenizer(url.getPath(), "/-");
+
+ if (names.hasMoreTokens())
+ names.nextToken(); // throw away "licenses"
+
+ // add a feature per component after "licenses"
+ while (names.hasMoreTokens()) {
+ String feature = names.nextToken();
+ addFeature(doc, feature);
+ }
+ } catch (MalformedURLException e) {
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("CC: failed to parse url: " + urlString + " : " + e);
+ }
+ }
+ }
+
+ private void addFeature(NutchDocument doc, String feature) {
+ doc.add(FIELD, feature);
+ }
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ }
+
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ @Override
+ public Collection<Field> getFields() {
+ return FIELDS;
+ }
+
+ @Override
+ public NutchDocument filter(NutchDocument doc, String url, WebPage page)
+ throws IndexingException {
+
+ ByteBuffer blicense = page.getFromMetadata(new Utf8(
+ CreativeCommons.LICENSE_URL));
+ if (blicense != null) {
+ String licenseUrl = new String(blicense.array());
+ if (LOG.isInfoEnabled()) {
+ LOG.info("CC: indexing " + licenseUrl + " for: "
+ + url.toString());
+ }
+
+ // add the entire license as cc:license=xxx
+ addFeature(doc, "license=" + licenseUrl);
+
+ // index license attributes extracted of the license url
+ addUrlFeatures(doc, licenseUrl);
+ }
+
+ // index the license location as cc:meta=xxx
+ ByteBuffer blicenseloc = page.getFromMetadata(new Utf8(
+ CreativeCommons.LICENSE_LOCATION));
+ if (blicenseloc != null) {
+ String licenseLocation = new String(blicenseloc.array());
+ addFeature(doc, "meta=" + licenseLocation);
+ }
+
+ // index the work type cc:type=xxx
+ ByteBuffer bworkType = page.getFromMetadata(new Utf8(
+ CreativeCommons.WORK_TYPE));
+ if (bworkType != null) {
+ String workType = new String(bworkType.array());
+ addFeature(doc, workType);
+ }
- public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
- throws IndexingException {
-
- Metadata metadata = parse.getData().getParseMeta();
- // index the license
- String licenseUrl = metadata.get(CreativeCommons.LICENSE_URL);
- if (licenseUrl != null) {
- if (LOG.isInfoEnabled()) {
- LOG.info("CC: indexing " + licenseUrl + " for: " + url.toString());
- }
-
- // add the entire license as cc:license=xxx
- addFeature(doc, "license=" + licenseUrl);
-
- // index license attributes extracted of the license url
- addUrlFeatures(doc, licenseUrl);
- }
-
- // index the license location as cc:meta=xxx
- String licenseLocation = metadata.get(CreativeCommons.LICENSE_LOCATION);
- if (licenseLocation != null) {
- addFeature(doc, "meta=" + licenseLocation);
- }
-
- // index the work type cc:type=xxx
- String workType = metadata.get(CreativeCommons.WORK_TYPE);
- if (workType != null) {
- addFeature(doc, workType);
- }
-
- return doc;
- }
-
- /** Add the features represented by a license URL. Urls are of the form
- * "http://creativecommons.org/licenses/xx-xx/xx/xx", where "xx" names a
- * license feature. */
- public void addUrlFeatures(NutchDocument doc, String urlString) {
- try {
- URL url = new URL(urlString);
-
- // tokenize the path of the url, breaking at slashes and dashes
- StringTokenizer names = new StringTokenizer(url.getPath(), "/-");
-
- if (names.hasMoreTokens())
- names.nextToken(); // throw away "licenses"
-
- // add a feature per component after "licenses"
- while (names.hasMoreTokens()) {
- String feature = names.nextToken();
- addFeature(doc, feature);
- }
- } catch (MalformedURLException e) {
- if (LOG.isWarnEnabled()) {
- LOG.warn("CC: failed to parse url: " + urlString + " : " + e);
- }
- }
- }
-
- private void addFeature(NutchDocument doc, String feature) {
- doc.add(FIELD, feature);
- }
-
- public void setConf(Configuration conf) {
- this.conf = conf;
- }
-
- public Configuration getConf() {
- return this.conf;
- }
+ return doc;
+ }
}
Modified: nutch/branches/nutchbase/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java?rev=965787&r1=965786&r2=965787&view=diff
==============================================================================
--- nutch/branches/nutchbase/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java (original)
+++ nutch/branches/nutchbase/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java Tue Jul 20 10:34:18 2010
@@ -17,289 +17,315 @@
package org.creativecommons.nutch;
-import org.apache.nutch.metadata.CreativeCommons;
-import org.apache.nutch.parse.*;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
+import java.io.StringReader;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.nio.ByteBuffer;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+
+import org.apache.avro.util.Utf8;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
-
-import java.util.*;
-import java.io.*;
-import java.net.*;
-import javax.xml.parsers.*;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.CreativeCommons;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.HtmlParseFilter;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseStatusUtils;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.storage.WebPage.Field;
+import org.w3c.dom.Comment;
+import org.w3c.dom.Document;
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
-import org.w3c.dom.*;
-
/** Adds metadata identifying the Creative Commons license used, if any. */
public class CCParseFilter implements HtmlParseFilter {
- public static final Log LOG = LogFactory.getLog(CCParseFilter.class);
+ public static final Log LOG = LogFactory.getLog(CCParseFilter.class);
+ /** Walks DOM tree, looking for RDF in comments and licenses in anchors. */
+ public static class Walker {
+ private URL base; // base url of page
+ private String rdfLicense; // subject url found, if any
+ private URL relLicense; // license url found, if any
+ private URL anchorLicense; // anchor url found, if any
+ private String workType; // work type URI
+
+ private Walker(URL base) {
+ this.base = base;
+ }
+
+ /** Scan the document adding attributes to metadata. */
+ public static void walk(Node doc, URL base, WebPage page,
+ Configuration conf) throws ParseException {
+
+ // walk the DOM tree, scanning for license data
+ Walker walker = new Walker(base);
+ walker.walk(doc);
+
+ // interpret results of walk
+ String licenseUrl = null;
+ String licenseLocation = null;
+ if (walker.rdfLicense != null) { // 1st choice: subject in RDF
+ licenseLocation = "rdf";
+ licenseUrl = walker.rdfLicense;
+ } else if (walker.relLicense != null) { // 2nd: anchor w/
+ // rel=license
+ licenseLocation = "rel";
+ licenseUrl = walker.relLicense.toString();
+ } else if (walker.anchorLicense != null) { // 3rd: anchor w/ CC
+ // license
+ licenseLocation = "a";
+ licenseUrl = walker.anchorLicense.toString();
+ } else if (conf.getBoolean("creativecommons.exclude.unlicensed",
+ false)) {
+ throw new ParseException("No CC license. Excluding.");
+ }
+
+ // add license to metadata
+ if (licenseUrl != null) {
+ if (LOG.isInfoEnabled()) {
+ LOG.info("CC: found " + licenseUrl + " in "
+ + licenseLocation + " of " + base);
+ }
+ page.putToMetadata(new Utf8(CreativeCommons.LICENSE_URL),
+ ByteBuffer.wrap(licenseUrl.getBytes()));
+ page.putToMetadata(new Utf8(CreativeCommons.LICENSE_LOCATION),
+ ByteBuffer.wrap(licenseLocation.getBytes()));
+ }
+
+ if (walker.workType != null) {
+ if (LOG.isInfoEnabled()) {
+ LOG.info("CC: found " + walker.workType + " in " + base);
+ }
+ page.putToMetadata(new Utf8(CreativeCommons.WORK_TYPE),
+ ByteBuffer.wrap(walker.workType.getBytes()));
+ }
+
+ }
+
+ /** Scan the document looking for RDF in comments and license elements. */
+ private void walk(Node node) {
+
+ // check element nodes for license URL
+ if (node instanceof Element) {
+ findLicenseUrl((Element) node);
+ }
+
+ // check comment nodes for license RDF
+ if (node instanceof Comment) {
+ findRdf(((Comment) node).getData());
+ }
+
+ // recursively walk child nodes
+ NodeList children = node.getChildNodes();
+ for (int i = 0; children != null && i < children.getLength(); i++) {
+ walk(children.item(i));
+ }
+ }
+
+ /**
+ * Extract license url from element, if any. Thse are the href attribute
+ * of anchor elements with rel="license". These must also point to
+ * http://creativecommons.org/licenses/.
+ */
+ private void findLicenseUrl(Element element) {
+ // only look in Anchor elements
+ if (!"a".equalsIgnoreCase(element.getTagName()))
+ return;
+
+ // require an href
+ String href = element.getAttribute("href");
+ if (href == null)
+ return;
+
+ try {
+ URL url = new URL(base, href); // resolve the url
+
+ // check that it's a CC license URL
+ if ("http".equalsIgnoreCase(url.getProtocol())
+ && "creativecommons.org"
+ .equalsIgnoreCase(url.getHost())
+ && url.getPath() != null
+ && url.getPath().startsWith("/licenses/")
+ && url.getPath().length() > "/licenses/".length()) {
+
+ // check rel="license"
+ String rel = element.getAttribute("rel");
+ if (rel != null && "license".equals(rel)
+ && this.relLicense == null) {
+ this.relLicense = url; // found rel license
+ } else if (this.anchorLicense == null) {
+ this.anchorLicense = url; // found anchor license
+ }
+ }
+ } catch (MalformedURLException e) { // ignore malformed urls
+ }
+ }
+
+ /** Configure a namespace aware XML parser. */
+ private static final DocumentBuilderFactory FACTORY = DocumentBuilderFactory
+ .newInstance();
+ static {
+ FACTORY.setNamespaceAware(true);
+ }
+
+ /** Creative Commons' namespace URI. */
+ private static final String CC_NS = "http://web.resource.org/cc/";
+
+ /** Dublin Core namespace URI. */
+ private static final String DC_NS = "http://purl.org/dc/elements/1.1/";
+
+ /** RDF syntax namespace URI. */
+ private static final String RDF_NS = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
+
+ private void findRdf(String comment) {
+ // first check for likely RDF in comment
+ int rdfPosition = comment.indexOf("RDF");
+ if (rdfPosition < 0)
+ return; // no RDF, abort
+ int nsPosition = comment.indexOf(CC_NS);
+ if (nsPosition < 0)
+ return; // no RDF, abort
+
+ // try to parse the XML
+ Document doc;
+ try {
+ DocumentBuilder parser = FACTORY.newDocumentBuilder();
+ doc = parser.parse(new InputSource(new StringReader(comment)));
+ } catch (Exception e) {
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("CC: Failed to parse RDF in " + base + ": " + e);
+ }
+ // e.printStackTrace();
+ return;
+ }
+
+ // check that root is rdf:RDF
+ NodeList roots = doc.getElementsByTagNameNS(RDF_NS, "RDF");
+ if (roots.getLength() != 1) {
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("CC: No RDF root in " + base);
+ }
+ return;
+ }
+ Element rdf = (Element) roots.item(0);
+
+ // get cc:License nodes inside rdf:RDF
+ NodeList licenses = rdf.getElementsByTagNameNS(CC_NS, "License");
+ for (int i = 0; i < licenses.getLength(); i++) {
+
+ Element l = (Element) licenses.item(i);
+
+ // license is rdf:about= attribute from cc:License
+ this.rdfLicense = l.getAttributeNodeNS(RDF_NS, "about")
+ .getValue();
+
+ // walk predicates of cc:License
+ NodeList predicates = l.getChildNodes();
+ for (int j = 0; j < predicates.getLength(); j++) {
+ Node predicateNode = predicates.item(j);
+ if (!(predicateNode instanceof Element))
+ continue;
+ Element predicateElement = (Element) predicateNode;
+
+ // extract predicates of cc:xxx predicates
+ if (!CC_NS.equals(predicateElement.getNamespaceURI())) {
+ continue;
+ }
+ String predicate = predicateElement.getLocalName();
+
+ // object is rdf:resource from cc:xxx predicates
+ String object = predicateElement.getAttributeNodeNS(RDF_NS,
+ "resource").getValue();
+
+ // add object and predicate to metadata
+ // metadata.put(object, predicate);
+ // if (LOG.isInfoEnabled()) {
+ // LOG.info("CC: found: "+predicate+"="+object);
+ // }
+ }
+ }
+
+ // get cc:Work nodes from rdf:RDF
+ NodeList works = rdf.getElementsByTagNameNS(CC_NS, "Work");
+ for (int i = 0; i < works.getLength(); i++) {
+ Element l = (Element) works.item(i);
+
+ // get dc:type nodes from cc:Work
+ NodeList types = rdf.getElementsByTagNameNS(DC_NS, "type");
+ for (int j = 0; j < types.getLength(); j++) {
+ Element type = (Element) types.item(j);
+ String workUri = type
+ .getAttributeNodeNS(RDF_NS, "resource").getValue();
+ this.workType = (String) WORK_TYPE_NAMES.get(workUri);
+ break;
+ }
+ }
+ }
+ }
+
+ private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
+
+ static {
+ FIELDS.add(WebPage.Field.BASE_URL);
+ FIELDS.add(WebPage.Field.METADATA);
+ }
+
+ private static final HashMap<String,String> WORK_TYPE_NAMES = new HashMap<String,String>();
+ static {
+ WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/MovingImage", "video");
+ WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/StillImage", "image");
+ WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Sound", "audio");
+ WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Text", "text");
+ WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Interactive",
+ "interactive");
+ WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Software", "software");
+ WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Image", "image");
+ }
+
+ private Configuration conf;
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ }
+
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ @Override
+ public Collection<Field> getFields() {
+ return FIELDS;
+ }
+
+ /**
+ * Adds metadata or otherwise modifies a parse of an HTML document, given
+ * the DOM tree of a page.
+ */
+ @Override
+ public Parse filter(String url, WebPage page, Parse parse,
+ HTMLMetaTags metaTags, DocumentFragment doc) {
+ // construct base url
+ URL base;
+ try {
+ base = new URL(page.getBaseUrl().toString());
+ // extract license metadata
+ Walker.walk(doc, base, page, getConf());
+ } catch (Exception e) {
+ LOG.error("Error parsing " + url, e);
+ return ParseStatusUtils.getEmptyParse(e, getConf());
+ }
- /** Walks DOM tree, looking for RDF in comments and licenses in anchors.*/
- public static class Walker {
- private URL base; // base url of page
- private String rdfLicense; // subject url found, if any
- private URL relLicense; // license url found, if any
- private URL anchorLicense; // anchor url found, if any
- private String workType; // work type URI
-
- private Walker(URL base) {
- this.base = base;
- }
-
- /** Scan the document adding attributes to metadata.*/
- public static void walk(Node doc, URL base, Metadata metadata, Configuration conf)
- throws ParseException {
-
- // walk the DOM tree, scanning for license data
- Walker walker = new Walker(base);
- walker.walk(doc);
-
- // interpret results of walk
- String licenseUrl = null;
- String licenseLocation = null;
- if (walker.rdfLicense != null) { // 1st choice: subject in RDF
- licenseLocation = "rdf";
- licenseUrl = walker.rdfLicense;
- } else if (walker.relLicense != null) { // 2nd: anchor w/ rel=license
- licenseLocation = "rel";
- licenseUrl = walker.relLicense.toString();
- } else if (walker.anchorLicense != null) { // 3rd: anchor w/ CC license
- licenseLocation = "a";
- licenseUrl = walker.anchorLicense.toString();
- } else if (conf.getBoolean("creativecommons.exclude.unlicensed", false)) {
- throw new ParseException("No CC license. Excluding.");
- }
-
- // add license to metadata
- if (licenseUrl != null) {
- if (LOG.isInfoEnabled()) {
- LOG.info("CC: found "+licenseUrl+" in "+licenseLocation+" of "+base);
- }
- metadata.add(CreativeCommons.LICENSE_URL, licenseUrl);
- metadata.add(CreativeCommons.LICENSE_LOCATION, licenseLocation);
- }
-
- if (walker.workType != null) {
- if (LOG.isInfoEnabled()) {
- LOG.info("CC: found "+walker.workType+" in "+base);
- }
- metadata.add(CreativeCommons.WORK_TYPE, walker.workType);
- }
-
- }
-
- /** Scan the document looking for RDF in comments and license elements.*/
- private void walk(Node node) {
-
- // check element nodes for license URL
- if (node instanceof Element) {
- findLicenseUrl((Element)node);
- }
-
- // check comment nodes for license RDF
- if (node instanceof Comment) {
- findRdf(((Comment)node).getData());
- }
-
- // recursively walk child nodes
- NodeList children = node.getChildNodes();
- for (int i = 0; children != null && i < children.getLength(); i++ ) {
- walk(children.item(i));
- }
- }
-
- /** Extract license url from element, if any. Thse are the href attribute
- * of anchor elements with rel="license". These must also point to
- * http://creativecommons.org/licenses/. */
- private void findLicenseUrl(Element element) {
- // only look in Anchor elements
- if (!"a".equalsIgnoreCase(element.getTagName()))
- return;
-
- // require an href
- String href = element.getAttribute("href");
- if (href == null)
- return;
-
- try {
- URL url = new URL(base, href); // resolve the url
-
- // check that it's a CC license URL
- if ("http".equalsIgnoreCase(url.getProtocol()) &&
- "creativecommons.org".equalsIgnoreCase(url.getHost()) &&
- url.getPath() != null &&
- url.getPath().startsWith("/licenses/") &&
- url.getPath().length() > "/licenses/".length()) {
-
- // check rel="license"
- String rel = element.getAttribute("rel");
- if (rel != null && "license".equals(rel) && this.relLicense == null) {
- this.relLicense = url; // found rel license
- } else if (this.anchorLicense == null) {
- this.anchorLicense = url; // found anchor license
- }
- }
- } catch (MalformedURLException e) { // ignore malformed urls
- }
- }
-
- /** Configure a namespace aware XML parser. */
- private static final DocumentBuilderFactory FACTORY
- = DocumentBuilderFactory.newInstance();
- static {
- FACTORY.setNamespaceAware(true);
- }
-
- /** Creative Commons' namespace URI. */
- private static final String CC_NS = "http://web.resource.org/cc/";
-
- /** Dublin Core namespace URI. */
- private static final String DC_NS = "http://purl.org/dc/elements/1.1/";
-
- /** RDF syntax namespace URI. */
- private static final String RDF_NS
- = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
-
- private void findRdf(String comment) {
- // first check for likely RDF in comment
- int rdfPosition = comment.indexOf("RDF");
- if (rdfPosition < 0)
- return; // no RDF, abort
- int nsPosition = comment.indexOf(CC_NS);
- if (nsPosition < 0)
- return; // no RDF, abort
-
- // try to parse the XML
- Document doc;
- try {
- DocumentBuilder parser = FACTORY.newDocumentBuilder();
- doc = parser.parse(new InputSource(new StringReader(comment)));
- } catch (Exception e) {
- if (LOG.isWarnEnabled()) {
- LOG.warn("CC: Failed to parse RDF in "+base+": "+e);
- }
- //e.printStackTrace();
- return;
- }
-
- // check that root is rdf:RDF
- NodeList roots = doc.getElementsByTagNameNS(RDF_NS, "RDF");
- if (roots.getLength() != 1) {
- if (LOG.isWarnEnabled()) { LOG.warn("CC: No RDF root in "+base); }
- return;
- }
- Element rdf = (Element)roots.item(0);
-
- // get cc:License nodes inside rdf:RDF
- NodeList licenses = rdf.getElementsByTagNameNS(CC_NS, "License");
- for (int i = 0; i < licenses.getLength(); i++) {
-
- Element l = (Element)licenses.item(i);
-
- // license is rdf:about= attribute from cc:License
- this.rdfLicense = l.getAttributeNodeNS(RDF_NS,"about").getValue();
-
- // walk predicates of cc:License
- NodeList predicates = l.getChildNodes();
- for (int j = 0; j < predicates.getLength(); j++) {
- Node predicateNode = predicates.item(j);
- if (!(predicateNode instanceof Element))
- continue;
- Element predicateElement = (Element)predicateNode;
-
- // extract predicates of cc:xxx predicates
- if (!CC_NS.equals(predicateElement.getNamespaceURI())) {
- continue;
- }
- String predicate = predicateElement.getLocalName();
-
- // object is rdf:resource from cc:xxx predicates
- String object =
- predicateElement.getAttributeNodeNS(RDF_NS, "resource").getValue();
-
- // add object and predicate to metadata
- // metadata.put(object, predicate);
- // if (LOG.isInfoEnabled()) {
- // LOG.info("CC: found: "+predicate+"="+object);
- // }
- }
- }
-
- // get cc:Work nodes from rdf:RDF
- NodeList works = rdf.getElementsByTagNameNS(CC_NS, "Work");
- for (int i = 0; i < works.getLength(); i++) {
- Element l = (Element)works.item(i);
-
- // get dc:type nodes from cc:Work
- NodeList types = rdf.getElementsByTagNameNS(DC_NS, "type");
- for (int j = 0; j < types.getLength(); j++) {
- Element type = (Element)types.item(j);
- String workUri =
- type.getAttributeNodeNS(RDF_NS, "resource").getValue();
- this.workType = (String)WORK_TYPE_NAMES.get(workUri);
- break;
- }
- }
- }
- }
-
- private static final HashMap WORK_TYPE_NAMES = new HashMap();
- static {
- WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/MovingImage", "video");
- WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/StillImage", "image");
- WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Sound", "audio");
- WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Text", "text");
- WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Interactive", "interactive");
- WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Software", "software");
- WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Image", "image");
- }
-
- private Configuration conf;
-
- /** Adds metadata or otherwise modifies a parse of an HTML document, given
- * the DOM tree of a page. */
- public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
-
- // get parse obj
- Parse parse = parseResult.get(content.getUrl());
-
- // construct base url
- URL base;
- try {
- base = new URL(content.getBaseUrl());
- } catch (MalformedURLException e) {
- Parse emptyParse = new ParseStatus(e).getEmptyParse(getConf());
- parseResult.put(content.getUrl(),
- new ParseText(emptyParse.getText()),
- emptyParse.getData());
- return parseResult;
- }
-
- try {
- // extract license metadata
- Walker.walk(doc, base, parse.getData().getParseMeta(), getConf());
- } catch (ParseException e) {
- Parse emptyParse = new ParseStatus(e).getEmptyParse(getConf());
- parseResult.put(content.getUrl(),
- new ParseText(emptyParse.getText()),
- emptyParse.getData());
- return parseResult;
- }
-
- return parseResult;
- }
-
- public void setConf(Configuration conf) {
- this.conf = conf;
- }
-
- public Configuration getConf() {
- return this.conf;
- }
+ return parse;
+ }
}
Modified: nutch/branches/nutchbase/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java?rev=965787&r1=965786&r2=965787&view=diff
==============================================================================
--- nutch/branches/nutchbase/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java (original)
+++ nutch/branches/nutchbase/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java Tue Jul 20 10:34:18 2010
@@ -21,57 +21,71 @@ import org.apache.nutch.metadata.Metadat
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.Content;
+import org.apache.avro.util.Utf8;
import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.MimeUtil;
import org.apache.nutch.util.NutchConfiguration;
+import org.apache.tika.mime.MimeType;
import java.util.Properties;
import java.io.*;
import java.net.URL;
+import java.nio.ByteBuffer;
import junit.framework.TestCase;
public class TestCCParseFilter extends TestCase {
- private static final File testDir =
- new File(System.getProperty("test.input"));
+ private static final File testDir = new File(
+ System.getProperty("test.input"));
- public void testPages() throws Exception {
- pageTest(new File(testDir, "anchor.html"), "http://foo.com/",
- "http://creativecommons.org/licenses/by-nc-sa/1.0", "a", null);
- // Tika returns <a> whereas parse-html returns <rel>
- // check later
- pageTest(new File(testDir, "rel.html"), "http://foo.com/",
- "http://creativecommons.org/licenses/by-nc/2.0", "a", null);
- // Tika returns <a> whereas parse-html returns <rdf>
- // check later
- pageTest(new File(testDir, "rdf.html"), "http://foo.com/",
- "http://creativecommons.org/licenses/by-nc/1.0", "a", null);
- }
-
- public void pageTest(File file, String url,
- String license, String location, String type)
- throws Exception {
-
- String contentType = "text/html";
- InputStream in = new FileInputStream(file);
- ByteArrayOutputStream out = new ByteArrayOutputStream((int)file.length());
- byte[] buffer = new byte[1024];
- int i;
- while ((i = in.read(buffer)) != -1) {
- out.write(buffer, 0, i);
- }
- in.close();
- byte[] bytes = out.toByteArray();
- Configuration conf = NutchConfiguration.create();
-
- Content content =
- new Content(url, url, bytes, contentType, new Metadata(), conf);
- Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
-
- Metadata metadata = parse.getData().getParseMeta();
- assertEquals(license, metadata.get("License-Url"));
- assertEquals(location, metadata.get("License-Location"));
- assertEquals(type, metadata.get("Work-Type"));
- }
+ public void testPages() throws Exception {
+ pageTest(new File(testDir, "anchor.html"), "http://foo.com/",
+ "http://creativecommons.org/licenses/by-nc-sa/1.0", "a", null);
+ // Tika returns <a> whereas parse-html returns <rel>
+ // check later
+ pageTest(new File(testDir, "rel.html"), "http://foo.com/",
+ "http://creativecommons.org/licenses/by-nc/2.0", "a", null);
+ // Tika returns <a> whereas parse-html returns <rdf>
+ // check later
+ pageTest(new File(testDir, "rdf.html"), "http://foo.com/",
+ "http://creativecommons.org/licenses/by-nc/1.0", "a", null);
+ }
+
+ public void pageTest(File file, String url, String license,
+ String location, String type) throws Exception {
+
+ String contentType = "text/html";
+ InputStream in = new FileInputStream(file);
+ ByteArrayOutputStream out = new ByteArrayOutputStream(
+ (int) file.length());
+ byte[] buffer = new byte[1024];
+ int i;
+ while ((i = in.read(buffer)) != -1) {
+ out.write(buffer, 0, i);
+ }
+ in.close();
+ byte[] bytes = out.toByteArray();
+ Configuration conf = NutchConfiguration.create();
+
+ WebPage page = new WebPage();
+ page.setBaseUrl(new Utf8(url));
+ page.setContent(ByteBuffer.wrap(bytes));
+ MimeUtil mimeutil = new MimeUtil(conf);
+ MimeType mtype = mimeutil.getMimeType(file);
+ page.setContentType(new Utf8(mtype.getName()));
+
+ new ParseUtil(conf).parse(url, page);
+
+ ByteBuffer bb = page.getFromMetadata(new Utf8("License-Url"));
+ assertEquals(license, new String(bb.array()));
+ bb = page.getFromMetadata(new Utf8("License-Location"));
+ assertEquals(location, new String(bb.array()));
+ bb = page.getFromMetadata(new Utf8("Work-Type"));
+ if (bb == null)
+ assertEquals(type, null);
+ else
+ assertEquals(type, new String(bb.array()));
+ }
}
-
Modified: nutch/branches/nutchbase/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java?rev=965787&r1=965786&r2=965787&view=diff
==============================================================================
--- nutch/branches/nutchbase/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java (original)
+++ nutch/branches/nutchbase/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java Tue Jul 20 10:34:18 2010
@@ -17,85 +17,81 @@
package org.apache.nutch.scoring.link;
import java.util.Collection;
+import java.util.HashSet;
import java.util.List;
-import java.util.Map.Entry;
+import java.util.Set;
import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.metadata.Nutch;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.protocol.Content;
+import org.apache.nutch.scoring.ScoreDatum;
import org.apache.nutch.scoring.ScoringFilter;
import org.apache.nutch.scoring.ScoringFilterException;
+import org.apache.nutch.storage.WebPage;
-public class LinkAnalysisScoringFilter
- implements ScoringFilter {
+public class LinkAnalysisScoringFilter implements ScoringFilter {
- private Configuration conf;
- private float scoreInjected = 0.001f;
- private float normalizedScore = 1.00f;
-
- public LinkAnalysisScoringFilter() {
-
- }
-
- public Configuration getConf() {
- return conf;
- }
-
- public void setConf(Configuration conf) {
- this.conf = conf;
- normalizedScore = conf.getFloat("link.analyze.normalize.score", 1.00f);
- scoreInjected = conf.getFloat("link.analyze.injected.score", 1.00f);
- }
-
- public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
- ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
- CrawlDatum adjust, int allCount)
- throws ScoringFilterException {
- return adjust;
- }
-
- public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
- throws ScoringFilterException {
- return datum.getScore() * initSort;
- }
-
- public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
- CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
- throws ScoringFilterException {
- return (normalizedScore * dbDatum.getScore());
- }
-
- public void initialScore(Text url, CrawlDatum datum)
- throws ScoringFilterException {
- datum.setScore(0.0f);
- }
-
- public void injectedScore(Text url, CrawlDatum datum)
- throws ScoringFilterException {
- datum.setScore(scoreInjected);
- }
-
- public void passScoreAfterParsing(Text url, Content content, Parse parse)
- throws ScoringFilterException {
- parse.getData().getContentMeta().set(Nutch.SCORE_KEY,
- content.getMetadata().get(Nutch.SCORE_KEY));
- }
-
- public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content)
- throws ScoringFilterException {
- content.getMetadata().set(Nutch.SCORE_KEY, "" + datum.getScore());
- }
-
- public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
- List<CrawlDatum> inlinked)
- throws ScoringFilterException {
- // nothing to do
- }
+ private Configuration conf;
+ private float scoreInjected = 0.001f;
+ private float normalizedScore = 1.00f;
+
+ private final static Set<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
+
+ static {
+ FIELDS.add(WebPage.Field.METADATA);
+ FIELDS.add(WebPage.Field.SCORE);
+ }
+
+ public LinkAnalysisScoringFilter() {
+ }
+
+ public Configuration getConf() {
+ return conf;
+ }
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ normalizedScore = conf.getFloat("link.analyze.normalize.score", 1.00f);
+ scoreInjected = conf.getFloat("link.analyze.injected.score", 1.00f);
+ }
+
+ @Override
+ public Collection<WebPage.Field> getFields() {
+ return FIELDS;
+ }
+
+ @Override
+ public void injectedScore(String url, WebPage page)
+ throws ScoringFilterException {
+ page.setScore(scoreInjected);
+ }
+
+ @Override
+ public void initialScore(String url, WebPage page)
+ throws ScoringFilterException {
+ page.setScore(0.0f);
+ }
+
+ @Override
+ public float generatorSortValue(String url, WebPage page, float initSort)
+ throws ScoringFilterException {
+ return page.getScore() * initSort;
+ }
+
+ @Override
+ public void distributeScoreToOutlinks(String fromUrl, WebPage page,
+ Collection<ScoreDatum> scoreData, int allCount)
+ throws ScoringFilterException {
+ }
+
+ @Override
+ public void updateScore(String url, WebPage page,
+ List<ScoreDatum> inlinkedScoreData) throws ScoringFilterException {
+ }
+
+ @Override
+ public float indexerScore(String url, NutchDocument doc, WebPage page,
+ float initScore) throws ScoringFilterException {
+ return (normalizedScore * page.getScore());
+ }
}