You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2010/07/20 12:34:18 UTC
svn commit: r965787 - in /nutch/branches/nutchbase/src/plugin: ./ creativecommons/src/java/org/creativecommons/nutch/ creativecommons/src/test/org/creativecommons/nutch/ scoring-link/src/java/org/apache/nutch/scoring/link/

Author: jnioche
Date: Tue Jul 20 10:34:18 2010
New Revision: 965787

URL: http://svn.apache.org/viewvc?rev=965787&view=rev
Log:
Ported Scoring-link + creative commons to new API

Modified:
    nutch/branches/nutchbase/src/plugin/build.xml
    nutch/branches/nutchbase/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
    nutch/branches/nutchbase/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
    nutch/branches/nutchbase/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
    nutch/branches/nutchbase/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java

Modified: nutch/branches/nutchbase/src/plugin/build.xml
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/build.xml?rev=965787&r1=965786&r2=965787&view=diff
==============================================================================
--- nutch/branches/nutchbase/src/plugin/build.xml (original)
+++ nutch/branches/nutchbase/src/plugin/build.xml Tue Jul 20 10:34:18 2010
@@ -26,6 +26,7 @@
   <!-- Build & deploy all the plugin jars.                    -->
   <!-- ====================================================== -->
   <target name="deploy">
+     <ant dir="creativecommons" target="deploy"/>
      <ant dir="index-anchor" target="deploy"/>
      <ant dir="index-basic" target="deploy"/>
      <ant dir="index-more" target="deploy"/>
@@ -41,6 +42,7 @@
      <ant dir="parse-js" target="deploy"/>
      <ant dir="parse-rss" target="deploy"/>
      <ant dir="parse-tika" target="deploy"/>
+     <ant dir="scoring-link" target="deploy"/>
      <ant dir="scoring-opic" target="deploy"/>
      <ant dir="subcollection" target="deploy"/>
      <ant dir="tld" target="deploy"/>
@@ -54,14 +56,12 @@
      <ant dir="urlnormalizer-pass" target="deploy"/>
      <ant dir="urlnormalizer-regex" target="deploy"/>
      <!--
-     <ant dir="creativecommons" target="deploy"/>
      <ant dir="feed" target="deploy"/>
      <ant dir="microformats-reltag" target="deploy"/>
      <ant dir="parse-ext" target="deploy"/>
      <ant dir="parse-swf" target="deploy"/>
      <ant dir="parse-zip" target="deploy"/>
      <ant dir="protocol-httpclient" target="deploy"/>
-     <ant dir="scoring-link" target="deploy"/>
      -->
   </target>
 
@@ -69,7 +69,7 @@
   <!-- Test all of the plugins.                               -->
   <!-- ====================================================== -->
   <target name="test">
-    <parallel threadCount="2">
+     <ant dir="creativecommons" target="test"/>
      <ant dir="parse-rss" target="test"/>
      <ant dir="parse-tika" target="test"/>
      <ant dir="protocol-file" target="test"/>
@@ -87,14 +87,12 @@
      <ant dir="lib-http" target="test"/>
      <ant dir="subcollection" target="test"/>
      <!--
-     <ant dir="creativecommons" target="test"/>
      <ant dir="feed" target="test"/>
      <ant dir="parse-ext" target="test"/>
      <ant dir="parse-swf" target="test"/>
      <ant dir="parse-zip" target="test"/>
      <ant dir="protocol-httpclient" target="test"/>
       -->
-    </parallel>
   </target>
 
   <!-- ====================================================== -->

Modified: nutch/branches/nutchbase/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java?rev=965787&r1=965786&r2=965787&view=diff
==============================================================================
--- nutch/branches/nutchbase/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java (original)
+++ nutch/branches/nutchbase/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java Tue Jul 20 10:34:18 2010
@@ -17,105 +17,121 @@
 
 package org.creativecommons.nutch;
 
-import org.apache.nutch.metadata.CreativeCommons;
-
-import org.apache.nutch.parse.Parse;
-
-import org.apache.nutch.indexer.IndexingFilter;
-import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.hadoop.io.Text;
-
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.metadata.Metadata;
-
-import org.apache.hadoop.conf.Configuration;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.nio.ByteBuffer;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.StringTokenizer;
 
+import org.apache.avro.util.Utf8;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
-
-import java.util.*;
-import java.net.URL;
-import java.net.MalformedURLException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.metadata.CreativeCommons;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.storage.WebPage.Field;
 
 /** Adds basic searchable fields to a document. */
 public class CCIndexingFilter implements IndexingFilter {
-  public static final Log LOG = LogFactory.getLog(CCIndexingFilter.class);
+	public static final Log LOG = LogFactory.getLog(CCIndexingFilter.class);
+
+	/** The name of the document field we use. */
+	public static String FIELD = "cc";
+
+	private Configuration conf;
 
-  /** The name of the document field we use. */
-  public static String FIELD = "cc";
+	private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
 
-  private Configuration conf;
+	static {
+		FIELDS.add(WebPage.Field.BASE_URL);
+		FIELDS.add(WebPage.Field.METADATA);
+	}
+
+	/**
+	 * Add the features represented by a license URL. Urls are of the form
+	 * "http://creativecommons.org/licenses/xx-xx/xx/xx", where "xx" names a
+	 * license feature.
+	 */
+	public void addUrlFeatures(NutchDocument doc, String urlString) {
+		try {
+			URL url = new URL(urlString);
+
+			// tokenize the path of the url, breaking at slashes and dashes
+			StringTokenizer names = new StringTokenizer(url.getPath(), "/-");
+
+			if (names.hasMoreTokens())
+				names.nextToken(); // throw away "licenses"
+
+			// add a feature per component after "licenses"
+			while (names.hasMoreTokens()) {
+				String feature = names.nextToken();
+				addFeature(doc, feature);
+			}
+		} catch (MalformedURLException e) {
+			if (LOG.isWarnEnabled()) {
+				LOG.warn("CC: failed to parse url: " + urlString + " : " + e);
+			}
+		}
+	}
+
+	private void addFeature(NutchDocument doc, String feature) {
+		doc.add(FIELD, feature);
+	}
+
+	public void setConf(Configuration conf) {
+		this.conf = conf;
+	}
+
+	public Configuration getConf() {
+		return this.conf;
+	}
+
+	@Override
+	public Collection<Field> getFields() {
+		return FIELDS;
+	}
+
+	@Override
+	public NutchDocument filter(NutchDocument doc, String url, WebPage page)
+			throws IndexingException {
+
+		ByteBuffer blicense = page.getFromMetadata(new Utf8(
+				CreativeCommons.LICENSE_URL));
+		if (blicense != null) {
+			String licenseUrl = new String(blicense.array());
+			if (LOG.isInfoEnabled()) {
+				LOG.info("CC: indexing " + licenseUrl + " for: "
+						+ url.toString());
+			}
+
+			// add the entire license as cc:license=xxx
+			addFeature(doc, "license=" + licenseUrl);
+
+			// index license attributes extracted of the license url
+			addUrlFeatures(doc, licenseUrl);
+		}
+
+		// index the license location as cc:meta=xxx
+		ByteBuffer blicenseloc = page.getFromMetadata(new Utf8(
+				CreativeCommons.LICENSE_LOCATION));
+		if (blicenseloc != null) {
+			String licenseLocation = new String(blicenseloc.array());
+			addFeature(doc, "meta=" + licenseLocation);
+		}
+
+		// index the work type cc:type=xxx
+		ByteBuffer bworkType = page.getFromMetadata(new Utf8(
+				CreativeCommons.WORK_TYPE));
+		if (bworkType != null) {
+			String workType = new String(bworkType.array());
+			addFeature(doc, workType);
+		}
 
-  public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
-    throws IndexingException {
-    
-    Metadata metadata = parse.getData().getParseMeta();
-    // index the license
-    String licenseUrl = metadata.get(CreativeCommons.LICENSE_URL);
-    if (licenseUrl != null) {
-      if (LOG.isInfoEnabled()) {
-        LOG.info("CC: indexing " + licenseUrl + " for: " + url.toString());
-      }
-
-      // add the entire license as cc:license=xxx
-      addFeature(doc, "license=" + licenseUrl);
-
-      // index license attributes extracted of the license url
-      addUrlFeatures(doc, licenseUrl);
-    }
-
-    // index the license location as cc:meta=xxx
-    String licenseLocation = metadata.get(CreativeCommons.LICENSE_LOCATION);
-    if (licenseLocation != null) {
-      addFeature(doc, "meta=" + licenseLocation);
-    }
-
-    // index the work type cc:type=xxx
-    String workType = metadata.get(CreativeCommons.WORK_TYPE);
-    if (workType != null) {
-      addFeature(doc, workType);
-    }
-
-    return doc;
-  }
-
-  /** Add the features represented by a license URL.  Urls are of the form
-   * "http://creativecommons.org/licenses/xx-xx/xx/xx", where "xx" names a
-   * license feature. */
-  public void addUrlFeatures(NutchDocument doc, String urlString) {
-    try {
-      URL url = new URL(urlString);
-
-      // tokenize the path of the url, breaking at slashes and dashes
-      StringTokenizer names = new StringTokenizer(url.getPath(), "/-");
-
-      if (names.hasMoreTokens())
-        names.nextToken();                        // throw away "licenses"
-
-      // add a feature per component after "licenses"
-      while (names.hasMoreTokens()) {
-        String feature = names.nextToken();
-        addFeature(doc, feature);
-      }
-    } catch (MalformedURLException e) {
-      if (LOG.isWarnEnabled()) {
-        LOG.warn("CC: failed to parse url: " + urlString + " : " + e);
-      }
-    }
-  }
-  
-  private void addFeature(NutchDocument doc, String feature) {
-    doc.add(FIELD, feature);
-  }
-
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-  }
-
-  public Configuration getConf() {
-    return this.conf;
-  }
+		return doc;
+	}
 
 }

Modified: nutch/branches/nutchbase/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java?rev=965787&r1=965786&r2=965787&view=diff
==============================================================================
--- nutch/branches/nutchbase/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java (original)
+++ nutch/branches/nutchbase/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java Tue Jul 20 10:34:18 2010
@@ -17,289 +17,315 @@
 
 package org.creativecommons.nutch;
 
-import org.apache.nutch.metadata.CreativeCommons;
-import org.apache.nutch.parse.*;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
+import java.io.StringReader;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.nio.ByteBuffer;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
 
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+
+import org.apache.avro.util.Utf8;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
-
-import java.util.*;
-import java.io.*;
-import java.net.*;
-import javax.xml.parsers.*;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.CreativeCommons;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.HtmlParseFilter;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseStatusUtils;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.storage.WebPage.Field;
+import org.w3c.dom.Comment;
+import org.w3c.dom.Document;
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
 import org.xml.sax.InputSource;
-import org.w3c.dom.*;
-
 
 /** Adds metadata identifying the Creative Commons license used, if any. */
 public class CCParseFilter implements HtmlParseFilter {
-  public static final Log LOG = LogFactory.getLog(CCParseFilter.class);
+	public static final Log LOG = LogFactory.getLog(CCParseFilter.class);
 
+	/** Walks DOM tree, looking for RDF in comments and licenses in anchors. */
+	public static class Walker {
+		private URL base; // base url of page
+		private String rdfLicense; // subject url found, if any
+		private URL relLicense; // license url found, if any
+		private URL anchorLicense; // anchor url found, if any
+		private String workType; // work type URI
+
+		private Walker(URL base) {
+			this.base = base;
+		}
+
+		/** Scan the document adding attributes to metadata. */
+		public static void walk(Node doc, URL base, WebPage page,
+				Configuration conf) throws ParseException {
+
+			// walk the DOM tree, scanning for license data
+			Walker walker = new Walker(base);
+			walker.walk(doc);
+
+			// interpret results of walk
+			String licenseUrl = null;
+			String licenseLocation = null;
+			if (walker.rdfLicense != null) { // 1st choice: subject in RDF
+				licenseLocation = "rdf";
+				licenseUrl = walker.rdfLicense;
+			} else if (walker.relLicense != null) { // 2nd: anchor w/
+													// rel=license
+				licenseLocation = "rel";
+				licenseUrl = walker.relLicense.toString();
+			} else if (walker.anchorLicense != null) { // 3rd: anchor w/ CC
+														// license
+				licenseLocation = "a";
+				licenseUrl = walker.anchorLicense.toString();
+			} else if (conf.getBoolean("creativecommons.exclude.unlicensed",
+					false)) {
+				throw new ParseException("No CC license.  Excluding.");
+			}
+
+			// add license to metadata
+			if (licenseUrl != null) {
+				if (LOG.isInfoEnabled()) {
+					LOG.info("CC: found " + licenseUrl + " in "
+							+ licenseLocation + " of " + base);
+				}
+				page.putToMetadata(new Utf8(CreativeCommons.LICENSE_URL),
+						ByteBuffer.wrap(licenseUrl.getBytes()));
+				page.putToMetadata(new Utf8(CreativeCommons.LICENSE_LOCATION),
+						ByteBuffer.wrap(licenseLocation.getBytes()));
+			}
+
+			if (walker.workType != null) {
+				if (LOG.isInfoEnabled()) {
+					LOG.info("CC: found " + walker.workType + " in " + base);
+				}
+				page.putToMetadata(new Utf8(CreativeCommons.WORK_TYPE),
+						ByteBuffer.wrap(walker.workType.getBytes()));
+			}
+
+		}
+
+		/** Scan the document looking for RDF in comments and license elements. */
+		private void walk(Node node) {
+
+			// check element nodes for license URL
+			if (node instanceof Element) {
+				findLicenseUrl((Element) node);
+			}
+
+			// check comment nodes for license RDF
+			if (node instanceof Comment) {
+				findRdf(((Comment) node).getData());
+			}
+
+			// recursively walk child nodes
+			NodeList children = node.getChildNodes();
+			for (int i = 0; children != null && i < children.getLength(); i++) {
+				walk(children.item(i));
+			}
+		}
+
+		/**
+		 * Extract license url from element, if any. Thse are the href attribute
+		 * of anchor elements with rel="license". These must also point to
+		 * http://creativecommons.org/licenses/.
+		 */
+		private void findLicenseUrl(Element element) {
+			// only look in Anchor elements
+			if (!"a".equalsIgnoreCase(element.getTagName()))
+				return;
+
+			// require an href
+			String href = element.getAttribute("href");
+			if (href == null)
+				return;
+
+			try {
+				URL url = new URL(base, href); // resolve the url
+
+				// check that it's a CC license URL
+				if ("http".equalsIgnoreCase(url.getProtocol())
+						&& "creativecommons.org"
+								.equalsIgnoreCase(url.getHost())
+						&& url.getPath() != null
+						&& url.getPath().startsWith("/licenses/")
+						&& url.getPath().length() > "/licenses/".length()) {
+
+					// check rel="license"
+					String rel = element.getAttribute("rel");
+					if (rel != null && "license".equals(rel)
+							&& this.relLicense == null) {
+						this.relLicense = url; // found rel license
+					} else if (this.anchorLicense == null) {
+						this.anchorLicense = url; // found anchor license
+					}
+				}
+			} catch (MalformedURLException e) { // ignore malformed urls
+			}
+		}
+
+		/** Configure a namespace aware XML parser. */
+		private static final DocumentBuilderFactory FACTORY = DocumentBuilderFactory
+				.newInstance();
+		static {
+			FACTORY.setNamespaceAware(true);
+		}
+
+		/** Creative Commons' namespace URI. */
+		private static final String CC_NS = "http://web.resource.org/cc/";
+
+		/** Dublin Core namespace URI. */
+		private static final String DC_NS = "http://purl.org/dc/elements/1.1/";
+
+		/** RDF syntax namespace URI. */
+		private static final String RDF_NS = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
+
+		private void findRdf(String comment) {
+			// first check for likely RDF in comment
+			int rdfPosition = comment.indexOf("RDF");
+			if (rdfPosition < 0)
+				return; // no RDF, abort
+			int nsPosition = comment.indexOf(CC_NS);
+			if (nsPosition < 0)
+				return; // no RDF, abort
+
+			// try to parse the XML
+			Document doc;
+			try {
+				DocumentBuilder parser = FACTORY.newDocumentBuilder();
+				doc = parser.parse(new InputSource(new StringReader(comment)));
+			} catch (Exception e) {
+				if (LOG.isWarnEnabled()) {
+					LOG.warn("CC: Failed to parse RDF in " + base + ": " + e);
+				}
+				// e.printStackTrace();
+				return;
+			}
+
+			// check that root is rdf:RDF
+			NodeList roots = doc.getElementsByTagNameNS(RDF_NS, "RDF");
+			if (roots.getLength() != 1) {
+				if (LOG.isWarnEnabled()) {
+					LOG.warn("CC: No RDF root in " + base);
+				}
+				return;
+			}
+			Element rdf = (Element) roots.item(0);
+
+			// get cc:License nodes inside rdf:RDF
+			NodeList licenses = rdf.getElementsByTagNameNS(CC_NS, "License");
+			for (int i = 0; i < licenses.getLength(); i++) {
+
+				Element l = (Element) licenses.item(i);
+
+				// license is rdf:about= attribute from cc:License
+				this.rdfLicense = l.getAttributeNodeNS(RDF_NS, "about")
+						.getValue();
+
+				// walk predicates of cc:License
+				NodeList predicates = l.getChildNodes();
+				for (int j = 0; j < predicates.getLength(); j++) {
+					Node predicateNode = predicates.item(j);
+					if (!(predicateNode instanceof Element))
+						continue;
+					Element predicateElement = (Element) predicateNode;
+
+					// extract predicates of cc:xxx predicates
+					if (!CC_NS.equals(predicateElement.getNamespaceURI())) {
+						continue;
+					}
+					String predicate = predicateElement.getLocalName();
+
+					// object is rdf:resource from cc:xxx predicates
+					String object = predicateElement.getAttributeNodeNS(RDF_NS,
+							"resource").getValue();
+
+					// add object and predicate to metadata
+					// metadata.put(object, predicate);
+					// if (LOG.isInfoEnabled()) {
+					// LOG.info("CC: found: "+predicate+"="+object);
+					// }
+				}
+			}
+
+			// get cc:Work nodes from rdf:RDF
+			NodeList works = rdf.getElementsByTagNameNS(CC_NS, "Work");
+			for (int i = 0; i < works.getLength(); i++) {
+				Element l = (Element) works.item(i);
+
+				// get dc:type nodes from cc:Work
+				NodeList types = rdf.getElementsByTagNameNS(DC_NS, "type");
+				for (int j = 0; j < types.getLength(); j++) {
+					Element type = (Element) types.item(j);
+					String workUri = type
+							.getAttributeNodeNS(RDF_NS, "resource").getValue();
+					this.workType = (String) WORK_TYPE_NAMES.get(workUri);
+					break;
+				}
+			}
+		}
+	}
+
+	private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
+
+	static {
+		FIELDS.add(WebPage.Field.BASE_URL);
+		FIELDS.add(WebPage.Field.METADATA);
+	}
+
+	private static final HashMap<String,String> WORK_TYPE_NAMES = new HashMap<String,String>();
+	static {
+		WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/MovingImage", "video");
+		WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/StillImage", "image");
+		WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Sound", "audio");
+		WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Text", "text");
+		WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Interactive",
+				"interactive");
+		WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Software", "software");
+		WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Image", "image");
+	}
+
+	private Configuration conf;
+
+	public void setConf(Configuration conf) {
+		this.conf = conf;
+	}
+
+	public Configuration getConf() {
+		return this.conf;
+	}
+
+	@Override
+	public Collection<Field> getFields() {
+		return FIELDS;
+	}
+
+	/**
+	 * Adds metadata or otherwise modifies a parse of an HTML document, given
+	 * the DOM tree of a page.
+	 */
+	@Override
+	public Parse filter(String url, WebPage page, Parse parse,
+			HTMLMetaTags metaTags, DocumentFragment doc) {
+		// construct base url
+		URL base;
+		try {
+			base = new URL(page.getBaseUrl().toString());
+			// extract license metadata
+			Walker.walk(doc, base, page, getConf());
+		} catch (Exception e) {
+			LOG.error("Error parsing " + url, e);
+			return ParseStatusUtils.getEmptyParse(e, getConf());
+		}
 
-  /** Walks DOM tree, looking for RDF in comments and licenses in anchors.*/
-  public static class Walker {
-    private URL base;                             // base url of page
-    private String rdfLicense;                    // subject url found, if any
-    private URL relLicense;                       // license url found, if any
-    private URL anchorLicense;                    // anchor url found, if any
-    private String workType;                      // work type URI
-
-    private Walker(URL base) {
-      this.base = base;
-    }
-
-    /** Scan the document adding attributes to metadata.*/
-    public static void walk(Node doc, URL base, Metadata metadata, Configuration conf)
-      throws ParseException {
-
-      // walk the DOM tree, scanning for license data
-      Walker walker = new Walker(base);
-      walker.walk(doc);
-
-      // interpret results of walk
-      String licenseUrl = null;
-      String licenseLocation = null;
-      if (walker.rdfLicense != null) {            // 1st choice: subject in RDF
-        licenseLocation = "rdf";
-        licenseUrl = walker.rdfLicense;
-      } else if (walker.relLicense != null) {     // 2nd: anchor w/ rel=license
-        licenseLocation = "rel";
-        licenseUrl = walker.relLicense.toString();
-      } else if (walker.anchorLicense != null) {  // 3rd: anchor w/ CC license
-        licenseLocation = "a";
-        licenseUrl = walker.anchorLicense.toString();
-      } else if (conf.getBoolean("creativecommons.exclude.unlicensed", false)) {
-        throw new ParseException("No CC license.  Excluding.");
-      }
-
-      // add license to metadata
-      if (licenseUrl != null) {
-        if (LOG.isInfoEnabled()) {
-          LOG.info("CC: found "+licenseUrl+" in "+licenseLocation+" of "+base);
-        }
-        metadata.add(CreativeCommons.LICENSE_URL, licenseUrl);
-        metadata.add(CreativeCommons.LICENSE_LOCATION, licenseLocation);
-      }
-
-      if (walker.workType != null) {
-        if (LOG.isInfoEnabled()) {
-          LOG.info("CC: found "+walker.workType+" in "+base);
-        }
-        metadata.add(CreativeCommons.WORK_TYPE, walker.workType);
-      }
-
-    }
-
-    /** Scan the document looking for RDF in comments and license elements.*/
-    private void walk(Node node) {
-      
-      // check element nodes for license URL
-      if (node instanceof Element) {
-        findLicenseUrl((Element)node);
-      }
-
-      // check comment nodes for license RDF
-      if (node instanceof Comment) {
-        findRdf(((Comment)node).getData());
-      }
-
-      // recursively walk child nodes
-      NodeList children = node.getChildNodes();
-      for (int i = 0; children != null && i < children.getLength(); i++ ) {
-        walk(children.item(i));
-      }
-    }
-
-    /** Extract license url from element, if any.  Thse are the href attribute
-     * of anchor elements with rel="license".  These must also point to
-     * http://creativecommons.org/licenses/. */
-    private void findLicenseUrl(Element element) {
-      // only look in Anchor elements
-      if (!"a".equalsIgnoreCase(element.getTagName()))
-        return;
-
-      // require an href
-      String href = element.getAttribute("href");
-      if (href == null)
-        return;
-      
-      try {
-        URL url = new URL(base, href);            // resolve the url
-
-        // check that it's a CC license URL
-        if ("http".equalsIgnoreCase(url.getProtocol()) &&
-            "creativecommons.org".equalsIgnoreCase(url.getHost()) &&
-            url.getPath() != null &&
-            url.getPath().startsWith("/licenses/") &&
-            url.getPath().length() > "/licenses/".length()) {
-
-          // check rel="license"
-          String rel = element.getAttribute("rel");
-          if (rel != null && "license".equals(rel) && this.relLicense == null) {
-            this.relLicense = url;                   // found rel license
-          } else if (this.anchorLicense == null) {
-            this.anchorLicense = url;             // found anchor license
-          }
-        }
-      } catch (MalformedURLException e) {         // ignore malformed urls
-      }
-    }
-
-   /** Configure a namespace aware XML parser. */
-    private static final DocumentBuilderFactory FACTORY
-      = DocumentBuilderFactory.newInstance();
-    static {
-      FACTORY.setNamespaceAware(true);
-    }
-
-    /** Creative Commons' namespace URI. */
-    private static final String CC_NS = "http://web.resource.org/cc/";
-    
-    /** Dublin Core namespace URI. */
-    private static final String DC_NS = "http://purl.org/dc/elements/1.1/";
-    
-    /** RDF syntax namespace URI. */
-    private static final String RDF_NS
-      = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
-
-    private void findRdf(String comment) {
-      // first check for likely RDF in comment
-      int rdfPosition = comment.indexOf("RDF");
-      if (rdfPosition < 0)
-        return;                                   // no RDF, abort
-      int nsPosition = comment.indexOf(CC_NS);
-      if (nsPosition < 0)
-        return;                                   // no RDF, abort
-
-      // try to parse the XML
-      Document doc;
-      try {
-        DocumentBuilder parser = FACTORY.newDocumentBuilder();
-        doc = parser.parse(new InputSource(new StringReader(comment)));
-      } catch (Exception e) {
-        if (LOG.isWarnEnabled()) {
-          LOG.warn("CC: Failed to parse RDF in "+base+": "+e);
-        }
-        //e.printStackTrace();
-        return;
-      }
-
-      // check that root is rdf:RDF
-      NodeList roots = doc.getElementsByTagNameNS(RDF_NS, "RDF");
-      if (roots.getLength() != 1) {
-        if (LOG.isWarnEnabled()) { LOG.warn("CC: No RDF root in "+base); }
-        return;
-      }
-      Element rdf = (Element)roots.item(0);
-
-      // get cc:License nodes inside rdf:RDF
-      NodeList licenses = rdf.getElementsByTagNameNS(CC_NS, "License");
-      for (int i = 0; i < licenses.getLength(); i++) {
-
-        Element l = (Element)licenses.item(i);
-
-        // license is rdf:about= attribute from cc:License
-        this.rdfLicense = l.getAttributeNodeNS(RDF_NS,"about").getValue();
-
-        // walk predicates of cc:License
-        NodeList predicates = l.getChildNodes();
-        for (int j = 0; j < predicates.getLength(); j++) {
-          Node predicateNode = predicates.item(j);
-          if (!(predicateNode instanceof Element))
-            continue;
-          Element predicateElement = (Element)predicateNode;
-
-          // extract predicates of cc:xxx predicates
-          if (!CC_NS.equals(predicateElement.getNamespaceURI())) {
-            continue;
-          }
-          String predicate = predicateElement.getLocalName();
-
-          // object is rdf:resource from cc:xxx predicates
-          String object =
-            predicateElement.getAttributeNodeNS(RDF_NS, "resource").getValue();
-        
-          // add object and predicate to metadata
-          // metadata.put(object, predicate);
-          // if (LOG.isInfoEnabled()) {
-          //   LOG.info("CC: found: "+predicate+"="+object);
-          // }
-        }
-      }
-
-      // get cc:Work nodes from rdf:RDF
-      NodeList works = rdf.getElementsByTagNameNS(CC_NS, "Work");
-      for (int i = 0; i < works.getLength(); i++) {
-        Element l = (Element)works.item(i);
-        
-        // get dc:type nodes from cc:Work
-        NodeList types = rdf.getElementsByTagNameNS(DC_NS, "type");
-        for (int j = 0; j < types.getLength(); j++) {
-          Element type = (Element)types.item(j);
-          String workUri = 
-            type.getAttributeNodeNS(RDF_NS, "resource").getValue();
-          this.workType = (String)WORK_TYPE_NAMES.get(workUri);
-          break;
-        }
-      }
-    }
-  }
-
-  private static final HashMap WORK_TYPE_NAMES = new HashMap();
-  static {
-    WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/MovingImage", "video");
-    WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/StillImage", "image");
-    WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Sound", "audio");
-    WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Text", "text");
-    WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Interactive", "interactive");
-    WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Software", "software");
-    WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Image", "image");
-  }
-
-  private Configuration conf;
-
-  /** Adds metadata or otherwise modifies a parse of an HTML document, given
-   * the DOM tree of a page. */
-  public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
-
-    // get parse obj
-    Parse parse = parseResult.get(content.getUrl());
-
-    // construct base url
-    URL base;
-    try {
-      base = new URL(content.getBaseUrl());
-    } catch (MalformedURLException e) {
-      Parse emptyParse = new ParseStatus(e).getEmptyParse(getConf());
-      parseResult.put(content.getUrl(), 
-                      new ParseText(emptyParse.getText()), 
-                      emptyParse.getData());
-      return parseResult;
-    }
-
-    try {
-      // extract license metadata
-      Walker.walk(doc, base, parse.getData().getParseMeta(), getConf());
-    } catch (ParseException e) {
-      Parse emptyParse = new ParseStatus(e).getEmptyParse(getConf());
-      parseResult.put(content.getUrl(), 
-                      new ParseText(emptyParse.getText()), 
-                      emptyParse.getData());
-      return parseResult;
-    }
-
-    return parseResult;
-  }
-
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-  }
-
-  public Configuration getConf() {
-    return this.conf;
-  }
+		return parse;
+	}
 }

Modified: nutch/branches/nutchbase/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java?rev=965787&r1=965786&r2=965787&view=diff
==============================================================================
--- nutch/branches/nutchbase/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java (original)
+++ nutch/branches/nutchbase/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java Tue Jul 20 10:34:18 2010
@@ -21,57 +21,71 @@ import org.apache.nutch.metadata.Metadat
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseUtil;
 import org.apache.nutch.protocol.Content;
+import org.apache.avro.util.Utf8;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.MimeUtil;
 import org.apache.nutch.util.NutchConfiguration;
+import org.apache.tika.mime.MimeType;
 
 import java.util.Properties;
 import java.io.*;
 import java.net.URL;
+import java.nio.ByteBuffer;
 
 import junit.framework.TestCase;
 
 public class TestCCParseFilter extends TestCase {
 
-  private static final File testDir =
-    new File(System.getProperty("test.input"));
+	private static final File testDir = new File(
+			System.getProperty("test.input"));
 
-  public void testPages() throws Exception {
-    pageTest(new File(testDir, "anchor.html"), "http://foo.com/",
-             "http://creativecommons.org/licenses/by-nc-sa/1.0", "a", null);
-    // Tika returns <a> whereas parse-html returns <rel>
-    // check later
-    pageTest(new File(testDir, "rel.html"), "http://foo.com/",
-             "http://creativecommons.org/licenses/by-nc/2.0", "a", null);
-    // Tika returns <a> whereas parse-html returns <rdf>
-    // check later
-    pageTest(new File(testDir, "rdf.html"), "http://foo.com/",
-             "http://creativecommons.org/licenses/by-nc/1.0", "a", null);
-  }
-
-  public void pageTest(File file, String url,
-                       String license, String location, String type)
-    throws Exception {
-
-    String contentType = "text/html";
-    InputStream in = new FileInputStream(file);
-    ByteArrayOutputStream out = new ByteArrayOutputStream((int)file.length());
-    byte[] buffer = new byte[1024];
-    int i;
-    while ((i = in.read(buffer)) != -1) {
-      out.write(buffer, 0, i);
-    }
-    in.close();
-    byte[] bytes = out.toByteArray();
-    Configuration conf = NutchConfiguration.create();
-
-    Content content =
-      new Content(url, url, bytes, contentType, new Metadata(), conf);
-    Parse parse =  new ParseUtil(conf).parse(content).get(content.getUrl());
-    
-    Metadata metadata = parse.getData().getParseMeta();
-    assertEquals(license, metadata.get("License-Url"));
-    assertEquals(location, metadata.get("License-Location"));
-    assertEquals(type, metadata.get("Work-Type"));
-  }
+	public void testPages() throws Exception {
+		pageTest(new File(testDir, "anchor.html"), "http://foo.com/",
+				"http://creativecommons.org/licenses/by-nc-sa/1.0", "a", null);
+		// Tika returns <a> whereas parse-html returns <rel>
+		// check later
+		pageTest(new File(testDir, "rel.html"), "http://foo.com/",
+				"http://creativecommons.org/licenses/by-nc/2.0", "a", null);
+		// Tika returns <a> whereas parse-html returns <rdf>
+		// check later
+		pageTest(new File(testDir, "rdf.html"), "http://foo.com/",
+				"http://creativecommons.org/licenses/by-nc/1.0", "a", null);
+	}
+
+	public void pageTest(File file, String url, String license,
+			String location, String type) throws Exception {
+
+		String contentType = "text/html";
+		InputStream in = new FileInputStream(file);
+		ByteArrayOutputStream out = new ByteArrayOutputStream(
+				(int) file.length());
+		byte[] buffer = new byte[1024];
+		int i;
+		while ((i = in.read(buffer)) != -1) {
+			out.write(buffer, 0, i);
+		}
+		in.close();
+		byte[] bytes = out.toByteArray();
+		Configuration conf = NutchConfiguration.create();
+
+		WebPage page = new WebPage();
+		page.setBaseUrl(new Utf8(url));
+		page.setContent(ByteBuffer.wrap(bytes));
+		MimeUtil mimeutil = new MimeUtil(conf);
+		MimeType mtype = mimeutil.getMimeType(file);
+		page.setContentType(new Utf8(mtype.getName()));
+
+		new ParseUtil(conf).parse(url, page);
+
+		ByteBuffer bb = page.getFromMetadata(new Utf8("License-Url"));
+		assertEquals(license, new String(bb.array()));
+		bb = page.getFromMetadata(new Utf8("License-Location"));
+		assertEquals(location, new String(bb.array()));
+		bb = page.getFromMetadata(new Utf8("Work-Type"));
+		if (bb == null)
+			assertEquals(type, null);
+		else
+			assertEquals(type, new String(bb.array()));
+	}
 }
-

Modified: nutch/branches/nutchbase/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java?rev=965787&r1=965786&r2=965787&view=diff
==============================================================================
--- nutch/branches/nutchbase/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java (original)
+++ nutch/branches/nutchbase/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java Tue Jul 20 10:34:18 2010
@@ -17,85 +17,81 @@
 package org.apache.nutch.scoring.link;
 
 import java.util.Collection;
+import java.util.HashSet;
 import java.util.List;
-import java.util.Map.Entry;
+import java.util.Set;
 
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
 import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.metadata.Nutch;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.protocol.Content;
+import org.apache.nutch.scoring.ScoreDatum;
 import org.apache.nutch.scoring.ScoringFilter;
 import org.apache.nutch.scoring.ScoringFilterException;
+import org.apache.nutch.storage.WebPage;
 
-public class LinkAnalysisScoringFilter
-  implements ScoringFilter {
+public class LinkAnalysisScoringFilter implements ScoringFilter {
 
-  private Configuration conf;
-  private float scoreInjected = 0.001f;
-  private float normalizedScore = 1.00f;
-
-  public LinkAnalysisScoringFilter() {
-
-  }
-
-  public Configuration getConf() {
-    return conf;
-  }
-
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    normalizedScore = conf.getFloat("link.analyze.normalize.score", 1.00f);
-    scoreInjected = conf.getFloat("link.analyze.injected.score", 1.00f);
-  }
-
-  public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
-    ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
-    CrawlDatum adjust, int allCount)
-    throws ScoringFilterException {
-    return adjust;
-  }
-
-  public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
-    throws ScoringFilterException {
-    return datum.getScore() * initSort;
-  }
-
-  public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
-    CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
-    throws ScoringFilterException {
-    return (normalizedScore * dbDatum.getScore());
-  }
-
-  public void initialScore(Text url, CrawlDatum datum)
-    throws ScoringFilterException {
-    datum.setScore(0.0f);
-  }
-
-  public void injectedScore(Text url, CrawlDatum datum)
-    throws ScoringFilterException {
-    datum.setScore(scoreInjected);
-  }
-
-  public void passScoreAfterParsing(Text url, Content content, Parse parse)
-    throws ScoringFilterException {
-    parse.getData().getContentMeta().set(Nutch.SCORE_KEY,
-      content.getMetadata().get(Nutch.SCORE_KEY));
-  }
-
-  public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content)
-    throws ScoringFilterException {
-    content.getMetadata().set(Nutch.SCORE_KEY, "" + datum.getScore());
-  }
-
-  public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
-    List<CrawlDatum> inlinked)
-    throws ScoringFilterException {
-    // nothing to do
-  }
+	private Configuration conf;
+	private float scoreInjected = 0.001f;
+	private float normalizedScore = 1.00f;
+
+	private final static Set<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
+
+	static {
+		FIELDS.add(WebPage.Field.METADATA);
+		FIELDS.add(WebPage.Field.SCORE);
+	}
+
+	public LinkAnalysisScoringFilter() {
+	}
+
+	public Configuration getConf() {
+		return conf;
+	}
+
+	public void setConf(Configuration conf) {
+		this.conf = conf;
+		normalizedScore = conf.getFloat("link.analyze.normalize.score", 1.00f);
+		scoreInjected = conf.getFloat("link.analyze.injected.score", 1.00f);
+	}
+
+	@Override
+	public Collection<WebPage.Field> getFields() {
+		return FIELDS;
+	}
+
+	@Override
+	public void injectedScore(String url, WebPage page)
+			throws ScoringFilterException {
+		page.setScore(scoreInjected);
+	}
+
+	@Override
+	public void initialScore(String url, WebPage page)
+			throws ScoringFilterException {
+		page.setScore(0.0f);
+	}
+
+	@Override
+	public float generatorSortValue(String url, WebPage page, float initSort)
+			throws ScoringFilterException {
+		return page.getScore() * initSort;
+	}
+
+	@Override
+	public void distributeScoreToOutlinks(String fromUrl, WebPage page,
+			Collection<ScoreDatum> scoreData, int allCount)
+			throws ScoringFilterException {
+	}
+
+	@Override
+	public void updateScore(String url, WebPage page,
+			List<ScoreDatum> inlinkedScoreData) throws ScoringFilterException {
+	}
+
+	@Override
+	public float indexerScore(String url, NutchDocument doc, WebPage page,
+			float initScore) throws ScoringFilterException {
+		return (normalizedScore * page.getScore());
+	}
 
 }