You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2010/07/28 00:26:08 UTC

svn commit: r979893 - in /nutch/branches/nutchbase/src/plugin/parse-tika/src: java/org/apache/nutch/parse/tika/ test/org/apache/nutch/parse/ test/org/apache/nutch/parse/tika/ test/org/apache/nutch/tika/

Author: jnioche
Date: Tue Jul 27 22:26:07 2010
New Revision: 979893

URL: http://svn.apache.org/viewvc?rev=979893&view=rev
Log:
NUTCH-840 : moved tests to parse/tika + added TestDOMContentUtil which currently fail but will help us track the progress on the Tika processing of HTML

Added:
    nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/
    nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/
    nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestDOMContentUtils.java
    nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
    nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java
    nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java
    nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
Removed:
    nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/tika/
Modified:
    nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
    nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
    nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java

Modified: nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java?rev=979893&r1=979892&r2=979893&view=diff
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java (original)
+++ nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java Tue Jul 27 22:26:07 2010
@@ -37,7 +37,7 @@ import org.w3c.dom.NodeList;
  * DOM nodes, such as getOutlinks, getText, etc.
  *
  */
-class DOMContentUtils {
+public class DOMContentUtils {
 
   private static class LinkParams {
 	private String elName;
@@ -58,11 +58,11 @@ class DOMContentUtils {
   private HashMap linkParams = new HashMap();
   private Configuration conf;
   
-  DOMContentUtils(Configuration conf) {
+  public DOMContentUtils(Configuration conf) {
     setConf(conf);
   }
   
-  private void setConf(Configuration conf) {
+  public void setConf(Configuration conf) {
     // forceTags is used to override configurable tag ignoring, later on
     Collection<String> forceTags = new ArrayList<String>(1);
 
@@ -118,7 +118,7 @@ class DOMContentUtils {
    * #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
    * 
    */
-  void getText(StringBuffer sb, Node node) {
+  public void getText(StringBuffer sb, Node node) {
     getText(sb, node, false);
   }
 
@@ -174,7 +174,7 @@ class DOMContentUtils {
    *
    * @return true if a title node was found, false otherwise
    */
-  boolean getTitle(StringBuffer sb, Node node) {
+  public boolean getTitle(StringBuffer sb, Node node) {
     
     NodeWalker walker = new NodeWalker(node);
     
@@ -358,7 +358,7 @@ class DOMContentUtils {
    * nodes (this is a common DOM-fixup artifact, at least with
    * nekohtml).
    */
-  void getOutlinks(URL base, ArrayList outlinks, 
+  public void getOutlinks(URL base, ArrayList outlinks, 
                                        Node node) {
     
     NodeWalker walker = new NodeWalker(node);

Modified: nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java?rev=979893&r1=979892&r2=979893&view=diff
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java (original)
+++ nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java Tue Jul 27 22:26:07 2010
@@ -28,7 +28,7 @@ import org.w3c.dom.*;
  * noindex), finding BASE HREF tags, and HTTP-EQUIV no-cache
  * instructions. All meta directives are stored in a HTMLMetaTags instance.
  */
-class HTMLMetaProcessor {
+public class HTMLMetaProcessor {
 
   /**
    * Utility class with indicators for the robots directives "noindex"
@@ -40,7 +40,7 @@ class HTMLMetaProcessor {
    * values, based on any META tags found under the given
    * <code>node</code>.
    */
-  static final void getMetaTags (
+  public static final void getMetaTags (
     HTMLMetaTags metaTags, Node node, URL currURL) {
 
     metaTags.reset();

Modified: nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java?rev=979893&r1=979892&r2=979893&view=diff
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java (original)
+++ nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java Tue Jul 27 22:26:07 2010
@@ -53,6 +53,8 @@ import org.apache.tika.metadata.Metadata
 import org.apache.tika.mime.MimeType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.html.HtmlMapper;
+import org.apache.tika.parser.html.IdentityHtmlMapper;
 import org.w3c.dom.DocumentFragment;
 
 /**
@@ -110,6 +112,8 @@ public class TikaParser implements org.a
     DocumentFragment root = doc.createDocumentFragment();
     DOMBuilder domhandler = new DOMBuilder(doc, root);
     ParseContext context = new ParseContext();
+    // to add once available in Tika
+    // context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
     try {
       parser.parse(new ByteArrayInputStream(raw), domhandler, tikamd, context);
     } catch (Exception e) {
@@ -211,6 +215,10 @@ public class TikaParser implements org.a
         Nutch.CACHING_FORBIDDEN_CONTENT);
   }
 
+  public TikaConfig getTikaConfig(){
+	  return this.tikaConfig;
+  }
+  
   public Configuration getConf() {
     return this.conf;
   }

Added: nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestDOMContentUtils.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestDOMContentUtils.java?rev=979893&view=auto
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestDOMContentUtils.java (added)
+++ nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestDOMContentUtils.java Tue Jul 27 22:26:07 2010
@@ -0,0 +1,416 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.tika;
+
+import junit.framework.TestCase;
+
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.tika.DOMBuilder;
+import org.apache.nutch.parse.tika.DOMContentUtils;
+import org.apache.nutch.parse.tika.TikaParser;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.html.dom.HTMLDocumentImpl;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.html.HtmlMapper;
+import org.apache.tika.parser.html.IdentityHtmlMapper;
+import org.apache.xml.serialize.DOMSerializerImpl;
+import org.mortbay.log.Log;
+import org.w3c.dom.DocumentFragment;
+
+import java.io.ByteArrayInputStream;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.StringTokenizer;
+
+/**
+ * Unit tests for DOMContentUtils.
+ */
+public class TestDOMContentUtils extends TestCase {
+
+	private static final String[] testPages = {
+			// 0.
+			new String(
+					"<html><head><title> title </title><script> script </script>"
+							+ "</head><body> body <a href=\"http://www.nutch.org\">"
+							+ " anchor </a><!--comment-->" + "</body></html>"),
+			// 1.
+			new String(
+					"<html><head><title> title </title><script> script </script>"
+							+ "</head><body> body <a href=\"/\">"
+							+ " home </a><!--comment-->"
+							+ "<style> style </style>"
+							+ " <a href=\"bot.html\">" + " bots </a>"
+							+ "</body></html>"),
+			// 2.
+			new String("<html><head><title> </title>" + "</head><body> "
+					+ "<a href=\"/\"> separate this "
+					+ "<a href=\"ok\"> from this" + "</a></a>"
+					+ "</body></html>"),
+			// 3.
+			// this one relies on certain neko fixup behavior, possibly
+			// distributing the anchors into the LI's-but not the other
+			// anchors (outside of them, instead)! So you get a tree that
+			// looks like:
+			// ... <li> <a href=/> home </a> </li>
+			// <li> <a href=/> <a href="1"> 1 </a> </a> </li>
+			// <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li>
+			new String("<html><head><title> my title </title>"
+					+ "</head><body> body " + "<ul>"
+					+ "<li> <a href=\"/\"> home" + "<li> <a href=\"1\"> 1"
+					+ "<li> <a href=\"2\"> 2" + "</ul>" + "</body></html>"),
+			// 4.
+			// test frameset link extraction. The invalid frame in the middle
+			// will be
+			// fixed to a third standalone frame.
+			new String("<html><head><title> my title </title>"
+					+ "</head><frameset rows=\"20,*\"> "
+					+ "<frame src=\"top.html\">" + "</frame>"
+					+ "<frameset cols=\"20,*\">" + "<frame src=\"left.html\">"
+					+ "</frame>" + "<frame src=\"invalid.html\"/>" + "</frame>"
+					+ "<frame src=\"right.html\">" + "</frame>" + "</frameset>"
+					+ "</frameset>" + "</body></html>"),
+			// 5.
+			// test <area> and <iframe> link extraction + url normalization
+			new String(
+					"<html><head><title> my title </title>"
+							+ "</head><body>"
+							+ "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">"
+							+ "<map name=\"green\">"
+							+ "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">"
+							+ "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">"
+							+ "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">"
+							+ "</map>"
+							+ "<a name=\"bottom\"/><h1> the bottom </h1> "
+							+ "<iframe src=\"../docs/index.html\"/>"
+							+ "</body></html>"),
+			// 6.
+			// test whitespace processing for plain text extraction
+			new String(
+					"<html><head>\n <title> my\t\n  title\r\n </title>\n"
+							+ " </head>\n"
+							+ " <body>\n"
+							+ "    <h1> Whitespace\ttest  </h1> \n"
+							+ "\t<a href=\"../index.html\">\n  \twhitespace  test\r\n\t</a>  \t\n"
+							+ "    <p> This is<span> a whitespace<span></span> test</span>. Newlines\n"
+							+ "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>"
+							+ "    This\t<b>is a</b> break -&gt;<br>and the line after<i> break</i>.<br>\n"
+							+ "<table>"
+							+ "    <tr><td>one</td><td>two</td><td>three</td></tr>\n"
+							+ "    <tr><td>space here </td><td> space there</td><td>no space</td></tr>"
+							+ "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n"
+							+ "</table>put some text here<Br>and there."
+							+ "<h2>End\tthis\rmadness\n!</h2>\r\n"
+							+ "         .        .        .         ."
+							+ "</body>  </html>"),
+			// 7.
+			// test that <a rel=nofollow> links are not returned
+			new String(
+					"<html><head></head><body>"
+							+ "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>"
+							+ "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>"
+							+ "</body></html>"),
+			// 8.
+			// test that POST form actions are skipped
+			new String(
+					"<html><head></head><body>"
+							+ "<form method='POST' action='/search.jsp'><input type=text>"
+							+ "<input type=submit><p>test1</p></form>"
+							+ "<form method='GET' action='/dummy.jsp'><input type=text>"
+							+ "<input type=submit><p>test2</p></form></body></html>"),
+			// 9.
+			// test that all form actions are skipped
+			new String(
+					"<html><head></head><body>"
+							+ "<form method='POST' action='/search.jsp'><input type=text>"
+							+ "<input type=submit><p>test1</p></form>"
+							+ "<form method='GET' action='/dummy.jsp'><input type=text>"
+							+ "<input type=submit><p>test2</p></form></body></html>"),
+			// 10.
+			new String("<html><head><title> title </title>" + "</head><body>"
+					+ "<a href=\";x\">anchor1</a>"
+					+ "<a href=\"g;x\">anchor2</a>"
+					+ "<a href=\"g;x?y#s\">anchor3</a>" + "</body></html>"),
+			// 11.
+			new String("<html><head><title> title </title>" + "</head><body>"
+					+ "<a href=\"g\">anchor1</a>"
+					+ "<a href=\"g?y#s\">anchor2</a>"
+					+ "<a href=\"?y=1\">anchor3</a>"
+					+ "<a href=\"?y=1#s\">anchor4</a>"
+					+ "<a href=\"?y=1;somethingelse\">anchor5</a>"
+					+ "</body></html>"), };
+
+	private static int SKIP = 9;
+
+	private static String[] testBaseHrefs = { "http://www.nutch.org",
+			"http://www.nutch.org/docs/foo.html", "http://www.nutch.org/docs/",
+			"http://www.nutch.org/docs/", "http://www.nutch.org/frames/",
+			"http://www.nutch.org/maps/", "http://www.nutch.org/whitespace/",
+			"http://www.nutch.org//", "http://www.nutch.org/",
+			"http://www.nutch.org/", "http://www.nutch.org/",
+			"http://www.nutch.org/;something" };
+
+	private static final DocumentFragment testDOMs[] = new DocumentFragment[testPages.length];
+
+	private static URL[] testBaseHrefURLs = new URL[testPages.length];
+
+	private static final String[] answerText = {
+			"body anchor",
+			"body home bots",
+			"separate this from this",
+			"body home 1 2",
+			"",
+			"the bottom",
+			"Whitespace test whitespace test "
+					+ "This is a whitespace test . Newlines should appear as space too. "
+					+ "Tabs are spaces too. This is a break -> and the line after break . "
+					+ "one two three space here space there no space "
+					+ "one two two three three four put some text here and there. "
+					+ "End this madness ! . . . .", "ignore ignore",
+			"test1 test2", "test1 test2", "anchor1 anchor2 anchor3",
+			"anchor1 anchor2 anchor3 anchor4 anchor5" };
+
+	private static final String[] answerTitle = { "title", "title", "",
+			"my title", "my title", "my title", "my title", "", "", "",
+			"title", "title" };
+
+	// note: should be in page-order
+	private static Outlink[][] answerOutlinks;
+
+	private static Configuration conf;
+	private static DOMContentUtils utils = null;
+
+	public TestDOMContentUtils(String name) {
+		super(name);
+	}
+
+	private static void setup() throws Exception {
+		conf = NutchConfiguration.create();
+		conf.setBoolean("parser.html.form.use_action", true);
+		utils = new DOMContentUtils(conf);
+		TikaParser tikaParser = new TikaParser();
+		tikaParser.setConf(conf);
+		Parser parser = tikaParser.getTikaConfig().getParser("text/html");
+		for (int i = 0; i < testPages.length; i++) {
+			Metadata tikamd = new Metadata();
+
+			HTMLDocumentImpl doc = new HTMLDocumentImpl();
+			doc.setErrorChecking(false);
+			DocumentFragment root = doc.createDocumentFragment();
+			DOMBuilder domhandler = new DOMBuilder(doc, root);
+			ParseContext context = new ParseContext();
+			// to add once available in Tika
+			//context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
+			try {
+				parser.parse(new ByteArrayInputStream(testPages[i].getBytes()),
+						domhandler, tikamd, context);
+				testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
+			} catch (Exception e) {
+				e.printStackTrace();
+				fail("caught exception: " + e);
+			}
+			testDOMs[i] = root;
+			DOMSerializerImpl ds = new DOMSerializerImpl();
+			System.out.println("input " + i + ": '" + testPages[i] + "'");
+			System.out.println("output " + i + ": '" + ds.writeToString(root)
+					+ "'");
+
+		}
+		answerOutlinks = new Outlink[][] {
+				// 0
+				{ new Outlink("http://www.nutch.org", "anchor"), },
+				// 1
+				{
+				  new Outlink("http://www.nutch.org/", "home"),
+				  new Outlink("http://www.nutch.org/docs/bot.html",
+								"bots"), },
+				// 2
+				{
+					new Outlink("http://www.nutch.org/", "separate this"),
+					new Outlink("http://www.nutch.org/docs/ok", "from this"), },
+				
+				// 3	
+				{   new Outlink("http://www.nutch.org/", "home"),
+					new Outlink("http://www.nutch.org/docs/1", "1"),
+					new Outlink("http://www.nutch.org/docs/2", "2"), },
+				// 4	
+				{
+					new Outlink("http://www.nutch.org/frames/top.html", ""),
+					new Outlink("http://www.nutch.org/frames/left.html", ""),
+					new Outlink("http://www.nutch.org/frames/invalid.html",""),
+					new Outlink("http://www.nutch.org/frames/right.html",""), 
+				},
+				// 5
+				{ 
+					new Outlink("http://www.nutch.org/maps/logo.gif", ""),
+					new Outlink("http://www.nutch.org/index.html", ""),
+					new Outlink("http://www.nutch.org/maps/#bottom", ""),
+					new Outlink("http://www.nutch.org/bot.html", ""),
+					new Outlink("http://www.nutch.org/docs/index.html", "") 
+				},
+				// 6
+				{ new Outlink("http://www.nutch.org/index.html",
+						"whitespace test"), 
+				},
+				// 7
+				{},
+				// 8
+				{ new Outlink("http://www.nutch.org/dummy.jsp", "test2"), },
+				// 9
+				{},
+				// 10 
+				{ 
+				 new Outlink("http://www.nutch.org/;x", "anchor1"),
+				 new Outlink("http://www.nutch.org/g;x", "anchor2"),
+				 new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") 
+				},
+				// 11
+				{
+				 new Outlink("http://www.nutch.org/g;something","anchor1"),
+				 new Outlink("http://www.nutch.org/g;something?y#s", "anchor2"),
+				 new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
+				 new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
+				 new Outlink("http://www.nutch.org/?y=1;somethingelse", "anchor5") }
+				};
+
+	}
+
+	private static boolean equalsIgnoreWhitespace(String s1, String s2) {
+		StringTokenizer st1 = new StringTokenizer(s1);
+		StringTokenizer st2 = new StringTokenizer(s2);
+
+		while (st1.hasMoreTokens()) {
+			if (!st2.hasMoreTokens()) {
+				Log.info("st1+ '" + st1.nextToken() + "'");
+				return false;
+			}
+			String st1Token = st1.nextToken();
+			String st2Token = st2.nextToken();
+			if (!st1Token.equals(st2Token)) {
+				Log.info("st1:'" + st1Token + "' != st2:'" + st2Token + "'");
+				return false;
+			}
+		}
+		if (st2.hasMoreTokens()) {
+			System.err.println("st2+ '" + st2.nextToken() + "'");
+			return false;
+		}
+		return true;
+	}
+
+	public void testGetText() throws Exception {
+		if (testDOMs[0] == null)
+			setup();
+		for (int i = 0; i < testPages.length; i++) {
+			StringBuffer sb = new StringBuffer();
+			utils.getText(sb, testDOMs[i]);
+			String text = sb.toString();
+			assertTrue(
+					"example " + i + " : expecting text: " + answerText[i]
+							+ System.getProperty("line.separator")
+							+ System.getProperty("line.separator")
+							+ "got text: " + text,
+					equalsIgnoreWhitespace(answerText[i], text));
+		}
+	}
+
+	// won't work with Tika - the title is stored in the metadata but
+	// not put in the XHTML representation
+	public void testGetTitle() throws Exception {
+		if (testDOMs[0] == null)
+			setup();
+		for (int i = 0; i < testPages.length; i++) {
+			StringBuffer sb = new StringBuffer();
+			utils.getTitle(sb, testDOMs[i]);
+			String title = sb.toString();
+			assertTrue(
+					"example " + i + " : expecting title: " + answerTitle[i]
+							+ System.getProperty("line.separator")
+							+ System.getProperty("line.separator")
+							+ "got title: " + title,
+					equalsIgnoreWhitespace(answerTitle[i], title));
+		}
+	}
+
+	public void testGetOutlinks() throws Exception {
+		if (testDOMs[0] == null)
+			setup();
+		for (int i = 0; i < testPages.length; i++) {
+			ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
+			if (i == SKIP) {
+				conf.setBoolean("parser.html.form.use_action", false);
+				utils.setConf(conf);
+			} else {
+				conf.setBoolean("parser.html.form.use_action", true);
+				utils.setConf(conf);
+			}
+			utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]);
+			Outlink[] outlinkArr = new Outlink[outlinks.size()];
+			outlinkArr = outlinks.toArray(outlinkArr);
+			compareOutlinks(i, answerOutlinks[i], outlinkArr);
+		}
+	}
+
+	private static final void appendOutlinks(StringBuffer sb, Outlink[] o) {
+		for (int i = 0; i < o.length; i++) {
+			sb.append(o[i].toString());
+			sb.append(System.getProperty("line.separator"));
+		}
+	}
+
+	private static final String outlinksString(Outlink[] o) {
+		StringBuffer sb = new StringBuffer();
+		appendOutlinks(sb, o);
+		return sb.toString();
+	}
+
+	private static final void compareOutlinks(int test, Outlink[] o1,
+			Outlink[] o2) {
+		if (o1.length != o2.length) {
+			assertTrue(
+					"test " + test
+							+ ", got wrong number of outlinks (expecting "
+							+ o1.length + ", got " + o2.length + ")"
+							+ System.getProperty("line.separator") + "answer: "
+							+ System.getProperty("line.separator")
+							+ outlinksString(o1)
+							+ System.getProperty("line.separator") + "got: "
+							+ System.getProperty("line.separator")
+							+ outlinksString(o2)
+							+ System.getProperty("line.separator"), false);
+		}
+
+		for (int i = 0; i < o1.length; i++) {
+			if (!o1[i].equals(o2[i])) {
+				assertTrue(
+						"test " + test + ", got wrong outlinks at position "
+								+ i + System.getProperty("line.separator")
+								+ "answer: "
+								+ System.getProperty("line.separator")
+								+ o1[i].toString()
+								+ System.getProperty("line.separator")
+								+ "got: "
+								+ System.getProperty("line.separator")
+								+ o2[i].toString(), false);
+
+			}
+		}
+	}
+}

Added: nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java?rev=979893&view=auto
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java (added)
+++ nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java Tue Jul 27 22:26:07 2010
@@ -0,0 +1,109 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.tika;
+
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import junit.framework.TestCase;
+
+import org.apache.avro.util.Utf8;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.MimeUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.tika.mime.MimeType;
+
+/**
+ * Unit tests for MSWordParser.
+ * 
+ * @author John Xing
+ */
+public class TestMSWordParser extends TestCase {
+
+    private String fileSeparator = System.getProperty("file.separator");
+    // This system property is defined in ./src/plugin/build-plugin.xml
+    private String sampleDir = System.getProperty("test.data", ".");
+    // Make sure sample files are copied to "test.data" as specified in
+    // ./src/plugin/parse-msword/build.xml during plugin compilation.
+    // Check ./src/plugin/parse-msword/sample/README.txt for what they are.
+    private String[] sampleFiles = { "word97.doc" };
+
+    private String expectedText = "This is a sample doc file prepared for nutch.";
+
+    private Configuration conf;
+
+    public TestMSWordParser(String name) {
+	super(name);
+    }
+
+    protected void setUp() {
+	conf = NutchConfiguration.create();
+	conf.set("file.content.limit", "-1");
+    }
+
+    protected void tearDown() {
+    }
+
+    public String getTextContent(String fileName) throws ProtocolException,
+	    ParseException, IOException {
+	String urlString = sampleDir + fileSeparator + fileName;
+
+	File file = new File(urlString);
+	byte[] bytes = new byte[(int) file.length()];
+	DataInputStream in = new DataInputStream(new FileInputStream(file));
+	in.readFully(bytes);
+	in.close();
+	Parse parse;
+	WebPage page = new WebPage();
+	page.setBaseUrl(new Utf8("file:"+urlString));
+	page.setContent(ByteBuffer.wrap(bytes));
+	// set the content type?
+	MimeUtil mimeutil = new MimeUtil(conf);
+	MimeType mtype = mimeutil.getMimeType(file);
+	page.setContentType(new Utf8(mtype.getName()));
+		
+	parse = new ParseUtil(conf).parse("file:"+urlString, page);
+	return parse.getText();
+    }
+
+    public void testIt() throws ProtocolException, ParseException, IOException {
+	for (int i = 0; i < sampleFiles.length; i++) {
+	    String found = getTextContent(sampleFiles[i]);
+	    assertTrue("text found : '" + found + "'", found
+		    .startsWith(expectedText));
+	}
+    }
+
+    public void testOpeningDocs() throws ProtocolException, ParseException, IOException {
+	String[] filenames = new File(sampleDir).list();
+	for (int i = 0; i < filenames.length; i++) {
+	    if (filenames[i].endsWith(".doc") == false)
+		continue;
+	    assertTrue("cann't read content of " + filenames[i],
+		    getTextContent(filenames[i]).length() > 0);
+	}
+    }
+}

Added: nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java?rev=979893&view=auto
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java (added)
+++ nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java Tue Jul 27 22:26:07 2010
@@ -0,0 +1,126 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.tika;
+
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.ByteBuffer;
+
+import junit.framework.TestCase;
+
+import org.apache.avro.util.Utf8;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.MimeUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.tika.mime.MimeType;
+
+/**
+ * Unit tests for OOParser.
+ * 
+ * @author Andrzej Bialecki
+ */
+public class TestOOParser extends TestCase {
+
+    private String fileSeparator = System.getProperty("file.separator");
+    // This system property is defined in ./src/plugin/build-plugin.xml
+    private String sampleDir = System.getProperty("test.data", ".");
+    // Make sure sample files are copied to "test.data" as specified in
+    // ./src/plugin/parse-oo/build.xml during plugin compilation.
+    private String[] sampleFiles = { "ootest.odt", "ootest.sxw" };
+
+    private String sampleText = "ootest.txt";
+
+    private String expectedText;
+
+    public TestOOParser(String name) {
+	super(name);
+	try {
+	    // read the test string
+	    FileInputStream fis = new FileInputStream(sampleDir + fileSeparator
+		    + sampleText);
+	    StringBuffer sb = new StringBuffer();
+	    int len = 0;
+	    InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
+	    char[] buf = new char[1024];
+	    while ((len = isr.read(buf)) > 0) {
+		sb.append(buf, 0, len);
+	    }
+	    isr.close();
+	    expectedText = sb.toString();
+	    // normalize space
+	    expectedText = expectedText.replaceAll("[ \t\r\n]+", " ");
+	} catch (Exception e) {
+	    e.printStackTrace();
+	}
+    }
+
+    protected void setUp() {
+    }
+
+    protected void tearDown() {
+    }
+
+    public void testIt() throws ProtocolException, ParseException, IOException {
+	String urlString;
+	Parse parse;
+	Configuration conf = NutchConfiguration.create();
+	MimeUtil mimeutil = new MimeUtil(conf);
+
+	System.out.println("Expected : " + expectedText);
+
+	for (int i = 0; i < sampleFiles.length; i++) {
+	    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+	    if (sampleFiles[i].startsWith("ootest") == false)
+		continue;
+
+	    File file = new File(sampleDir + fileSeparator + sampleFiles[i]);
+	    byte[] bytes = new byte[(int) file.length()];
+	    DataInputStream in = new DataInputStream(new FileInputStream(file));
+	    in.readFully(bytes);
+	    in.close();
+
+	    WebPage page = new WebPage();
+	    page.setBaseUrl(new Utf8(urlString));
+	    page.setContent(ByteBuffer.wrap(bytes));
+	    MimeType mtype = mimeutil.getMimeType(file);
+	    page.setContentType(new Utf8(mtype.getName()));
+
+	    parse = new ParseUtil(conf).parse(urlString, page);
+
+	    String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
+
+	    // simply test for the presence of a text - the ordering of the
+	    // elements
+	    // may differ from what was expected
+	    // in the previous tests
+	    assertTrue(text != null && text.length() > 0);
+
+	    System.out.println("Found " + sampleFiles[i] + ": " + text);
+	}
+    }
+
+}

Added: nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java?rev=979893&view=auto
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java (added)
+++ nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java Tue Jul 27 22:26:07 2010
@@ -0,0 +1,94 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.tika;
+
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import junit.framework.TestCase;
+
+import org.apache.avro.util.Utf8;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.MimeUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.tika.mime.MimeType;
+
+/**
+ * Unit tests for PdfParser.
+ * 
+ * @author John Xing
+ */
+public class TestPdfParser extends TestCase {
+
+    private String fileSeparator = System.getProperty("file.separator");
+    // This system property is defined in ./src/plugin/build-plugin.xml
+    private String sampleDir = System.getProperty("test.data", ".");
+    // Make sure sample files are copied to "test.data" as specified in
+    // ./src/plugin/parse-pdf/build.xml during plugin compilation.
+    // Check ./src/plugin/parse-pdf/sample/README.txt for what they are.
+    private String[] sampleFiles = { "pdftest.pdf", "encrypted.pdf" };
+
+    private String expectedText = "A VERY SMALL PDF FILE";
+
+    public TestPdfParser(String name) {
+	super(name);
+    }
+
+    protected void setUp() {
+    }
+
+    protected void tearDown() {
+    }
+
+    public void testIt() throws ProtocolException, ParseException, IOException {
+	String urlString;
+	Parse parse;
+	Configuration conf = NutchConfiguration.create();
+	MimeUtil mimeutil = new MimeUtil(conf);
+
+	for (int i = 0; i < sampleFiles.length; i++) {
+	    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+	    File file = new File(sampleDir + fileSeparator + sampleFiles[i]);
+	    byte[] bytes = new byte[(int) file.length()];
+	    DataInputStream in = new DataInputStream(new FileInputStream(file));
+	    in.readFully(bytes);
+	    in.close();
+
+	    WebPage page = new WebPage();
+	    page.setBaseUrl(new Utf8(urlString));
+	    page.setContent(ByteBuffer.wrap(bytes));
+	    MimeType mtype = mimeutil.getMimeType(file);
+	    page.setContentType(new Utf8(mtype.getName()));
+
+	    parse = new ParseUtil(conf).parse(urlString, page);
+
+	    int index = parse.getText().indexOf(expectedText);
+	    assertTrue(index > 0);
+	}
+    }
+
+}

Added: nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java?rev=979893&view=auto
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java (added)
+++ nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java Tue Jul 27 22:26:07 2010
@@ -0,0 +1,100 @@
+package org.apache.nutch.parse.tika;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// JUnit imports
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import junit.framework.TestCase;
+
+import org.apache.avro.util.Utf8;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.MimeUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.tika.mime.MimeType;
+
+/**
+ * Unit tests for TestRTFParser. (Adapted from John Xing msword unit tests).
+ * 
+ * @author Andy Hedges
+ */
+public class TestRTFParser extends TestCase {
+
+    private String fileSeparator = System.getProperty("file.separator");
+    // This system property is defined in ./src/plugin/build-plugin.xml
+    private String sampleDir = System.getProperty("test.data", ".");
+    // Make sure sample files are copied to "test.data" as specified in
+    // ./src/plugin/parse-rtf/build.xml during plugin compilation.
+    // Check ./src/plugin/parse-rtf/sample/README.txt for what they are.
+    private String rtfFile = "test.rtf";
+
+    public TestRTFParser(String name) {
+	super(name);
+    }
+
+    protected void setUp() {
+    }
+
+    protected void tearDown() {
+    }
+
+    public void testIt() throws ProtocolException, ParseException, IOException {
+
+	String urlString;
+	Parse parse;
+	Configuration conf = NutchConfiguration.create();
+	MimeUtil mimeutil = new MimeUtil(conf);
+
+	urlString = "file:" + sampleDir + fileSeparator + rtfFile;
+
+	File file = new File(sampleDir + fileSeparator + rtfFile);
+	byte[] bytes = new byte[(int) file.length()];
+	DataInputStream in = new DataInputStream(new FileInputStream(file));
+	in.readFully(bytes);
+	in.close();
+
+	WebPage page = new WebPage();
+	page.setBaseUrl(new Utf8(urlString));
+	page.setContent(ByteBuffer.wrap(bytes));
+	MimeType mtype = mimeutil.getMimeType(file);
+	page.setContentType(new Utf8(mtype.getName()));
+
+	parse = new ParseUtil(conf).parse(urlString, page);
+
+	String text = parse.getText();
+	assertEquals("The quick brown fox jumps over the lazy dog", text.trim());
+
+	String title = parse.getTitle();
+	// HOW DO WE GET THE PARSE METADATA?
+	// Metadata meta = parse();
+
+	// METADATA extraction is not yet supported in Tika
+	// assertEquals("test rft document", title);
+	// assertEquals("tests", meta.get(DublinCore.SUBJECT));
+    }
+
+}