You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2010/07/28 00:26:08 UTC
svn commit: r979893 - in /nutch/branches/nutchbase/src/plugin/parse-tika/src:
java/org/apache/nutch/parse/tika/ test/org/apache/nutch/parse/
test/org/apache/nutch/parse/tika/ test/org/apache/nutch/tika/
Author: jnioche
Date: Tue Jul 27 22:26:07 2010
New Revision: 979893
URL: http://svn.apache.org/viewvc?rev=979893&view=rev
Log:
NUTCH-840 : moved tests to parse/tika + added TestDOMContentUtil which currently fail but will help us track the progress on the Tika processing of HTML
Added:
nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/
nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/
nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestDOMContentUtils.java
nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java
nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java
nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
Removed:
nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/tika/
Modified:
nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
Modified: nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java?rev=979893&r1=979892&r2=979893&view=diff
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java (original)
+++ nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java Tue Jul 27 22:26:07 2010
@@ -37,7 +37,7 @@ import org.w3c.dom.NodeList;
* DOM nodes, such as getOutlinks, getText, etc.
*
*/
-class DOMContentUtils {
+public class DOMContentUtils {
private static class LinkParams {
private String elName;
@@ -58,11 +58,11 @@ class DOMContentUtils {
private HashMap linkParams = new HashMap();
private Configuration conf;
- DOMContentUtils(Configuration conf) {
+ public DOMContentUtils(Configuration conf) {
setConf(conf);
}
- private void setConf(Configuration conf) {
+ public void setConf(Configuration conf) {
// forceTags is used to override configurable tag ignoring, later on
Collection<String> forceTags = new ArrayList<String>(1);
@@ -118,7 +118,7 @@ class DOMContentUtils {
* #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
*
*/
- void getText(StringBuffer sb, Node node) {
+ public void getText(StringBuffer sb, Node node) {
getText(sb, node, false);
}
@@ -174,7 +174,7 @@ class DOMContentUtils {
*
* @return true if a title node was found, false otherwise
*/
- boolean getTitle(StringBuffer sb, Node node) {
+ public boolean getTitle(StringBuffer sb, Node node) {
NodeWalker walker = new NodeWalker(node);
@@ -358,7 +358,7 @@ class DOMContentUtils {
* nodes (this is a common DOM-fixup artifact, at least with
* nekohtml).
*/
- void getOutlinks(URL base, ArrayList outlinks,
+ public void getOutlinks(URL base, ArrayList outlinks,
Node node) {
NodeWalker walker = new NodeWalker(node);
Modified: nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java?rev=979893&r1=979892&r2=979893&view=diff
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java (original)
+++ nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java Tue Jul 27 22:26:07 2010
@@ -28,7 +28,7 @@ import org.w3c.dom.*;
* noindex), finding BASE HREF tags, and HTTP-EQUIV no-cache
* instructions. All meta directives are stored in a HTMLMetaTags instance.
*/
-class HTMLMetaProcessor {
+public class HTMLMetaProcessor {
/**
* Utility class with indicators for the robots directives "noindex"
@@ -40,7 +40,7 @@ class HTMLMetaProcessor {
* values, based on any META tags found under the given
* <code>node</code>.
*/
- static final void getMetaTags (
+ public static final void getMetaTags (
HTMLMetaTags metaTags, Node node, URL currURL) {
metaTags.reset();
Modified: nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java?rev=979893&r1=979892&r2=979893&view=diff
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java (original)
+++ nutch/branches/nutchbase/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java Tue Jul 27 22:26:07 2010
@@ -53,6 +53,8 @@ import org.apache.tika.metadata.Metadata
import org.apache.tika.mime.MimeType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.html.HtmlMapper;
+import org.apache.tika.parser.html.IdentityHtmlMapper;
import org.w3c.dom.DocumentFragment;
/**
@@ -110,6 +112,8 @@ public class TikaParser implements org.a
DocumentFragment root = doc.createDocumentFragment();
DOMBuilder domhandler = new DOMBuilder(doc, root);
ParseContext context = new ParseContext();
+ // to add once available in Tika
+ // context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
try {
parser.parse(new ByteArrayInputStream(raw), domhandler, tikamd, context);
} catch (Exception e) {
@@ -211,6 +215,10 @@ public class TikaParser implements org.a
Nutch.CACHING_FORBIDDEN_CONTENT);
}
+ public TikaConfig getTikaConfig(){
+ return this.tikaConfig;
+ }
+
public Configuration getConf() {
return this.conf;
}
Added: nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestDOMContentUtils.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestDOMContentUtils.java?rev=979893&view=auto
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestDOMContentUtils.java (added)
+++ nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestDOMContentUtils.java Tue Jul 27 22:26:07 2010
@@ -0,0 +1,416 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.tika;
+
+import junit.framework.TestCase;
+
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.tika.DOMBuilder;
+import org.apache.nutch.parse.tika.DOMContentUtils;
+import org.apache.nutch.parse.tika.TikaParser;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.html.dom.HTMLDocumentImpl;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.html.HtmlMapper;
+import org.apache.tika.parser.html.IdentityHtmlMapper;
+import org.apache.xml.serialize.DOMSerializerImpl;
+import org.mortbay.log.Log;
+import org.w3c.dom.DocumentFragment;
+
+import java.io.ByteArrayInputStream;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.StringTokenizer;
+
+/**
+ * Unit tests for DOMContentUtils.
+ */
+public class TestDOMContentUtils extends TestCase {
+
+ private static final String[] testPages = {
+ // 0.
+ new String(
+ "<html><head><title> title </title><script> script </script>"
+ + "</head><body> body <a href=\"http://www.nutch.org\">"
+ + " anchor </a><!--comment-->" + "</body></html>"),
+ // 1.
+ new String(
+ "<html><head><title> title </title><script> script </script>"
+ + "</head><body> body <a href=\"/\">"
+ + " home </a><!--comment-->"
+ + "<style> style </style>"
+ + " <a href=\"bot.html\">" + " bots </a>"
+ + "</body></html>"),
+ // 2.
+ new String("<html><head><title> </title>" + "</head><body> "
+ + "<a href=\"/\"> separate this "
+ + "<a href=\"ok\"> from this" + "</a></a>"
+ + "</body></html>"),
+ // 3.
+ // this one relies on certain neko fixup behavior, possibly
+ // distributing the anchors into the LI's-but not the other
+ // anchors (outside of them, instead)! So you get a tree that
+ // looks like:
+ // ... <li> <a href=/> home </a> </li>
+ // <li> <a href=/> <a href="1"> 1 </a> </a> </li>
+ // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li>
+ new String("<html><head><title> my title </title>"
+ + "</head><body> body " + "<ul>"
+ + "<li> <a href=\"/\"> home" + "<li> <a href=\"1\"> 1"
+ + "<li> <a href=\"2\"> 2" + "</ul>" + "</body></html>"),
+ // 4.
+ // test frameset link extraction. The invalid frame in the middle
+ // will be
+ // fixed to a third standalone frame.
+ new String("<html><head><title> my title </title>"
+ + "</head><frameset rows=\"20,*\"> "
+ + "<frame src=\"top.html\">" + "</frame>"
+ + "<frameset cols=\"20,*\">" + "<frame src=\"left.html\">"
+ + "</frame>" + "<frame src=\"invalid.html\"/>" + "</frame>"
+ + "<frame src=\"right.html\">" + "</frame>" + "</frameset>"
+ + "</frameset>" + "</body></html>"),
+ // 5.
+ // test <area> and <iframe> link extraction + url normalization
+ new String(
+ "<html><head><title> my title </title>"
+ + "</head><body>"
+ + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">"
+ + "<map name=\"green\">"
+ + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">"
+ + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">"
+ + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">"
+ + "</map>"
+ + "<a name=\"bottom\"/><h1> the bottom </h1> "
+ + "<iframe src=\"../docs/index.html\"/>"
+ + "</body></html>"),
+ // 6.
+ // test whitespace processing for plain text extraction
+ new String(
+ "<html><head>\n <title> my\t\n title\r\n </title>\n"
+ + " </head>\n"
+ + " <body>\n"
+ + " <h1> Whitespace\ttest </h1> \n"
+ + "\t<a href=\"../index.html\">\n \twhitespace test\r\n\t</a> \t\n"
+ + " <p> This is<span> a whitespace<span></span> test</span>. Newlines\n"
+ + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>"
+ + " This\t<b>is a</b> break -><br>and the line after<i> break</i>.<br>\n"
+ + "<table>"
+ + " <tr><td>one</td><td>two</td><td>three</td></tr>\n"
+ + " <tr><td>space here </td><td> space there</td><td>no space</td></tr>"
+ + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n"
+ + "</table>put some text here<Br>and there."
+ + "<h2>End\tthis\rmadness\n!</h2>\r\n"
+ + " . . . ."
+ + "</body> </html>"),
+ // 7.
+ // test that <a rel=nofollow> links are not returned
+ new String(
+ "<html><head></head><body>"
+ + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>"
+ + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>"
+ + "</body></html>"),
+ // 8.
+ // test that POST form actions are skipped
+ new String(
+ "<html><head></head><body>"
+ + "<form method='POST' action='/search.jsp'><input type=text>"
+ + "<input type=submit><p>test1</p></form>"
+ + "<form method='GET' action='/dummy.jsp'><input type=text>"
+ + "<input type=submit><p>test2</p></form></body></html>"),
+ // 9.
+ // test that all form actions are skipped
+ new String(
+ "<html><head></head><body>"
+ + "<form method='POST' action='/search.jsp'><input type=text>"
+ + "<input type=submit><p>test1</p></form>"
+ + "<form method='GET' action='/dummy.jsp'><input type=text>"
+ + "<input type=submit><p>test2</p></form></body></html>"),
+ // 10.
+ new String("<html><head><title> title </title>" + "</head><body>"
+ + "<a href=\";x\">anchor1</a>"
+ + "<a href=\"g;x\">anchor2</a>"
+ + "<a href=\"g;x?y#s\">anchor3</a>" + "</body></html>"),
+ // 11.
+ new String("<html><head><title> title </title>" + "</head><body>"
+ + "<a href=\"g\">anchor1</a>"
+ + "<a href=\"g?y#s\">anchor2</a>"
+ + "<a href=\"?y=1\">anchor3</a>"
+ + "<a href=\"?y=1#s\">anchor4</a>"
+ + "<a href=\"?y=1;somethingelse\">anchor5</a>"
+ + "</body></html>"), };
+
+ private static int SKIP = 9;
+
+ private static String[] testBaseHrefs = { "http://www.nutch.org",
+ "http://www.nutch.org/docs/foo.html", "http://www.nutch.org/docs/",
+ "http://www.nutch.org/docs/", "http://www.nutch.org/frames/",
+ "http://www.nutch.org/maps/", "http://www.nutch.org/whitespace/",
+ "http://www.nutch.org//", "http://www.nutch.org/",
+ "http://www.nutch.org/", "http://www.nutch.org/",
+ "http://www.nutch.org/;something" };
+
+ private static final DocumentFragment testDOMs[] = new DocumentFragment[testPages.length];
+
+ private static URL[] testBaseHrefURLs = new URL[testPages.length];
+
+ private static final String[] answerText = {
+ "body anchor",
+ "body home bots",
+ "separate this from this",
+ "body home 1 2",
+ "",
+ "the bottom",
+ "Whitespace test whitespace test "
+ + "This is a whitespace test . Newlines should appear as space too. "
+ + "Tabs are spaces too. This is a break -> and the line after break . "
+ + "one two three space here space there no space "
+ + "one two two three three four put some text here and there. "
+ + "End this madness ! . . . .", "ignore ignore",
+ "test1 test2", "test1 test2", "anchor1 anchor2 anchor3",
+ "anchor1 anchor2 anchor3 anchor4 anchor5" };
+
+ private static final String[] answerTitle = { "title", "title", "",
+ "my title", "my title", "my title", "my title", "", "", "",
+ "title", "title" };
+
+ // note: should be in page-order
+ private static Outlink[][] answerOutlinks;
+
+ private static Configuration conf;
+ private static DOMContentUtils utils = null;
+
+ public TestDOMContentUtils(String name) {
+ super(name);
+ }
+
+ private static void setup() throws Exception {
+ conf = NutchConfiguration.create();
+ conf.setBoolean("parser.html.form.use_action", true);
+ utils = new DOMContentUtils(conf);
+ TikaParser tikaParser = new TikaParser();
+ tikaParser.setConf(conf);
+ Parser parser = tikaParser.getTikaConfig().getParser("text/html");
+ for (int i = 0; i < testPages.length; i++) {
+ Metadata tikamd = new Metadata();
+
+ HTMLDocumentImpl doc = new HTMLDocumentImpl();
+ doc.setErrorChecking(false);
+ DocumentFragment root = doc.createDocumentFragment();
+ DOMBuilder domhandler = new DOMBuilder(doc, root);
+ ParseContext context = new ParseContext();
+ // to add once available in Tika
+ //context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
+ try {
+ parser.parse(new ByteArrayInputStream(testPages[i].getBytes()),
+ domhandler, tikamd, context);
+ testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
+ } catch (Exception e) {
+ e.printStackTrace();
+ fail("caught exception: " + e);
+ }
+ testDOMs[i] = root;
+ DOMSerializerImpl ds = new DOMSerializerImpl();
+ System.out.println("input " + i + ": '" + testPages[i] + "'");
+ System.out.println("output " + i + ": '" + ds.writeToString(root)
+ + "'");
+
+ }
+ answerOutlinks = new Outlink[][] {
+ // 0
+ { new Outlink("http://www.nutch.org", "anchor"), },
+ // 1
+ {
+ new Outlink("http://www.nutch.org/", "home"),
+ new Outlink("http://www.nutch.org/docs/bot.html",
+ "bots"), },
+ // 2
+ {
+ new Outlink("http://www.nutch.org/", "separate this"),
+ new Outlink("http://www.nutch.org/docs/ok", "from this"), },
+
+ // 3
+ { new Outlink("http://www.nutch.org/", "home"),
+ new Outlink("http://www.nutch.org/docs/1", "1"),
+ new Outlink("http://www.nutch.org/docs/2", "2"), },
+ // 4
+ {
+ new Outlink("http://www.nutch.org/frames/top.html", ""),
+ new Outlink("http://www.nutch.org/frames/left.html", ""),
+ new Outlink("http://www.nutch.org/frames/invalid.html",""),
+ new Outlink("http://www.nutch.org/frames/right.html",""),
+ },
+ // 5
+ {
+ new Outlink("http://www.nutch.org/maps/logo.gif", ""),
+ new Outlink("http://www.nutch.org/index.html", ""),
+ new Outlink("http://www.nutch.org/maps/#bottom", ""),
+ new Outlink("http://www.nutch.org/bot.html", ""),
+ new Outlink("http://www.nutch.org/docs/index.html", "")
+ },
+ // 6
+ { new Outlink("http://www.nutch.org/index.html",
+ "whitespace test"),
+ },
+ // 7
+ {},
+ // 8
+ { new Outlink("http://www.nutch.org/dummy.jsp", "test2"), },
+ // 9
+ {},
+ // 10
+ {
+ new Outlink("http://www.nutch.org/;x", "anchor1"),
+ new Outlink("http://www.nutch.org/g;x", "anchor2"),
+ new Outlink("http://www.nutch.org/g;x?y#s", "anchor3")
+ },
+ // 11
+ {
+ new Outlink("http://www.nutch.org/g;something","anchor1"),
+ new Outlink("http://www.nutch.org/g;something?y#s", "anchor2"),
+ new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
+ new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
+ new Outlink("http://www.nutch.org/?y=1;somethingelse", "anchor5") }
+ };
+
+ }
+
+ private static boolean equalsIgnoreWhitespace(String s1, String s2) {
+ StringTokenizer st1 = new StringTokenizer(s1);
+ StringTokenizer st2 = new StringTokenizer(s2);
+
+ while (st1.hasMoreTokens()) {
+ if (!st2.hasMoreTokens()) {
+ Log.info("st1+ '" + st1.nextToken() + "'");
+ return false;
+ }
+ String st1Token = st1.nextToken();
+ String st2Token = st2.nextToken();
+ if (!st1Token.equals(st2Token)) {
+ Log.info("st1:'" + st1Token + "' != st2:'" + st2Token + "'");
+ return false;
+ }
+ }
+ if (st2.hasMoreTokens()) {
+ System.err.println("st2+ '" + st2.nextToken() + "'");
+ return false;
+ }
+ return true;
+ }
+
+ public void testGetText() throws Exception {
+ if (testDOMs[0] == null)
+ setup();
+ for (int i = 0; i < testPages.length; i++) {
+ StringBuffer sb = new StringBuffer();
+ utils.getText(sb, testDOMs[i]);
+ String text = sb.toString();
+ assertTrue(
+ "example " + i + " : expecting text: " + answerText[i]
+ + System.getProperty("line.separator")
+ + System.getProperty("line.separator")
+ + "got text: " + text,
+ equalsIgnoreWhitespace(answerText[i], text));
+ }
+ }
+
+ // won't work with Tika - the title is stored in the metadata but
+ // not put in the XHTML representation
+ public void testGetTitle() throws Exception {
+ if (testDOMs[0] == null)
+ setup();
+ for (int i = 0; i < testPages.length; i++) {
+ StringBuffer sb = new StringBuffer();
+ utils.getTitle(sb, testDOMs[i]);
+ String title = sb.toString();
+ assertTrue(
+ "example " + i + " : expecting title: " + answerTitle[i]
+ + System.getProperty("line.separator")
+ + System.getProperty("line.separator")
+ + "got title: " + title,
+ equalsIgnoreWhitespace(answerTitle[i], title));
+ }
+ }
+
+ public void testGetOutlinks() throws Exception {
+ if (testDOMs[0] == null)
+ setup();
+ for (int i = 0; i < testPages.length; i++) {
+ ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
+ if (i == SKIP) {
+ conf.setBoolean("parser.html.form.use_action", false);
+ utils.setConf(conf);
+ } else {
+ conf.setBoolean("parser.html.form.use_action", true);
+ utils.setConf(conf);
+ }
+ utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]);
+ Outlink[] outlinkArr = new Outlink[outlinks.size()];
+ outlinkArr = outlinks.toArray(outlinkArr);
+ compareOutlinks(i, answerOutlinks[i], outlinkArr);
+ }
+ }
+
+ private static final void appendOutlinks(StringBuffer sb, Outlink[] o) {
+ for (int i = 0; i < o.length; i++) {
+ sb.append(o[i].toString());
+ sb.append(System.getProperty("line.separator"));
+ }
+ }
+
+ private static final String outlinksString(Outlink[] o) {
+ StringBuffer sb = new StringBuffer();
+ appendOutlinks(sb, o);
+ return sb.toString();
+ }
+
+ private static final void compareOutlinks(int test, Outlink[] o1,
+ Outlink[] o2) {
+ if (o1.length != o2.length) {
+ assertTrue(
+ "test " + test
+ + ", got wrong number of outlinks (expecting "
+ + o1.length + ", got " + o2.length + ")"
+ + System.getProperty("line.separator") + "answer: "
+ + System.getProperty("line.separator")
+ + outlinksString(o1)
+ + System.getProperty("line.separator") + "got: "
+ + System.getProperty("line.separator")
+ + outlinksString(o2)
+ + System.getProperty("line.separator"), false);
+ }
+
+ for (int i = 0; i < o1.length; i++) {
+ if (!o1[i].equals(o2[i])) {
+ assertTrue(
+ "test " + test + ", got wrong outlinks at position "
+ + i + System.getProperty("line.separator")
+ + "answer: "
+ + System.getProperty("line.separator")
+ + o1[i].toString()
+ + System.getProperty("line.separator")
+ + "got: "
+ + System.getProperty("line.separator")
+ + o2[i].toString(), false);
+
+ }
+ }
+ }
+}
Added: nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java?rev=979893&view=auto
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java (added)
+++ nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java Tue Jul 27 22:26:07 2010
@@ -0,0 +1,109 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.tika;
+
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import junit.framework.TestCase;
+
+import org.apache.avro.util.Utf8;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.MimeUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.tika.mime.MimeType;
+
+/**
+ * Unit tests for MSWordParser.
+ *
+ * @author John Xing
+ */
+public class TestMSWordParser extends TestCase {
+
+ private String fileSeparator = System.getProperty("file.separator");
+ // This system property is defined in ./src/plugin/build-plugin.xml
+ private String sampleDir = System.getProperty("test.data", ".");
+ // Make sure sample files are copied to "test.data" as specified in
+ // ./src/plugin/parse-msword/build.xml during plugin compilation.
+ // Check ./src/plugin/parse-msword/sample/README.txt for what they are.
+ private String[] sampleFiles = { "word97.doc" };
+
+ private String expectedText = "This is a sample doc file prepared for nutch.";
+
+ private Configuration conf;
+
+ public TestMSWordParser(String name) {
+ super(name);
+ }
+
+ protected void setUp() {
+ conf = NutchConfiguration.create();
+ conf.set("file.content.limit", "-1");
+ }
+
+ protected void tearDown() {
+ }
+
+ public String getTextContent(String fileName) throws ProtocolException,
+ ParseException, IOException {
+ String urlString = sampleDir + fileSeparator + fileName;
+
+ File file = new File(urlString);
+ byte[] bytes = new byte[(int) file.length()];
+ DataInputStream in = new DataInputStream(new FileInputStream(file));
+ in.readFully(bytes);
+ in.close();
+ Parse parse;
+ WebPage page = new WebPage();
+ page.setBaseUrl(new Utf8("file:"+urlString));
+ page.setContent(ByteBuffer.wrap(bytes));
+ // set the content type?
+ MimeUtil mimeutil = new MimeUtil(conf);
+ MimeType mtype = mimeutil.getMimeType(file);
+ page.setContentType(new Utf8(mtype.getName()));
+
+ parse = new ParseUtil(conf).parse("file:"+urlString, page);
+ return parse.getText();
+ }
+
+ public void testIt() throws ProtocolException, ParseException, IOException {
+ for (int i = 0; i < sampleFiles.length; i++) {
+ String found = getTextContent(sampleFiles[i]);
+ assertTrue("text found : '" + found + "'", found
+ .startsWith(expectedText));
+ }
+ }
+
+ public void testOpeningDocs() throws ProtocolException, ParseException, IOException {
+ String[] filenames = new File(sampleDir).list();
+ for (int i = 0; i < filenames.length; i++) {
+ if (filenames[i].endsWith(".doc") == false)
+ continue;
+ assertTrue("cann't read content of " + filenames[i],
+ getTextContent(filenames[i]).length() > 0);
+ }
+ }
+}
Added: nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java?rev=979893&view=auto
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java (added)
+++ nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java Tue Jul 27 22:26:07 2010
@@ -0,0 +1,126 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.tika;
+
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.ByteBuffer;
+
+import junit.framework.TestCase;
+
+import org.apache.avro.util.Utf8;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.MimeUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.tika.mime.MimeType;
+
+/**
+ * Unit tests for OOParser.
+ *
+ * @author Andrzej Bialecki
+ */
+public class TestOOParser extends TestCase {
+
+ private String fileSeparator = System.getProperty("file.separator");
+ // This system property is defined in ./src/plugin/build-plugin.xml
+ private String sampleDir = System.getProperty("test.data", ".");
+ // Make sure sample files are copied to "test.data" as specified in
+ // ./src/plugin/parse-oo/build.xml during plugin compilation.
+ private String[] sampleFiles = { "ootest.odt", "ootest.sxw" };
+
+ private String sampleText = "ootest.txt";
+
+ private String expectedText;
+
+ public TestOOParser(String name) {
+ super(name);
+ try {
+ // read the test string
+ FileInputStream fis = new FileInputStream(sampleDir + fileSeparator
+ + sampleText);
+ StringBuffer sb = new StringBuffer();
+ int len = 0;
+ InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
+ char[] buf = new char[1024];
+ while ((len = isr.read(buf)) > 0) {
+ sb.append(buf, 0, len);
+ }
+ isr.close();
+ expectedText = sb.toString();
+ // normalize space
+ expectedText = expectedText.replaceAll("[ \t\r\n]+", " ");
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+
+ protected void setUp() {
+ }
+
+ protected void tearDown() {
+ }
+
+ public void testIt() throws ProtocolException, ParseException, IOException {
+ String urlString;
+ Parse parse;
+ Configuration conf = NutchConfiguration.create();
+ MimeUtil mimeutil = new MimeUtil(conf);
+
+ System.out.println("Expected : " + expectedText);
+
+ for (int i = 0; i < sampleFiles.length; i++) {
+ urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+ if (sampleFiles[i].startsWith("ootest") == false)
+ continue;
+
+ File file = new File(sampleDir + fileSeparator + sampleFiles[i]);
+ byte[] bytes = new byte[(int) file.length()];
+ DataInputStream in = new DataInputStream(new FileInputStream(file));
+ in.readFully(bytes);
+ in.close();
+
+ WebPage page = new WebPage();
+ page.setBaseUrl(new Utf8(urlString));
+ page.setContent(ByteBuffer.wrap(bytes));
+ MimeType mtype = mimeutil.getMimeType(file);
+ page.setContentType(new Utf8(mtype.getName()));
+
+ parse = new ParseUtil(conf).parse(urlString, page);
+
+ String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
+
+ // simply test for the presence of a text - the ordering of the
+ // elements
+ // may differ from what was expected
+ // in the previous tests
+ assertTrue(text != null && text.length() > 0);
+
+ System.out.println("Found " + sampleFiles[i] + ": " + text);
+ }
+ }
+
+}
Added: nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java?rev=979893&view=auto
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java (added)
+++ nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java Tue Jul 27 22:26:07 2010
@@ -0,0 +1,94 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.tika;
+
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import junit.framework.TestCase;
+
+import org.apache.avro.util.Utf8;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.MimeUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.tika.mime.MimeType;
+
+/**
+ * Unit tests for PdfParser.
+ *
+ * @author John Xing
+ */
+public class TestPdfParser extends TestCase {
+
+ private String fileSeparator = System.getProperty("file.separator");
+ // This system property is defined in ./src/plugin/build-plugin.xml
+ private String sampleDir = System.getProperty("test.data", ".");
+ // Make sure sample files are copied to "test.data" as specified in
+ // ./src/plugin/parse-pdf/build.xml during plugin compilation.
+ // Check ./src/plugin/parse-pdf/sample/README.txt for what they are.
+ private String[] sampleFiles = { "pdftest.pdf", "encrypted.pdf" };
+
+ private String expectedText = "A VERY SMALL PDF FILE";
+
+ public TestPdfParser(String name) {
+ super(name);
+ }
+
+ protected void setUp() {
+ }
+
+ protected void tearDown() {
+ }
+
+ public void testIt() throws ProtocolException, ParseException, IOException {
+ String urlString;
+ Parse parse;
+ Configuration conf = NutchConfiguration.create();
+ MimeUtil mimeutil = new MimeUtil(conf);
+
+ for (int i = 0; i < sampleFiles.length; i++) {
+ urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+ File file = new File(sampleDir + fileSeparator + sampleFiles[i]);
+ byte[] bytes = new byte[(int) file.length()];
+ DataInputStream in = new DataInputStream(new FileInputStream(file));
+ in.readFully(bytes);
+ in.close();
+
+ WebPage page = new WebPage();
+ page.setBaseUrl(new Utf8(urlString));
+ page.setContent(ByteBuffer.wrap(bytes));
+ MimeType mtype = mimeutil.getMimeType(file);
+ page.setContentType(new Utf8(mtype.getName()));
+
+ parse = new ParseUtil(conf).parse(urlString, page);
+
+ int index = parse.getText().indexOf(expectedText);
+ assertTrue(index > 0);
+ }
+ }
+
+}
Added: nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java?rev=979893&view=auto
==============================================================================
--- nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java (added)
+++ nutch/branches/nutchbase/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java Tue Jul 27 22:26:07 2010
@@ -0,0 +1,100 @@
+package org.apache.nutch.parse.tika;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// JUnit imports
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import junit.framework.TestCase;
+
+import org.apache.avro.util.Utf8;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.MimeUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.tika.mime.MimeType;
+
+/**
+ * Unit tests for TestRTFParser. (Adapted from John Xing msword unit tests).
+ *
+ * @author Andy Hedges
+ */
+public class TestRTFParser extends TestCase {
+
+ private String fileSeparator = System.getProperty("file.separator");
+ // This system property is defined in ./src/plugin/build-plugin.xml
+ private String sampleDir = System.getProperty("test.data", ".");
+ // Make sure sample files are copied to "test.data" as specified in
+ // ./src/plugin/parse-rtf/build.xml during plugin compilation.
+ // Check ./src/plugin/parse-rtf/sample/README.txt for what they are.
+ private String rtfFile = "test.rtf";
+
+ public TestRTFParser(String name) {
+ super(name);
+ }
+
+ protected void setUp() {
+ }
+
+ protected void tearDown() {
+ }
+
+ public void testIt() throws ProtocolException, ParseException, IOException {
+
+ String urlString;
+ Parse parse;
+ Configuration conf = NutchConfiguration.create();
+ MimeUtil mimeutil = new MimeUtil(conf);
+
+ urlString = "file:" + sampleDir + fileSeparator + rtfFile;
+
+ File file = new File(sampleDir + fileSeparator + rtfFile);
+ byte[] bytes = new byte[(int) file.length()];
+ DataInputStream in = new DataInputStream(new FileInputStream(file));
+ in.readFully(bytes);
+ in.close();
+
+ WebPage page = new WebPage();
+ page.setBaseUrl(new Utf8(urlString));
+ page.setContent(ByteBuffer.wrap(bytes));
+ MimeType mtype = mimeutil.getMimeType(file);
+ page.setContentType(new Utf8(mtype.getName()));
+
+ parse = new ParseUtil(conf).parse(urlString, page);
+
+ String text = parse.getText();
+ assertEquals("The quick brown fox jumps over the lazy dog", text.trim());
+
+ String title = parse.getTitle();
+ // HOW DO WE GET THE PARSE METADATA?
+ // Metadata meta = parse();
+
+ // METADATA extraction is not yet supported in Tika
+ // assertEquals("test rft document", title);
+ // assertEquals("tests", meta.get(DublinCore.SUBJECT));
+ }
+
+}