You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/05 22:49:39 UTC
[55/69] [abbrv] nutch git commit: Moved test sources to maven
standard directory
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestDOMContentUtils.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestDOMContentUtils.java b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestDOMContentUtils.java
new file mode 100644
index 0000000..96029a6
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestDOMContentUtils.java
@@ -0,0 +1,337 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.tika.DOMContentUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+
+import java.io.ByteArrayInputStream;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.StringTokenizer;
+
+import org.xml.sax.*;
+import org.w3c.dom.*;
+import org.apache.html.dom.*;
+import org.cyberneko.html.parsers.DOMFragmentParser;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Unit tests for DOMContentUtils.
+ */
+public class TestDOMContentUtils {
+
+ private static final String[] testPages = {
+
+ new String("<html><head><title> title </title><script> script </script>"
+ + "</head><body> body <a href=\"http://www.nutch.org\">"
+ + " anchor </a><!--comment-->" + "</body></html>"),
+
+ new String("<html><head><title> title </title><script> script </script>"
+ + "</head><body> body <a href=\"/\">" + " home </a><!--comment-->"
+ + "<style> style </style>" + " <a href=\"bot.html\">" + " bots </a>"
+ + "</body></html>"),
+
+ new String("<html><head><title> </title>" + "</head><body> "
+ + "<a href=\"/\"> separate this " + "<a href=\"ok\"> from this"
+ + "</a></a>" + "</body></html>"),
+
+ // this one relies on certain neko fixup behavior, possibly
+ // distributing the anchors into the LI's-but not the other
+ // anchors (outside of them, instead)! So you get a tree that
+ // looks like:
+ // ... <li> <a href=/> home </a> </li>
+ // <li> <a href=/> <a href="1"> 1 </a> </a> </li>
+ // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li>
+ new String("<html><head><title> my title </title>"
+ + "</head><body> body " + "<ul>" + "<li> <a href=\"/\"> home"
+ + "<li> <a href=\"1\"> 1" + "<li> <a href=\"2\"> 2" + "</ul>"
+ + "</body></html>"),
+
+ // test frameset link extraction. The invalid frame in the middle
+ // will be
+ // fixed to a third standalone frame.
+ new String("<html><head><title> my title </title>"
+ + "</head><frameset rows=\"20,*\"> " + "<frame src=\"top.html\">"
+ + "</frame>" + "<frameset cols=\"20,*\">"
+ + "<frame src=\"left.html\">" + "<frame src=\"invalid.html\"/>"
+ + "</frame>" + "<frame src=\"right.html\">" + "</frame>"
+ + "</frameset>" + "</frameset>" + "</body></html>"),
+
+ // test <area> and <iframe> link extraction + url normalization
+ new String(
+ "<html><head><title> my title </title>"
+ + "</head><body>"
+ + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">"
+ + "<map name=\"green\">"
+ + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">"
+ + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">"
+ + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">"
+ + "</map>" + "<a name=\"bottom\"/><h1> the bottom </h1> "
+ + "<iframe src=\"../docs/index.html\"/>" + "</body></html>"),
+
+ // test whitespace processing for plain text extraction
+ new String(
+ "<html><head>\n <title> my\t\n title\r\n </title>\n"
+ + " </head>\n"
+ + " <body>\n"
+ + " <h1> Whitespace\ttest </h1> \n"
+ + "\t<a href=\"../index.html\">\n \twhitespace test\r\n\t</a> \t\n"
+ + " <p> This is<span> a whitespace<span></span> test</span>. Newlines\n"
+ + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>"
+ + " This\t<b>is a</b> break -><br>and the line after<i> break</i>.<br>\n"
+ + "<table>"
+ + " <tr><td>one</td><td>two</td><td>three</td></tr>\n"
+ + " <tr><td>space here </td><td> space there</td><td>no space</td></tr>"
+ + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n"
+ + "</table>put some text here<Br>and there."
+ + "<h2>End\tthis\rmadness\n!</h2>\r\n"
+ + " . . . ." + "</body> </html>"),
+
+ // test that <a rel=nofollow> links are not returned
+ new String("<html><head></head><body>"
+ + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>"
+ + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>"
+ + "</body></html>"),
+ // test that POST form actions are skipped
+ new String("<html><head></head><body>"
+ + "<form method='POST' action='/search.jsp'><input type=text>"
+ + "<input type=submit><p>test1</p></form>"
+ + "<form method='GET' action='/dummy.jsp'><input type=text>"
+ + "<input type=submit><p>test2</p></form></body></html>"),
+ // test that all form actions are skipped
+ new String("<html><head></head><body>"
+ + "<form method='POST' action='/search.jsp'><input type=text>"
+ + "<input type=submit><p>test1</p></form>"
+ + "<form method='GET' action='/dummy.jsp'><input type=text>"
+ + "<input type=submit><p>test2</p></form></body></html>"),
+ new String("<html><head><title> title </title>" + "</head><body>"
+ + "<a href=\";x\">anchor1</a>" + "<a href=\"g;x\">anchor2</a>"
+ + "<a href=\"g;x?y#s\">anchor3</a>" + "</body></html>"),
+ new String("<html><head><title> title </title>" + "</head><body>"
+ + "<a href=\"g\">anchor1</a>" + "<a href=\"g?y#s\">anchor2</a>"
+ + "<a href=\"?y=1\">anchor3</a>" + "<a href=\"?y=1#s\">anchor4</a>"
+ + "<a href=\"?y=1;somethingelse\">anchor5</a>" + "</body></html>"), };
+
+ private static int SKIP = 9;
+
+ private static String[] testBaseHrefs = { "http://www.nutch.org",
+ "http://www.nutch.org/docs/foo.html", "http://www.nutch.org/docs/",
+ "http://www.nutch.org/docs/", "http://www.nutch.org/frames/",
+ "http://www.nutch.org/maps/", "http://www.nutch.org/whitespace/",
+ "http://www.nutch.org//", "http://www.nutch.org/",
+ "http://www.nutch.org/", "http://www.nutch.org/",
+ "http://www.nutch.org/;something" };
+
+ private static final DocumentFragment testDOMs[] = new DocumentFragment[testPages.length];
+
+ private static URL[] testBaseHrefURLs = new URL[testPages.length];
+
+ private static final String[] answerText = {
+ "title body anchor",
+ "title body home bots",
+ "separate this from this",
+ "my title body home 1 2",
+ "my title",
+ "my title the bottom",
+ "my title Whitespace test whitespace test "
+ + "This is a whitespace test . Newlines should appear as space too. "
+ + "Tabs are spaces too. This is a break -> and the line after break . "
+ + "one two three space here space there no space "
+ + "one two two three three four put some text here and there. "
+ + "End this madness ! . . . .", "ignore ignore", "test1 test2",
+ "test1 test2", "title anchor1 anchor2 anchor3",
+ "title anchor1 anchor2 anchor3 anchor4 anchor5" };
+
+ private static final String[] answerTitle = { "title", "title", "",
+ "my title", "my title", "my title", "my title", "", "", "", "title",
+ "title" };
+
+ // note: should be in page-order
+ private static Outlink[][] answerOutlinks;
+
+ private static Configuration conf;
+ private static DOMContentUtils utils = null;
+
+ @Before
+ public void setup() throws Exception {
+ conf = NutchConfiguration.create();
+ conf.setBoolean("parser.html.form.use_action", true);
+ utils = new DOMContentUtils(conf);
+ DOMFragmentParser parser = new DOMFragmentParser();
+ parser.setFeature(
+ "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe",
+ true);
+ for (int i = 0; i < testPages.length; i++) {
+ DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
+ try {
+ parser.parse(
+ new InputSource(new ByteArrayInputStream(testPages[i].getBytes())),
+ node);
+ testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
+ } catch (Exception e) {
+ Assert.assertTrue("caught exception: " + e, false);
+ }
+ testDOMs[i] = node;
+ }
+ answerOutlinks = new Outlink[][] {
+ { new Outlink("http://www.nutch.org", "anchor"), },
+ { new Outlink("http://www.nutch.org/", "home"),
+ new Outlink("http://www.nutch.org/docs/bot.html", "bots"), },
+ { new Outlink("http://www.nutch.org/", "separate this"),
+ new Outlink("http://www.nutch.org/docs/ok", "from this"), },
+ { new Outlink("http://www.nutch.org/", "home"),
+ new Outlink("http://www.nutch.org/docs/1", "1"),
+ new Outlink("http://www.nutch.org/docs/2", "2"), },
+ { new Outlink("http://www.nutch.org/frames/top.html", ""),
+ new Outlink("http://www.nutch.org/frames/left.html", ""),
+ new Outlink("http://www.nutch.org/frames/invalid.html", ""),
+ new Outlink("http://www.nutch.org/frames/right.html", ""), },
+ { new Outlink("http://www.nutch.org/maps/logo.gif", ""),
+ new Outlink("http://www.nutch.org/index.html", ""),
+ new Outlink("http://www.nutch.org/maps/#bottom", ""),
+ new Outlink("http://www.nutch.org/bot.html", ""),
+ new Outlink("http://www.nutch.org/docs/index.html", ""), },
+ { new Outlink("http://www.nutch.org/index.html", "whitespace test"), },
+ {},
+ { new Outlink("http://www.nutch.org/dummy.jsp", "test2"), },
+ {},
+ { new Outlink("http://www.nutch.org/;x", "anchor1"),
+ new Outlink("http://www.nutch.org/g;x", "anchor2"),
+ new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") },
+ {
+ // this is tricky - see RFC3986 section 5.4.1 example 7
+ new Outlink("http://www.nutch.org/g", "anchor1"),
+ new Outlink("http://www.nutch.org/g?y#s", "anchor2"),
+ new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
+ new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
+ new Outlink("http://www.nutch.org/;something?y=1;somethingelse",
+ "anchor5") } };
+
+ }
+
+ private static boolean equalsIgnoreWhitespace(String s1, String s2) {
+ StringTokenizer st1 = new StringTokenizer(s1);
+ StringTokenizer st2 = new StringTokenizer(s2);
+
+ while (st1.hasMoreTokens()) {
+ if (!st2.hasMoreTokens())
+ return false;
+ if (!st1.nextToken().equals(st2.nextToken()))
+ return false;
+ }
+ if (st2.hasMoreTokens())
+ return false;
+ return true;
+ }
+
+ @Test
+ public void testGetText() throws Exception {
+ if (testDOMs[0] == null)
+ setup();
+ for (int i = 0; i < testPages.length; i++) {
+ StringBuffer sb = new StringBuffer();
+ utils.getText(sb, testDOMs[i]);
+ String text = sb.toString();
+ Assert.assertTrue(
+ "expecting text: " + answerText[i]
+ + System.getProperty("line.separator")
+ + System.getProperty("line.separator") + "got text: " + text,
+ equalsIgnoreWhitespace(answerText[i], text));
+ }
+ }
+
+ @Test
+ public void testGetTitle() throws Exception {
+ if (testDOMs[0] == null)
+ setup();
+ for (int i = 0; i < testPages.length; i++) {
+ StringBuffer sb = new StringBuffer();
+ utils.getTitle(sb, testDOMs[i]);
+ String text = sb.toString();
+ Assert.assertTrue(
+ "expecting text: " + answerText[i]
+ + System.getProperty("line.separator")
+ + System.getProperty("line.separator") + "got text: " + text,
+ equalsIgnoreWhitespace(answerTitle[i], text));
+ }
+ }
+
+ @Test
+ public void testGetOutlinks() throws Exception {
+ if (testDOMs[0] == null)
+ setup();
+ for (int i = 0; i < testPages.length; i++) {
+ ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
+ if (i == SKIP) {
+ conf.setBoolean("parser.html.form.use_action", false);
+ utils.setConf(conf);
+ } else {
+ conf.setBoolean("parser.html.form.use_action", true);
+ utils.setConf(conf);
+ }
+ utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]);
+ Outlink[] outlinkArr = new Outlink[outlinks.size()];
+ outlinkArr = outlinks.toArray(outlinkArr);
+ compareOutlinks(answerOutlinks[i], outlinkArr);
+ }
+ }
+
+ private static final void appendOutlinks(StringBuffer sb, Outlink[] o) {
+ for (int i = 0; i < o.length; i++) {
+ sb.append(o[i].toString());
+ sb.append(System.getProperty("line.separator"));
+ }
+ }
+
+ private static final String outlinksString(Outlink[] o) {
+ StringBuffer sb = new StringBuffer();
+ appendOutlinks(sb, o);
+ return sb.toString();
+ }
+
+ private static final void compareOutlinks(Outlink[] o1, Outlink[] o2) {
+ if (o1.length != o2.length) {
+ Assert.assertTrue(
+ "got wrong number of outlinks (expecting " + o1.length + ", got "
+ + o2.length + ")" + System.getProperty("line.separator")
+ + "answer: " + System.getProperty("line.separator")
+ + outlinksString(o1) + System.getProperty("line.separator")
+ + "got: " + System.getProperty("line.separator")
+ + outlinksString(o2) + System.getProperty("line.separator"),
+ false);
+ }
+
+ for (int i = 0; i < o1.length; i++) {
+ if (!o1[i].equals(o2[i])) {
+ Assert.assertTrue(
+ "got wrong outlinks at position " + i
+ + System.getProperty("line.separator") + "answer: "
+ + System.getProperty("line.separator") + "'" + o1[i].getToUrl()
+ + "', anchor: '" + o1[i].getAnchor() + "'"
+ + System.getProperty("line.separator") + "got: "
+ + System.getProperty("line.separator") + "'" + o2[i].getToUrl()
+ + "', anchor: '" + o2[i].getAnchor() + "'", false);
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestFeedParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestFeedParser.java b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestFeedParser.java
new file mode 100644
index 0000000..c9394dc
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestFeedParser.java
@@ -0,0 +1,121 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import org.junit.Assert;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.tika.TikaParser;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.util.NutchConfiguration;
+
+/**
+ *
+ * @author mattmann / jnioche
+ *
+ * Test Suite for the RSS feeds with the {@link TikaParser}.
+ *
+ */
+public class TestFeedParser {
+
+ private String fileSeparator = System.getProperty("file.separator");
+
+ // This system property is defined in ./src/plugin/build-plugin.xml
+ private String sampleDir = System.getProperty("test.data", ".");
+
+ private String[] sampleFiles = { "rsstest.rss" };
+
+ public static final Logger LOG = LoggerFactory.getLogger(TestFeedParser.class
+ .getName());
+
+ /**
+ * <p>
+ * The test method: tests out the following 2 asserts:
+ * </p>
+ *
+ * <ul>
+ * <li>There are 3 outlinks read from the sample rss file</li>
+ * <li>The 3 outlinks read are in fact the correct outlinks from the sample
+ * file</li>
+ * </ul>
+ */
+ @Test
+ public void testIt() throws ProtocolException, ParseException {
+ String urlString;
+ Protocol protocol;
+ Content content;
+ Parse parse;
+
+ Configuration conf = NutchConfiguration.create();
+ for (int i = 0; i < sampleFiles.length; i++) {
+ urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+ protocol = new ProtocolFactory(conf).getProtocol(urlString);
+ content = protocol.getProtocolOutput(new Text(urlString),
+ new CrawlDatum()).getContent();
+ parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
+ .get(content.getUrl());
+
+ // check that there are 2 outlinks:
+ // unlike the original parse-rss
+ // tika ignores the URL and description of the channel
+
+ // http://test.channel.com
+ // http://www-scf.usc.edu/~mattmann/
+ // http://www.nutch.org
+
+ ParseData theParseData = parse.getData();
+
+ Outlink[] theOutlinks = theParseData.getOutlinks();
+
+ Assert.assertTrue("There aren't 2 outlinks read!",
+ theOutlinks.length == 2);
+
+ // now check to make sure that those are the two outlinks
+ boolean hasLink1 = false, hasLink2 = false;
+
+ for (int j = 0; j < theOutlinks.length; j++) {
+ if (theOutlinks[j].getToUrl().equals(
+ "http://www-scf.usc.edu/~mattmann/")) {
+ hasLink1 = true;
+ }
+
+ if (theOutlinks[j].getToUrl().equals("http://www.nutch.org/")) {
+ hasLink2 = true;
+ }
+ }
+
+ if (!hasLink1 || !hasLink2) {
+ Assert.fail("Outlinks read from sample rss file are not correct!");
+ }
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestImageMetadata.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestImageMetadata.java b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestImageMetadata.java
new file mode 100644
index 0000000..b1762e6
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestImageMetadata.java
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Test extraction of image metadata
+ */
+public class TestImageMetadata {
+
+ private String fileSeparator = System.getProperty("file.separator");
+ // This system property is defined in ./src/plugin/build-plugin.xml
+ private String sampleDir = System.getProperty("test.data", ".");
+ // Make sure sample files are copied to "test.data" as specified in
+ private String[] sampleFiles = { "nutch_logo_tm.gif", };
+
+ @Test
+ public void testIt() throws ProtocolException, ParseException {
+ String urlString;
+ Protocol protocol;
+ Content content;
+ Parse parse;
+
+ for (int i = 0; i < sampleFiles.length; i++) {
+ urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+ Configuration conf = NutchConfiguration.create();
+ protocol = new ProtocolFactory(conf).getProtocol(urlString);
+ content = protocol.getProtocolOutput(new Text(urlString),
+ new CrawlDatum()).getContent();
+ parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
+ .get(content.getUrl());
+
+ Assert.assertEquals("121", parse.getData().getMeta("width"));
+ Assert.assertEquals("48", parse.getData().getMeta("height"));
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestMSWordParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestMSWordParser.java b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestMSWordParser.java
new file mode 100644
index 0000000..576b3df
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestMSWordParser.java
@@ -0,0 +1,92 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.File;
+
+/**
+ * Unit tests for MSWordParser.
+ *
+ * @author John Xing
+ */
+public class TestMSWordParser {
+
+ private String fileSeparator = System.getProperty("file.separator");
+ // This system property is defined in ./src/plugin/build-plugin.xml
+ private String sampleDir = System.getProperty("test.data", ".");
+ // Make sure sample files are copied to "test.data" as specified in
+ // ./src/plugin/parse-msword/build.xml during plugin compilation.
+ // Check ./src/plugin/parse-msword/sample/README.txt for what they are.
+ private String[] sampleFiles = { "word97.doc" };
+
+ private String expectedText = "This is a sample doc file prepared for nutch.";
+
+ private Configuration conf;
+
+ @Before
+ public void setUp() {
+ conf = NutchConfiguration.create();
+ conf.set("file.content.limit", "-1");
+ }
+
+ public String getTextContent(String fileName) throws ProtocolException,
+ ParseException {
+ String urlString = "file:" + sampleDir + fileSeparator + fileName;
+ Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
+ Content content = protocol.getProtocolOutput(new Text(urlString),
+ new CrawlDatum()).getContent();
+ Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
+ .get(content.getUrl());
+ return parse.getText();
+ }
+
+ @Test
+ public void testIt() throws ProtocolException, ParseException {
+ for (int i = 0; i < sampleFiles.length; i++) {
+ String found = getTextContent(sampleFiles[i]);
+ Assert.assertTrue("text found : '" + found + "'",
+ found.startsWith(expectedText));
+ }
+ }
+
+ @Test
+ public void testOpeningDocs() throws ProtocolException, ParseException {
+ String[] filenames = new File(sampleDir).list();
+ for (int i = 0; i < filenames.length; i++) {
+ if (filenames[i].endsWith(".doc") == false)
+ continue;
+ Assert.assertTrue("cann't read content of " + filenames[i],
+ getTextContent(filenames[i]).length() > 0);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestOOParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestOOParser.java b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestOOParser.java
new file mode 100644
index 0000000..6960bad
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestOOParser.java
@@ -0,0 +1,107 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.protocol.*;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Unit tests for OOParser.
+ *
+ * @author Andrzej Bialecki
+ */
+public class TestOOParser {
+
+ private String fileSeparator = System.getProperty("file.separator");
+ // This system property is defined in ./src/plugin/build-plugin.xml
+ private String sampleDir = System.getProperty("test.data", ".");
+ // Make sure sample files are copied to "test.data" as specified in
+ // ./src/plugin/parse-oo/build.xml during plugin compilation.
+ private String[] sampleFiles = { "ootest.odt", "ootest.sxw" };
+
+ private String expectedText;
+
+ private String sampleText = "ootest.txt";
+
+ @Test
+ public void testIt() throws ProtocolException, ParseException {
+ String urlString;
+ Content content;
+ Parse parse;
+ Configuration conf = NutchConfiguration.create();
+ Protocol protocol;
+ ProtocolFactory factory = new ProtocolFactory(conf);
+
+ System.out.println("Expected : " + expectedText);
+
+ for (int i = 0; i < sampleFiles.length; i++) {
+ urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+ if (sampleFiles[i].startsWith("ootest") == false)
+ continue;
+
+ protocol = factory.getProtocol(urlString);
+ content = protocol.getProtocolOutput(new Text(urlString),
+ new CrawlDatum()).getContent();
+ parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
+ .get(content.getUrl());
+
+ String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
+
+ // simply test for the presence of a text - the ordering of the elements
+ // may differ from what was expected
+ // in the previous tests
+ Assert.assertTrue(text != null && text.length() > 0);
+
+ System.out.println("Found " + sampleFiles[i] + ": " + text);
+ }
+ }
+
+ public TestOOParser() {
+ try {
+ // read the test string
+ FileInputStream fis = new FileInputStream(sampleDir + fileSeparator
+ + sampleText);
+ StringBuffer sb = new StringBuffer();
+ int len = 0;
+ InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
+ char[] buf = new char[1024];
+ while ((len = isr.read(buf)) > 0) {
+ sb.append(buf, 0, len);
+ }
+ isr.close();
+ expectedText = sb.toString();
+ // normalize space
+ expectedText = expectedText.replaceAll("[ \t\r\n]+", " ");
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestPdfParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestPdfParser.java b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestPdfParser.java
new file mode 100644
index 0000000..9884f0c
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestPdfParser.java
@@ -0,0 +1,73 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Unit tests for PdfParser.
+ *
+ * @author John Xing
+ */
+public class TestPdfParser {
+
+ private String fileSeparator = System.getProperty("file.separator");
+ // This system property is defined in ./src/plugin/build-plugin.xml
+ private String sampleDir = System.getProperty("test.data", ".");
+ // Make sure sample files are copied to "test.data" as specified in
+ // ./src/plugin/parse-pdf/build.xml during plugin compilation.
+ // Check ./src/plugin/parse-pdf/sample/README.txt for what they are.
+ private String[] sampleFiles = { "pdftest.pdf", "encrypted.pdf" };
+
+ private String expectedText = "A VERY SMALL PDF FILE";
+
+ @Test
+ public void testIt() throws ProtocolException, ParseException {
+ String urlString;
+ Protocol protocol;
+ Content content;
+ Parse parse;
+
+ for (int i = 0; i < sampleFiles.length; i++) {
+ urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+ Configuration conf = NutchConfiguration.create();
+ protocol = new ProtocolFactory(conf).getProtocol(urlString);
+ content = protocol.getProtocolOutput(new Text(urlString),
+ new CrawlDatum()).getContent();
+ parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
+ .get(content.getUrl());
+
+ int index = parse.getText().indexOf(expectedText);
+ Assert.assertTrue(index > 0);
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRTFParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRTFParser.java b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRTFParser.java
new file mode 100644
index 0000000..f15d821
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRTFParser.java
@@ -0,0 +1,81 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.tika;
+
+// Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.DublinCore;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.util.NutchConfiguration;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.junit.Assert;
+import org.junit.Ignore;
+import org.junit.Test;
+
+/**
+ * Unit tests for TestRTFParser. (Adapted from John Xing msword unit tests).
+ *
+ * @author Andy Hedges
+ */
+public class TestRTFParser {
+
+ private String fileSeparator = System.getProperty("file.separator");
+ // This system property is defined in ./src/plugin/build-plugin.xml
+ private String sampleDir = System.getProperty("test.data", ".");
+ // Make sure sample files are copied to "test.data" as specified in
+ // ./src/plugin/parse-rtf/build.xml during plugin compilation.
+ // Check ./src/plugin/parse-rtf/sample/README.txt for what they are.
+ private String rtfFile = "test.rtf";
+
+ @Ignore("There seems to be an issue with line 71 e.g. text.trim()")
+ @Test
+ public void testIt() throws ProtocolException, ParseException {
+
+ String urlString;
+ Protocol protocol;
+ Content content;
+ Parse parse;
+
+ Configuration conf = NutchConfiguration.create();
+ urlString = "file:" + sampleDir + fileSeparator + rtfFile;
+ protocol = new ProtocolFactory(conf).getProtocol(urlString);
+ content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
+ .getContent();
+ parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(
+ content.getUrl());
+ String text = parse.getText();
+ Assert.assertEquals("The quick brown fox jumps over the lazy dog",
+ text.trim());
+
+ String title = parse.getData().getTitle();
+ Metadata meta = parse.getData().getParseMeta();
+
+ Assert.assertEquals("test rft document", title);
+ Assert.assertEquals("tests", meta.get(DublinCore.SUBJECT));
+
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRobotsMetaProcessor.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRobotsMetaProcessor.java b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRobotsMetaProcessor.java
new file mode 100644
index 0000000..4224f93
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/test/java/org/apache/nutch/tika/TestRobotsMetaProcessor.java
@@ -0,0 +1,156 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.tika.HTMLMetaProcessor;
+
+import java.io.ByteArrayInputStream;
+import java.net.URL;
+
+import org.xml.sax.*;
+import org.w3c.dom.*;
+import org.apache.html.dom.*;
+import org.cyberneko.html.parsers.DOMFragmentParser;
+import org.junit.Assert;
+import org.junit.Test;
+
+/** Unit tests for HTMLMetaProcessor. */
+public class TestRobotsMetaProcessor {
+
+ /*
+ *
+ * some sample tags:
+ *
+ * <meta name="robots" content="index,follow"> <meta name="robots"
+ * content="noindex,follow"> <meta name="robots" content="index,nofollow">
+ * <meta name="robots" content="noindex,nofollow">
+ *
+ * <META HTTP-EQUIV="Pragma" CONTENT="no-cache">
+ */
+
+ public static String[] tests = {
+ "<html><head><title>test page</title>"
+ + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> "
+ + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> "
+ + "</head><body>" + " some text" + "</body></html>",
+
+ "<html><head><title>test page</title>"
+ + "<meta name=\"robots\" content=\"all\"> "
+ + "<meta http-equiv=\"pragma\" content=\"no-cache\"> "
+ + "</head><body>" + " some text" + "</body></html>",
+
+ "<html><head><title>test page</title>"
+ + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> "
+ + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> "
+ + "</head><body>" + " some text" + "</body></html>",
+
+ "<html><head><title>test page</title>"
+ + "<meta name=\"robots\" content=\"none\"> " + "</head><body>"
+ + " some text" + "</body></html>",
+
+ "<html><head><title>test page</title>"
+ + "<meta name=\"robots\" content=\"noindex,nofollow\"> "
+ + "</head><body>" + " some text" + "</body></html>",
+
+ "<html><head><title>test page</title>"
+ + "<meta name=\"robots\" content=\"noindex,follow\"> "
+ + "</head><body>" + " some text" + "</body></html>",
+
+ "<html><head><title>test page</title>"
+ + "<meta name=\"robots\" content=\"index,nofollow\"> "
+ + "</head><body>" + " some text" + "</body></html>",
+
+ "<html><head><title>test page</title>"
+ + "<meta name=\"robots\" content=\"index,follow\"> "
+ + "<base href=\"http://www.nutch.org/\">" + "</head><body>"
+ + " some text" + "</body></html>",
+
+ "<html><head><title>test page</title>" + "<meta name=\"robots\"> "
+ + "<base href=\"http://www.nutch.org/base/\">" + "</head><body>"
+ + " some text" + "</body></html>",
+
+ };
+
+ public static final boolean[][] answers = { { true, true, true }, // NONE
+ { false, false, true }, // all
+ { true, true, true }, // nOnE
+ { true, true, false }, // none
+ { true, true, false }, // noindex,nofollow
+ { true, false, false }, // noindex,follow
+ { false, true, false }, // index,nofollow
+ { false, false, false }, // index,follow
+ { false, false, false }, // missing!
+ };
+
+ private URL[][] currURLsAndAnswers;
+
+ @Test
+ public void testRobotsMetaProcessor() {
+ DOMFragmentParser parser = new DOMFragmentParser();
+ ;
+
+ try {
+ currURLsAndAnswers = new URL[][] {
+ { new URL("http://www.nutch.org"), null },
+ { new URL("http://www.nutch.org"), null },
+ { new URL("http://www.nutch.org"), null },
+ { new URL("http://www.nutch.org"), null },
+ { new URL("http://www.nutch.org"), null },
+ { new URL("http://www.nutch.org"), null },
+ { new URL("http://www.nutch.org"), null },
+ { new URL("http://www.nutch.org/foo/"),
+ new URL("http://www.nutch.org/") },
+ { new URL("http://www.nutch.org"),
+ new URL("http://www.nutch.org/base/") } };
+ } catch (Exception e) {
+ Assert.assertTrue("couldn't make test URLs!", false);
+ }
+
+ for (int i = 0; i < tests.length; i++) {
+ byte[] bytes = tests[i].getBytes();
+
+ DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
+
+ try {
+ parser.parse(new InputSource(new ByteArrayInputStream(bytes)), node);
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+ HTMLMetaTags robotsMeta = new HTMLMetaTags();
+ HTMLMetaProcessor.getMetaTags(robotsMeta, node, currURLsAndAnswers[i][0]);
+
+ Assert.assertTrue("got index wrong on test " + i,
+ robotsMeta.getNoIndex() == answers[i][0]);
+ Assert.assertTrue("got follow wrong on test " + i,
+ robotsMeta.getNoFollow() == answers[i][1]);
+ Assert.assertTrue("got cache wrong on test " + i,
+ robotsMeta.getNoCache() == answers[i][2]);
+ Assert
+ .assertTrue(
+ "got base href wrong on test " + i + " (got "
+ + robotsMeta.getBaseHref() + ")",
+ ((robotsMeta.getBaseHref() == null) && (currURLsAndAnswers[i][1] == null))
+ || ((robotsMeta.getBaseHref() != null) && robotsMeta
+ .getBaseHref().equals(currURLsAndAnswers[i][1])));
+
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java
deleted file mode 100644
index 96029a6..0000000
--- a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java
+++ /dev/null
@@ -1,337 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.tika;
-
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.tika.DOMContentUtils;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-
-import java.io.ByteArrayInputStream;
-import java.net.URL;
-import java.util.ArrayList;
-import java.util.StringTokenizer;
-
-import org.xml.sax.*;
-import org.w3c.dom.*;
-import org.apache.html.dom.*;
-import org.cyberneko.html.parsers.DOMFragmentParser;
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Test;
-
-/**
- * Unit tests for DOMContentUtils.
- */
-public class TestDOMContentUtils {
-
- private static final String[] testPages = {
-
- new String("<html><head><title> title </title><script> script </script>"
- + "</head><body> body <a href=\"http://www.nutch.org\">"
- + " anchor </a><!--comment-->" + "</body></html>"),
-
- new String("<html><head><title> title </title><script> script </script>"
- + "</head><body> body <a href=\"/\">" + " home </a><!--comment-->"
- + "<style> style </style>" + " <a href=\"bot.html\">" + " bots </a>"
- + "</body></html>"),
-
- new String("<html><head><title> </title>" + "</head><body> "
- + "<a href=\"/\"> separate this " + "<a href=\"ok\"> from this"
- + "</a></a>" + "</body></html>"),
-
- // this one relies on certain neko fixup behavior, possibly
- // distributing the anchors into the LI's-but not the other
- // anchors (outside of them, instead)! So you get a tree that
- // looks like:
- // ... <li> <a href=/> home </a> </li>
- // <li> <a href=/> <a href="1"> 1 </a> </a> </li>
- // <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li>
- new String("<html><head><title> my title </title>"
- + "</head><body> body " + "<ul>" + "<li> <a href=\"/\"> home"
- + "<li> <a href=\"1\"> 1" + "<li> <a href=\"2\"> 2" + "</ul>"
- + "</body></html>"),
-
- // test frameset link extraction. The invalid frame in the middle
- // will be
- // fixed to a third standalone frame.
- new String("<html><head><title> my title </title>"
- + "</head><frameset rows=\"20,*\"> " + "<frame src=\"top.html\">"
- + "</frame>" + "<frameset cols=\"20,*\">"
- + "<frame src=\"left.html\">" + "<frame src=\"invalid.html\"/>"
- + "</frame>" + "<frame src=\"right.html\">" + "</frame>"
- + "</frameset>" + "</frameset>" + "</body></html>"),
-
- // test <area> and <iframe> link extraction + url normalization
- new String(
- "<html><head><title> my title </title>"
- + "</head><body>"
- + "<img src=\"logo.gif\" usemap=\"#green\" border=\"0\">"
- + "<map name=\"green\">"
- + "<area shape=\"polygon\" coords=\"19,44,45,11,87\" href=\"../index.html\">"
- + "<area shape=\"rect\" coords=\"128,132,241,179\" href=\"#bottom\">"
- + "<area shape=\"circle\" coords=\"68,211,35\" href=\"../bot.html\">"
- + "</map>" + "<a name=\"bottom\"/><h1> the bottom </h1> "
- + "<iframe src=\"../docs/index.html\"/>" + "</body></html>"),
-
- // test whitespace processing for plain text extraction
- new String(
- "<html><head>\n <title> my\t\n title\r\n </title>\n"
- + " </head>\n"
- + " <body>\n"
- + " <h1> Whitespace\ttest </h1> \n"
- + "\t<a href=\"../index.html\">\n \twhitespace test\r\n\t</a> \t\n"
- + " <p> This is<span> a whitespace<span></span> test</span>. Newlines\n"
- + "should appear as space too.</p><p>Tabs\tare spaces too.\n</p>"
- + " This\t<b>is a</b> break -><br>and the line after<i> break</i>.<br>\n"
- + "<table>"
- + " <tr><td>one</td><td>two</td><td>three</td></tr>\n"
- + " <tr><td>space here </td><td> space there</td><td>no space</td></tr>"
- + "\t<tr><td>one\r\ntwo</td><td>two\tthree</td><td>three\r\tfour</td></tr>\n"
- + "</table>put some text here<Br>and there."
- + "<h2>End\tthis\rmadness\n!</h2>\r\n"
- + " . . . ." + "</body> </html>"),
-
- // test that <a rel=nofollow> links are not returned
- new String("<html><head></head><body>"
- + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>"
- + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>"
- + "</body></html>"),
- // test that POST form actions are skipped
- new String("<html><head></head><body>"
- + "<form method='POST' action='/search.jsp'><input type=text>"
- + "<input type=submit><p>test1</p></form>"
- + "<form method='GET' action='/dummy.jsp'><input type=text>"
- + "<input type=submit><p>test2</p></form></body></html>"),
- // test that all form actions are skipped
- new String("<html><head></head><body>"
- + "<form method='POST' action='/search.jsp'><input type=text>"
- + "<input type=submit><p>test1</p></form>"
- + "<form method='GET' action='/dummy.jsp'><input type=text>"
- + "<input type=submit><p>test2</p></form></body></html>"),
- new String("<html><head><title> title </title>" + "</head><body>"
- + "<a href=\";x\">anchor1</a>" + "<a href=\"g;x\">anchor2</a>"
- + "<a href=\"g;x?y#s\">anchor3</a>" + "</body></html>"),
- new String("<html><head><title> title </title>" + "</head><body>"
- + "<a href=\"g\">anchor1</a>" + "<a href=\"g?y#s\">anchor2</a>"
- + "<a href=\"?y=1\">anchor3</a>" + "<a href=\"?y=1#s\">anchor4</a>"
- + "<a href=\"?y=1;somethingelse\">anchor5</a>" + "</body></html>"), };
-
- private static int SKIP = 9;
-
- private static String[] testBaseHrefs = { "http://www.nutch.org",
- "http://www.nutch.org/docs/foo.html", "http://www.nutch.org/docs/",
- "http://www.nutch.org/docs/", "http://www.nutch.org/frames/",
- "http://www.nutch.org/maps/", "http://www.nutch.org/whitespace/",
- "http://www.nutch.org//", "http://www.nutch.org/",
- "http://www.nutch.org/", "http://www.nutch.org/",
- "http://www.nutch.org/;something" };
-
- private static final DocumentFragment testDOMs[] = new DocumentFragment[testPages.length];
-
- private static URL[] testBaseHrefURLs = new URL[testPages.length];
-
- private static final String[] answerText = {
- "title body anchor",
- "title body home bots",
- "separate this from this",
- "my title body home 1 2",
- "my title",
- "my title the bottom",
- "my title Whitespace test whitespace test "
- + "This is a whitespace test . Newlines should appear as space too. "
- + "Tabs are spaces too. This is a break -> and the line after break . "
- + "one two three space here space there no space "
- + "one two two three three four put some text here and there. "
- + "End this madness ! . . . .", "ignore ignore", "test1 test2",
- "test1 test2", "title anchor1 anchor2 anchor3",
- "title anchor1 anchor2 anchor3 anchor4 anchor5" };
-
- private static final String[] answerTitle = { "title", "title", "",
- "my title", "my title", "my title", "my title", "", "", "", "title",
- "title" };
-
- // note: should be in page-order
- private static Outlink[][] answerOutlinks;
-
- private static Configuration conf;
- private static DOMContentUtils utils = null;
-
- @Before
- public void setup() throws Exception {
- conf = NutchConfiguration.create();
- conf.setBoolean("parser.html.form.use_action", true);
- utils = new DOMContentUtils(conf);
- DOMFragmentParser parser = new DOMFragmentParser();
- parser.setFeature(
- "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe",
- true);
- for (int i = 0; i < testPages.length; i++) {
- DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
- try {
- parser.parse(
- new InputSource(new ByteArrayInputStream(testPages[i].getBytes())),
- node);
- testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
- } catch (Exception e) {
- Assert.assertTrue("caught exception: " + e, false);
- }
- testDOMs[i] = node;
- }
- answerOutlinks = new Outlink[][] {
- { new Outlink("http://www.nutch.org", "anchor"), },
- { new Outlink("http://www.nutch.org/", "home"),
- new Outlink("http://www.nutch.org/docs/bot.html", "bots"), },
- { new Outlink("http://www.nutch.org/", "separate this"),
- new Outlink("http://www.nutch.org/docs/ok", "from this"), },
- { new Outlink("http://www.nutch.org/", "home"),
- new Outlink("http://www.nutch.org/docs/1", "1"),
- new Outlink("http://www.nutch.org/docs/2", "2"), },
- { new Outlink("http://www.nutch.org/frames/top.html", ""),
- new Outlink("http://www.nutch.org/frames/left.html", ""),
- new Outlink("http://www.nutch.org/frames/invalid.html", ""),
- new Outlink("http://www.nutch.org/frames/right.html", ""), },
- { new Outlink("http://www.nutch.org/maps/logo.gif", ""),
- new Outlink("http://www.nutch.org/index.html", ""),
- new Outlink("http://www.nutch.org/maps/#bottom", ""),
- new Outlink("http://www.nutch.org/bot.html", ""),
- new Outlink("http://www.nutch.org/docs/index.html", ""), },
- { new Outlink("http://www.nutch.org/index.html", "whitespace test"), },
- {},
- { new Outlink("http://www.nutch.org/dummy.jsp", "test2"), },
- {},
- { new Outlink("http://www.nutch.org/;x", "anchor1"),
- new Outlink("http://www.nutch.org/g;x", "anchor2"),
- new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") },
- {
- // this is tricky - see RFC3986 section 5.4.1 example 7
- new Outlink("http://www.nutch.org/g", "anchor1"),
- new Outlink("http://www.nutch.org/g?y#s", "anchor2"),
- new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
- new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
- new Outlink("http://www.nutch.org/;something?y=1;somethingelse",
- "anchor5") } };
-
- }
-
- private static boolean equalsIgnoreWhitespace(String s1, String s2) {
- StringTokenizer st1 = new StringTokenizer(s1);
- StringTokenizer st2 = new StringTokenizer(s2);
-
- while (st1.hasMoreTokens()) {
- if (!st2.hasMoreTokens())
- return false;
- if (!st1.nextToken().equals(st2.nextToken()))
- return false;
- }
- if (st2.hasMoreTokens())
- return false;
- return true;
- }
-
- @Test
- public void testGetText() throws Exception {
- if (testDOMs[0] == null)
- setup();
- for (int i = 0; i < testPages.length; i++) {
- StringBuffer sb = new StringBuffer();
- utils.getText(sb, testDOMs[i]);
- String text = sb.toString();
- Assert.assertTrue(
- "expecting text: " + answerText[i]
- + System.getProperty("line.separator")
- + System.getProperty("line.separator") + "got text: " + text,
- equalsIgnoreWhitespace(answerText[i], text));
- }
- }
-
- @Test
- public void testGetTitle() throws Exception {
- if (testDOMs[0] == null)
- setup();
- for (int i = 0; i < testPages.length; i++) {
- StringBuffer sb = new StringBuffer();
- utils.getTitle(sb, testDOMs[i]);
- String text = sb.toString();
- Assert.assertTrue(
- "expecting text: " + answerText[i]
- + System.getProperty("line.separator")
- + System.getProperty("line.separator") + "got text: " + text,
- equalsIgnoreWhitespace(answerTitle[i], text));
- }
- }
-
- @Test
- public void testGetOutlinks() throws Exception {
- if (testDOMs[0] == null)
- setup();
- for (int i = 0; i < testPages.length; i++) {
- ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
- if (i == SKIP) {
- conf.setBoolean("parser.html.form.use_action", false);
- utils.setConf(conf);
- } else {
- conf.setBoolean("parser.html.form.use_action", true);
- utils.setConf(conf);
- }
- utils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]);
- Outlink[] outlinkArr = new Outlink[outlinks.size()];
- outlinkArr = outlinks.toArray(outlinkArr);
- compareOutlinks(answerOutlinks[i], outlinkArr);
- }
- }
-
- private static final void appendOutlinks(StringBuffer sb, Outlink[] o) {
- for (int i = 0; i < o.length; i++) {
- sb.append(o[i].toString());
- sb.append(System.getProperty("line.separator"));
- }
- }
-
- private static final String outlinksString(Outlink[] o) {
- StringBuffer sb = new StringBuffer();
- appendOutlinks(sb, o);
- return sb.toString();
- }
-
- private static final void compareOutlinks(Outlink[] o1, Outlink[] o2) {
- if (o1.length != o2.length) {
- Assert.assertTrue(
- "got wrong number of outlinks (expecting " + o1.length + ", got "
- + o2.length + ")" + System.getProperty("line.separator")
- + "answer: " + System.getProperty("line.separator")
- + outlinksString(o1) + System.getProperty("line.separator")
- + "got: " + System.getProperty("line.separator")
- + outlinksString(o2) + System.getProperty("line.separator"),
- false);
- }
-
- for (int i = 0; i < o1.length; i++) {
- if (!o1[i].equals(o2[i])) {
- Assert.assertTrue(
- "got wrong outlinks at position " + i
- + System.getProperty("line.separator") + "answer: "
- + System.getProperty("line.separator") + "'" + o1[i].getToUrl()
- + "', anchor: '" + o1[i].getAnchor() + "'"
- + System.getProperty("line.separator") + "got: "
- + System.getProperty("line.separator") + "'" + o2[i].getToUrl()
- + "', anchor: '" + o2[i].getAnchor() + "'", false);
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java
deleted file mode 100644
index c9394dc..0000000
--- a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java
+++ /dev/null
@@ -1,121 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.tika;
-
-import org.junit.Assert;
-import org.junit.Test;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseException;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.parse.tika.TikaParser;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.util.NutchConfiguration;
-
-/**
- *
- * @author mattmann / jnioche
- *
- * Test Suite for the RSS feeds with the {@link TikaParser}.
- *
- */
-public class TestFeedParser {
-
- private String fileSeparator = System.getProperty("file.separator");
-
- // This system property is defined in ./src/plugin/build-plugin.xml
- private String sampleDir = System.getProperty("test.data", ".");
-
- private String[] sampleFiles = { "rsstest.rss" };
-
- public static final Logger LOG = LoggerFactory.getLogger(TestFeedParser.class
- .getName());
-
- /**
- * <p>
- * The test method: tests out the following 2 asserts:
- * </p>
- *
- * <ul>
- * <li>There are 3 outlinks read from the sample rss file</li>
- * <li>The 3 outlinks read are in fact the correct outlinks from the sample
- * file</li>
- * </ul>
- */
- @Test
- public void testIt() throws ProtocolException, ParseException {
- String urlString;
- Protocol protocol;
- Content content;
- Parse parse;
-
- Configuration conf = NutchConfiguration.create();
- for (int i = 0; i < sampleFiles.length; i++) {
- urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
-
- protocol = new ProtocolFactory(conf).getProtocol(urlString);
- content = protocol.getProtocolOutput(new Text(urlString),
- new CrawlDatum()).getContent();
- parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
- .get(content.getUrl());
-
- // check that there are 2 outlinks:
- // unlike the original parse-rss
- // tika ignores the URL and description of the channel
-
- // http://test.channel.com
- // http://www-scf.usc.edu/~mattmann/
- // http://www.nutch.org
-
- ParseData theParseData = parse.getData();
-
- Outlink[] theOutlinks = theParseData.getOutlinks();
-
- Assert.assertTrue("There aren't 2 outlinks read!",
- theOutlinks.length == 2);
-
- // now check to make sure that those are the two outlinks
- boolean hasLink1 = false, hasLink2 = false;
-
- for (int j = 0; j < theOutlinks.length; j++) {
- if (theOutlinks[j].getToUrl().equals(
- "http://www-scf.usc.edu/~mattmann/")) {
- hasLink1 = true;
- }
-
- if (theOutlinks[j].getToUrl().equals("http://www.nutch.org/")) {
- hasLink2 = true;
- }
- }
-
- if (!hasLink1 || !hasLink2) {
- Assert.fail("Outlinks read from sample rss file are not correct!");
- }
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java
deleted file mode 100644
index b1762e6..0000000
--- a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java
+++ /dev/null
@@ -1,67 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.tika;
-
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.parse.ParseException;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.junit.Assert;
-import org.junit.Test;
-
-/**
- * Test extraction of image metadata
- */
-public class TestImageMetadata {
-
- private String fileSeparator = System.getProperty("file.separator");
- // This system property is defined in ./src/plugin/build-plugin.xml
- private String sampleDir = System.getProperty("test.data", ".");
- // Make sure sample files are copied to "test.data" as specified in
- private String[] sampleFiles = { "nutch_logo_tm.gif", };
-
- @Test
- public void testIt() throws ProtocolException, ParseException {
- String urlString;
- Protocol protocol;
- Content content;
- Parse parse;
-
- for (int i = 0; i < sampleFiles.length; i++) {
- urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
-
- Configuration conf = NutchConfiguration.create();
- protocol = new ProtocolFactory(conf).getProtocol(urlString);
- content = protocol.getProtocolOutput(new Text(urlString),
- new CrawlDatum()).getContent();
- parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
- .get(content.getUrl());
-
- Assert.assertEquals("121", parse.getData().getMeta("width"));
- Assert.assertEquals("48", parse.getData().getMeta("height"));
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java
deleted file mode 100644
index 576b3df..0000000
--- a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java
+++ /dev/null
@@ -1,92 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.tika;
-
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.parse.ParseException;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Test;
-
-import java.io.File;
-
-/**
- * Unit tests for MSWordParser.
- *
- * @author John Xing
- */
-public class TestMSWordParser {
-
- private String fileSeparator = System.getProperty("file.separator");
- // This system property is defined in ./src/plugin/build-plugin.xml
- private String sampleDir = System.getProperty("test.data", ".");
- // Make sure sample files are copied to "test.data" as specified in
- // ./src/plugin/parse-msword/build.xml during plugin compilation.
- // Check ./src/plugin/parse-msword/sample/README.txt for what they are.
- private String[] sampleFiles = { "word97.doc" };
-
- private String expectedText = "This is a sample doc file prepared for nutch.";
-
- private Configuration conf;
-
- @Before
- public void setUp() {
- conf = NutchConfiguration.create();
- conf.set("file.content.limit", "-1");
- }
-
- public String getTextContent(String fileName) throws ProtocolException,
- ParseException {
- String urlString = "file:" + sampleDir + fileSeparator + fileName;
- Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
- Content content = protocol.getProtocolOutput(new Text(urlString),
- new CrawlDatum()).getContent();
- Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
- .get(content.getUrl());
- return parse.getText();
- }
-
- @Test
- public void testIt() throws ProtocolException, ParseException {
- for (int i = 0; i < sampleFiles.length; i++) {
- String found = getTextContent(sampleFiles[i]);
- Assert.assertTrue("text found : '" + found + "'",
- found.startsWith(expectedText));
- }
- }
-
- @Test
- public void testOpeningDocs() throws ProtocolException, ParseException {
- String[] filenames = new File(sampleDir).list();
- for (int i = 0; i < filenames.length; i++) {
- if (filenames[i].endsWith(".doc") == false)
- continue;
- Assert.assertTrue("cann't read content of " + filenames[i],
- getTextContent(filenames[i]).length() > 0);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java
deleted file mode 100644
index 6960bad..0000000
--- a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java
+++ /dev/null
@@ -1,107 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.tika;
-
-import java.io.FileInputStream;
-import java.io.InputStreamReader;
-
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.protocol.*;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseException;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.Assert;
-import org.junit.Test;
-
-/**
- * Unit tests for OOParser.
- *
- * @author Andrzej Bialecki
- */
-public class TestOOParser {
-
- private String fileSeparator = System.getProperty("file.separator");
- // This system property is defined in ./src/plugin/build-plugin.xml
- private String sampleDir = System.getProperty("test.data", ".");
- // Make sure sample files are copied to "test.data" as specified in
- // ./src/plugin/parse-oo/build.xml during plugin compilation.
- private String[] sampleFiles = { "ootest.odt", "ootest.sxw" };
-
- private String expectedText;
-
- private String sampleText = "ootest.txt";
-
- @Test
- public void testIt() throws ProtocolException, ParseException {
- String urlString;
- Content content;
- Parse parse;
- Configuration conf = NutchConfiguration.create();
- Protocol protocol;
- ProtocolFactory factory = new ProtocolFactory(conf);
-
- System.out.println("Expected : " + expectedText);
-
- for (int i = 0; i < sampleFiles.length; i++) {
- urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
-
- if (sampleFiles[i].startsWith("ootest") == false)
- continue;
-
- protocol = factory.getProtocol(urlString);
- content = protocol.getProtocolOutput(new Text(urlString),
- new CrawlDatum()).getContent();
- parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
- .get(content.getUrl());
-
- String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
-
- // simply test for the presence of a text - the ordering of the elements
- // may differ from what was expected
- // in the previous tests
- Assert.assertTrue(text != null && text.length() > 0);
-
- System.out.println("Found " + sampleFiles[i] + ": " + text);
- }
- }
-
- public TestOOParser() {
- try {
- // read the test string
- FileInputStream fis = new FileInputStream(sampleDir + fileSeparator
- + sampleText);
- StringBuffer sb = new StringBuffer();
- int len = 0;
- InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
- char[] buf = new char[1024];
- while ((len = isr.read(buf)) > 0) {
- sb.append(buf, 0, len);
- }
- isr.close();
- expectedText = sb.toString();
- // normalize space
- expectedText = expectedText.replaceAll("[ \t\r\n]+", " ");
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java
deleted file mode 100644
index 9884f0c..0000000
--- a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.tika;
-
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.parse.ParseException;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.junit.Assert;
-import org.junit.Test;
-
-/**
- * Unit tests for PdfParser.
- *
- * @author John Xing
- */
-public class TestPdfParser {
-
- private String fileSeparator = System.getProperty("file.separator");
- // This system property is defined in ./src/plugin/build-plugin.xml
- private String sampleDir = System.getProperty("test.data", ".");
- // Make sure sample files are copied to "test.data" as specified in
- // ./src/plugin/parse-pdf/build.xml during plugin compilation.
- // Check ./src/plugin/parse-pdf/sample/README.txt for what they are.
- private String[] sampleFiles = { "pdftest.pdf", "encrypted.pdf" };
-
- private String expectedText = "A VERY SMALL PDF FILE";
-
- @Test
- public void testIt() throws ProtocolException, ParseException {
- String urlString;
- Protocol protocol;
- Content content;
- Parse parse;
-
- for (int i = 0; i < sampleFiles.length; i++) {
- urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
-
- Configuration conf = NutchConfiguration.create();
- protocol = new ProtocolFactory(conf).getProtocol(urlString);
- content = protocol.getProtocolOutput(new Text(urlString),
- new CrawlDatum()).getContent();
- parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
- .get(content.getUrl());
-
- int index = parse.getText().indexOf(expectedText);
- Assert.assertTrue(index > 0);
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/20d28406/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java b/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java
deleted file mode 100644
index f15d821..0000000
--- a/nutch-plugins/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java
+++ /dev/null
@@ -1,81 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.tika;
-
-// Nutch imports
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.metadata.DublinCore;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.parse.ParseException;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.util.NutchConfiguration;
-
-// Hadoop imports
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.junit.Assert;
-import org.junit.Ignore;
-import org.junit.Test;
-
-/**
- * Unit tests for TestRTFParser. (Adapted from John Xing msword unit tests).
- *
- * @author Andy Hedges
- */
-public class TestRTFParser {
-
- private String fileSeparator = System.getProperty("file.separator");
- // This system property is defined in ./src/plugin/build-plugin.xml
- private String sampleDir = System.getProperty("test.data", ".");
- // Make sure sample files are copied to "test.data" as specified in
- // ./src/plugin/parse-rtf/build.xml during plugin compilation.
- // Check ./src/plugin/parse-rtf/sample/README.txt for what they are.
- private String rtfFile = "test.rtf";
-
- @Ignore("There seems to be an issue with line 71 e.g. text.trim()")
- @Test
- public void testIt() throws ProtocolException, ParseException {
-
- String urlString;
- Protocol protocol;
- Content content;
- Parse parse;
-
- Configuration conf = NutchConfiguration.create();
- urlString = "file:" + sampleDir + fileSeparator + rtfFile;
- protocol = new ProtocolFactory(conf).getProtocol(urlString);
- content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
- .getContent();
- parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(
- content.getUrl());
- String text = parse.getText();
- Assert.assertEquals("The quick brown fox jumps over the lazy dog",
- text.trim());
-
- String title = parse.getData().getTitle();
- Metadata meta = parse.getData().getParseMeta();
-
- Assert.assertEquals("test rft document", title);
- Assert.assertEquals("tests", meta.get(DublinCore.SUBJECT));
-
- }
-}