You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2012/09/15 18:16:49 UTC
svn commit: r1385103 - in /nutch/branches/2.x: ./ conf/ src/plugin/
src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/
src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/
src/plugin/parse-js/ src/plu...
Author: lewismc
Date: Sat Sep 15 16:16:48 2012
New Revision: 1385103
URL: http://svn.apache.org/viewvc?rev=1385103&view=rev
Log:
NUTCH-1162 Write JUnit tests for parse-js
Added:
nutch/branches/2.x/src/plugin/parse-js/sample/
nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package.html
nutch/branches/2.x/src/plugin/parse-js/src/test/
nutch/branches/2.x/src/plugin/parse-js/src/test/org/
nutch/branches/2.x/src/plugin/parse-js/src/test/org/apache/
nutch/branches/2.x/src/plugin/parse-js/src/test/org/apache/nutch/
nutch/branches/2.x/src/plugin/parse-js/src/test/org/apache/nutch/parse/
nutch/branches/2.x/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/
nutch/branches/2.x/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/TestJSParseFilter.java
Removed:
nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/package.html
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/conf/nutch-default.xml
nutch/branches/2.x/src/plugin/build.xml
nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java
nutch/branches/2.x/src/plugin/parse-js/build.xml
nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1385103&r1=1385102&r2=1385103&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Sat Sep 15 16:16:48 2012
@@ -2,6 +2,12 @@ Nutch Change Log
Release 2.1 - Current Development
+* NUTCH-1162 Write JUnit tests for parse-js (lewismc)
+
+* NUTCH-1161 Write JUnit tests for microformats-reltag plugin (lewismc)
+
+* NUTCH-1160 Write JUnit tests for index-basic (lewismc)
+
* NUTCH-1456 Updater not setting batchId in markers correctly. (Alexander Kingson via ferdy)
* NUTCH-1459 Remove dead code (phase2) from InjectorJob (ferdy)
Modified: nutch/branches/2.x/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1385103&r1=1385102&r2=1385103&view=diff
==============================================================================
--- nutch/branches/2.x/conf/nutch-default.xml (original)
+++ nutch/branches/2.x/conf/nutch-default.xml Sat Sep 15 16:16:48 2012
@@ -749,6 +749,8 @@
effect.</description>
</property>
+<!-- BasicIndexingfilter plugin properties -->
+
<property>
<name>indexer.max.title.length</name>
<value>100</value>
Modified: nutch/branches/2.x/src/plugin/build.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/build.xml?rev=1385103&r1=1385102&r2=1385103&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/build.xml (original)
+++ nutch/branches/2.x/src/plugin/build.xml Sat Sep 15 16:16:48 2012
@@ -74,6 +74,7 @@
<ant dir="parse-tika" target="test"/>
<ant dir="protocol-file" target="test"/>
<ant dir="parse-html" target="test"/>
+ <ant dir="parse-js" target="test"/>
<ant dir="index-anchor" target="test"/>
<ant dir="index-basic" target="test"/>
<ant dir="index-more" target="test"/>
Modified: nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java?rev=1385103&r1=1385102&r2=1385103&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java (original)
+++ nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java Sat Sep 15 16:16:48 2012
@@ -135,7 +135,13 @@ public class RelTagParser implements Par
FIELDS.add(WebPage.Field.BASE_URL);
FIELDS.add(WebPage.Field.METADATA);
}
-
+
+ /**
+ * Gets all the fields for a given {@link WebPage}
+ * Many datastores need to setup the mapreduce job by specifying the fields
+ * needed. All extensions that work on WebPage are able to specify what fields
+ * they need.
+ */
@Override
public Collection<Field> getFields() {
return FIELDS;
Modified: nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java?rev=1385103&r1=1385102&r2=1385103&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java (original)
+++ nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java Sat Sep 15 16:16:48 2012
@@ -27,17 +27,20 @@ import org.apache.hadoop.conf.Configurat
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseException;
import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.parse.Parser;
import org.apache.nutch.protocol.ProtocolException;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.util.MimeUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.junit.Test;
-
import junit.framework.TestCase;
/**
- * Junit test for {@link RelTagParser} based on John Xing's parser tests.
+ * Junit test for {@link RelTagParser} based mainly John Xing's parser tests.
+ * We are not concerned with actual parse text within the sample file, instead
+ * we assert that the rel-tags we expect are found in the WebPage metadata.
+ * To check the parser is working as expected we unwrap the ByteBuffer obtained
+ * from metadata, the same type as * we use in expected (String). So just the
+ * other way around as we wrapped the metadata value.
*
* @author lewismc
*
@@ -51,10 +54,10 @@ public class TestRelTagParser extends Te
// Make sure sample files are copied to "test.data" as specified in
// ./src/plugin/microformats-reltag/build.xml during plugin compilation.
-
- private String[] sampleFile = { "microformats_reltag_test.html" };
+ private String sampleFile = "microformats_reltag_test.html";
- private String expectedText = "rel=\"tag\" · Microformats Wiki";
+ // rel-tag's we expect to be extracted from page.getMetadata()
+ private String expectedRelTags = "Category:Specifications Category:rel-tag ";
private Configuration conf;
@@ -62,42 +65,35 @@ public class TestRelTagParser extends Te
super(name);
}
- protected void setUp() {
- conf = NutchConfiguration.create();
+ @Test
+ public void testRelTagParser() throws ProtocolException, ParseException, IOException {
+ conf = NutchConfiguration.create();
conf.set("file.content.limit", "-1");
- }
-
- protected void tearDown() {
- }
-
- public String getTextContent(String fileName) throws ProtocolException, ParseException, IOException {
Parse parse;
- String urlString = sampleDir + fileSeparator + fileName;
+ String urlString = "file:" + sampleDir + fileSeparator + sampleFile;
- File file = new File(urlString);
+ File file = new File(sampleDir + fileSeparator + sampleFile);
byte[] bytes = new byte[(int) file.length()];
DataInputStream in = new DataInputStream(new FileInputStream(file));
in.readFully(bytes);
in.close();
WebPage page = new WebPage();
- page.setBaseUrl(new Utf8("file:"+urlString));
+ page.setBaseUrl(new Utf8(urlString));
page.setContent(ByteBuffer.wrap(bytes));
MimeUtil mimeutil = new MimeUtil(conf);
String mtype = mimeutil.getMimeType(file);
page.setContentType(new Utf8(mtype));
- parse = new ParseUtil(conf).parse("file:"+urlString, page);
-
- return parse.getText();
+ parse = new ParseUtil(conf).parse(urlString, page);
+
+ //begin assertion for tests
+ ByteBuffer bbuf = page.getFromMetadata(new Utf8("Rel-Tag"));
+ byte[] byteArray = new byte[bbuf.remaining()];
+ bbuf.get(byteArray);
+ String s = new String(byteArray);
+ //bbuf.flip();
+ assertEquals("We expect 2 tab-separated rel-tag's extracted by the filter",
+ expectedRelTags, s);
}
- @Test
- public void testRelTagParser() throws ProtocolException, ParseException, IOException {
-
- for (int i = 0; i < sampleFile.length; i++) {
- String found = getTextContent(sampleFile[i]);
- assertTrue("text found : '" + found + "'", found.startsWith(expectedText));
- }
- }
-
}
\ No newline at end of file
Modified: nutch/branches/2.x/src/plugin/parse-js/build.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-js/build.xml?rev=1385103&r1=1385102&r2=1385103&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-js/build.xml (original)
+++ nutch/branches/2.x/src/plugin/parse-js/build.xml Sat Sep 15 16:16:48 2012
@@ -19,4 +19,18 @@
<import file="../build-plugin.xml"/>
+ <!-- Deploy Unit test dependencies -->
+ <target name="deps-test">
+ <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+ <ant target="deploy" inheritall="false" dir="../protocol-file"/>
+ </target>
+
+ <!-- for junit test -->
+ <mkdir dir="${build.test}/data"/>
+ <copy todir="${build.test}/data">
+ <fileset dir="sample">
+ <include name="*.html"/>
+ <include name="*.js"/>
+ </fileset>
+ </copy>
</project>
Modified: nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?rev=1385103&r1=1385102&r2=1385103&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java (original)
+++ nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java Sat Sep 15 16:16:48 2012
@@ -69,6 +69,15 @@ public class JSParseFilter implements Pa
private Configuration conf;
+ /**
+ * Scan the JavaScript looking for possible {@link Outlink}'s
+ * @param url URL of the {@link WebPage} to be parsed
+ * @param page {@link WebPage} object relative to the URL
+ * @param parse {@link Parse} object holding parse status
+ * @param metatags within the {@link NutchDocument}
+ * @param doc The {@link NutchDocument} object
+ * @return parse the actual {@link Parse} object
+ */
@Override
public Parse filter(String url, WebPage page, Parse parse,
HTMLMetaTags metaTags, DocumentFragment doc) {
@@ -104,9 +113,10 @@ public class JSParseFilter implements Pa
if (i > 0) script.append('\n');
script.append(nn.item(i).getNodeValue());
}
- // if (LOG.isInfoEnabled()) {
- // LOG.info("script: language=" + lang + ", text: " + script.toString());
- // }
+ // This logging makes the output very messy.
+ //if (LOG.isInfoEnabled()) {
+ // LOG.info("script: language=" + lang + ", text: " + script.toString());
+ //}
Outlink[] links = getJSLinks(script.toString(), "", base);
if (links != null && links.length > 0) outlinks.addAll(Arrays.asList(links));
// no other children of interest here, go one level up.
@@ -141,6 +151,12 @@ public class JSParseFilter implements Pa
}
}
+ /**
+ * Set the {@link Configuration} object
+ * @param url URL of the {@link WebPage} which is parsed
+ * @param page {@link WebPage} object relative to the URL
+ * @return parse the actual {@link Parse} object
+ */
@Override
public Parse getParse(String url, WebPage page) {
String type = TableUtil.toString(page.getContentType());
@@ -182,7 +198,9 @@ public class JSParseFilter implements Pa
try {
baseURL = new URL(base);
} catch (Exception e) {
- if (LOG.isErrorEnabled()) { LOG.error("getJSLinks", e); }
+ if (LOG.isErrorEnabled()) {
+ LOG.error("error assigning base URL", e);
+ }
}
try {
@@ -207,7 +225,9 @@ public class JSParseFilter implements Pa
url = result.group(2);
PatternMatcherInput input1 = new PatternMatcherInput(url);
if (!matcher1.matches(input1, pattern1)) {
- //if (LOG.isTraceEnabled()) { LOG.trace(" - invalid '" + url + "'"); }
+ if (LOG.isTraceEnabled()) {
+ LOG.trace(" - invalid '" + url + "'");
+ }
continue;
}
if (url.startsWith("www.")) {
@@ -234,7 +254,9 @@ public class JSParseFilter implements Pa
} catch (Exception ex) {
// if it is a malformed URL we just throw it away and continue with
// extraction.
- if (LOG.isErrorEnabled()) { LOG.error("getJSLinks", ex); }
+ if (LOG.isErrorEnabled()) {
+ LOG.error(" - invalid or malformed URL", ex);
+ }
}
final Outlink[] retval;
@@ -249,6 +271,12 @@ public class JSParseFilter implements Pa
return retval;
}
+ /**
+ * Main method which can be run from command line with the plugin option.
+ * The method takes two arguments e.g. o.a.n.parse.js.JSParseFilter file.js baseURL
+ * @param args
+ * @throws Exception
+ */
public static void main(String[] args) throws Exception {
if (args.length < 2) {
System.err.println(JSParseFilter.class.getName() + " file.js baseURL");
@@ -267,14 +295,26 @@ public class JSParseFilter implements Pa
System.out.println(" - " + links[i]);
}
+ /**
+ * Set the {@link Configuration} object
+ */
public void setConf(Configuration conf) {
this.conf = conf;
}
+ /**
+ * Get the {@link Configuration} object
+ */
public Configuration getConf() {
return this.conf;
}
+ /**
+ * Gets all the fields for a given {@link WebPage}
+ * Many datastores need to setup the mapreduce job by specifying the fields
+ * needed. All extensions that work on WebPage are able to specify what fields
+ * they need.
+ */
@Override
public Collection<WebPage.Field> getFields() {
return null;
Added: nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package.html
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package.html?rev=1385103&view=auto
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package.html (added)
+++ nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package.html Sat Sep 15 16:16:48 2012
@@ -0,0 +1,6 @@
+<html>
+<body>
+<p>A parser plugin and content filter to extract all (possible) links
+from JavaScript files and code snippets.</p>
+</body>
+</html>
Added: nutch/branches/2.x/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/TestJSParseFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/TestJSParseFilter.java?rev=1385103&view=auto
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/TestJSParseFilter.java (added)
+++ nutch/branches/2.x/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/TestJSParseFilter.java Sat Sep 15 16:16:48 2012
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.js;
+
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import org.apache.avro.util.Utf8;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.MimeUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Test;
+
+import junit.framework.TestCase;
+
+/**
+ * JUnit test case for {@link JSParseFilter} which tests
+ * 1. That 5 outlinks are extracted from JavaScript snippets embedded in HTML
+ * 2. That X outlinks are extracted from a pure JavaScript file (this is temporarily disabled)
+ *
+ * @author lewismc
+ */
+
+public class TestJSParseFilter extends TestCase {
+
+ private String fileSeparator = System.getProperty("file.separator");
+
+ // This system property is defined in ./src/plugin/build-plugin.xml
+ private String sampleDir = System.getProperty("test.data", ".");
+
+ // Make sure sample files are copied to "test.data" as specified in
+ // ./src/plugin/parse-js/build.xml during plugin compilation.
+ private String[] sampleFiles = { "parse_pure_js_test.js", "parse_embedded_js_test.html" };
+
+ private Configuration conf;
+
+ public TestJSParseFilter(String name) {
+ super(name);
+ }
+
+ protected void setUp() {
+ conf = NutchConfiguration.create();
+ conf.set("file.content.limit", "-1");
+ }
+
+ protected void tearDown() {
+ }
+
+ public Outlink[] getOutlinks(String[] sampleFiles) throws ProtocolException, ParseException, IOException {
+ String urlString;
+ Parse parse;
+
+ urlString = "file:" + sampleDir + fileSeparator + sampleFiles;
+ File file = new File(urlString);
+ byte[] bytes = new byte[(int) file.length()];
+ DataInputStream dip = new DataInputStream(new FileInputStream(file));
+ dip.readFully(bytes);
+ dip.close();
+
+ WebPage page = new WebPage();
+ page.setBaseUrl(new Utf8(urlString));
+ page.setContent(ByteBuffer.wrap(bytes));
+ MimeUtil mutil = new MimeUtil(conf);
+ String mime = mutil.getMimeType(file);
+ page.setContentType(new Utf8(mime));
+
+ parse = new ParseUtil(conf).parse(urlString, page);
+ return parse.getOutlinks();
+ }
+
+ @Test
+ public void testOutlinkExtraction() throws ProtocolException, ParseException, IOException {
+ String[] filenames = new File(sampleDir).list();
+ for (int i = 0; i < filenames.length; i++) {
+ if (filenames[i].endsWith(".js") == true) {
+ assertEquals("number of outlinks in .js test file should be 5", 5, getOutlinks(sampleFiles));
+ // temporarily disabled as a suitable pure JS file could not be be found.
+ //} else {
+ //assertEquals("number of outlinks in .html file should be X", 5, getOutlinks(sampleFiles));
+ }
+ }
+ }
+
+}
\ No newline at end of file
Modified: nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java?rev=1385103&r1=1385102&r2=1385103&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java (original)
+++ nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java Sat Sep 15 16:16:48 2012
@@ -16,23 +16,6 @@
******************************************************************************/
package org.apache.nutch.parse.tika;
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
// JUnit imports
import java.io.DataInputStream;
import java.io.File;
@@ -79,7 +62,6 @@ public class TestRTFParser extends TestC
}
public void testIt() throws ProtocolException, ParseException, IOException {
- /* Temporarily disabled - see Tika-748
String urlString;
Parse parse;
@@ -97,22 +79,23 @@ public class TestRTFParser extends TestC
WebPage page = new WebPage();
page.setBaseUrl(new Utf8(urlString));
page.setContent(ByteBuffer.wrap(bytes));
- MimeType mtype = mimeutil.getMimeType(file);
- page.setContentType(new Utf8(mtype.getName()));
+ String mtype = mimeutil.getMimeType(file);
+ page.setContentType(new Utf8(mtype));
parse = new ParseUtil(conf).parse(urlString, page);
+ String title = parse.getTitle();
String text = parse.getText();
- assertEquals("The quick brown fox jumps over the lazy dog", text.trim());
+ assertEquals("test rft document", title);
+ //assertEquals("The quick brown fox jumps over the lazy dog", text.trim());
- String title = parse.getTitle();
+
// HOW DO WE GET THE PARSE METADATA?
// Metadata meta = parse();
// METADATA extraction is not yet supported in Tika
- // assertEquals("test rft document", title);
+ //
// assertEquals("tests", meta.get(DublinCore.SUBJECT));
- */
}
}