You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2013/05/23 00:44:12 UTC
svn commit: r1485475 - in /nutch/branches/2.x: ./ ivy/
src/java/org/apache/nutch/api/ src/java/org/apache/nutch/crawl/
src/java/org/apache/nutch/host/ src/test/org/apache/nutch/crawl/
src/test/org/apache/nutch/fetcher/ src/test/org/apache/nutch/storage...
Author: lewismc
Date: Wed May 22 22:44:12 2013
New Revision: 1485475
URL: http://svn.apache.org/r1485475
Log:
revert NUTCH-1569 Upgrade 2.x to Gora 0.3
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/build.xml
nutch/branches/2.x/ivy/ivy.xml
nutch/branches/2.x/src/java/org/apache/nutch/api/DbReader.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java
nutch/branches/2.x/src/java/org/apache/nutch/host/HostDb.java
nutch/branches/2.x/src/java/org/apache/nutch/host/HostDbReader.java
nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java
nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java
nutch/branches/2.x/src/test/org/apache/nutch/fetcher/TestFetcher.java
nutch/branches/2.x/src/test/org/apache/nutch/storage/TestGoraStorage.java
nutch/branches/2.x/src/test/org/apache/nutch/util/AbstractNutchTest.java
nutch/branches/2.x/src/test/org/apache/nutch/util/CrawlTestUtil.java
Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1485475&r1=1485474&r2=1485475&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Wed May 22 22:44:12 2013
@@ -4,8 +4,6 @@ Release 2.2 - Current Development
* NUTCH-1249 and NUTCH-1275 : Resolve all issues flagged up by adding javac -Xlint argument (tejasp)
-* NUTCH-1569 Upgrade 2.x to Gora 0.3 (lewismc)
-
* NUTCH-1513 Support Robots.txt for Ftp urls (tejasp)
* NUTCH-1053 Parsing of RSS feeds fails (tejasp)
Modified: nutch/branches/2.x/build.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/build.xml?rev=1485475&r1=1485474&r2=1485475&view=diff
==============================================================================
--- nutch/branches/2.x/build.xml (original)
+++ nutch/branches/2.x/build.xml Wed May 22 22:44:12 2013
@@ -538,7 +538,7 @@
<target name="generate-gora-src" depends="init" description="--> compile the avro schema(s) in src/gora/*.avsc">
<java classname="org.apache.gora.compiler.GoraCompiler">
<classpath refid="classpath"/>
- <arg value="src/gora/"/>
+ <arg value="src/gora/webpage.avsc"/>
<arg value="${src.dir}"/>
</java>
</target>
Modified: nutch/branches/2.x/ivy/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/ivy/ivy.xml?rev=1485475&r1=1485474&r2=1485475&view=diff
==============================================================================
--- nutch/branches/2.x/ivy/ivy.xml (original)
+++ nutch/branches/2.x/ivy/ivy.xml Wed May 22 22:44:12 2013
@@ -99,28 +99,23 @@
<!--================-->
<!-- Gora artifacts -->
<!--================-->
- <dependency org="org.apache.gora" name="gora-core" rev="0.3" conf="*->compile"/>
- <!-- Uncomment this to use SQL as Gora backend. It should be noted that the
- gora-sql 0.1.1-incubating artifact is NOT compatable with gora-core 0.3. Users should
- downgrade to gora-core 0.2.1 in order to use SQL as a backend. -->
- <!--
+ <dependency org="org.apache.gora" name="gora-core" rev="0.2.1" conf="*->compile"/>
<dependency org="org.apache.gora" name="gora-sql" rev="0.1.1-incubating" conf="*->default" />
- -->
<!-- Uncomment this to use MySQL as database with SQL as Gora store. -->
<!--
<dependency org="mysql" name="mysql-connector-java" rev="5.1.18" conf="*->default"/>
-->
<!-- Uncomment this to use HBase as Gora backend. -->
<!--
- <dependency org="org.apache.gora" name="gora-hbase" rev="0.3" conf="*->default" />
+ <dependency org="org.apache.gora" name="gora-hbase" rev="0.2.1" conf="*->default" />
-->
<!-- Uncomment this to use Accumulo as Gora backend. -->
<!--
- <dependency org="org.apache.gora" name="gora-accumulo" rev="0.3" conf="*->default" />
+ <dependency org="org.apache.gora" name="gora-accumulo" rev="0.2.1" conf="*->default" />
-->
<!-- Uncomment this to use Cassandra as Gora backend. -->
<!--
- <dependency org="org.apache.gora" name="gora-cassandra" rev="0.3" conf="*->default" />
+ <dependency org="org.apache.gora" name="gora-cassandra" rev="0.2" conf="*->default" />
-->
<!--global exclusion -->
Modified: nutch/branches/2.x/src/java/org/apache/nutch/api/DbReader.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/api/DbReader.java?rev=1485475&r1=1485474&r2=1485475&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/api/DbReader.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/api/DbReader.java Wed May 22 22:44:12 2013
@@ -113,14 +113,10 @@ public class DbReader {
if (fields != null) {
this.fields = new TreeSet<String>(Arrays.asList(fields));
}
- try {
- advance();
- } catch (Exception e){
- e.printStackTrace();
- }
+ advance();
}
- private void advance() throws Exception, IOException {
+ private void advance() throws IOException {
hasNext = res.next();
if (hasNext && batchId != null) {
do {
@@ -155,10 +151,6 @@ public class DbReader {
e.printStackTrace();
hasNext = false;
return null;
- } catch (Exception e) {
- e.printStackTrace();
- hasNext = false;
- return null;
}
return pageAsMap(url, page);
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java?rev=1485475&r1=1485474&r2=1485475&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java Wed May 22 22:44:12 2013
@@ -223,7 +223,7 @@ public class WebTableReader extends Nutc
/** Prints out the entry to the standard out **/
private void read(String key, boolean dumpContent, boolean dumpHeaders,
- boolean dumpLinks, boolean dumpText) throws ClassNotFoundException, IOException, Exception {
+ boolean dumpLinks, boolean dumpText) throws ClassNotFoundException, IOException {
DataStore<String, WebPage> datastore = StorageUtils.createWebStore(getConf(),
String.class, WebPage.class);
@@ -235,19 +235,15 @@ public class WebTableReader extends Nutc
boolean found = false;
// should happen only once
while (result.next()) {
- try {
- WebPage page = result.get();
- String skey = result.getKey();
- // we should not get to this point but nevermind
- if (page == null || skey == null)
- break;
- found = true;
- String url = TableUtil.unreverseUrl(skey);
- System.out.println(getPageRepresentation(url, page, dumpContent,
- dumpHeaders, dumpLinks, dumpText));
- }catch (Exception e) {
- e.printStackTrace();
- }
+ WebPage page = result.get();
+ String skey = result.getKey();
+ // we should not get to this point but nevermind
+ if (page == null || skey == null)
+ break;
+ found = true;
+ String url = TableUtil.unreverseUrl(skey);
+ System.out.println(getPageRepresentation(url, page, dumpContent,
+ dumpHeaders, dumpLinks, dumpText));
}
if (!found)
System.out.println(key + " not found");
Modified: nutch/branches/2.x/src/java/org/apache/nutch/host/HostDb.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/host/HostDb.java?rev=1485475&r1=1485474&r2=1485475&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/host/HostDb.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/host/HostDb.java Wed May 22 22:44:12 2013
@@ -25,7 +25,6 @@ import java.util.concurrent.atomic.Atomi
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.gora.store.DataStore;
-import org.apache.gora.util.GoraException;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.storage.Host;
import org.apache.nutch.storage.StorageUtils;
@@ -64,7 +63,7 @@ public class HostDb implements Closeable
private AtomicLong lastFlush;
- public HostDb(Configuration conf) throws GoraException {
+ public HostDb(Configuration conf) throws IOException {
try {
hostStore = StorageUtils.createWebStore(conf, String.class, Host.class);
} catch (ClassNotFoundException e) {
@@ -87,7 +86,11 @@ public class HostDb implements Closeable
CacheHost removeFromCacheHost = notification.getValue();
if (removeFromCacheHost != NULL_HOST) {
if (removeFromCacheHost.timestamp < lastFlush.get()) {
- hostStore.flush();
+ try {
+ hostStore.flush();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
lastFlush.set(System.currentTimeMillis());
}
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/host/HostDbReader.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/host/HostDbReader.java?rev=1485475&r1=1485474&r2=1485475&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/host/HostDbReader.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/host/HostDbReader.java Wed May 22 22:44:12 2013
@@ -39,7 +39,7 @@ import org.apache.nutch.util.TableUtil;
public class HostDbReader extends Configured implements Tool {
public static final Log LOG = LogFactory.getLog(HostDbReader.class);
- private void read(String key) throws ClassNotFoundException, IOException, Exception {
+ private void read(String key) throws ClassNotFoundException, IOException {
DataStore<String, Host> datastore = StorageUtils.createWebStore(getConf(),
String.class, Host.class);
@@ -53,14 +53,10 @@ public class HostDbReader extends Config
Result<String, Host> result = datastore.execute(query);
while (result.next()) {
- try {
- String hostName = TableUtil.unreverseUrl(result.getKey());
- Host host = result.get();
- System.out.println(hostName);
- System.out.println(host);
- } catch (Exception e) {
- e.printStackTrace();
- }
+ String hostName = TableUtil.unreverseUrl(result.getKey());
+ Host host = result.get();
+ System.out.println(hostName);
+ System.out.println(host);
}
result.close();
datastore.close();
Modified: nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java?rev=1485475&r1=1485474&r2=1485475&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java (original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java Wed May 22 22:44:12 2013
@@ -20,7 +20,6 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
-
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
@@ -31,7 +30,6 @@ import org.apache.nutch.util.CrawlTestUt
import org.apache.nutch.util.TableUtil;
import org.junit.After;
import org.junit.Before;
-import org.junit.Ignore;
import org.junit.Test;
import static org.junit.Assert.*;
@@ -68,7 +66,6 @@ public class TestGenerator extends Abstr
* @throws Exception
*/
@Test
- @Ignore("Temporarily diable until NUTCH-1572 is addressed.")
public void testGenerateHighest() throws Exception {
final int NUM_RESULTS = 2;
@@ -129,7 +126,6 @@ public class TestGenerator extends Abstr
* @throws Exception
*/
@Test
- @Ignore("Temporarily diable until NUTCH-1572 is addressed.")
public void testGenerateHostLimit() throws Exception {
ArrayList<URLWebPage> list = new ArrayList<URLWebPage>();
@@ -178,7 +174,6 @@ public class TestGenerator extends Abstr
* @throws Exception
*/
@Test
- @Ignore("Temporarily diable until NUTCH-1572 is addressed.")
public void testGenerateDomainLimit() throws Exception {
ArrayList<URLWebPage> list = new ArrayList<URLWebPage>();
@@ -231,7 +226,6 @@ public class TestGenerator extends Abstr
* @throws IOException
*/
@Test
- @Ignore("Temporarily diable until NUTCH-1572 is addressed.")
public void testFilter() throws IOException, Exception {
ArrayList<URLWebPage> list = new ArrayList<URLWebPage>();
Modified: nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java?rev=1485475&r1=1485474&r2=1485475&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java (original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java Wed May 22 22:44:12 2013
@@ -28,7 +28,6 @@ import org.apache.nutch.util.AbstractNut
import org.apache.nutch.util.CrawlTestUtil;
import org.apache.gora.util.ByteUtils;
import org.junit.Before;
-import org.junit.Ignore;
import org.junit.Test;
import static org.junit.Assert.*;
@@ -37,6 +36,7 @@ import static org.junit.Assert.*;
* crawldb 3. Reads crawldb entries and verifies contents 4. Injects more urls
* into webdb 5. Reads crawldb entries and verifies contents
*
+ * @author nutch-dev <nutch-dev at lucene.apache.org>
*/
public class TestInjector extends AbstractNutchTest {
Path urlPath;
@@ -49,7 +49,6 @@ public class TestInjector extends Abstra
}
@Test
- @Ignore("Temporarily diable until NUTCH-1572 is addressed.")
public void testInject() throws Exception {
ArrayList<String> urls = new ArrayList<String>();
for (int i = 0; i < 100; i++) {
Modified: nutch/branches/2.x/src/test/org/apache/nutch/fetcher/TestFetcher.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/fetcher/TestFetcher.java?rev=1485475&r1=1485474&r2=1485475&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/fetcher/TestFetcher.java (original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/fetcher/TestFetcher.java Wed May 22 22:44:12 2013
@@ -32,7 +32,6 @@ import org.mortbay.jetty.Server;
import org.junit.After;
import org.junit.Before;
-import org.junit.Ignore;
import org.junit.Test;
import static org.junit.Assert.*;
@@ -68,7 +67,6 @@ public class TestFetcher extends Abstrac
}
@Test
- @Ignore("Temporarily diable until NUTCH-1572 is addressed.")
public void testFetch() throws Exception {
//generate seedlist
Modified: nutch/branches/2.x/src/test/org/apache/nutch/storage/TestGoraStorage.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/storage/TestGoraStorage.java?rev=1485475&r1=1485474&r2=1485475&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/storage/TestGoraStorage.java (original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/storage/TestGoraStorage.java Wed May 22 22:44:12 2013
@@ -34,7 +34,6 @@ import org.apache.hadoop.conf.Configurat
import org.apache.nutch.util.AbstractNutchTest;
import org.apache.nutch.util.CrawlTestUtil;
import org.hsqldb.Server;
-import org.junit.Ignore;
import org.junit.After;
import org.junit.Before;
@@ -70,7 +69,7 @@ public class TestGoraStorage extends Abs
}
private static void readWrite(String id, DataStore<String, WebPage> store)
- throws IOException, Exception {
+ throws IOException {
WebPage page = new WebPage();
int max = 1000;
for (int i = 0; i < max; i++) {
@@ -91,13 +90,9 @@ public class TestGoraStorage extends Abs
Result<String, WebPage> result = store.execute(store.newQuery());
int count = 0;
while (result.next()) {
- try {
- // only count keys in the store for the current id
- if (result.getKey().contains(id))
- count++;
- } catch (Exception e) {
- e.printStackTrace();
- }
+ // only count keys in the store for the current id
+ if (result.getKey().contains(id))
+ count++;
}
// check amount
assertEquals(max, count);
@@ -111,7 +106,6 @@ public class TestGoraStorage extends Abs
* @throws Exception
*/
@Test
- @Ignore("Temporarily diable until NUTCH-1572 is addressed.")
public void testMultithreaded() throws Exception {
// create a fixed thread pool
int numThreads = 8;
@@ -225,7 +219,7 @@ public class TestGoraStorage extends Abs
System.out.println("Starting!");
Configuration localConf = CrawlTestUtil.createConfiguration();
- localConf.set("storage.data.store.class", "org.apache.gora.memory.store.MemStore");
+ localConf.set("storage.data.store.class", "org.apache.gora.sql.store.SqlStore");
DataStore<String, WebPage> store = StorageUtils.createWebStore(localConf,
String.class, WebPage.class);
Modified: nutch/branches/2.x/src/test/org/apache/nutch/util/AbstractNutchTest.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/util/AbstractNutchTest.java?rev=1485475&r1=1485474&r2=1485475&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/util/AbstractNutchTest.java (original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/util/AbstractNutchTest.java Wed May 22 22:44:12 2013
@@ -37,7 +37,7 @@ public class AbstractNutchTest {
public void setUp() throws Exception {
conf = CrawlTestUtil.createConfiguration();
- conf.set("storage.data.store.class", "org.apache.gora.memory.store.MemStore");
+ conf.set("storage.data.store.class", "org.apache.gora.sql.store.SqlStore");
fs = FileSystem.get(conf);
webPageStore = StorageUtils.createWebStore(conf, String.class,
WebPage.class);
Modified: nutch/branches/2.x/src/test/org/apache/nutch/util/CrawlTestUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/util/CrawlTestUtil.java?rev=1485475&r1=1485474&r2=1485475&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/util/CrawlTestUtil.java (original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/util/CrawlTestUtil.java Wed May 22 22:44:12 2013
@@ -98,10 +98,10 @@ public class CrawlTestUtil {
* Read entries from a data store
*
* @return list of matching {@link URLWebPage} objects
- * @throws Exception
+ * @throws IOException
*/
public static ArrayList<URLWebPage> readContents(DataStore<String,WebPage> store,
- Mark requiredMark, String... fields) throws Exception {
+ Mark requiredMark, String... fields) throws IOException {
ArrayList<URLWebPage> l = new ArrayList<URLWebPage>();
Query<String, WebPage> query = store.newQuery();
@@ -111,21 +111,18 @@ public class CrawlTestUtil {
Result<String, WebPage> results = store.execute(query);
while (results.next()) {
- try {
- WebPage page = results.get();
- String url = results.getKey();
-
- if (page == null)
- continue;
-
- if (requiredMark != null && requiredMark.checkMark(page) == null)
- continue;
-
- l.add(new URLWebPage(TableUtil.unreverseUrl(url), (WebPage)page.clone()));
- } catch (Exception e) {
- e.printStackTrace();
- }
+ WebPage page = results.get();
+ String url = results.getKey();
+
+ if (page == null)
+ continue;
+
+ if (requiredMark != null && requiredMark.checkMark(page) == null)
+ continue;
+
+ l.add(new URLWebPage(TableUtil.unreverseUrl(url), (WebPage)page.clone()));
}
+
return l;
}