You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2010/06/30 12:36:29 UTC
svn commit: r959259 [12/12] - in /nutch/branches/nutchbase: ./ bin/ conf/
contrib/ docs/ ivy/ lib/ lib/jetty-ext/ src/engines/ src/gora/ src/java/
src/java/org/apache/nutch/analysis/ src/java/org/apache/nutch/clustering/
src/java/org/apache/nutch/crawl...
Modified: nutch/branches/nutchbase/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java?rev=959259&r1=959258&r2=959259&view=diff
==============================================================================
--- nutch/branches/nutchbase/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java (original)
+++ nutch/branches/nutchbase/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java Wed Jun 30 10:36:20 2010
@@ -16,9 +16,7 @@
*/
package org.apache.nutch.crawl;
-import java.io.File;
import java.io.IOException;
-import java.net.UnknownHostException;
import java.util.Iterator;
import java.util.List;
@@ -28,12 +26,8 @@ import org.apache.hadoop.conf.Configurat
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.Text;
-import org.mortbay.http.HttpContext;
-import org.mortbay.http.SocketListener;
-import org.mortbay.http.handler.ResourceHandler;
-import org.mortbay.jetty.Server;
+import org.apache.nutch.storage.WebPage;
public class CrawlDBTestUtil {
@@ -49,97 +43,91 @@ public class CrawlDBTestUtil {
* @param init
* urls to be inserted, objects are of type URLCrawlDatum
* @throws Exception
- */
- public static void createCrawlDb(Configuration conf, FileSystem fs, Path crawldb, List<URLCrawlDatum> init)
- throws Exception {
- LOG.trace("* creating crawldb: " + crawldb);
- Path dir = new Path(crawldb, CrawlDb.CURRENT_NAME);
- MapFile.Writer writer = new MapFile.Writer(conf, fs, new Path(dir, "part-00000")
- .toString(), Text.class, CrawlDatum.class);
- Iterator<URLCrawlDatum> it = init.iterator();
- while (it.hasNext()) {
- URLCrawlDatum row = it.next();
- LOG.info("adding:" + row.url.toString());
- writer.append(new Text(row.url), row.datum);
- }
- writer.close();
- }
+ * @see TestGenerator
+ **/
+ // public static void oldcreateCrawlDb(Configuration conf, FileSystem fs,
+ // Path crawldb, List<URLWebPage> init) throws Exception {
+ // LOG.trace("* creating crawldb: " + crawldb);
+ // Path dir = new Path(crawldb, CrawlDb.CURRENT_NAME);
+ // MapFile.Writer writer = new MapFile.Writer(conf, fs, new Path(dir,
+ // "part-00000").toString(), Text.class, CrawlDatum.class);
+ // Iterator<URLWebPage> it = init.iterator();
+ // while (it.hasNext()) {
+ // URLWebPage row = it.next();
+ // LOG.info("adding:" + row.url.toString());
+ // writer.append(new Text(row.url), row.datum);
+ // }
+ // writer.close();
+ // }
/**
* For now we need to manually construct our Configuration, because we need to
- * override the default one and it is currently not possible to use dynamically
- * set values.
+ * override the default one and it is currently not possible to use
+ * dynamically set values.
*
* @return
* @deprecated Use {@link #createConfiguration()} instead
*/
- public static Configuration create(){
+ public static Configuration create() {
return createConfiguration();
}
/**
* For now we need to manually construct our Configuration, because we need to
- * override the default one and it is currently not possible to use dynamically
- * set values.
+ * override the default one and it is currently not possible to use
+ * dynamically set values.
*
* @return
*/
- public static Configuration createConfiguration(){
+ public static Configuration createConfiguration() {
Configuration conf = new Configuration();
conf.addResource("nutch-default.xml");
conf.addResource("crawl-tests.xml");
+ conf.addResource("hbase-site.xml");
return conf;
}
- public static class URLCrawlDatum {
-
- Text url;
-
- CrawlDatum datum;
-
- public URLCrawlDatum(Text url, CrawlDatum datum) {
- this.url = url;
- this.datum = datum;
- }
- }
-
/**
* Generate seedlist
- * @throws IOException
+ *
+ * @see TestInjector
+ * @throws IOException
*/
- public static void generateSeedList(FileSystem fs, Path urlPath, List<String> contents) throws IOException{
+ public static void generateSeedList(FileSystem fs, Path urlPath,
+ List<String> contents) throws IOException {
FSDataOutputStream out;
- Path file=new Path(urlPath,"urls.txt");
+ Path file = new Path(urlPath, "urls.txt");
fs.mkdirs(urlPath);
- out=fs.create(file);
- Iterator<String> iterator=contents.iterator();
- while(iterator.hasNext()){
- String url=iterator.next();
+ out = fs.create(file);
+ Iterator<String> iterator = contents.iterator();
+ while (iterator.hasNext()) {
+ String url = iterator.next();
out.writeBytes(url);
out.writeBytes("\n");
}
out.flush();
out.close();
}
-
- /**
- * Creates a new JettyServer with one static root context
- *
- * @param port port to listen to
- * @param staticContent folder where static content lives
- * @throws UnknownHostException
- */
- public static Server getServer(int port, String staticContent) throws UnknownHostException{
- Server webServer = new org.mortbay.jetty.Server();
- SocketListener listener = new SocketListener();
- listener.setPort(port);
- listener.setHost("127.0.0.1");
- webServer.addListener(listener);
- HttpContext staticContext = new HttpContext();
- staticContext.setContextPath("/");
- staticContext.setResourceBase(staticContent);
- staticContext.addHandler(new ResourceHandler());
- webServer.addContext(staticContext);
- return webServer;
- }
+
+ // /**
+ // * Creates a new JettyServer with one static root context
+ // *
+ // * @param port port to listen to
+ // * @param staticContent folder where static content lives
+ // * @throws UnknownHostException
+ // */
+ // public static Server getServer(int port, String staticContent) throws
+ // UnknownHostException{
+ // Server webServer = new org.mortbay.jetty.Server();
+ // SocketListener listener = new SocketListener();
+ // listener.setPort(port);
+ // listener.setHost("127.0.0.1");
+ // webServer.addListener(listener);
+ // HttpContext staticContext = new HttpContext();
+ // staticContext.setContextPath("/");
+ // staticContext.setResourceBase(staticContent);
+ // staticContext.addHandler(new ResourceHandler());
+ // webServer.addContext(staticContext);
+ // return webServer;
+ // }
}
Modified: nutch/branches/nutchbase/src/test/org/apache/nutch/crawl/TestGenerator.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/test/org/apache/nutch/crawl/TestGenerator.java?rev=959259&r1=959258&r2=959259&view=diff
==============================================================================
--- nutch/branches/nutchbase/src/test/org/apache/nutch/crawl/TestGenerator.java (original)
+++ nutch/branches/nutchbase/src/test/org/apache/nutch/crawl/TestGenerator.java Wed Jun 30 10:36:20 2010
@@ -21,50 +21,51 @@ import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDBTestUtil.URLCrawlDatum;
-
-import junit.framework.TestCase;
+import org.apache.hadoop.hbase.HBaseClusterTestCase;
+import org.apache.hadoop.hbase.HBaseConfiguration;
+import org.apache.nutch.storage.Mark;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.TableUtil;
+import org.gora.hbase.store.HBaseStore;
+import org.gora.query.Query;
+import org.gora.query.Result;
+import org.gora.store.DataStore;
+import org.gora.store.DataStoreFactory;
/**
- * Basic generator test. 1. Insert entries in crawldb 2. Generates entries to
+ * Basic generator test. 1. Insert entries in webtable 2. Generates entries to
* fetch 3. Verifies that number of generated urls match 4. Verifies that
* highest scoring urls are generated
*
* @author nutch-dev <nutch-dev at lucene.apache.org>
+ * @param <URLWebPage>
*
*/
-public class TestGenerator extends TestCase {
-
- Configuration conf;
+public class TestGenerator extends HBaseClusterTestCase {
- Path dbDir;
+ public static final Log LOG = LogFactory.getLog(TestGenerator.class);
- Path segmentsDir;
-
- FileSystem fs;
+ Configuration conf;
- final static Path testdir = new Path("build/test/generator-test");
+ private DataStore<String, WebPage> webPageStore;
+ @Override
protected void setUp() throws Exception {
conf = CrawlDBTestUtil.createConfiguration();
- fs = FileSystem.get(conf);
- fs.delete(testdir, true);
+ super.conf = new HBaseConfiguration(conf);
+ super.setUp();
+ webPageStore = DataStoreFactory.getDataStore(HBaseStore.class,
+ String.class, WebPage.class);
}
- protected void tearDown() {
- delete(testdir);
- }
-
- private void delete(Path p) {
- try {
- fs.delete(p, true);
- } catch (IOException e) {
- }
+ @Override
+ protected void tearDown() throws Exception {
+ webPageStore.close();
+ super.tearDown();
+ super.cluster.shutdown();
}
/**
@@ -76,22 +77,20 @@ public class TestGenerator extends TestC
final int NUM_RESULTS = 2;
- ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
+ ArrayList<URLWebPage> list = new ArrayList<URLWebPage>();
for (int i = 0; i <= 100; i++) {
- list.add(createURLCrawlDatum("http://aaa/" + pad(i),
- 1, i));
+ list.add(createURLWebPage("http://aaa/" + pad(i), 1, i));
}
- createCrawlDB(list);
+ for (URLWebPage uwp : list) {
+ webPageStore.put(TableUtil.reverseUrl(uwp.getUrl()), uwp.getDatum());
+ }
- Path generatedSegment = generateFetchlist(NUM_RESULTS, conf, false);
+ generateFetchlist(NUM_RESULTS, conf, false);
- Path fetchlist = new Path(new Path(generatedSegment,
- CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+ ArrayList<URLWebPage> l = readContents();
- ArrayList<URLCrawlDatum> l = readContents(fetchlist);
-
// sort urls by score desc
Collections.sort(l, new ScoreComparator());
@@ -99,8 +98,8 @@ public class TestGenerator extends TestC
assertEquals(NUM_RESULTS, l.size());
// verify we have the highest scoring urls
- assertEquals("http://aaa/100", (l.get(0).url.toString()));
- assertEquals("http://aaa/099", (l.get(1).url.toString()));
+ assertEquals("http://aaa/100", (l.get(0).getUrl().toString()));
+ assertEquals("http://aaa/099", (l.get(1).getUrl().toString()));
}
private String pad(int i) {
@@ -114,13 +113,13 @@ public class TestGenerator extends TestC
/**
* Comparator that sorts by score desc.
*/
- public class ScoreComparator implements Comparator<URLCrawlDatum> {
+ public class ScoreComparator implements Comparator<URLWebPage> {
- public int compare(URLCrawlDatum tuple1, URLCrawlDatum tuple2) {
- if (tuple2.datum.getScore() - tuple1.datum.getScore() < 0) {
+ public int compare(URLWebPage tuple1, URLWebPage tuple2) {
+ if (tuple2.getDatum().getScore() - tuple1.getDatum().getScore() < 0) {
return -1;
}
- if (tuple2.datum.getScore() - tuple1.datum.getScore() > 0) {
+ if (tuple2.getDatum().getScore() - tuple1.getDatum().getScore() > 0) {
return 1;
}
return 0;
@@ -129,55 +128,43 @@ public class TestGenerator extends TestC
/**
* Test that generator obeys the property "generate.max.per.host".
- * @throws Exception
+ *
+ * @throws Exception
*/
- public void testGenerateHostLimit() throws Exception{
- ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
+ public void testGenerateHostLimit() throws Exception {
+ ArrayList<URLWebPage> list = new ArrayList<URLWebPage>();
- list.add(createURLCrawlDatum("http://www.example.com/index1.html",
- 1, 1));
- list.add(createURLCrawlDatum("http://www.example.com/index2.html",
- 1, 1));
- list.add(createURLCrawlDatum("http://www.example.com/index3.html",
- 1, 1));
+ list.add(createURLWebPage("http://www.example.com/index1.html", 1, 1));
+ list.add(createURLWebPage("http://www.example.com/index2.html", 1, 1));
+ list.add(createURLWebPage("http://www.example.com/index3.html", 1, 1));
- createCrawlDB(list);
+ for (URLWebPage uwp : list) {
+ webPageStore.put(TableUtil.reverseUrl(uwp.getUrl()), uwp.getDatum());
+ }
Configuration myConfiguration = new Configuration(conf);
- myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 1);
- Path generatedSegment = generateFetchlist(Integer.MAX_VALUE,
- myConfiguration, false);
-
- Path fetchlistPath = new Path(new Path(generatedSegment,
- CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+ myConfiguration.setInt(GeneratorJob.GENERATE_MAX_PER_HOST, 1);
+ generateFetchlist(Integer.MAX_VALUE, myConfiguration, false);
- ArrayList<URLCrawlDatum> fetchList = readContents(fetchlistPath);
+ ArrayList<URLWebPage> fetchList = readContents();
// verify we got right amount of records
assertEquals(1, fetchList.size());
myConfiguration = new Configuration(conf);
- myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 2);
- generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
- false);
-
- fetchlistPath = new Path(new Path(generatedSegment,
- CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+ myConfiguration.setInt(GeneratorJob.GENERATE_MAX_PER_HOST, 2);
+ generateFetchlist(Integer.MAX_VALUE, myConfiguration, false);
- fetchList = readContents(fetchlistPath);
+ fetchList = readContents();
// verify we got right amount of records
assertEquals(2, fetchList.size());
myConfiguration = new Configuration(conf);
- myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 3);
- generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
- false);
+ myConfiguration.setInt(GeneratorJob.GENERATE_MAX_PER_HOST, 3);
+ generateFetchlist(Integer.MAX_VALUE, myConfiguration, false);
- fetchlistPath = new Path(new Path(generatedSegment,
- CrawlDatum.GENERATE_DIR_NAME), "part-00000");
-
- fetchList = readContents(fetchlistPath);
+ fetchList = readContents();
// verify we got right amount of records
assertEquals(3, fetchList.size());
@@ -186,53 +173,45 @@ public class TestGenerator extends TestC
/**
* Test that generator obeys the property "generate.max.per.host" and
* "generate.max.per.host.by.ip".
- * @throws Exception
+ *
+ * @throws Exception
*/
- public void testGenerateHostIPLimit() throws Exception{
- ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
+ public void toastGenerateHostIPLimit() throws Exception {
+ ArrayList<URLWebPage> list = new ArrayList<URLWebPage>();
- list.add(createURLCrawlDatum("http://www.example.com/index.html", 1, 1));
- list.add(createURLCrawlDatum("http://www.example.net/index.html", 1, 1));
- list.add(createURLCrawlDatum("http://www.example.org/index.html", 1, 1));
+ list.add(createURLWebPage("http://www.example.com/index.html", 1, 1));
+ list.add(createURLWebPage("http://www.example.net/index.html", 1, 1));
+ list.add(createURLWebPage("http://www.example.org/index.html", 1, 1));
- createCrawlDB(list);
+ for (URLWebPage uwp : list) {
+ webPageStore.put(TableUtil.reverseUrl(uwp.getUrl()), uwp.getDatum());
+ }
Configuration myConfiguration = new Configuration(conf);
- myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 1);
- myConfiguration.setBoolean(Generator.GENERATE_MAX_PER_HOST_BY_IP, true);
-
- Path generatedSegment = generateFetchlist(Integer.MAX_VALUE,
- myConfiguration, false);
+ myConfiguration.setInt(GeneratorJob.GENERATE_MAX_PER_HOST, 1);
+ myConfiguration.setBoolean(GeneratorJob.GENERATE_MAX_PER_HOST_BY_IP, true);
- Path fetchlistPath = new Path(new Path(generatedSegment,
- CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+ generateFetchlist(Integer.MAX_VALUE, myConfiguration, false);
- ArrayList<URLCrawlDatum> fetchList = readContents(fetchlistPath);
+ ArrayList<URLWebPage> fetchList = readContents();
// verify we got right amount of records
assertEquals(1, fetchList.size());
myConfiguration = new Configuration(myConfiguration);
- myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 2);
- generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration, false);
-
- fetchlistPath = new Path(new Path(generatedSegment,
- CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+ myConfiguration.setInt(GeneratorJob.GENERATE_MAX_PER_HOST, 2);
+ generateFetchlist(Integer.MAX_VALUE, myConfiguration, false);
- fetchList = readContents(fetchlistPath);
+ fetchList = readContents();
// verify we got right amount of records
assertEquals(2, fetchList.size());
myConfiguration = new Configuration(myConfiguration);
- myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 3);
- generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration,
- false);
+ myConfiguration.setInt(GeneratorJob.GENERATE_MAX_PER_HOST, 3);
+ generateFetchlist(Integer.MAX_VALUE, myConfiguration, false);
- fetchlistPath = new Path(new Path(generatedSegment,
- CrawlDatum.GENERATE_DIR_NAME), "part-00000");
-
- fetchList = readContents(fetchlistPath);
+ fetchList = readContents();
// verify we got right amount of records
assertEquals(3, fetchList.size());
@@ -240,109 +219,108 @@ public class TestGenerator extends TestC
/**
* Test generator obeys the filter setting.
- * @throws Exception
- * @throws IOException
+ *
+ * @throws Exception
+ * @throws IOException
*/
- public void testFilter() throws IOException, Exception{
+ public void testFilter() throws IOException, Exception {
- ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
+ ArrayList<URLWebPage> list = new ArrayList<URLWebPage>();
- list.add(createURLCrawlDatum("http://www.example.com/index.html", 1, 1));
- list.add(createURLCrawlDatum("http://www.example.net/index.html", 1, 1));
- list.add(createURLCrawlDatum("http://www.example.org/index.html", 1, 1));
+ list.add(createURLWebPage("http://www.example.com/index.html", 1, 1));
+ list.add(createURLWebPage("http://www.example.net/index.html", 1, 1));
+ list.add(createURLWebPage("http://www.example.org/index.html", 1, 1));
- createCrawlDB(list);
+ for (URLWebPage uwp : list) {
+ webPageStore.put(TableUtil.reverseUrl(uwp.getUrl()), uwp.getDatum());
+ }
Configuration myConfiguration = new Configuration(conf);
myConfiguration.set("urlfilter.suffix.file", "filter-all.txt");
- Path generatedSegment = generateFetchlist(Integer.MAX_VALUE,
- myConfiguration, true);
+ generateFetchlist(Integer.MAX_VALUE, myConfiguration, true);
- assertNull("should be null (0 entries)", generatedSegment);
+ ArrayList<URLWebPage> fetchList = readContents();
- generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration, false);
+ assertEquals(0, fetchList.size());
- Path fetchlistPath = new Path(new Path(generatedSegment,
- CrawlDatum.GENERATE_DIR_NAME), "part-00000");
+ generateFetchlist(Integer.MAX_VALUE, myConfiguration, false);
- ArrayList<URLCrawlDatum> fetchList = readContents(fetchlistPath);
+ fetchList = readContents();
// verify nothing got filtered
assertEquals(list.size(), fetchList.size());
}
-
/**
- * Read contents of fetchlist.
- * @param fetchlist path to Generated fetchlist
- * @return Generated {@link URLCrawlDatum} objects
+ * Read entries marked as fetchable
+ *
+ * @return Generated {@link URLWebPage} objects
* @throws IOException
*/
- private ArrayList<URLCrawlDatum> readContents(Path fetchlist) throws IOException {
- // verify results
- SequenceFile.Reader reader = new SequenceFile.Reader(fs, fetchlist, conf);
-
- ArrayList<URLCrawlDatum> l = new ArrayList<URLCrawlDatum>();
-
- READ: do {
- Text key = new Text();
- CrawlDatum value = new CrawlDatum();
- if (!reader.next(key, value)) {
- break READ;
- }
- l.add(new URLCrawlDatum(key, value));
- } while (true);
+ private ArrayList<URLWebPage> readContents() throws IOException {
+ ArrayList<URLWebPage> l = new ArrayList<URLWebPage>();
+
+ Query<String, WebPage> query = webPageStore.newQuery();
+ query.setFields(WebPage.Field.MARKERS.getName());
+
+ Result<String, WebPage> results = webPageStore.execute(query);
+
+ while (results.next()) {
+ WebPage page = results.get();
+ String url = results.getKey();
+ LOG.info("FOUND IN TABLE :" + url);
+
+ if (page == null)
+ continue;
+
+ if (Mark.GENERATE_MARK.checkMark(page) == null)
+ continue;
+
+ // it has been marked as generated so it is ready for the fetch
+ l.add(new URLWebPage(TableUtil.unreverseUrl(url), page));
+ }
- reader.close();
return l;
}
/**
* Generate Fetchlist.
- * @param numResults number of results to generate
- * @param config Configuration to use
+ *
+ * @param numResults
+ * number of results to generate
+ * @param config
+ * Configuration to use
* @return path to generated segment
* @throws IOException
*/
- private Path generateFetchlist(int numResults, Configuration config,
- boolean filter) throws IOException {
+ private void generateFetchlist(int numResults, Configuration config,
+ boolean filter) throws Exception {
// generate segment
- Generator g = new Generator(config);
- Path generatedSegment = g.generate(dbDir, segmentsDir, -1, numResults,
- Long.MAX_VALUE, filter, false);
- return generatedSegment;
+ GeneratorJob g = new GeneratorJob();
+ g.setConf(config);
+ String crawlId = g.generate(numResults, Long.MAX_VALUE, filter, false);
+ if (crawlId == null)
+ throw new RuntimeException("Generator failed");
}
/**
- * Creates CrawlDB.
+ * Constructs new {@link URLWebPage} from submitted parameters.
*
- * @param list database contents
- * @throws IOException
- * @throws Exception
- */
- private void createCrawlDB(ArrayList<URLCrawlDatum> list) throws IOException,
- Exception {
- dbDir = new Path(testdir, "crawldb");
- segmentsDir = new Path(testdir, "segments");
- fs.mkdirs(dbDir);
- fs.mkdirs(segmentsDir);
-
- // create crawldb
- CrawlDBTestUtil.createCrawlDb(conf, fs, dbDir, list);
- }
-
- /**
- * Constructs new {@link URLCrawlDatum} from submitted parameters.
- * @param url url to use
- * @param fetchInterval {@link CrawlDatum#setFetchInterval(float)}
- * @param score {@link CrawlDatum#setScore(float)}
+ * @param url
+ * url to use
+ * @param fetchInterval
+ * @param score
* @return Constructed object
*/
- private URLCrawlDatum createURLCrawlDatum(final String url,
+ private URLWebPage createURLWebPage(final String url,
final int fetchInterval, final float score) {
- return new CrawlDBTestUtil.URLCrawlDatum(new Text(url), new CrawlDatum(
- CrawlDatum.STATUS_DB_UNFETCHED, fetchInterval, score));
+ WebPage page = new WebPage();
+ page.setFetchInterval(fetchInterval);
+ page.setScore(score);
+ page.setStatus(CrawlStatus.STATUS_UNFETCHED);
+ return new URLWebPage(url, page);
}
+
}
Modified: nutch/branches/nutchbase/src/test/org/apache/nutch/crawl/TestInjector.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/test/org/apache/nutch/crawl/TestInjector.java?rev=959259&r1=959258&r2=959259&view=diff
==============================================================================
--- nutch/branches/nutchbase/src/test/org/apache/nutch/crawl/TestInjector.java (original)
+++ nutch/branches/nutchbase/src/test/org/apache/nutch/crawl/TestInjector.java Wed Jun 30 10:36:20 2010
@@ -16,108 +16,130 @@
*/
package org.apache.nutch.crawl;
-import java.io.IOException;
+import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
+import org.apache.avro.util.Utf8;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.Text;
-
-import junit.framework.TestCase;
+import org.apache.hadoop.hbase.HBaseClusterTestCase;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.TableUtil;
+import org.gora.hbase.store.HBaseStore;
+import org.gora.query.Query;
+import org.gora.query.Result;
+import org.gora.store.DataStore;
+import org.gora.store.DataStoreFactory;
+import org.junit.Before;
/**
- * Basic injector test:
- * 1. Creates a text file with urls
- * 2. Injects them into crawldb
- * 3. Reads crawldb entries and verifies contents
- * 4. Injects more urls into webdb
- * 5. Reads crawldb entries and verifies contents
+ * Basic injector test: 1. Creates a text file with urls 2. Injects them into
+ * crawldb 3. Reads crawldb entries and verifies contents 4. Injects more urls
+ * into webdb 5. Reads crawldb entries and verifies contents
*
* @author nutch-dev <nutch-dev at lucene.apache.org>
*/
-public class TestInjector extends TestCase {
+public class TestInjector extends HBaseClusterTestCase {
private Configuration conf;
private FileSystem fs;
- final static Path testdir=new Path("build/test/inject-test");
- Path crawldbPath;
+ final static Path testdir = new Path("build/test/inject-test");
+ private DataStore<String, WebPage> webPageStore;
Path urlPath;
-
- protected void setUp() throws Exception {
+
+ @Before
+ @Override
+ public void setUp() throws Exception {
+ super.setUp();
conf = CrawlDBTestUtil.createConfiguration();
- urlPath=new Path(testdir,"urls");
- crawldbPath=new Path(testdir,"crawldb");
- fs=FileSystem.get(conf);
- if (fs.exists(urlPath)) fs.delete(urlPath, false);
- if (fs.exists(crawldbPath)) fs.delete(crawldbPath, true);
+ urlPath = new Path(testdir, "urls");
+ fs = FileSystem.get(conf);
+ if (fs.exists(urlPath))
+ fs.delete(urlPath, false);
+ webPageStore = DataStoreFactory.getDataStore(HBaseStore.class,
+ String.class, WebPage.class);
}
-
- protected void tearDown() throws IOException{
+
+ @Override
+ public void tearDown() throws Exception {
fs.delete(testdir, true);
+ webPageStore.close();
+ super.tearDown();
}
- public void testInject() throws IOException {
- ArrayList<String> urls=new ArrayList<String>();
- for(int i=0;i<100;i++) {
- urls.add("http://zzz.com/" + i + ".html");
+ public void testInject() throws Exception {
+ ArrayList<String> urls = new ArrayList<String>();
+ for (int i = 0; i < 100; i++) {
+ urls.add("http://zzz.com/" + i + ".html\tnutch.score=" + i
+ + "\tcustom.attribute=" + i);
}
CrawlDBTestUtil.generateSeedList(fs, urlPath, urls);
-
- Injector injector=new Injector(conf);
- injector.inject(crawldbPath, urlPath);
-
+
+ InjectorJob injector = new InjectorJob();
+ injector.setConf(conf);
+ injector.inject(urlPath);
+
// verify results
- List<String>read=readCrawldb();
-
+ List<String> read = readCrawldb();
+
Collections.sort(read);
Collections.sort(urls);
assertEquals(urls.size(), read.size());
-
- assertTrue(read.containsAll(urls));
+
assertTrue(urls.containsAll(read));
-
- //inject more urls
- ArrayList<String> urls2=new ArrayList<String>();
- for(int i=0;i<100;i++) {
+ assertTrue(read.containsAll(urls));
+
+ // inject more urls
+ ArrayList<String> urls2 = new ArrayList<String>();
+ for (int i = 0; i < 100; i++) {
urls2.add("http://xxx.com/" + i + ".html");
}
CrawlDBTestUtil.generateSeedList(fs, urlPath, urls2);
- injector.inject(crawldbPath, urlPath);
+ injector.inject(urlPath);
urls.addAll(urls2);
-
+
// verify results
- read=readCrawldb();
-
+ read = readCrawldb();
Collections.sort(read);
Collections.sort(urls);
assertEquals(urls.size(), read.size());
-
+
assertTrue(read.containsAll(urls));
assertTrue(urls.containsAll(read));
-
+
}
-
- private List<String> readCrawldb() throws IOException{
- Path dbfile=new Path(crawldbPath,CrawlDb.CURRENT_NAME + "/part-00000/data");
- System.out.println("reading:" + dbfile);
- SequenceFile.Reader reader=new SequenceFile.Reader(fs, dbfile, conf);
- ArrayList<String> read=new ArrayList<String>();
-
- READ:
- do {
- Text key=new Text();
- CrawlDatum value=new CrawlDatum();
- if(!reader.next(key, value)) break READ;
- read.add(key.toString());
- } while(true);
+ /**
+ * Read from a Gora datastore + make sure we get the score and custom metadata
+ *
+ * @throws ClassNotFoundException
+ **/
+ private List<String> readCrawldb() throws Exception {
+ ArrayList<String> read = new ArrayList<String>();
+
+ Query<String, WebPage> query = webPageStore.newQuery();
+ Result<String, WebPage> result = webPageStore.execute(query);
+
+ while (result.next()) {
+ String skey = result.getKey();
+ WebPage page = result.get();
+ float fscore = page.getScore();
+ String representation = TableUtil.unreverseUrl(skey);
+ ByteBuffer bb = page.getFromMetadata(new Utf8("custom.attribute"));
+ if (bb != null) {
+ representation += "\tnutch.score=" + (int) fscore;
+ representation += "\tcustom.attribute=" + Bytes.toString(bb.array());
+ }
+ read.add(representation);
+ }
+ result.close();
return read;
}
Modified: nutch/branches/nutchbase/src/test/org/apache/nutch/indexer/TestIndexingFilters.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/test/org/apache/nutch/indexer/TestIndexingFilters.java?rev=959259&r1=959258&r2=959259&view=diff
==============================================================================
--- nutch/branches/nutchbase/src/test/org/apache/nutch/indexer/TestIndexingFilters.java (original)
+++ nutch/branches/nutchbase/src/test/org/apache/nutch/indexer/TestIndexingFilters.java Wed Jun 30 10:36:20 2010
@@ -16,19 +16,13 @@
*/
package org.apache.nutch.indexer;
+import junit.framework.TestCase;
+
+import org.apache.avro.util.Utf8;
import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
-import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.storage.WebPage;
import org.apache.nutch.util.NutchConfiguration;
-import junit.framework.TestCase;
-
public class TestIndexingFilters extends TestCase {
/**
@@ -42,9 +36,13 @@ public class TestIndexingFilters extends
conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);
IndexingFilters filters = new IndexingFilters(conf);
- filters.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
- new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text(
- "http://www.example.com/"), new CrawlDatum(), new Inlinks());
+// filters.filter(new NutchDocument(), new ParseImpl("text", new ParseData(
+// new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text(
+// "http://www.example.com/"), new CrawlDatum(), new Inlinks());
+ WebPage page = new WebPage();
+ page.setText(new Utf8("text"));
+ page.setTitle(new Utf8("title"));
+ filters.filter(new NutchDocument(),"http://www.example.com/",page);
}
}
Modified: nutch/branches/nutchbase/src/test/org/apache/nutch/parse/TestParserFactory.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/test/org/apache/nutch/parse/TestParserFactory.java?rev=959259&r1=959258&r2=959259&view=diff
==============================================================================
--- nutch/branches/nutchbase/src/test/org/apache/nutch/parse/TestParserFactory.java (original)
+++ nutch/branches/nutchbase/src/test/org/apache/nutch/parse/TestParserFactory.java Wed Jun 30 10:36:20 2010
@@ -54,7 +54,7 @@ public class TestParserFactory extends T
ext = (Extension) parserFactory.getExtensions("text/html; charset=ISO-8859-1").get(0);
assertEquals("parse-html", ext.getDescriptor().getPluginId());
ext = (Extension)parserFactory.getExtensions("foo/bar").get(0);
- assertEquals("parse-text", ext.getDescriptor().getPluginId());
+ assertEquals("parse-tika", ext.getDescriptor().getPluginId());
}
/** Unit test to check <code>getParsers</code> method */
@@ -82,7 +82,7 @@ public class TestParserFactory extends T
parsers = parserFactory.getParsers("text/plain", "http://foo.com");
assertNotNull(parsers);
assertEquals(1, parsers.length);
- assertEquals("org.apache.nutch.parse.text.TextParser",
+ assertEquals("org.apache.nutch.parse.tika.TikaParser",
parsers[0].getClass().getName());
Parser parser1 = parserFactory.getParsers("text/plain", "http://foo.com")[0];
Modified: nutch/branches/nutchbase/src/test/org/apache/nutch/parse/parse-plugin-test.xml
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/test/org/apache/nutch/parse/parse-plugin-test.xml?rev=959259&r1=959258&r2=959259&view=diff
==============================================================================
--- nutch/branches/nutchbase/src/test/org/apache/nutch/parse/parse-plugin-test.xml (original)
+++ nutch/branches/nutchbase/src/test/org/apache/nutch/parse/parse-plugin-test.xml Wed Jun 30 10:36:20 2010
@@ -22,9 +22,9 @@
<parse-plugins>
<!-- by default if the mimeType is set to *, or
- can't be determined, use parse-text -->
+ if it can't be determined, use parse-tika -->
<mimeType name="*">
- <plugin id="parse-text" />
+ <plugin id="parse-tika" />
</mimeType>
<!-- test these 4 plugins -->
@@ -39,7 +39,7 @@
<mimeType name="text/plain">
<!-- Test that an extension-id can be directly used here -->
- <plugin id="org.apache.nutch.parse.text.TextParser"/>
+ <plugin id="org.apache.nutch.parse.tika.TikaParser"/>
</mimeType>
<mimeType name="application/x-javascript">
@@ -59,7 +59,7 @@
extension-id="JSParser" />
<alias name="parse-rss"
extension-id="org.apache.nutch.parse.rss.RSSParser" />
- <alias name="parse-text"
- extension-id="org.apache.nutch.parse.text.TextParser" />
+ <alias name="parse-tika"
+ extension-id="org.apache.nutch.parse.tika.TikaParser" />
</aliases>
</parse-plugins>
Modified: nutch/branches/nutchbase/src/test/org/apache/nutch/protocol/TestContent.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/test/org/apache/nutch/protocol/TestContent.java?rev=959259&r1=959258&r2=959259&view=diff
==============================================================================
--- nutch/branches/nutchbase/src/test/org/apache/nutch/protocol/TestContent.java (original)
+++ nutch/branches/nutchbase/src/test/org/apache/nutch/protocol/TestContent.java Wed Jun 30 10:36:20 2010
@@ -99,7 +99,7 @@ public class TestContent extends TestCas
"http://www.foo.com/",
"".getBytes("UTF8"),
"", p, conf);
- assertEquals(MimeTypes.DEFAULT, c.getContentType());
+ assertEquals(MimeTypes.OCTET_STREAM, c.getContentType());
c = new Content("http://www.foo.com/",
"http://www.foo.com/",
Modified: nutch/branches/nutchbase/src/test/org/apache/nutch/util/TestEncodingDetector.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/test/org/apache/nutch/util/TestEncodingDetector.java?rev=959259&r1=959258&r2=959259&view=diff
==============================================================================
--- nutch/branches/nutchbase/src/test/org/apache/nutch/util/TestEncodingDetector.java (original)
+++ nutch/branches/nutchbase/src/test/org/apache/nutch/util/TestEncodingDetector.java Wed Jun 30 10:36:20 2010
@@ -1,13 +1,14 @@
package org.apache.nutch.util;
import java.io.UnsupportedEncodingException;
+import java.nio.ByteBuffer;
+import junit.framework.TestCase;
+
+import org.apache.avro.util.Utf8;
import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.protocol.Content;
-
-import junit.framework.TestCase;
+import org.apache.nutch.storage.WebPage;
public class TestEncodingDetector extends TestCase {
private static Configuration conf = NutchConfiguration.create();
@@ -30,47 +31,56 @@ public class TestEncodingDetector extend
// first disable auto detection
conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, -1);
- Metadata metadata = new Metadata();
+ //Metadata metadata = new Metadata();
EncodingDetector detector;
- Content content;
+ // Content content;
String encoding;
- content = new Content("http://www.example.com", "http://www.example.com/",
- contentInOctets, "text/plain", metadata, conf);
+ WebPage page = new WebPage();
+ page.setBaseUrl(new Utf8("http://www.example.com/"));
+ page.setContentType(new Utf8("text/plain"));
+ page.setContent(ByteBuffer.wrap(contentInOctets));
+
detector = new EncodingDetector(conf);
- detector.autoDetectClues(content, true);
- encoding = detector.guessEncoding(content, "windows-1252");
+ detector.autoDetectClues(page, true);
+ encoding = detector.guessEncoding(page, "windows-1252");
// no information is available, so it should return default encoding
assertEquals("windows-1252", encoding.toLowerCase());
- metadata.clear();
- metadata.set(Response.CONTENT_TYPE, "text/plain; charset=UTF-16");
- content = new Content("http://www.example.com", "http://www.example.com/",
- contentInOctets, "text/plain", metadata, conf);
+ page = new WebPage();
+ page.setBaseUrl(new Utf8("http://www.example.com/"));
+ page.setContentType(new Utf8("text/plain"));
+ page.setContent(ByteBuffer.wrap(contentInOctets));
+ page.putToHeaders(EncodingDetector.CONTENT_TYPE_UTF8, new Utf8("text/plain; charset=UTF-16"));
+
detector = new EncodingDetector(conf);
- detector.autoDetectClues(content, true);
- encoding = detector.guessEncoding(content, "windows-1252");
+ detector.autoDetectClues(page, true);
+ encoding = detector.guessEncoding(page, "windows-1252");
assertEquals("utf-16", encoding.toLowerCase());
- metadata.clear();
- content = new Content("http://www.example.com", "http://www.example.com/",
- contentInOctets, "text/plain", metadata, conf);
+ page = new WebPage();
+ page.setBaseUrl(new Utf8("http://www.example.com/"));
+ page.setContentType(new Utf8("text/plain"));
+ page.setContent(ByteBuffer.wrap(contentInOctets));
+
detector = new EncodingDetector(conf);
- detector.autoDetectClues(content, true);
+ detector.autoDetectClues(page, true);
detector.addClue("windows-1254", "sniffed");
- encoding = detector.guessEncoding(content, "windows-1252");
+ encoding = detector.guessEncoding(page, "windows-1252");
assertEquals("windows-1254", encoding.toLowerCase());
// enable autodetection
conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, 50);
- metadata.clear();
- metadata.set(Response.CONTENT_TYPE, "text/plain; charset=UTF-16");
- content = new Content("http://www.example.com", "http://www.example.com/",
- contentInOctets, "text/plain", metadata, conf);
+ page = new WebPage();
+ page.setBaseUrl(new Utf8("http://www.example.com/"));
+ page.setContentType(new Utf8("text/plain"));
+ page.setContent(ByteBuffer.wrap(contentInOctets));
+ page.putToMetadata(new Utf8(Response.CONTENT_TYPE), ByteBuffer.wrap("text/plain; charset=UTF-16".getBytes()));
+
detector = new EncodingDetector(conf);
- detector.autoDetectClues(content, true);
+ detector.autoDetectClues(page, true);
detector.addClue("utf-32", "sniffed");
- encoding = detector.guessEncoding(content, "windows-1252");
+ encoding = detector.guessEncoding(page, "windows-1252");
assertEquals("utf-8", encoding.toLowerCase());
}
Modified: nutch/branches/nutchbase/src/test/org/apache/nutch/util/TestTableUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/nutchbase/src/test/org/apache/nutch/util/TestTableUtil.java?rev=959259&r1=959258&r2=959259&view=diff
==============================================================================
--- nutch/branches/nutchbase/src/test/org/apache/nutch/util/TestTableUtil.java (original)
+++ nutch/branches/nutchbase/src/test/org/apache/nutch/util/TestTableUtil.java Wed Jun 30 10:36:20 2010
@@ -1,6 +1,6 @@
package org.apache.nutch.util;
-import org.apache.nutch.util.hbase.TableUtil;
+import org.apache.nutch.util.TableUtil;
import junit.framework.TestCase;
public class TestTableUtil extends TestCase {
@@ -41,11 +41,11 @@ public class TestTableUtil extends TestC
private static void assertReverse(String url, String expectedReversedUrl) throws Exception {
String reversed = TableUtil.reverseUrl(url);
- assertEquals(reversed, expectedReversedUrl);
+ assertEquals(expectedReversedUrl, reversed);
}
private static void assertUnreverse(String reversedUrl, String expectedUrl) {
String unreversed = TableUtil.unreverseUrl(reversedUrl);
- assertEquals(unreversed, expectedUrl);
+ assertEquals(expectedUrl, unreversed);
}
}