You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@gora.apache.org by le...@apache.org on 2014/06/04 18:36:42 UTC
[25/50] [abbrv] git commit: Remove nutch references.
Remove nutch references.
Project: http://git-wip-us.apache.org/repos/asf/gora/repo
Commit: http://git-wip-us.apache.org/repos/asf/gora/commit/77ccfec6
Tree: http://git-wip-us.apache.org/repos/asf/gora/tree/77ccfec6
Diff: http://git-wip-us.apache.org/repos/asf/gora/diff/77ccfec6
Branch: refs/heads/master
Commit: 77ccfec66f5cc9249898fe4f2d9cf0e37f398098
Parents: a7c5f77
Author: Damien Raude-Morvan <da...@dictanova.com>
Authored: Sun May 18 00:02:46 2014 +0200
Committer: Damien Raude-Morvan <da...@dictanova.com>
Committed: Sun May 18 00:02:46 2014 +0200
----------------------------------------------------------------------
.../conf/nutch/gora-mongodb-mapping.xml | 51 -----
.../src/examples/conf/nutch/gora.properties | 90 ---------
.../src/examples/conf/nutch/log4j.properties | 96 ----------
.../src/examples/conf/nutch/nutch-site.xml | 189 -------------------
.../gora/mongodb/store/TestMongoStoreNutch.java | 77 --------
5 files changed, 503 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/gora/blob/77ccfec6/gora-mongodb/src/examples/conf/nutch/gora-mongodb-mapping.xml
----------------------------------------------------------------------
diff --git a/gora-mongodb/src/examples/conf/nutch/gora-mongodb-mapping.xml b/gora-mongodb/src/examples/conf/nutch/gora-mongodb-mapping.xml
deleted file mode 100644
index 48d496c..0000000
--- a/gora-mongodb/src/examples/conf/nutch/gora-mongodb-mapping.xml
+++ /dev/null
@@ -1,51 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<gora-orm>
-
- <class document="frontier" keyClass="java.lang.String" name="org.apache.nutch.storage.WebPage">
- <field name="baseUrl" docfield="baseUrl" type="string"/>
- <field name="status" docfield="status" type="int32"/>
- <field name="fetchTime" docfield="fetchTime" type="int64"/>
- <field name="prevFetchTime" docfield="prevFetchTime" type="int64"/>
- <field name="fetchInterval" docfield="fetchInterval" type="int32"/>
- <field name="retriesSinceFetch" docfield="retriesSinceFetch" type="int32"/>
- <field name="modifiedTime" docfield="modifiedTime" type="int64"/>
- <field name="protocolStatus" docfield="protocolStatus" type="document"/>
- <field name="content" docfield="content" type="binary"/>
- <field name="contentType" docfield="contentType" type="string"/>
- <field name="signature" docfield="signature" type="binary"/>
- <field name="prevSignature" docfield="prevSignature" type="binary"/>
- <field name="title" docfield="title" type="string"/>
- <field name="text" docfield="text" type="string"/>
- <field name="parseStatus" docfield="parseStatus" type="document"/>
- <field name="score" docfield="score" type="double"/>
- <field name="reprUrl" docfield="reprUrl" type="string"/>
- <field name="headers" docfield="headers" type="document"/>
- <field name="outlinks" docfield="outlinks" type="document"/>
- <field name="inlinks" docfield="inlinks" type="document"/>
- <field name="markers" docfield="markers" type="document"/>
- <field name="metadata" docfield="metadata" type="document"/>
- </class>
-
- <class document="hosts" keyClass="java.lang.String" name="org.apache.nutch.storage.Host">
- <field name="metadata" docfield="metadata" type="document"/>
- <field name="outlinks" docfield="links.out" type="document"/>
- <field name="inlinks" docfield="links.in" type="document"/>
- </class>
-
-</gora-orm>
http://git-wip-us.apache.org/repos/asf/gora/blob/77ccfec6/gora-mongodb/src/examples/conf/nutch/gora.properties
----------------------------------------------------------------------
diff --git a/gora-mongodb/src/examples/conf/nutch/gora.properties b/gora-mongodb/src/examples/conf/nutch/gora.properties
deleted file mode 100644
index 1d15229..0000000
--- a/gora-mongodb/src/examples/conf/nutch/gora.properties
+++ /dev/null
@@ -1,90 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#gora.datastore.default=org.apache.gora.mock.store.MockDataStore
-gora.datastore.autocreateschema=true
-
-###############################
-# Default SqlStore properties #
-###############################
-
-gora.sqlstore.jdbc.driver=org.hsqldb.jdbc.JDBCDriver
-gora.sqlstore.jdbc.url=jdbc:hsqldb:hsql://localhost/nutchtest
-gora.sqlstore.jdbc.user=sa
-gora.sqlstore.jdbc.password=
-
-################################
-# Default AvroStore properties #
-################################
-
-# gora.avrostore.codec.type=BINARY||JSON
-# gora.avrostore.output.path=file:///tmp/gora.avrostore.test.output
-
-################################
-# DatafileAvroStore properties #
-################################
-# DataFileAvroStore is file based store which uses Avro's
-# DataFile{Writer,Reader}'s as a backend. This datastore supports
-# mapreduce.
-
-# gora.datafileavrostore.###=
-
-#########################
-# HBaseStore properties #
-#########################
-# HBase requires that the Configuration has a valid "hbase.zookeeper.quorum"
-# property. It should be included within hbase-site.xml on the classpath. When
-# this property is omitted, it expects Zookeeper to run on localhost:2181.
-
-# To greatly improve scan performance, increase the hbase-site Configuration
-# property "hbase.client.scanner.caching". This sets the number of rows to grab
-# per request.
-
-# HBase autoflushing. Enabling autoflush decreases write performance.
-# Available since Gora 0.2. Defaults to disabled.
-# hbase.client.autoflush.default=false
-
-#############################
-# CassandraStore properties #
-#############################
-
-# gora.cassandrastore.servers=localhost:9160
-
-#######################
-# MemStore properties #
-#######################
-# This is a memory based {@link DataStore} implementation for tests.
-
-# gora.memstore.###=
-
-############################
-# AccumuloStore properties #
-############################
-#gora.datastore.default=org.apache.gora.accumulo.store.AccumuloStore
-#gora.datastore.accumulo.mock=true
-#gora.datastore.accumulo.instance=a14
-#gora.datastore.accumulo.zookeepers=localhost
-#gora.datastore.accumulo.user=root
-#gora.datastore.accumulo.password=secret
-
-############################
-# MongoDBStore properties #
-############################
-gora.datastore.default=org.apache.gora.mongodb.store.MongoStore
-gora.mongodb.override_hadoop_configuration=true
-gora.mongodb.mapping.file=/gora-mongodb-mapping.xml
-gora.mongodb.servers=localhost
-gora.mongodb.db=nutchtest
-
http://git-wip-us.apache.org/repos/asf/gora/blob/77ccfec6/gora-mongodb/src/examples/conf/nutch/log4j.properties
----------------------------------------------------------------------
diff --git a/gora-mongodb/src/examples/conf/nutch/log4j.properties b/gora-mongodb/src/examples/conf/nutch/log4j.properties
deleted file mode 100644
index e1e839e..0000000
--- a/gora-mongodb/src/examples/conf/nutch/log4j.properties
+++ /dev/null
@@ -1,96 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Define some default values that can be overridden by system properties
-hadoop.log.dir=.
-hadoop.log.file=hadoop.log
-
-# RootLogger - DailyRollingFileAppender
-log4j.rootLogger=INFO,DRFA
-
-# Logging Threshold
-log4j.threshhold=ALL
-
-#special logging requirements for some commandline tools
-log4j.logger.org.apache.nutch.crawl.Crawl=INFO,cmdstdout
-log4j.logger.org.apache.nutch.crawl.InjectorJob=INFO,cmdstdout
-log4j.logger.org.apache.nutch.host.HostInjectorJob=INFO,cmdstdout
-log4j.logger.org.apache.nutch.crawl.GeneratorJob=INFO,cmdstdout
-log4j.logger.org.apache.nutch.crawl.DbUpdaterJob=INFO,cmdstdout
-log4j.logger.org.apache.nutch.host.HostDbUpdateJob=INFO,cmdstdout
-log4j.logger.org.apache.nutch.fetcher.FetcherJob=INFO,cmdstdout
-log4j.logger.org.apache.nutch.parse.ParserJob=INFO,cmdstdout
-log4j.logger.org.apache.nutch.indexer.IndexerJob=INFO,cmdstdout
-log4j.logger.org.apache.nutch.indexer.solr.SolrIndexerJob=INFO,cmdstdout
-log4j.logger.org.apache.nutch.indexer.solr.SolrWriter=INFO,cmdstdout
-log4j.logger.org.apache.nutch.indexer.DeleteDuplicates=INFO,cmdstdout
-log4j.logger.org.apache.nutch.crawl.WebTableReader=INFO,cmdstdout
-log4j.logger.org.apache.nutch.host.HostDbReader=INFO,cmdstdout
-
-log4j.logger.org.apache.nutch=WARN
-log4j.logger.org.apache.hadoop=WARN
-log4j.logger.org.apache.zookeeper=WARN
-
-#
-# Daily Rolling File Appender
-#
-
-log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender
-log4j.appender.DRFA.File=${hadoop.log.dir}/${hadoop.log.file}
-
-# Rollver at midnight
-log4j.appender.DRFA.DatePattern=.yyyy-MM-dd
-
-# 30-day backup
-#log4j.appender.DRFA.MaxBackupIndex=30
-log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout
-
-# Pattern format: Date LogLevel LoggerName LogMessage
-log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} - %m%n
-# Debugging Pattern format: Date LogLevel LoggerName (FileName:MethodName:LineNo) LogMessage
-#log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n
-
-
-#
-# stdout
-# Add *stdout* to rootlogger above if you want to use this
-#
-
-log4j.appender.stdout=org.apache.log4j.ConsoleAppender
-log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
-log4j.appender.stdout.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n
-
-#
-# plain layout used for commandline tools to output to console
-#
-log4j.appender.cmdstdout=org.apache.log4j.ConsoleAppender
-log4j.appender.cmdstdout.layout=org.apache.log4j.PatternLayout
-log4j.appender.cmdstdout.layout.ConversionPattern=%m%n
-
-#
-# Rolling File Appender
-#
-
-#log4j.appender.RFA=org.apache.log4j.RollingFileAppender
-#log4j.appender.RFA.File=${hadoop.log.dir}/${hadoop.log.file}
-
-# Logfile size and and 30-day backups
-#log4j.appender.RFA.MaxFileSize=1MB
-#log4j.appender.RFA.MaxBackupIndex=30
-
-#log4j.appender.RFA.layout=org.apache.log4j.PatternLayout
-#log4j.appender.RFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} - %m%n
-#log4j.appender.RFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n
-
http://git-wip-us.apache.org/repos/asf/gora/blob/77ccfec6/gora-mongodb/src/examples/conf/nutch/nutch-site.xml
----------------------------------------------------------------------
diff --git a/gora-mongodb/src/examples/conf/nutch/nutch-site.xml b/gora-mongodb/src/examples/conf/nutch/nutch-site.xml
deleted file mode 100644
index 49083d9..0000000
--- a/gora-mongodb/src/examples/conf/nutch/nutch-site.xml
+++ /dev/null
@@ -1,189 +0,0 @@
-<?xml version="1.0"?>
-<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
-
-<!-- Put site-specific property overrides in this file. -->
-
-<configuration>
-
-<property>
- <name>http.agent.name</name>
- <value>nutch-crawler</value>
-</property>
-
-<property>
- <name>http.robots.agents</name>
- <value>nutch-crawler,*</value>
-</property>
-
-<property>
- <name>http.accept.language</name>
- <value>fr-fr,fr,en;q=0.7,*;q=0.3</value>
-</property>
-
-
-<property>
- <name>db.fetch.schedule.class</name>
- <value>org.apache.nutch.crawl.AdaptiveFetchSchedule</value>
-</property>
-
-<property>
- <name>fetcher.throughput.threshold.pages</name>
- <value>0.8</value>
- <description>The threshold of minimum pages per second. If the fetcher downloads less
- pages per second than the configured threshold, the fetcher stops, preventing slow queue's
- from stalling the throughput. This threshold must be an integer. This can be useful when
- fetcher.timelimit.mins is hard to determine. The default value of -1 disables this check.
- </description>
-</property>
-
-<property>
- <name>generate.update.crawldb</name>
- <value>true</value>
- <description>For highly-concurrent environments, where several
- generate/fetch/update cycles may overlap, setting this to true ensures
- that generate will create different fetchlists even without intervening
- updatedb-s, at the cost of running an additional job to update CrawlDB.
- If false, running generate twice without intervening
- updatedb will generate identical fetchlists.</description>
-</property>
-
-<property>
- <name>crawl.gen.delay</name>
- <value>86400000</value> <!-- Make it one day -->
- <description>
- This value, expressed in days, defines how long we should keep the lock on records
- in CrawlDb that were just selected for fetching. If these records are not updated
- in the meantime, the lock is canceled, i.e. the become eligible for selecting.
- Default value of this is 7 days.
- </description>
-</property>
-
-<property>
- <name>fetcher.parse</name>
- <value>true</value>
- <description>If true, fetcher will parse content. NOTE: previous releases would
- default to true. Since 2.0 this is set to false as a safer default.</description>
-</property>
-
-<property>
- <name>parser.html.outlinks.ignore_tags</name>
- <value>img,script,link</value>
- <description>Comma separated list of HTML tags, from which outlinks
- shouldn't be extracted. Nutch takes links from: a, area, form, frame,
- iframe, script, link, img. If you add any of those tags here, it
- won't be taken. Default is empty list. Probably reasonable value
- for most people would be "img,script,link".</description>
-</property>
-
-<property>
- <name>plugin.includes</name>
- <value>protocol-http|urlfilter-regex|parse-(html|tika)|urlnormalizer-(pass|regex|basic)|scoring-opic</value>
-</property>
-
-<property>
- <name>mapred.task.timeout</name>
- <value>600000</value>
- <!-- Max 10 minutes idle -->
-</property>
-
-<property>
- <name>parser.html.impl</name>
- <value>tagsoup</value>
- <description>HTML Parser implementation. Currently the following keywords
- are recognized: "neko" uses NekoHTML, "tagsoup" uses TagSoup.
- </description>
-</property>
-
-<property>
- <name>db.update.additions.allowed</name>
- <value>true</value>
- <description>If true, updatedb will add newly discovered URLs, if false
- only already existing URLs in the CrawlDb will be updated and no new
- URLs will be added.
- </description>
-</property>
-
-<property>
- <name>db.ignore.internal.links</name>
- <value>false</value>
- <description>If true, when adding new links to a page, links from
- the same host are ignored. This is an effective way to limit the
- size of the link database, keeping only the highest quality
- links.
- </description>
-</property>
-
-<property>
- <name>db.score.link.external</name>
- <value>2.0</value>
- <description>The score factor for new pages added due to a link from
- another host relative to the referencing page's score. Scoring plugins
- may use this value to affect initial scores of external links.
- </description>
-</property>
-
-<property>
- <name>db.score.link.internal</name>
- <value>0.5</value>
- <description>The score factor for pages added due to a link from the
- same host, relative to the referencing page's score. Scoring plugins
- may use this value to affect initial scores of internal links.
- </description>
-</property>
-
-<property>
- <name>db.parsemeta.to.crawldb</name>
- <value>lang</value>
- <description>Comma-separated list of parse metadata keys to transfer to the crawldb (NUTCH-779).
- Assuming for instance that the languageidentifier plugin is enabled, setting the value to 'lang'
- will copy both the key 'lang' and its value to the corresponding entry in the crawldb.
- </description>
-</property>
-
-<property>
- <name>generate.max.count</name>
- <value>10000</value>
- <description>The maximum number of urls in a single
- fetchlist. -1 if unlimited. The urls are counted according
- to the value of the parameter generator.count.mode.
- </description>
-</property>
-
-<!-- storage properties -->
-
-<property>
- <name>storage.data.store.class</name>
- <value>org.apache.gora.mongodb.store.MongoStore</value>
-</property>
-
-<property>
- <name>storage.schema.webpage</name>
- <value>frontier</value>
- <description>This value holds the schema name used for Nutch web db.
- Note that Nutch ignores the value in the gora mapping files, and uses
- this as the webpage schema name.
- </description>
-</property>
-
-<property>
- <name>storage.schema.host</name>
- <value>host</value>
- <description>This value holds the schema name used for Nutch host db.
- Note that Nutch ignores the value in the gora mapping files, and uses
- this as the host schema name.
- </description>
-</property>
-
-<property>
- <name>storage.crawl.id</name>
- <value></value>
- <description>This value helps differentiate between the datasets that
- the jobs in the crawl cycle generate and operate on. The value will
- be input to all the jobs which then will use it as a prefix when
- accessing to the schemas. The default configuration uses no id to prefix
- the schemas. The value could also be given as a command line argument
- to each job.
- </description>
-</property>
-
-</configuration>
http://git-wip-us.apache.org/repos/asf/gora/blob/77ccfec6/gora-mongodb/src/test/java/org/apache/gora/mongodb/store/TestMongoStoreNutch.java
----------------------------------------------------------------------
diff --git a/gora-mongodb/src/test/java/org/apache/gora/mongodb/store/TestMongoStoreNutch.java b/gora-mongodb/src/test/java/org/apache/gora/mongodb/store/TestMongoStoreNutch.java
deleted file mode 100644
index 8487b45..0000000
--- a/gora-mongodb/src/test/java/org/apache/gora/mongodb/store/TestMongoStoreNutch.java
+++ /dev/null
@@ -1,77 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.gora.mongodb.store;
-
-import org.apache.gora.mapreduce.GoraOutputFormat;
-//import org.apache.gora.mongodb.beans.tests.WebPage;
-import org.apache.gora.store.DataStore;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.Reducer;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
-//import org.apache.nutch.crawl.InjectorJob.UrlMapper;
-//import org.apache.nutch.storage.StorageUtils;
-//import org.apache.nutch.util.NutchJob;
-//import org.apache.nutch.util.ToolUtil;
-import org.junit.Ignore;
-import org.junit.Test;
-
-import java.io.IOException;
-import java.util.HashMap;
-
-@Ignore("Needs Nutch configuration")
-public class TestMongoStoreNutch {
-
- /**
- * For this test to work, it is necessary to provide:
- * - a plugin directory with at least one urlnormalizer
- * - include the jar of the used plugins in the classpath
- *
- * @throws IOException
- * @throws ClassNotFoundException
- * @throws InterruptedException
- */
- /*@Test
- public void testNutchInjection() throws IOException, ClassNotFoundException, InterruptedException {
- Path input = new Path("src/it/resources/test-nutch-inject.csv");
- Configuration conf = new Configuration();
- conf.set("storage.data.store.class", "org.apache.gora.mongodb.store.MongoStore");
- conf.set("plugin.folders", "/home/grdscarabe/PROJECTS/DictaLab/workspace-nutch/nutch-2.1/runtime/local/plugins"); // FIXME
- conf.set("plugin.auto-activation", "true");
- conf.set("plugin.includes", "urlnormalizer-basic");
- conf.set("plugin.excludes", "");
-
- HashMap<String, Object> results = new HashMap<String, Object>();
- Job currentJob = new NutchJob(conf, "inject " + input);
- FileInputFormat.addInputPath(currentJob, input);
- currentJob.setMapperClass(UrlMapper.class);
- currentJob.setMapOutputKeyClass(String.class);
- currentJob.setMapOutputValueClass(WebPage.class);
- currentJob.setOutputFormatClass(GoraOutputFormat.class);
- DataStore<String, org.apache.nutch.storage.WebPage> store =
- StorageUtils.createWebStore(currentJob.getConfiguration(),
- String.class, org.apache.nutch.storage.WebPage.class);
- GoraOutputFormat.setOutput(currentJob, store, true);
- currentJob.setReducerClass(Reducer.class);
- currentJob.setNumReduceTasks(0);
- currentJob.waitForCompletion(true);
- ToolUtil.recordJobStatus(null, currentJob, results);
- }
- */
-}