You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@gora.apache.org by le...@apache.org on 2014/06/04 18:36:42 UTC
[25/50] [abbrv] git commit: Remove nutch references.

Remove nutch references.


Project: http://git-wip-us.apache.org/repos/asf/gora/repo
Commit: http://git-wip-us.apache.org/repos/asf/gora/commit/77ccfec6
Tree: http://git-wip-us.apache.org/repos/asf/gora/tree/77ccfec6
Diff: http://git-wip-us.apache.org/repos/asf/gora/diff/77ccfec6

Branch: refs/heads/master
Commit: 77ccfec66f5cc9249898fe4f2d9cf0e37f398098
Parents: a7c5f77
Author: Damien Raude-Morvan <da...@dictanova.com>
Authored: Sun May 18 00:02:46 2014 +0200
Committer: Damien Raude-Morvan <da...@dictanova.com>
Committed: Sun May 18 00:02:46 2014 +0200

----------------------------------------------------------------------
 .../conf/nutch/gora-mongodb-mapping.xml         |  51 -----
 .../src/examples/conf/nutch/gora.properties     |  90 ---------
 .../src/examples/conf/nutch/log4j.properties    |  96 ----------
 .../src/examples/conf/nutch/nutch-site.xml      | 189 -------------------
 .../gora/mongodb/store/TestMongoStoreNutch.java |  77 --------
 5 files changed, 503 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/gora/blob/77ccfec6/gora-mongodb/src/examples/conf/nutch/gora-mongodb-mapping.xml
----------------------------------------------------------------------
diff --git a/gora-mongodb/src/examples/conf/nutch/gora-mongodb-mapping.xml b/gora-mongodb/src/examples/conf/nutch/gora-mongodb-mapping.xml
deleted file mode 100644
index 48d496c..0000000
--- a/gora-mongodb/src/examples/conf/nutch/gora-mongodb-mapping.xml
+++ /dev/null
@@ -1,51 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
-  Licensed to the Apache Software Foundation (ASF) under one or more
-  contributor license agreements.  See the NOTICE file distributed with
-  this work for additional information regarding copyright ownership.
-  The ASF licenses this file to You under the Apache License, Version 2.0
-  (the "License"); you may not use this file except in compliance with
-  the License.  You may obtain a copy of the License at
-  
-  http://www.apache.org/licenses/LICENSE-2.0
-  
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
--->
-<gora-orm>
-   
-	<class document="frontier" keyClass="java.lang.String" name="org.apache.nutch.storage.WebPage">
-		<field name="baseUrl" docfield="baseUrl" type="string"/>
-		<field name="status" docfield="status" type="int32"/>
-		<field name="fetchTime" docfield="fetchTime" type="int64"/>
-		<field name="prevFetchTime" docfield="prevFetchTime" type="int64"/>
-		<field name="fetchInterval" docfield="fetchInterval" type="int32"/>
-		<field name="retriesSinceFetch" docfield="retriesSinceFetch" type="int32"/>
-		<field name="modifiedTime" docfield="modifiedTime" type="int64"/>
-		<field name="protocolStatus" docfield="protocolStatus" type="document"/>
-		<field name="content" docfield="content" type="binary"/>
-		<field name="contentType" docfield="contentType" type="string"/>
-		<field name="signature" docfield="signature" type="binary"/>
-		<field name="prevSignature" docfield="prevSignature" type="binary"/>
-		<field name="title" docfield="title" type="string"/>
-		<field name="text" docfield="text" type="string"/>
-		<field name="parseStatus" docfield="parseStatus" type="document"/>
-		<field name="score" docfield="score" type="double"/>
-		<field name="reprUrl" docfield="reprUrl" type="string"/>
-		<field name="headers" docfield="headers" type="document"/>
-		<field name="outlinks" docfield="outlinks" type="document"/>
-		<field name="inlinks" docfield="inlinks" type="document"/>
-		<field name="markers" docfield="markers" type="document"/>
-		<field name="metadata" docfield="metadata" type="document"/>
-	</class>
-
-	<class document="hosts" keyClass="java.lang.String" name="org.apache.nutch.storage.Host">
-		<field name="metadata" docfield="metadata" type="document"/>
-		<field name="outlinks" docfield="links.out" type="document"/>
-		<field name="inlinks" docfield="links.in" type="document"/>
-	</class>
-    
-</gora-orm>

http://git-wip-us.apache.org/repos/asf/gora/blob/77ccfec6/gora-mongodb/src/examples/conf/nutch/gora.properties
----------------------------------------------------------------------
diff --git a/gora-mongodb/src/examples/conf/nutch/gora.properties b/gora-mongodb/src/examples/conf/nutch/gora.properties
deleted file mode 100644
index 1d15229..0000000
--- a/gora-mongodb/src/examples/conf/nutch/gora.properties
+++ /dev/null
@@ -1,90 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#gora.datastore.default=org.apache.gora.mock.store.MockDataStore
-gora.datastore.autocreateschema=true
-
-###############################
-# Default SqlStore properties #
-###############################
-
-gora.sqlstore.jdbc.driver=org.hsqldb.jdbc.JDBCDriver
-gora.sqlstore.jdbc.url=jdbc:hsqldb:hsql://localhost/nutchtest
-gora.sqlstore.jdbc.user=sa
-gora.sqlstore.jdbc.password=
-
-################################
-# Default AvroStore properties #
-################################
-
-# gora.avrostore.codec.type=BINARY||JSON
-# gora.avrostore.output.path=file:///tmp/gora.avrostore.test.output
-
-################################
-# DatafileAvroStore properties #
-################################
-# DataFileAvroStore is file based store which uses Avro's 
-# DataFile{Writer,Reader}'s as a backend. This datastore supports 
-# mapreduce.
-
-# gora.datafileavrostore.###=
-
-#########################
-# HBaseStore properties #
-#########################
-# HBase requires that the Configuration has a valid "hbase.zookeeper.quorum"
-# property. It should be included within hbase-site.xml on the classpath. When
-# this property is omitted, it expects Zookeeper to run on localhost:2181.
-
-# To greatly improve scan performance, increase the hbase-site Configuration
-# property "hbase.client.scanner.caching". This sets the number of rows to grab
-# per request.
-
-# HBase autoflushing. Enabling autoflush decreases write performance. 
-# Available since Gora 0.2. Defaults to disabled.
-# hbase.client.autoflush.default=false
-
-#############################
-# CassandraStore properties #
-#############################
-
-# gora.cassandrastore.servers=localhost:9160
-
-#######################
-# MemStore properties #
-#######################
-# This is a memory based {@link DataStore} implementation for tests.
-
-# gora.memstore.###=
-
-############################
-# AccumuloStore properties #
-############################
-#gora.datastore.default=org.apache.gora.accumulo.store.AccumuloStore
-#gora.datastore.accumulo.mock=true
-#gora.datastore.accumulo.instance=a14
-#gora.datastore.accumulo.zookeepers=localhost
-#gora.datastore.accumulo.user=root
-#gora.datastore.accumulo.password=secret
-
-############################
-# MongoDBStore properties  #
-############################
-gora.datastore.default=org.apache.gora.mongodb.store.MongoStore
-gora.mongodb.override_hadoop_configuration=true
-gora.mongodb.mapping.file=/gora-mongodb-mapping.xml
-gora.mongodb.servers=localhost
-gora.mongodb.db=nutchtest
-

http://git-wip-us.apache.org/repos/asf/gora/blob/77ccfec6/gora-mongodb/src/examples/conf/nutch/log4j.properties
----------------------------------------------------------------------
diff --git a/gora-mongodb/src/examples/conf/nutch/log4j.properties b/gora-mongodb/src/examples/conf/nutch/log4j.properties
deleted file mode 100644
index e1e839e..0000000
--- a/gora-mongodb/src/examples/conf/nutch/log4j.properties
+++ /dev/null
@@ -1,96 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Define some default values that can be overridden by system properties
-hadoop.log.dir=.
-hadoop.log.file=hadoop.log
-
-# RootLogger - DailyRollingFileAppender
-log4j.rootLogger=INFO,DRFA
-
-# Logging Threshold
-log4j.threshhold=ALL
-
-#special logging requirements for some commandline tools
-log4j.logger.org.apache.nutch.crawl.Crawl=INFO,cmdstdout
-log4j.logger.org.apache.nutch.crawl.InjectorJob=INFO,cmdstdout
-log4j.logger.org.apache.nutch.host.HostInjectorJob=INFO,cmdstdout
-log4j.logger.org.apache.nutch.crawl.GeneratorJob=INFO,cmdstdout
-log4j.logger.org.apache.nutch.crawl.DbUpdaterJob=INFO,cmdstdout
-log4j.logger.org.apache.nutch.host.HostDbUpdateJob=INFO,cmdstdout
-log4j.logger.org.apache.nutch.fetcher.FetcherJob=INFO,cmdstdout
-log4j.logger.org.apache.nutch.parse.ParserJob=INFO,cmdstdout
-log4j.logger.org.apache.nutch.indexer.IndexerJob=INFO,cmdstdout
-log4j.logger.org.apache.nutch.indexer.solr.SolrIndexerJob=INFO,cmdstdout
-log4j.logger.org.apache.nutch.indexer.solr.SolrWriter=INFO,cmdstdout
-log4j.logger.org.apache.nutch.indexer.DeleteDuplicates=INFO,cmdstdout
-log4j.logger.org.apache.nutch.crawl.WebTableReader=INFO,cmdstdout
-log4j.logger.org.apache.nutch.host.HostDbReader=INFO,cmdstdout
-
-log4j.logger.org.apache.nutch=WARN
-log4j.logger.org.apache.hadoop=WARN
-log4j.logger.org.apache.zookeeper=WARN
-
-#
-# Daily Rolling File Appender
-#
-
-log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender
-log4j.appender.DRFA.File=${hadoop.log.dir}/${hadoop.log.file}
-
-# Rollver at midnight
-log4j.appender.DRFA.DatePattern=.yyyy-MM-dd
-
-# 30-day backup
-#log4j.appender.DRFA.MaxBackupIndex=30
-log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout
-
-# Pattern format: Date LogLevel LoggerName LogMessage
-log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} - %m%n
-# Debugging Pattern format: Date LogLevel LoggerName (FileName:MethodName:LineNo) LogMessage
-#log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n
-
-
-#
-# stdout
-# Add *stdout* to rootlogger above if you want to use this 
-#
-
-log4j.appender.stdout=org.apache.log4j.ConsoleAppender
-log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
-log4j.appender.stdout.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n
-
-#
-# plain layout used for commandline tools to output to console
-#
-log4j.appender.cmdstdout=org.apache.log4j.ConsoleAppender
-log4j.appender.cmdstdout.layout=org.apache.log4j.PatternLayout
-log4j.appender.cmdstdout.layout.ConversionPattern=%m%n
-
-#
-# Rolling File Appender
-#
-
-#log4j.appender.RFA=org.apache.log4j.RollingFileAppender
-#log4j.appender.RFA.File=${hadoop.log.dir}/${hadoop.log.file}
-
-# Logfile size and and 30-day backups
-#log4j.appender.RFA.MaxFileSize=1MB
-#log4j.appender.RFA.MaxBackupIndex=30
-
-#log4j.appender.RFA.layout=org.apache.log4j.PatternLayout
-#log4j.appender.RFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} - %m%n
-#log4j.appender.RFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n
-

http://git-wip-us.apache.org/repos/asf/gora/blob/77ccfec6/gora-mongodb/src/examples/conf/nutch/nutch-site.xml
----------------------------------------------------------------------
diff --git a/gora-mongodb/src/examples/conf/nutch/nutch-site.xml b/gora-mongodb/src/examples/conf/nutch/nutch-site.xml
deleted file mode 100644
index 49083d9..0000000
--- a/gora-mongodb/src/examples/conf/nutch/nutch-site.xml
+++ /dev/null
@@ -1,189 +0,0 @@
-<?xml version="1.0"?>
-<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
-
-<!-- Put site-specific property overrides in this file. -->
-
-<configuration>
-
-<property>
-  <name>http.agent.name</name>
-  <value>nutch-crawler</value>
-</property>
-
-<property>
-  <name>http.robots.agents</name>
-  <value>nutch-crawler,*</value>
-</property>
-
-<property>
-  <name>http.accept.language</name>
-  <value>fr-fr,fr,en;q=0.7,*;q=0.3</value>
-</property>
-
-
-<property>
-  <name>db.fetch.schedule.class</name>
-  <value>org.apache.nutch.crawl.AdaptiveFetchSchedule</value>
-</property>
-
-<property>
-  <name>fetcher.throughput.threshold.pages</name>
-  <value>0.8</value>
-  <description>The threshold of minimum pages per second. If the fetcher downloads less
-  pages per second than the configured threshold, the fetcher stops, preventing slow queue's
-  from stalling the throughput. This threshold must be an integer. This can be useful when
-  fetcher.timelimit.mins is hard to determine. The default value of -1 disables this check.
-  </description>
-</property>
-
-<property>
-  <name>generate.update.crawldb</name>
-  <value>true</value>
-  <description>For highly-concurrent environments, where several
-  generate/fetch/update cycles may overlap, setting this to true ensures
-  that generate will create different fetchlists even without intervening
-  updatedb-s, at the cost of running an additional job to update CrawlDB.
-  If false, running generate twice without intervening
-  updatedb will generate identical fetchlists.</description>
-</property>
-
-<property>
-  <name>crawl.gen.delay</name>
-  <value>86400000</value> <!-- Make it one day -->
-  <description>
-   This value, expressed in days, defines how long we should keep the lock on records
-   in CrawlDb that were just selected for fetching. If these records are not updated
-   in the meantime, the lock is canceled, i.e. the become eligible for selecting.
-   Default value of this is 7 days.
-  </description>
-</property>
-
-<property>
-  <name>fetcher.parse</name>
-  <value>true</value>
-  <description>If true, fetcher will parse content. NOTE: previous releases would
-  default to true. Since 2.0 this is set to false as a safer default.</description>
-</property>
-
-<property>
-  <name>parser.html.outlinks.ignore_tags</name>
-  <value>img,script,link</value>
-  <description>Comma separated list of HTML tags, from which outlinks
-  shouldn't be extracted. Nutch takes links from: a, area, form, frame,
-  iframe, script, link, img. If you add any of those tags here, it
-  won't be taken. Default is empty list. Probably reasonable value
-  for most people would be "img,script,link".</description>
-</property>
-
-<property>
-  <name>plugin.includes</name>
- <value>protocol-http|urlfilter-regex|parse-(html|tika)|urlnormalizer-(pass|regex|basic)|scoring-opic</value>
-</property>
-
-<property>
-	<name>mapred.task.timeout</name>
-	<value>600000</value>
-	<!-- Max 10 minutes idle -->
-</property>
-
-<property>
-  <name>parser.html.impl</name>
-  <value>tagsoup</value>
-  <description>HTML Parser implementation. Currently the following keywords
-  are recognized: "neko" uses NekoHTML, "tagsoup" uses TagSoup.
-  </description>
-</property>
-
-<property>
-  <name>db.update.additions.allowed</name>
-  <value>true</value>
-  <description>If true, updatedb will add newly discovered URLs, if false
-  only already existing URLs in the CrawlDb will be updated and no new
-  URLs will be added.
-  </description>
-</property>
-
-<property>
-  <name>db.ignore.internal.links</name>
-  <value>false</value>
-  <description>If true, when adding new links to a page, links from
-  the same host are ignored.  This is an effective way to limit the
-  size of the link database, keeping only the highest quality
-  links.
-  </description>
-</property>
-
-<property>
-  <name>db.score.link.external</name>
-  <value>2.0</value>
-  <description>The score factor for new pages added due to a link from
-  another host relative to the referencing page's score. Scoring plugins
-  may use this value to affect initial scores of external links.
-  </description>
-</property>
-
-<property>
-  <name>db.score.link.internal</name>
-  <value>0.5</value>
-  <description>The score factor for pages added due to a link from the
-  same host, relative to the referencing page's score. Scoring plugins
-  may use this value to affect initial scores of internal links.
-  </description>
-</property>
-
-<property>
-  <name>db.parsemeta.to.crawldb</name>
-  <value>lang</value>
-  <description>Comma-separated list of parse metadata keys to transfer to the crawldb (NUTCH-779).
-   Assuming for instance that the languageidentifier plugin is enabled, setting the value to 'lang'
-   will copy both the key 'lang' and its value to the corresponding entry in the crawldb.
-  </description>
-</property>
-
-<property>
-  <name>generate.max.count</name>
-  <value>10000</value>
-  <description>The maximum number of urls in a single
-  fetchlist.  -1 if unlimited. The urls are counted according
-  to the value of the parameter generator.count.mode.
-  </description>
-</property>
-
-<!-- storage properties -->
-
-<property>
-  <name>storage.data.store.class</name>
-  <value>org.apache.gora.mongodb.store.MongoStore</value>
-</property>
-
-<property>
-  <name>storage.schema.webpage</name>
-  <value>frontier</value>
-  <description>This value holds the schema name used for Nutch web db.
-  Note that Nutch ignores the value in the gora mapping files, and uses
-  this as the webpage schema name.
-  </description>
-</property>
-
-<property>
-  <name>storage.schema.host</name>
-  <value>host</value>
-  <description>This value holds the schema name used for Nutch host db.
-  Note that Nutch ignores the value in the gora mapping files, and uses
-  this as the host schema name.
-  </description>
-</property>
-
-<property>
-  <name>storage.crawl.id</name>
-  <value></value>
-  <description>This value helps differentiate between the datasets that
-  the jobs in the crawl cycle generate and operate on. The value will
-  be input to all the jobs which then will use it as a prefix when
-  accessing to the schemas. The default configuration uses no id to prefix
-  the schemas. The value could also be given as a command line argument
-  to each job.
-  </description>
-</property>
-
-</configuration>

http://git-wip-us.apache.org/repos/asf/gora/blob/77ccfec6/gora-mongodb/src/test/java/org/apache/gora/mongodb/store/TestMongoStoreNutch.java
----------------------------------------------------------------------
diff --git a/gora-mongodb/src/test/java/org/apache/gora/mongodb/store/TestMongoStoreNutch.java b/gora-mongodb/src/test/java/org/apache/gora/mongodb/store/TestMongoStoreNutch.java
deleted file mode 100644
index 8487b45..0000000
--- a/gora-mongodb/src/test/java/org/apache/gora/mongodb/store/TestMongoStoreNutch.java
+++ /dev/null
@@ -1,77 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.gora.mongodb.store;
-
-import org.apache.gora.mapreduce.GoraOutputFormat;
-//import org.apache.gora.mongodb.beans.tests.WebPage;
-import org.apache.gora.store.DataStore;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.Reducer;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
-//import org.apache.nutch.crawl.InjectorJob.UrlMapper;
-//import org.apache.nutch.storage.StorageUtils;
-//import org.apache.nutch.util.NutchJob;
-//import org.apache.nutch.util.ToolUtil;
-import org.junit.Ignore;
-import org.junit.Test;
-
-import java.io.IOException;
-import java.util.HashMap;
-
-@Ignore("Needs Nutch configuration")
-public class TestMongoStoreNutch {
-
-    /**
-     * For this test to work, it is necessary to provide:
-     * - a plugin directory with at least one urlnormalizer
-     * - include the jar of the used plugins in the classpath
-     *
-     * @throws IOException
-     * @throws ClassNotFoundException
-     * @throws InterruptedException
-     */
-    /*@Test
-    public void testNutchInjection() throws IOException, ClassNotFoundException, InterruptedException {
-        Path input = new Path("src/it/resources/test-nutch-inject.csv");
-        Configuration conf = new Configuration();
-        conf.set("storage.data.store.class", "org.apache.gora.mongodb.store.MongoStore");
-        conf.set("plugin.folders", "/home/grdscarabe/PROJECTS/DictaLab/workspace-nutch/nutch-2.1/runtime/local/plugins"); // FIXME
-        conf.set("plugin.auto-activation", "true");
-        conf.set("plugin.includes", "urlnormalizer-basic");
-        conf.set("plugin.excludes", "");
-
-        HashMap<String, Object> results = new HashMap<String, Object>();
-        Job currentJob = new NutchJob(conf, "inject " + input);
-        FileInputFormat.addInputPath(currentJob, input);
-        currentJob.setMapperClass(UrlMapper.class);
-        currentJob.setMapOutputKeyClass(String.class);
-        currentJob.setMapOutputValueClass(WebPage.class);
-        currentJob.setOutputFormatClass(GoraOutputFormat.class);
-        DataStore<String, org.apache.nutch.storage.WebPage> store =
-                StorageUtils.createWebStore(currentJob.getConfiguration(),
-                        String.class, org.apache.nutch.storage.WebPage.class);
-        GoraOutputFormat.setOutput(currentJob, store, true);
-        currentJob.setReducerClass(Reducer.class);
-        currentJob.setNumReduceTasks(0);
-        currentJob.waitForCompletion(true);
-        ToolUtil.recordJobStatus(null, currentJob, results);
-    }
-    */
-}