You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@cassandra.apache.org by jb...@apache.org on 2010/02/12 22:55:34 UTC
svn commit: r909626 - in /incubator/cassandra/trunk: ./ contrib/word_count/
contrib/word_count/bin/ contrib/word_count/src/
src/java/org/apache/cassandra/config/ src/java/org/apache/cassandra/db/
Author: jbellis
Date: Fri Feb 12 21:55:31 2010
New Revision: 909626
URL: http://svn.apache.org/viewvc?rev=909626&view=rev
Log:
add wordcount hadoop example
patch by jbellis; reviewed by Stu Hood for CASSANDRA-342
Added:
incubator/cassandra/trunk/contrib/word_count/
incubator/cassandra/trunk/contrib/word_count/README.txt (with props)
incubator/cassandra/trunk/contrib/word_count/bin/
incubator/cassandra/trunk/contrib/word_count/bin/word_count
incubator/cassandra/trunk/contrib/word_count/bin/word_count_setup
incubator/cassandra/trunk/contrib/word_count/build.xml (with props)
incubator/cassandra/trunk/contrib/word_count/src/
incubator/cassandra/trunk/contrib/word_count/src/WordCount.java (with props)
incubator/cassandra/trunk/contrib/word_count/src/WordCountSetup.java (with props)
incubator/cassandra/trunk/contrib/word_count/storage-conf.xml (with props)
Modified:
incubator/cassandra/trunk/ivy.xml
incubator/cassandra/trunk/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
incubator/cassandra/trunk/src/java/org/apache/cassandra/db/RangeSliceCommand.java
Added: incubator/cassandra/trunk/contrib/word_count/README.txt
URL: http://svn.apache.org/viewvc/incubator/cassandra/trunk/contrib/word_count/README.txt?rev=909626&view=auto
==============================================================================
--- incubator/cassandra/trunk/contrib/word_count/README.txt (added)
+++ incubator/cassandra/trunk/contrib/word_count/README.txt Fri Feb 12 21:55:31 2010
@@ -0,0 +1,18 @@
+WordCount hadoop example: Inserts a bunch of words across multiple rows,
+and counts them, with RandomPartitioner.
+
+The scripts in bin/ assume you are running with cwd of contrib/word_count.
+
+First build and start a Cassandra server with the default configuration*,
+then run
+
+contrib/word_count$ ant
+contrib/word_count$ bin/word_count_setup
+contrib/word_count$ bin/word_count
+
+Output will be in /tmp/word_count*.
+
+Read the code in src/ for more details.
+
+*If you want to point wordcount at a real cluster, modify the seed
+and listenaddress settings in storage-conf.xml accordingly.
Propchange: incubator/cassandra/trunk/contrib/word_count/README.txt
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/cassandra/trunk/contrib/word_count/bin/word_count
URL: http://svn.apache.org/viewvc/incubator/cassandra/trunk/contrib/word_count/bin/word_count?rev=909626&view=auto
==============================================================================
--- incubator/cassandra/trunk/contrib/word_count/bin/word_count (added)
+++ incubator/cassandra/trunk/contrib/word_count/bin/word_count Fri Feb 12 21:55:31 2010
@@ -0,0 +1,53 @@
+#!/bin/sh
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cwd=`dirname $0`
+
+# Cassandra class files.
+if [ ! -d $cwd/../../../build/classes ]; then
+ echo "Unable to locate cassandra class files" >&2
+ exit 1
+fi
+
+# word_count Jar.
+if [ ! -e $cwd/../build/*.jar ]; then
+ echo "Unable to locate word_count jar" >&2
+ exit 1
+fi
+
+CLASSPATH=$CLASSPATH:`ls -1 $cwd/../build/*.jar`
+CLASSPATH=$CLASSPATH:.:$cwd/../../../build/classes
+for jar in $cwd/../../../lib/*.jar; do
+ CLASSPATH=$CLASSPATH:$jar
+done
+for jar in $cwd/../../../build/lib/jars/*.jar; do
+ CLASSPATH=$CLASSPATH:$jar
+done
+
+if [ -x $JAVA_HOME/bin/java ]; then
+ JAVA=$JAVA_HOME/bin/java
+else
+ JAVA=`which java`
+fi
+
+if [ "x$JAVA" = "x" ]; then
+ echo "Java executable not found (hint: set JAVA_HOME)" >&2
+ exit 1
+fi
+
+$JAVA -Xmx1G -ea -cp $CLASSPATH WordCount
Added: incubator/cassandra/trunk/contrib/word_count/bin/word_count_setup
URL: http://svn.apache.org/viewvc/incubator/cassandra/trunk/contrib/word_count/bin/word_count_setup?rev=909626&view=auto
==============================================================================
--- incubator/cassandra/trunk/contrib/word_count/bin/word_count_setup (added)
+++ incubator/cassandra/trunk/contrib/word_count/bin/word_count_setup Fri Feb 12 21:55:31 2010
@@ -0,0 +1,53 @@
+#!/bin/sh
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cwd=`dirname $0`
+
+# Cassandra class files.
+if [ ! -d $cwd/../../../build/classes ]; then
+ echo "Unable to locate cassandra class files" >&2
+ exit 1
+fi
+
+# word_count Jar.
+if [ ! -e $cwd/../build/*.jar ]; then
+ echo "Unable to locate word_count jar" >&2
+ exit 1
+fi
+
+CLASSPATH=$CLASSPATH:`ls -1 $cwd/../build/*.jar`
+CLASSPATH=$CLASSPATH:.:$cwd/../../../build/classes
+for jar in $cwd/../../../lib/*.jar; do
+ CLASSPATH=$CLASSPATH:$jar
+done
+for jar in $cwd/../../../build/lib/jars/*.jar; do
+ CLASSPATH=$CLASSPATH:$jar
+done
+
+if [ -x $JAVA_HOME/bin/java ]; then
+ JAVA=$JAVA_HOME/bin/java
+else
+ JAVA=`which java`
+fi
+
+if [ "x$JAVA" = "x" ]; then
+ echo "Java executable not found (hint: set JAVA_HOME)" >&2
+ exit 1
+fi
+
+$JAVA -Xmx1G -ea -cp $CLASSPATH WordCountSetup
Added: incubator/cassandra/trunk/contrib/word_count/build.xml
URL: http://svn.apache.org/viewvc/incubator/cassandra/trunk/contrib/word_count/build.xml?rev=909626&view=auto
==============================================================================
--- incubator/cassandra/trunk/contrib/word_count/build.xml (added)
+++ incubator/cassandra/trunk/contrib/word_count/build.xml Fri Feb 12 21:55:31 2010
@@ -0,0 +1,65 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one
+ ~ or more contributor license agreements. See the NOTICE file
+ ~ distributed with this work for additional information
+ ~ regarding copyright ownership. The ASF licenses this file
+ ~ to you under the Apache License, Version 2.0 (the
+ ~ "License"); you may not use this file except in compliance
+ ~ with the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing,
+ ~ software distributed under the License is distributed on an
+ ~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ ~ KIND, either express or implied. See the License for the
+ ~ specific language governing permissions and limitations
+ ~ under the License.
+ -->
+<project basedir="." default="jar" name="word_count">
+ <property name="cassandra.dir" value="../.." />
+ <property name="cassandra.lib" value="" />
+ <property name="cassandra.classes" value="${cassandra.dir}/build/classes" />
+ <property name="build.src" value="${basedir}/src" />
+ <property name="build.out" value="${basedir}/build" />
+ <property name="build.classes" value="${build.out}/classes" />
+ <property name="final.name" value="word_count" />
+
+ <target name="init">
+ <mkdir dir="${build.classes}" />
+ </target>
+
+ <target depends="init" name="build">
+ <javac destdir="${build.classes}">
+ <src path="${build.src}" />
+ <classpath>
+ <path>
+ <fileset dir="${cassandra.dir}/lib">
+ <include name="**/*.jar" />
+ </fileset>
+ <fileset dir="${cassandra.dir}/build/lib/jars">
+ <include name="**/*.jar" />
+ </fileset>
+ <pathelement location="${cassandra.classes}" />
+ </path>
+ </classpath>
+ </javac>
+ </target>
+
+ <target name="jar" depends="build">
+ <mkdir dir="${build.classes}/META-INF" />
+ <jar jarfile="${build.out}/${final.name}.jar">
+ <fileset dir="${build.classes}" />
+ <fileset dir="${cassandra.classes}" />
+ <fileset dir="${cassandra.dir}">
+ <include name="lib/**/*.jar" />
+ </fileset>
+ <fileset file="${basedir}/storage-conf.xml" />
+ </jar>
+ </target>
+
+ <target name="clean">
+ <delete dir="${build.out}" />
+ </target>
+</project>
Propchange: incubator/cassandra/trunk/contrib/word_count/build.xml
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/cassandra/trunk/contrib/word_count/src/WordCount.java
URL: http://svn.apache.org/viewvc/incubator/cassandra/trunk/contrib/word_count/src/WordCount.java?rev=909626&view=auto
==============================================================================
--- incubator/cassandra/trunk/contrib/word_count/src/WordCount.java (added)
+++ incubator/cassandra/trunk/contrib/word_count/src/WordCount.java Fri Feb 12 21:55:31 2010
@@ -0,0 +1,107 @@
+import java.io.IOException;
+import java.util.SortedMap;
+import java.util.StringTokenizer;
+
+import org.apache.log4j.Logger;
+
+import org.apache.cassandra.db.IColumn;
+import org.apache.cassandra.hadoop.ColumnFamilyInputFormat;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+
+/**
+ * This counts the occurrences of words in ColumnFamily Standard1, that has a single column (that we care about)
+ * "text" containing a sequence of words.
+ *
+ * For each word, we output the total number of occurrences across all texts.
+ */
+public class WordCount extends Configured implements Tool
+{
+ private static final Logger logger = Logger.getLogger(WordCount.class);
+
+ static final String KEYSPACE = "Keyspace1";
+ static final String COLUMN_FAMILY = "Standard1";
+ private static String columnName;
+ private static final String OUTPUT_PATH_PREFIX = "/tmp/word_count";
+ static final int RING_DELAY = 3000; // this is enough for testing a single server node; may need more for a real cluster
+
+ public static void main(String[] args) throws Exception
+ {
+ // Let ToolRunner handle generic command-line options
+ ToolRunner.run(new Configuration(), new WordCount(), args);
+ System.exit(0);
+ }
+
+ public static class TokenizerMapper extends Mapper<String, SortedMap<byte[], IColumn>, Text, IntWritable>
+ {
+ private final static IntWritable one = new IntWritable(1);
+ private Text word = new Text();
+
+ public void map(String key, SortedMap<byte[], IColumn> columns, Context context) throws IOException, InterruptedException
+ {
+ if (columns == null)
+ return;
+ IColumn column = columns.get(columnName.getBytes());
+ String value = new String(column.value());
+ logger.debug("read " + key + ":" + value + " from " + context.getInputSplit());
+
+ StringTokenizer itr = new StringTokenizer(value);
+ while (itr.hasMoreTokens())
+ {
+ word.set(itr.nextToken());
+ context.write(word, one);
+ }
+ }
+ }
+
+ public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable>
+ {
+ private IntWritable result = new IntWritable();
+
+ public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException
+ {
+ int sum = 0;
+ for (IntWritable val : values)
+ {
+ sum += val.get();
+ }
+
+ result.set(sum);
+ context.write(key, result);
+ }
+ }
+
+ public int run(String[] args) throws Exception
+ {
+ Configuration conf = getConf();
+
+ for (int i = 0; i < WordCountSetup.TEST_COUNT; i++)
+ {
+ columnName = "text" + i;
+ Job job = new Job(conf, "wordcount");
+ job.setJarByClass(WordCount.class);
+ job.setMapperClass(TokenizerMapper.class);
+ job.setCombinerClass(IntSumReducer.class);
+ job.setReducerClass(IntSumReducer.class);
+ job.setOutputKeyClass(Text.class);
+ job.setOutputValueClass(IntWritable.class);
+
+ job.setInputFormatClass(ColumnFamilyInputFormat.class);
+ FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH_PREFIX + i));
+
+ ColumnFamilyInputFormat.setColumnFamily(job, KEYSPACE, COLUMN_FAMILY);
+
+ job.waitForCompletion(true);
+ }
+ return 0;
+ }
+}
\ No newline at end of file
Propchange: incubator/cassandra/trunk/contrib/word_count/src/WordCount.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/cassandra/trunk/contrib/word_count/src/WordCountSetup.java
URL: http://svn.apache.org/viewvc/incubator/cassandra/trunk/contrib/word_count/src/WordCountSetup.java?rev=909626&view=auto
==============================================================================
--- incubator/cassandra/trunk/contrib/word_count/src/WordCountSetup.java (added)
+++ incubator/cassandra/trunk/contrib/word_count/src/WordCountSetup.java Fri Feb 12 21:55:31 2010
@@ -0,0 +1,61 @@
+import java.util.Arrays;
+
+import org.apache.log4j.Logger;
+
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.service.StorageProxy;
+import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.thrift.ConsistencyLevel;
+
+public class WordCountSetup
+{
+ private static final Logger logger = Logger.getLogger(WordCountSetup.class);
+
+ public static final int TEST_COUNT = 4;
+
+ public static void main(String[] args) throws Exception
+ {
+ StorageService.instance.initClient();
+ logger.info("Sleeping " + WordCount.RING_DELAY);
+ Thread.sleep(WordCount.RING_DELAY);
+ assert !StorageService.instance.getLiveNodes().isEmpty();
+
+ RowMutation rm;
+ ColumnFamily cf;
+ byte[] columnName;
+
+ // text0: no rows
+
+ // text1: 1 row, 1 word
+ columnName = "text1".getBytes();
+ rm = new RowMutation(WordCount.KEYSPACE, "Key0");
+ cf = ColumnFamily.create(WordCount.KEYSPACE, WordCount.COLUMN_FAMILY);
+ cf.addColumn(new Column(columnName, "word1".getBytes(), 0));
+ rm.add(cf);
+ StorageProxy.mutateBlocking(Arrays.asList(rm), ConsistencyLevel.ONE);
+ logger.info("added text1");
+
+ // text2: 1 row, 2 words
+ columnName = "text2".getBytes();
+ rm = new RowMutation(WordCount.KEYSPACE, "Key0");
+ cf = ColumnFamily.create(WordCount.KEYSPACE, WordCount.COLUMN_FAMILY);
+ cf.addColumn(new Column(columnName, "word1 word2".getBytes(), 0));
+ rm.add(cf);
+ StorageProxy.mutateBlocking(Arrays.asList(rm), ConsistencyLevel.ONE);
+ logger.info("added text2");
+
+ // text3: 1000 rows, 1 word
+ columnName = "text3".getBytes();
+ for (int i = 0; i < 1000; i++)
+ {
+ rm = new RowMutation(WordCount.KEYSPACE, "Key" + i);
+ cf = ColumnFamily.create(WordCount.KEYSPACE, WordCount.COLUMN_FAMILY);
+ cf.addColumn(new Column(columnName, "word1".getBytes(), 0));
+ rm.add(cf);
+ StorageProxy.mutateBlocking(Arrays.asList(rm), ConsistencyLevel.ONE);
+ }
+ logger.info("added text3");
+
+ System.exit(0);
+ }
+}
Propchange: incubator/cassandra/trunk/contrib/word_count/src/WordCountSetup.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/cassandra/trunk/contrib/word_count/storage-conf.xml
URL: http://svn.apache.org/viewvc/incubator/cassandra/trunk/contrib/word_count/storage-conf.xml?rev=909626&view=auto
==============================================================================
--- incubator/cassandra/trunk/contrib/word_count/storage-conf.xml (added)
+++ incubator/cassandra/trunk/contrib/word_count/storage-conf.xml Fri Feb 12 21:55:31 2010
@@ -0,0 +1,369 @@
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one
+ ~ or more contributor license agreements. See the NOTICE file
+ ~ distributed with this work for additional information
+ ~ regarding copyright ownership. The ASF licenses this file
+ ~ to you under the Apache License, Version 2.0 (the
+ ~ "License"); you may not use this file except in compliance
+ ~ with the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing,
+ ~ software distributed under the License is distributed on an
+ ~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ ~ KIND, either express or implied. See the License for the
+ ~ specific language governing permissions and limitations
+ ~ under the License.
+-->
+<Storage>
+ <!--======================================================================-->
+ <!-- Basic Configuration -->
+ <!--======================================================================-->
+
+ <!--
+ ~ The name of this cluster. This is mainly used to prevent machines in
+ ~ one logical cluster from joining another.
+ -->
+ <ClusterName>Test Cluster</ClusterName>
+
+ <!--
+ ~ Turn on to make new [non-seed] nodes automatically migrate the right data
+ ~ to themselves. (If no InitialToken is specified, they will pick one
+ ~ such that they will get half the range of the most-loaded node.)
+ ~ If a node starts up without bootstrapping, it will mark itself bootstrapped
+ ~ so that you can't subsequently accidently bootstrap a node with
+ ~ data on it. (You can reset this by wiping your data and commitlog
+ ~ directories.)
+ ~
+ ~ Off by default so that new clusters and upgraders from 0.4 don't
+ ~ bootstrap immediately. You should turn this on when you start adding
+ ~ new nodes to a cluster that already has data on it. (If you are upgrading
+ ~ from 0.4, start your cluster with it off once before changing it to true.
+ ~ Otherwise, no data will be lost but you will incur a lot of unnecessary
+ ~ I/O before your cluster starts up.)
+ -->
+ <AutoBootstrap>false</AutoBootstrap>
+
+ <!--
+ ~ Keyspaces and ColumnFamilies:
+ ~ A ColumnFamily is the Cassandra concept closest to a relational
+ ~ table. Keyspaces are separate groups of ColumnFamilies. Except in
+ ~ very unusual circumstances you will have one Keyspace per application.
+
+ ~ There is an implicit keyspace named 'system' for Cassandra internals.
+ -->
+ <Keyspaces>
+ <Keyspace Name="Keyspace1">
+ <!--
+ ~ ColumnFamily definitions have one required attribute (Name)
+ ~ and several optional ones.
+ ~
+ ~ The CompareWith attribute tells Cassandra how to sort the columns
+ ~ for slicing operations. The default is BytesType, which is a
+ ~ straightforward lexical comparison of the bytes in each column.
+ ~ Other options are AsciiType, UTF8Type, LexicalUUIDType, TimeUUIDType,
+ ~ and LongType. You can also specify the fully-qualified class
+ ~ name to a class of your choice extending
+ ~ org.apache.cassandra.db.marshal.AbstractType.
+ ~
+ ~ SuperColumns have a similar CompareSubcolumnsWith attribute.
+ ~
+ ~ BytesType: Simple sort by byte value. No validation is performed.
+ ~ AsciiType: Like BytesType, but validates that the input can be
+ ~ parsed as US-ASCII.
+ ~ UTF8Type: A string encoded as UTF8
+ ~ LongType: A 64bit long
+ ~ LexicalUUIDType: A 128bit UUID, compared lexically (by byte value)
+ ~ TimeUUIDType: a 128bit version 1 UUID, compared by timestamp
+ ~
+ ~ (To get the closest approximation to 0.3-style supercolumns, you
+ ~ would use CompareWith=UTF8Type CompareSubcolumnsWith=LongType.)
+ ~
+ ~ An optional `Comment` attribute may be used to attach additional
+ ~ human-readable information about the column family to its definition.
+ ~
+ ~ The optional KeysCachedFraction attribute specifies
+ ~ The fraction of keys per sstable whose locations we keep in
+ ~ memory in "mostly LRU" order. (JUST the key locations, NOT any
+ ~ column values.) The amount of memory used by the default setting of
+ ~ 0.01 is comparable to the amount used by the internal per-sstable key
+ ~ index. Consider increasing this if you have fewer, wider rows.
+ ~ Set to 0 to disable entirely.
+ ~
+ ~ The optional RowsCached attribute specifies the number of rows
+ ~ whose entire contents we cache in memory, either as a fixed number
+ ~ of rows or as a percent of rows in the ColumnFamily.
+ ~ Do not use this on ColumnFamilies with large rows, or
+ ~ ColumnFamilies with high write:read ratios. As with key caching,
+ ~ valid values are from 0 to 1. The default 0 disables it entirely.
+ -->
+ <ColumnFamily CompareWith="BytesType"
+ Name="Standard1"
+ RowsCached="10%"
+ KeysCachedFraction="0"/>
+ <ColumnFamily CompareWith="UTF8Type" Name="Standard2"/>
+ <ColumnFamily CompareWith="TimeUUIDType" Name="StandardByUUID1"/>
+ <ColumnFamily ColumnType="Super"
+ CompareWith="UTF8Type"
+ CompareSubcolumnsWith="UTF8Type"
+ Name="Super1"
+ RowsCached="1000"
+ KeysCachedFraction="0"
+ Comment="A column family with supercolumns, whose column and subcolumn names are UTF8 strings"/>
+
+ <!--
+ ~ Strategy: Setting this to the class that implements
+ ~ IReplicaPlacementStrategy will change the way the node picker works.
+ ~ Out of the box, Cassandra provides
+ ~ org.apache.cassandra.locator.RackUnawareStrategy and
+ ~ org.apache.cassandra.locator.RackAwareStrategy (place one replica in
+ ~ a different datacenter, and the others on different racks in the same
+ ~ one.)
+ -->
+ <ReplicaPlacementStrategy>org.apache.cassandra.locator.RackUnawareStrategy</ReplicaPlacementStrategy>
+
+ <!-- Number of replicas of the data -->
+ <ReplicationFactor>1</ReplicationFactor>
+
+ <!--
+ ~ EndPointSnitch: Setting this to the class that implements
+ ~ AbstractEndpointSnitch, which lets Cassandra know enough
+ ~ about your network topology to route requests efficiently.
+ ~ Out of the box, Cassandra provides org.apache.cassandra.locator.EndPointSnitch,
+ ~ and PropertyFileEndPointSnitch is available in contrib/.
+ -->
+ <EndPointSnitch>org.apache.cassandra.locator.EndPointSnitch</EndPointSnitch>
+ </Keyspace>
+ </Keyspaces>
+
+ <!--
+ ~ Authenticator: any IAuthenticator may be used, including your own as long
+ ~ as it is on the classpath. Out of the box, Cassandra provides
+ ~ org.apache.cassandra.auth.AllowAllAuthenticator and,
+ ~ org.apache.cassandra.auth.SimpleAuthenticator
+ ~ (SimpleAuthenticator uses access.properties and passwd.properties by
+ ~ default).
+ ~
+ ~ If you don't specify an authenticator, AllowAllAuthenticator is used.
+ -->
+ <Authenticator>org.apache.cassandra.auth.AllowAllAuthenticator</Authenticator>
+
+ <!--
+ ~ Partitioner: any IPartitioner may be used, including your own as long
+ ~ as it is on the classpath. Out of the box, Cassandra provides
+ ~ org.apache.cassandra.dht.RandomPartitioner,
+ ~ org.apache.cassandra.dht.OrderPreservingPartitioner, and
+ ~ org.apache.cassandra.dht.CollatingOrderPreservingPartitioner.
+ ~ (CollatingOPP colates according to EN,US rules, not naive byte
+ ~ ordering. Use this as an example if you need locale-aware collation.)
+ ~ Range queries require using an order-preserving partitioner.
+ ~
+ ~ Achtung! Changing this parameter requires wiping your data
+ ~ directories, since the partitioner can modify the sstable on-disk
+ ~ format.
+ -->
+ <Partitioner>org.apache.cassandra.dht.RandomPartitioner</Partitioner>
+
+ <!--
+ ~ If you are using an order-preserving partitioner and you know your key
+ ~ distribution, you can specify the token for this node to use. (Keys
+ ~ are sent to the node with the "closest" token, so distributing your
+ ~ tokens equally along the key distribution space will spread keys
+ ~ evenly across your cluster.) This setting is only checked the first
+ ~ time a node is started.
+
+ ~ This can also be useful with RandomPartitioner to force equal spacing
+ ~ of tokens around the hash space, especially for clusters with a small
+ ~ number of nodes.
+ -->
+ <InitialToken></InitialToken>
+
+ <!--
+ ~ Directories: Specify where Cassandra should store different data on
+ ~ disk. Keep the data disks and the CommitLog disks separate for best
+ ~ performance
+ -->
+ <CommitLogDirectory>/var/lib/cassandra/commitlog</CommitLogDirectory>
+ <DataFileDirectories>
+ <DataFileDirectory>/var/lib/cassandra/data</DataFileDirectory>
+ </DataFileDirectories>
+ <CalloutLocation>/var/lib/cassandra/callouts</CalloutLocation>
+ <StagingFileDirectory>/var/lib/cassandra/staging</StagingFileDirectory>
+
+
+ <!--
+ ~ Addresses of hosts that are deemed contact points. Cassandra nodes
+ ~ use this list of hosts to find each other and learn the topology of
+ ~ the ring. You must change this if you are running multiple nodes!
+ -->
+ <Seeds>
+ <Seed>127.0.0.1</Seed>
+ </Seeds>
+
+
+ <!-- Miscellaneous -->
+
+ <!-- Time to wait for a reply from other nodes before failing the command -->
+ <RpcTimeoutInMillis>5000</RpcTimeoutInMillis>
+ <!-- Size to allow commitlog to grow to before creating a new segment -->
+ <CommitLogRotationThresholdInMB>128</CommitLogRotationThresholdInMB>
+
+
+ <!-- Local hosts and ports -->
+
+ <!--
+ ~ Address to bind to and tell other nodes to connect to. You _must_
+ ~ change this if you want multiple nodes to be able to communicate!
+ ~
+ ~ Leaving it blank leaves it up to InetAddress.getLocalHost(). This
+ ~ will always do the Right Thing *if* the node is properly configured
+ ~ (hostname, name resolution, etc), and the Right Thing is to use the
+ ~ address associated with the hostname (it might not be).
+ -->
+ <ListenAddress>127.0.0.2</ListenAddress>
+ <!-- internal communications port -->
+ <StoragePort>7000</StoragePort>
+
+ <!--
+ ~ The address to bind the Thrift RPC service to. Unlike ListenAddress
+ ~ above, you *can* specify 0.0.0.0 here if you want Thrift to listen on
+ ~ all interfaces.
+ ~
+ ~ Leaving this blank has the same effect it does for ListenAddress,
+ ~ (i.e. it will be based on the configured hostname of the node).
+ -->
+ <ThriftAddress>127.0.0.2</ThriftAddress>
+ <!-- Thrift RPC port (the port clients connect to). -->
+ <ThriftPort>9160</ThriftPort>
+ <!--
+ ~ Whether or not to use a framed transport for Thrift. If this option
+ ~ is set to true then you must also use a framed transport on the
+ ~ client-side, (framed and non-framed transports are not compatible).
+ -->
+ <ThriftFramedTransport>false</ThriftFramedTransport>
+
+
+ <!--======================================================================-->
+ <!-- Memory, Disk, and Performance -->
+ <!--======================================================================-->
+
+ <!--
+ ~ Access mode. mmapped i/o is substantially faster, but only practical on
+ ~ a 64bit machine (which notably does not include EC2 "small" instances)
+ ~ or relatively small datasets. "auto", the safe choice, will enable
+ ~ mmapping on a 64bit JVM. Other values are "mmap", "mmap_index_only"
+ ~ (which may allow you to get part of the benefits of mmap on a 32bit
+ ~ machine by mmapping only index files) and "standard".
+ ~ (The buffer size settings that follow only apply to standard,
+ ~ non-mmapped i/o.)
+ -->
+ <DiskAccessMode>auto</DiskAccessMode>
+
+ <!--
+ ~ Buffer size to use when performing contiguous column slices. Increase
+ ~ this to the size of the column slices you typically perform.
+ ~ (Name-based queries are performed with a buffer size of
+ ~ ColumnIndexSizeInKB.)
+ -->
+ <SlicedBufferSizeInKB>64</SlicedBufferSizeInKB>
+
+ <!--
+ ~ Buffer size to use when flushing memtables to disk. (Only one
+ ~ memtable is ever flushed at a time.) Increase (decrease) the index
+ ~ buffer size relative to the data buffer if you have few (many)
+ ~ columns per key. Bigger is only better _if_ your memtables get large
+ ~ enough to use the space. (Check in your data directory after your
+ ~ app has been running long enough.) -->
+ <FlushDataBufferSizeInMB>32</FlushDataBufferSizeInMB>
+ <FlushIndexBufferSizeInMB>8</FlushIndexBufferSizeInMB>
+
+ <!--
+ ~ Add column indexes to a row after its contents reach this size.
+ ~ Increase if your column values are large, or if you have a very large
+ ~ number of columns. The competing causes are, Cassandra has to
+ ~ deserialize this much of the row to read a single column, so you want
+ ~ it to be small - at least if you do many partial-row reads - but all
+ ~ the index data is read for each access, so you don't want to generate
+ ~ that wastefully either.
+ -->
+ <ColumnIndexSizeInKB>64</ColumnIndexSizeInKB>
+
+ <!--
+ ~ Flush memtable after this much data has been inserted, including
+ ~ overwritten data. There is one memtable per column family, and
+ ~ this threshold is based solely on the amount of data stored, not
+ ~ actual heap memory usage (there is some overhead in indexing the
+ ~ columns).
+ -->
+ <MemtableThroughputInMB>64</MemtableThroughputInMB>
+ <!--
+ ~ Throughput setting for Binary Memtables. Typically these are
+ ~ used for bulk load so you want them to be larger.
+ -->
+ <BinaryMemtableThroughputInMB>256</BinaryMemtableThroughputInMB>
+ <!--
+ ~ The maximum number of columns in millions to store in memory per
+ ~ ColumnFamily before flushing to disk. This is also a per-memtable
+ ~ setting. Use with MemtableThroughputInMB to tune memory usage.
+ -->
+ <MemtableOperationsInMillions>0.1</MemtableOperationsInMillions>
+ <!--
+ ~ The maximum time to leave a dirty memtable unflushed.
+ ~ (While any affected columnfamilies have unflushed data from a
+ ~ commit log segment, that segment cannot be deleted.)
+ ~ This needs to be large enough that it won't cause a flush storm
+ ~ of all your memtables flushing at once because none has hit
+ ~ the size or count thresholds yet. For production, a larger
+ ~ value such as 1440 is recommended.
+ -->
+ <MemtableFlushAfterMinutes>60</MemtableFlushAfterMinutes>
+
+ <!--
+ ~ Unlike most systems, in Cassandra writes are faster than reads, so
+ ~ you can afford more of those in parallel. A good rule of thumb is 2
+ ~ concurrent reads per processor core. Increase ConcurrentWrites to
+ ~ the number of clients writing at once if you enable CommitLogSync +
+ ~ CommitLogSyncDelay. -->
+ <ConcurrentReads>8</ConcurrentReads>
+ <ConcurrentWrites>32</ConcurrentWrites>
+
+ <!--
+ ~ CommitLogSync may be either "periodic" or "batch." When in batch
+ ~ mode, Cassandra won't ack writes until the commit log has been
+ ~ fsynced to disk. It will wait up to CommitLogSyncBatchWindowInMS
+ ~ milliseconds for other writes, before performing the sync.
+
+ ~ This is less necessary in Cassandra than in traditional databases
+ ~ since replication reduces the odds of losing data from a failure
+ ~ after writing the log entry but before it actually reaches the disk.
+ ~ So the other option is "timed," where writes may be acked immediately
+ ~ and the CommitLog is simply synced every CommitLogSyncPeriodInMS
+ ~ milliseconds.
+ -->
+ <CommitLogSync>periodic</CommitLogSync>
+ <!--
+ ~ Interval at which to perform syncs of the CommitLog in periodic mode.
+ ~ Usually the default of 10000ms is fine; increase it if your i/o
+ ~ load is such that syncs are taking excessively long times.
+ -->
+ <CommitLogSyncPeriodInMS>10000</CommitLogSyncPeriodInMS>
+ <!--
+ ~ Delay (in milliseconds) during which additional commit log entries
+ ~ may be written before fsync in batch mode. This will increase
+ ~ latency slightly, but can vastly improve throughput where there are
+ ~ many writers. Set to zero to disable (each entry will be synced
+ ~ individually). Reasonable values range from a minimal 0.1 to 10 or
+ ~ even more if throughput matters more than latency.
+ -->
+ <!-- <CommitLogSyncBatchWindowInMS>1</CommitLogSyncBatchWindowInMS> -->
+
+ <!--
+ ~ Time to wait before garbage-collection deletion markers. Set this to
+ ~ a large enough value that you are confident that the deletion marker
+ ~ will be propagated to all replicas by the time this many seconds has
+ ~ elapsed, even in the face of hardware failures. The default value is
+ ~ ten days.
+ -->
+ <GCGraceSeconds>864000</GCGraceSeconds>
+</Storage>
Propchange: incubator/cassandra/trunk/contrib/word_count/storage-conf.xml
------------------------------------------------------------------------------
svn:eol-style = native
Modified: incubator/cassandra/trunk/ivy.xml
URL: http://svn.apache.org/viewvc/incubator/cassandra/trunk/ivy.xml?rev=909626&r1=909625&r2=909626&view=diff
==============================================================================
--- incubator/cassandra/trunk/ivy.xml (original)
+++ incubator/cassandra/trunk/ivy.xml Fri Feb 12 21:55:31 2010
@@ -19,8 +19,11 @@
<ivy-module version="2.0">
<info organisation="apache-cassandra" module="cassandra"/>
<dependencies>
- <dependency org="org.apache.mahout.hadoop"
- name="hadoop-core" rev="0.20.1"/>
+ <!-- for hadoop -->
+ <dependency org="commons-logging" name="commons-logging" rev="1.1.1"/>
+ <dependency org="org.apache.mahout.hadoop" name="hadoop-core" rev="0.20.1"/>
+ <dependency org="commons-httpclient" name="commons-httpclient" rev="3.1"/>
+
<!-- FIXME: paranamer and jackson can be dropped after we're depending
on avro (since it depends on them). -->
<dependency org="com.thoughtworks.paranamer"
Modified: incubator/cassandra/trunk/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
URL: http://svn.apache.org/viewvc/incubator/cassandra/trunk/src/java/org/apache/cassandra/config/DatabaseDescriptor.java?rev=909626&r1=909625&r2=909626&view=diff
==============================================================================
--- incubator/cassandra/trunk/src/java/org/apache/cassandra/config/DatabaseDescriptor.java (original)
+++ incubator/cassandra/trunk/src/java/org/apache/cassandra/config/DatabaseDescriptor.java Fri Feb 12 21:55:31 2010
@@ -42,6 +42,7 @@
import java.util.*;
import java.net.InetAddress;
import java.net.UnknownHostException;
+import java.net.URL;
public class DatabaseDescriptor
{
@@ -145,11 +146,29 @@
private static IAuthenticator authenticator = new AllowAllAuthenticator();
+ private final static String STORAGE_CONF_FILE = "storage-conf.xml";
+
+ /**
+ * Try the storage-config system property, and then inspect the classpath.
+ */
+ static String getStorageConfigPath()
+ {
+ String scp = System.getProperty("storage-config") + File.separator + STORAGE_CONF_FILE;
+ if (new File(scp).exists())
+ return scp;
+ // try the classpath
+ ClassLoader loader = DatabaseDescriptor.class.getClassLoader();
+ URL scpurl = loader.getResource(STORAGE_CONF_FILE);
+ if (scpurl != null)
+ return scpurl.getFile();
+ throw new RuntimeException("Cannot locate " + STORAGE_CONF_FILE + " via storage-config system property or classpath lookup.");
+ }
+
static
{
try
{
- configFileName_ = System.getProperty("storage-config") + File.separator + "storage-conf.xml";
+ configFileName_ = getStorageConfigPath();
if (logger_.isDebugEnabled())
logger_.debug("Loading settings from " + configFileName_);
XMLUtils xmlUtils = new XMLUtils(configFileName_);
Modified: incubator/cassandra/trunk/src/java/org/apache/cassandra/db/RangeSliceCommand.java
URL: http://svn.apache.org/viewvc/incubator/cassandra/trunk/src/java/org/apache/cassandra/db/RangeSliceCommand.java?rev=909626&r1=909625&r2=909626&view=diff
==============================================================================
--- incubator/cassandra/trunk/src/java/org/apache/cassandra/db/RangeSliceCommand.java (original)
+++ incubator/cassandra/trunk/src/java/org/apache/cassandra/db/RangeSliceCommand.java Fri Feb 12 21:55:31 2010
@@ -39,7 +39,6 @@
import org.apache.cassandra.concurrent.StageManager;
import org.apache.cassandra.dht.AbstractBounds;
-import org.apache.cassandra.dht.Bounds;
import org.apache.cassandra.io.util.DataOutputBuffer;
import org.apache.cassandra.io.ICompactSerializer;
import org.apache.cassandra.net.Message;
@@ -129,7 +128,7 @@
TSerializer ser = new TSerializer(new TBinaryProtocol.Factory());
FBUtilities.serialize(ser, sliceCommand.predicate, dos);
- Bounds.serializer().serialize(sliceCommand.range, dos);
+ AbstractBounds.serializer().serialize(sliceCommand.range, dos);
dos.writeInt(sliceCommand.max_keys);
}