You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@cassandra.apache.org by bl...@apache.org on 2017/06/01 08:24:16 UTC
[5/8] cassandra git commit: Merge branch cassandra-3.0 into
cassandra-3.11
Merge branch cassandra-3.0 into cassandra-3.11
Project: http://git-wip-us.apache.org/repos/asf/cassandra/repo
Commit: http://git-wip-us.apache.org/repos/asf/cassandra/commit/5c9db9af
Tree: http://git-wip-us.apache.org/repos/asf/cassandra/tree/5c9db9af
Diff: http://git-wip-us.apache.org/repos/asf/cassandra/diff/5c9db9af
Branch: refs/heads/trunk
Commit: 5c9db9af96daeda02610623ccd8587b0a1510d4d
Parents: 88fa169 e22cb27
Author: Benjamin Lerer <b....@gmail.com>
Authored: Thu Jun 1 10:14:19 2017 +0200
Committer: Benjamin Lerer <b....@gmail.com>
Committed: Thu Jun 1 10:17:09 2017 +0200
----------------------------------------------------------------------
CHANGES.txt | 1 +
NEWS.txt | 2 +
doc/source/operating/metrics.rst | 2 +-
.../cassandra/db/PartitionRangeReadCommand.java | 20 ++++-
.../db/SinglePartitionReadCommand.java | 86 ++++++++++++++------
.../org/apache/cassandra/db/StorageHook.java | 64 +++++++++------
.../UnfilteredRowIteratorWithLowerBound.java | 10 ++-
.../io/sstable/format/SSTableReader.java | 72 ++++++++++------
.../io/sstable/format/SSTableReadsListener.java | 82 +++++++++++++++++++
.../io/sstable/format/big/BigTableReader.java | 42 ++++++----
.../io/sstable/format/big/BigTableScanner.java | 41 ++++++++--
.../miscellaneous/SSTablesIteratedTest.java | 69 +++++++++++++++-
.../cassandra/db/compaction/TTLExpiryTest.java | 6 +-
.../sstable/SSTableCorruptionDetectionTest.java | 8 +-
.../io/sstable/SSTableScannerTest.java | 3 +-
.../cassandra/io/sstable/SSTableWriterTest.java | 8 +-
.../sstable/format/ClientModeSSTableTest.java | 7 +-
17 files changed, 415 insertions(+), 108 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/cassandra/blob/5c9db9af/CHANGES.txt
----------------------------------------------------------------------
diff --cc CHANGES.txt
index b28f75f,4232084..2c96521
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@@ -1,41 -1,5 +1,42 @@@
-3.0.14
+3.11.0
+ * Fix formatting of duration columns in CQLSH (CASSANDRA-13549)
+ * Fix the problem with duplicated rows when using paging with SASI (CASSANDRA-13302)
+ * Allow CONTAINS statements filtering on the partition key and it’s parts (CASSANDRA-13275)
+ * Fall back to even ranges calculation in clusters with vnodes when tokens are distributed unevenly (CASSANDRA-13229)
+ * Fix duration type validation to prevent overflow (CASSANDRA-13218)
+ * Forbid unsupported creation of SASI indexes over partition key columns (CASSANDRA-13228)
+ * Reject multiple values for a key in CQL grammar. (CASSANDRA-13369)
+ * UDA fails without input rows (CASSANDRA-13399)
+ * Fix compaction-stress by using daemonInitialization (CASSANDRA-13188)
+ * V5 protocol flags decoding broken (CASSANDRA-13443)
+ * Use write lock not read lock for removing sstables from compaction strategies. (CASSANDRA-13422)
+ * Use corePoolSize equal to maxPoolSize in JMXEnabledThreadPoolExecutors (CASSANDRA-13329)
+ * Avoid rebuilding SASI indexes containing no values (CASSANDRA-12962)
+ * Add charset to Analyser input stream (CASSANDRA-13151)
+ * Fix testLimitSSTables flake caused by concurrent flush (CASSANDRA-12820)
+ * cdc column addition strikes again (CASSANDRA-13382)
+ * Fix static column indexes (CASSANDRA-13277)
+ * DataOutputBuffer.asNewBuffer broken (CASSANDRA-13298)
+ * unittest CipherFactoryTest failed on MacOS (CASSANDRA-13370)
+ * Forbid SELECT restrictions and CREATE INDEX over non-frozen UDT columns (CASSANDRA-13247)
+ * Default logging we ship will incorrectly print "?:?" for "%F:%L" pattern (CASSANDRA-13317)
+ * Possible AssertionError in UnfilteredRowIteratorWithLowerBound (CASSANDRA-13366)
+ * Support unaligned memory access for AArch64 (CASSANDRA-13326)
+ * Improve SASI range iterator efficiency on intersection with an empty range (CASSANDRA-12915).
+ * Fix equality comparisons of columns using the duration type (CASSANDRA-13174)
+ * Obfuscate password in stress-graphs (CASSANDRA-12233)
+ * Move to FastThreadLocalThread and FastThreadLocal (CASSANDRA-13034)
+ * nodetool stopdaemon errors out (CASSANDRA-13030)
+ * Tables in system_distributed should not use gcgs of 0 (CASSANDRA-12954)
+ * Fix primary index calculation for SASI (CASSANDRA-12910)
+ * More fixes to the TokenAllocator (CASSANDRA-12990)
+ * NoReplicationTokenAllocator should work with zero replication factor (CASSANDRA-12983)
+ * Address message coalescing regression (CASSANDRA-12676)
+ * Delete illegal character from StandardTokenizerImpl.jflex (CASSANDRA-13417)
+ * Fix cqlsh automatic protocol downgrade regression (CASSANDRA-13307)
+ * Tracing payload not passed from QueryMessage to tracing session (CASSANDRA-12835)
+Merged from 3.0:
+ * Fix the reported number of sstable data files accessed per read (CASSANDRA-13120)
* Fix schema digest mismatch during rolling upgrades from versions before 3.0.12 (CASSANDRA-13559)
* Upgrade JNA version to 4.4.0 (CASSANDRA-13072)
* Interned ColumnIdentifiers should use minimal ByteBuffers (CASSANDRA-13533)
http://git-wip-us.apache.org/repos/asf/cassandra/blob/5c9db9af/NEWS.txt
----------------------------------------------------------------------
diff --cc NEWS.txt
index 1c14748,a92fc5d..a56ced6
--- a/NEWS.txt
+++ b/NEWS.txt
@@@ -26,139 -38,40 +26,141 @@@ Upgradin
- In 2.1, the default for otc_coalescing_strategy was 'DISABLED'.
In 2.2 and 3.0, it was changed to 'TIMEHORIZON', but that value was shown
to be a performance regression. The default for 3.11.0 and newer has
- been reverted to 'DISABLED'. Users upgrading to Cassandra 3.0 should
- consider setting otc_coalescing_strategy to 'DISABLED'.
-
-3.0.11
-======
-
-Upgrading
----------
- - Support for alter types of already defined tables and of UDTs fields has been disabled.
- If it is necessary to return a different type, please use casting instead. See
- CASSANDRA-12443 for more details.
- - Specifying the default_time_to_live option when creating or altering a
- materialized view was erroneously accepted (and ignored). It is now
- properly rejected.
- - Only Java and JavaScript are now supported UDF languages.
- The sandbox in 3.0 already prevented the use of script languages except Java
- and JavaScript.
- - Compaction now correctly drops sstables out of CompactionTask when there
- isn't enough disk space to perform the full compaction. This should reduce
- pending compaction tasks on systems with little remaining disk space.
- - Primary ranges in the system.size_estimates table are now based on the keyspace
- replication settings and adjacent ranges are no longer merged (CASSANDRA-9639).
+ been reverted to 'DISABLED'. Users upgrading from Cassandra 2.2 or 3.0 should
+ be aware that the default has changed.
++ - The StorageHook interface has been modified to allow to retrieve read information from
++ SSTableReader (CASSANDRA-13120).
-3.0.10
-======
+3.10
+====
-Upgrading
----------
- - memtable_allocation_type: offheap_buffers is no longer allowed to be specified in the 3.0 series.
- This was an oversight that can cause segfaults. Offheap was re-introduced in 3.4 see CASSANDRA-11039
- and CASSANDRA-9472 for details.
+New features
+------------
+ - New `DurationType` (cql duration). See CASSANDRA-11873
+ - Runtime modification of concurrent_compactors is now available via nodetool
+ - Support for the assignment operators +=/-= has been added for update queries.
+ - An Index implementation may now provide a task which runs prior to joining
+ the ring. See CASSANDRA-12039
+ - Filtering on partition key columns is now also supported for queries without
+ secondary indexes.
+ - A slow query log has been added: slow queries will be logged at DEBUG level.
+ For more details refer to CASSANDRA-12403 and slow_query_log_timeout_in_ms
+ in cassandra.yaml.
+ - Support for GROUP BY queries has been added.
+ - A new compaction-stress tool has been added to test the throughput of compaction
+ for any cassandra-stress user schema. see compaction-stress help for how to use.
+ - Compaction can now take into account overlapping tables that don't take part
+ in the compaction to look for deleted or overwritten data in the compacted tables.
+ Then such data is found, it can be safely discarded, which in turn should enable
+ the removal of tombstones over that data.
+
+ The behavior can be engaged in two ways:
+ - as a "nodetool garbagecollect -g CELL/ROW" operation, which applies
+ single-table compaction on all sstables to discard deleted data in one step.
+ - as a "provide_overlapping_tombstones:CELL/ROW/NONE" compaction strategy flag,
+ which uses overlapping tables as a source of deletions/overwrites during all
+ compactions.
+ The argument specifies the granularity at which deleted data is to be found:
+ - If ROW is specified, only whole deleted rows (or sets of rows) will be
+ discarded.
+ - If CELL is specified, any columns whose value is overwritten or deleted
+ will also be discarded.
+ - NONE (default) specifies the old behavior, overlapping tables are not used to
+ decide when to discard data.
+ Which option to use depends on your workload, both ROW and CELL increase the
+ disk load on compaction (especially with the size-tiered compaction strategy),
+ with CELL being more resource-intensive. Both should lead to better read
+ performance if deleting rows (resp. overwriting or deleting cells) is common.
+ - Prepared statements are now persisted in the table prepared_statements in
+ the system keyspace. Upon startup, this table is used to preload all
+ previously prepared statements - i.e. in many cases clients do not need to
+ re-prepare statements against restarted nodes.
+ - cqlsh can now connect to older Cassandra versions by downgrading the native
+ protocol version. Please note that this is currently not part of our release
+ testing and, as a consequence, it is not guaranteed to work in all cases.
+ See CASSANDRA-12150 for more details.
+ - Snapshots that are automatically taken before a table is dropped or truncated
+ will have a "dropped" or "truncated" prefix on their snapshot tag name.
+ - Metrics are exposed for successful and failed authentication attempts.
+ These can be located using the object names org.apache.cassandra.metrics:type=Client,name=AuthSuccess
+ and org.apache.cassandra.metrics:type=Client,name=AuthFailure respectively.
+ - Add support to "unset" JSON fields in prepared statements by specifying DEFAULT UNSET.
+ See CASSANDRA-11424 for details
+ - Allow TTL with null value on insert and update. It will be treated as equivalent to inserting a 0.
+ - Removed outboundBindAny configuration property. See CASSANDRA-12673 for details.
+
+Upgrading
+---------
+ - Support for alter types of already defined tables and of UDTs fields has been disabled.
+ If it is necessary to return a different type, please use casting instead. See
+ CASSANDRA-12443 for more details.
+ - Specifying the default_time_to_live option when creating or altering a
+ materialized view was erroneously accepted (and ignored). It is now
+ properly rejected.
+ - Only Java and JavaScript are now supported UDF languages.
+ The sandbox in 3.0 already prevented the use of script languages except Java
+ and JavaScript.
+ - Compaction now correctly drops sstables out of CompactionTask when there
+ isn't enough disk space to perform the full compaction. This should reduce
+ pending compaction tasks on systems with little remaining disk space.
+ - Request timeouts in cassandra.yaml (read_request_timeout_in_ms, etc) now apply to the
+ "full" request time on the coordinator. Previously, they only covered the time from
+ when the coordinator sent a message to a replica until the time that the replica
+ responded. Additionally, the previous behavior was to reset the timeout when performing
+ a read repair, making a second read to fix a short read, and when subranges were read
+ as part of a range scan or secondary index query. In 3.10 and higher, the timeout
+ is no longer reset for these "subqueries". The entire request must complete within
+ the specified timeout. As a consequence, your timeouts may need to be adjusted
+ to account for this. See CASSANDRA-12256 for more details.
+ - Logs written to stdout are now consistent with logs written to files.
+ Time is now local (it was UTC on the console and local in files). Date, thread, file
+ and line info where added to stdout. (see CASSANDRA-12004)
+ - The 'clientutil' jar, which has been somewhat broken on the 3.x branch, is not longer provided.
+ The features provided by that jar are provided by any good java driver and we advise relying on drivers rather on
+ that jar, but if you need that jar for backward compatiblity until you do so, you should use the version provided
+ on previous Cassandra branch, like the 3.0 branch (by design, the functionality provided by that jar are stable
+ accross versions so using the 3.0 jar for a client connecting to 3.x should work without issues).
+ - (Tools development) DatabaseDescriptor no longer implicitly startups components/services like
+ commit log replay. This may break existing 3rd party tools and clients. In order to startup
+ a standalone tool or client application, use the DatabaseDescriptor.toolInitialization() or
+ DatabaseDescriptor.clientInitialization() methods. Tool initialization sets up partitioner,
+ snitch, encryption context. Client initialization just applies the configuration but does not
+ setup anything. Instead of using Config.setClientMode() or Config.isClientMode(), which are
+ deprecated now, use one of the appropiate new methods in DatabaseDescriptor.
+ - Application layer keep-alives were added to the streaming protocol to prevent idle incoming connections from
+ timing out and failing the stream session (CASSANDRA-11839). This effectively deprecates the streaming_socket_timeout_in_ms
+ property in favor of streaming_keep_alive_period_in_secs. See cassandra.yaml for more details about this property.
+ - Duration litterals support the ISO 8601 format. By consequence, identifiers matching that format
+ (e.g P2Y or P1MT6H) will not be supported anymore (CASSANDRA-11873).
+
+3.8
+===
-3.0.9
-=====
+New features
+------------
+ - Shared pool threads are now named according to the stage they are executing
+ tasks for. Thread names mentioned in traced queries change accordingly.
+ - A new option has been added to cassandra-stress "-rate fixed={number}/s"
+ that forces a scheduled rate of operations/sec over time. Using this, stress can
+ accurately account for coordinated ommission from the stress process.
+ - The cassandra-stress "-rate limit=" option has been renamed to "-rate throttle="
+ - hdr histograms have been added to stress runs, it's output can be saved to disk using:
+ "-log hdrfile=" option. This histogram includes response/service/wait times when used with the
+ fixed or throttle rate options. The histogram file can be plotted on
+ http://hdrhistogram.github.io/HdrHistogram/plotFiles.html
+ - TimeWindowCompactionStrategy has been added. This has proven to be a better approach
+ to time series compaction and new tables should use this instead of DTCS. See
+ CASSANDRA-9666 for details.
+ - Change-Data-Capture is now available. See cassandra.yaml and for cdc-specific flags and
+ a brief explanation of on-disk locations for archived data in CommitLog form. This can
+ be enabled via ALTER TABLE ... WITH cdc=true.
+ Upon flush, CommitLogSegments containing data for CDC-enabled tables are moved to
+ the data/cdc_raw directory until removed by the user and writes to CDC-enabled tables
+ will be rejected with a WriteTimeoutException once cdc_total_space_in_mb is reached
+ between unflushed CommitLogSegments and cdc_raw.
+ NOTE: CDC is disabled by default in the .yaml file. Do not enable CDC on a mixed-version
+ cluster as it will lead to exceptions which can interrupt traffic. Once all nodes
+ have been upgraded to 3.8 it is safe to enable this feature and restart the cluster.
Upgrading
---------
http://git-wip-us.apache.org/repos/asf/cassandra/blob/5c9db9af/doc/source/operating/metrics.rst
----------------------------------------------------------------------
diff --cc doc/source/operating/metrics.rst
index 546d9c2,0000000..04abb48
mode 100644,000000..100644
--- a/doc/source/operating/metrics.rst
+++ b/doc/source/operating/metrics.rst
@@@ -1,706 -1,0 +1,706 @@@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements. See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership. The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License. You may obtain a copy of the License at
+..
+.. http://www.apache.org/licenses/LICENSE-2.0
+..
+.. Unless required by applicable law or agreed to in writing, software
+.. distributed under the License is distributed on an "AS IS" BASIS,
+.. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. See the License for the specific language governing permissions and
+.. limitations under the License.
+
+.. highlight:: none
+
+Monitoring
+----------
+
+Metrics in Cassandra are managed using the `Dropwizard Metrics <http://metrics.dropwizard.io>`__ library. These metrics
+can be queried via JMX or pushed to external monitoring systems using a number of `built in
+<http://metrics.dropwizard.io/3.1.0/getting-started/#other-reporting>`__ and `third party
+<http://metrics.dropwizard.io/3.1.0/manual/third-party/>`__ reporter plugins.
+
+Metrics are collected for a single node. It's up to the operator to use an external monitoring system to aggregate them.
+
+Metric Types
+^^^^^^^^^^^^
+All metrics reported by cassandra fit into one of the following types.
+
+``Gauge``
+ An instantaneous measurement of a value.
+
+``Counter``
+ A gauge for an ``AtomicLong`` instance. Typically this is consumed by monitoring the change since the last call to
+ see if there is a large increase compared to the norm.
+
+``Histogram``
+ Measures the statistical distribution of values in a stream of data.
+
+ In addition to minimum, maximum, mean, etc., it also measures median, 75th, 90th, 95th, 98th, 99th, and 99.9th
+ percentiles.
+
+``Timer``
+ Measures both the rate that a particular piece of code is called and the histogram of its duration.
+
+``Latency``
+ Special type that tracks latency (in microseconds) with a ``Timer`` plus a ``Counter`` that tracks the total latency
+ accrued since starting. The former is useful if you track the change in total latency since the last check. Each
+ metric name of this type will have 'Latency' and 'TotalLatency' appended to it.
+
+``Meter``
+ A meter metric which measures mean throughput and one-, five-, and fifteen-minute exponentially-weighted moving
+ average throughputs.
+
+Table Metrics
+^^^^^^^^^^^^^
+
+Each table in Cassandra has metrics responsible for tracking its state and performance.
+
+The metric names are all appended with the specific ``Keyspace`` and ``Table`` name.
+
+Reported name format:
+
+**Metric Name**
+ ``org.apache.cassandra.metrics.Table.<MetricName>.<Keyspace>.<Table>``
+
+**JMX MBean**
+ ``org.apache.cassandra.metrics:type=Table keyspace=<Keyspace> scope=<Table> name=<MetricName>``
+
+.. NOTE::
+ There is a special table called '``all``' without a keyspace. This represents the aggregation of metrics across
+ **all** tables and keyspaces on the node.
+
+
+======================================= ============== ===========
+Name Type Description
+======================================= ============== ===========
+MemtableOnHeapSize Gauge<Long> Total amount of data stored in the memtable that resides **on**-heap, including column related overhead and partitions overwritten.
+MemtableOffHeapSize Gauge<Long> Total amount of data stored in the memtable that resides **off**-heap, including column related overhead and partitions overwritten.
+MemtableLiveDataSize Gauge<Long> Total amount of live data stored in the memtable, excluding any data structure overhead.
+AllMemtablesOnHeapSize Gauge<Long> Total amount of data stored in the memtables (2i and pending flush memtables included) that resides **on**-heap.
+AllMemtablesOffHeapSize Gauge<Long> Total amount of data stored in the memtables (2i and pending flush memtables included) that resides **off**-heap.
+AllMemtablesLiveDataSize Gauge<Long> Total amount of live data stored in the memtables (2i and pending flush memtables included) that resides off-heap, excluding any data structure overhead.
+MemtableColumnsCount Gauge<Long> Total number of columns present in the memtable.
+MemtableSwitchCount Counter Number of times flush has resulted in the memtable being switched out.
+CompressionRatio Gauge<Double> Current compression ratio for all SSTables.
+EstimatedPartitionSizeHistogram Gauge<long[]> Histogram of estimated partition size (in bytes).
+EstimatedPartitionCount Gauge<Long> Approximate number of keys in table.
+EstimatedColumnCountHistogram Gauge<long[]> Histogram of estimated number of columns.
- SSTablesPerReadHistogram Histogram Histogram of the number of sstable data files accessed per read.
++SSTablesPerReadHistogram Histogram Histogram of the number of sstable data files accessed per single partition read. SSTables skipped due to Bloom Filters, min-max key or partition index lookup are not taken into acoount.
+ReadLatency Latency Local read latency for this table.
+RangeLatency Latency Local range scan latency for this table.
+WriteLatency Latency Local write latency for this table.
+CoordinatorReadLatency Timer Coordinator read latency for this table.
+CoordinatorScanLatency Timer Coordinator range scan latency for this table.
+PendingFlushes Counter Estimated number of flush tasks pending for this table.
+BytesFlushed Counter Total number of bytes flushed since server [re]start.
+CompactionBytesWritten Counter Total number of bytes written by compaction since server [re]start.
+PendingCompactions Gauge<Integer> Estimate of number of pending compactions for this table.
+LiveSSTableCount Gauge<Integer> Number of SSTables on disk for this table.
+LiveDiskSpaceUsed Counter Disk space used by SSTables belonging to this table (in bytes).
+TotalDiskSpaceUsed Counter Total disk space used by SSTables belonging to this table, including obsolete ones waiting to be GC'd.
+MinPartitionSize Gauge<Long> Size of the smallest compacted partition (in bytes).
+MaxPartitionSize Gauge<Long> Size of the largest compacted partition (in bytes).
+MeanPartitionSize Gauge<Long> Size of the average compacted partition (in bytes).
+BloomFilterFalsePositives Gauge<Long> Number of false positives on table's bloom filter.
+BloomFilterFalseRatio Gauge<Double> False positive ratio of table's bloom filter.
+BloomFilterDiskSpaceUsed Gauge<Long> Disk space used by bloom filter (in bytes).
+BloomFilterOffHeapMemoryUsed Gauge<Long> Off-heap memory used by bloom filter.
+IndexSummaryOffHeapMemoryUsed Gauge<Long> Off-heap memory used by index summary.
+CompressionMetadataOffHeapMemoryUsed Gauge<Long> Off-heap memory used by compression meta data.
+KeyCacheHitRate Gauge<Double> Key cache hit rate for this table.
+TombstoneScannedHistogram Histogram Histogram of tombstones scanned in queries on this table.
+LiveScannedHistogram Histogram Histogram of live cells scanned in queries on this table.
+ColUpdateTimeDeltaHistogram Histogram Histogram of column update time delta on this table.
+ViewLockAcquireTime Timer Time taken acquiring a partition lock for materialized view updates on this table.
+ViewReadTime Timer Time taken during the local read of a materialized view update.
+TrueSnapshotsSize Gauge<Long> Disk space used by snapshots of this table including all SSTable components.
+RowCacheHitOutOfRange Counter Number of table row cache hits that do not satisfy the query filter, thus went to disk.
+RowCacheHit Counter Number of table row cache hits.
+RowCacheMiss Counter Number of table row cache misses.
+CasPrepare Latency Latency of paxos prepare round.
+CasPropose Latency Latency of paxos propose round.
+CasCommit Latency Latency of paxos commit round.
+PercentRepaired Gauge<Double> Percent of table data that is repaired on disk.
+SpeculativeRetries Counter Number of times speculative retries were sent for this table.
+WaitingOnFreeMemtableSpace Histogram Histogram of time spent waiting for free memtable space, either on- or off-heap.
+DroppedMutations Counter Number of dropped mutations on this table.
+======================================= ============== ===========
+
+Keyspace Metrics
+^^^^^^^^^^^^^^^^
+Each keyspace in Cassandra has metrics responsible for tracking its state and performance.
+
+These metrics are the same as the ``Table Metrics`` above, only they are aggregated at the Keyspace level.
+
+Reported name format:
+
+**Metric Name**
+ ``org.apache.cassandra.metrics.keyspace.<MetricName>.<Keyspace>``
+
+**JMX MBean**
+ ``org.apache.cassandra.metrics:type=Keyspace scope=<Keyspace> name=<MetricName>``
+
+ThreadPool Metrics
+^^^^^^^^^^^^^^^^^^
+
+Cassandra splits work of a particular type into its own thread pool. This provides back-pressure and asynchrony for
+requests on a node. It's important to monitor the state of these thread pools since they can tell you how saturated a
+node is.
+
+The metric names are all appended with the specific ``ThreadPool`` name. The thread pools are also categorized under a
+specific type.
+
+Reported name format:
+
+**Metric Name**
+ ``org.apache.cassandra.metrics.ThreadPools.<MetricName>.<Path>.<ThreadPoolName>``
+
+**JMX MBean**
+ ``org.apache.cassandra.metrics:type=ThreadPools scope=<ThreadPoolName> type=<Type> name=<MetricName>``
+
+===================== ============== ===========
+Name Type Description
+===================== ============== ===========
+ActiveTasks Gauge<Integer> Number of tasks being actively worked on by this pool.
+PendingTasks Gauge<Integer> Number of queued tasks queued up on this pool.
+CompletedTasks Counter Number of tasks completed.
+TotalBlockedTasks Counter Number of tasks that were blocked due to queue saturation.
+CurrentlyBlockedTask Counter Number of tasks that are currently blocked due to queue saturation but on retry will become unblocked.
+MaxPoolSize Gauge<Integer> The maximum number of threads in this pool.
+===================== ============== ===========
+
+The following thread pools can be monitored.
+
+============================ ============== ===========
+Name Type Description
+============================ ============== ===========
+Native-Transport-Requests transport Handles client CQL requests
+CounterMutationStage request Responsible for counter writes
+ViewMutationStage request Responsible for materialized view writes
+MutationStage request Responsible for all other writes
+ReadRepairStage request ReadRepair happens on this thread pool
+ReadStage request Local reads run on this thread pool
+RequestResponseStage request Coordinator requests to the cluster run on this thread pool
+AntiEntropyStage internal Builds merkle tree for repairs
+CacheCleanupExecutor internal Cache maintenance performed on this thread pool
+CompactionExecutor internal Compactions are run on these threads
+GossipStage internal Handles gossip requests
+HintsDispatcher internal Performs hinted handoff
+InternalResponseStage internal Responsible for intra-cluster callbacks
+MemtableFlushWriter internal Writes memtables to disk
+MemtablePostFlush internal Cleans up commit log after memtable is written to disk
+MemtableReclaimMemory internal Memtable recycling
+MigrationStage internal Runs schema migrations
+MiscStage internal Misceleneous tasks run here
+PendingRangeCalculator internal Calculates token range
+PerDiskMemtableFlushWriter_0 internal Responsible for writing a spec (there is one of these per disk 0-N)
+Sampler internal Responsible for re-sampling the index summaries of SStables
+SecondaryIndexManagement internal Performs updates to secondary indexes
+ValidationExecutor internal Performs validation compaction or scrubbing
+============================ ============== ===========
+
+.. |nbsp| unicode:: 0xA0 .. nonbreaking space
+
+Client Request Metrics
+^^^^^^^^^^^^^^^^^^^^^^
+
+Client requests have their own set of metrics that encapsulate the work happening at coordinator level.
+
+Different types of client requests are broken down by ``RequestType``.
+
+Reported name format:
+
+**Metric Name**
+ ``org.apache.cassandra.metrics.ClientRequest.<MetricName>.<RequestType>``
+
+**JMX MBean**
+ ``org.apache.cassandra.metrics:type=ClientRequest scope=<RequestType> name=<MetricName>``
+
+
+:RequestType: CASRead
+:Description: Metrics related to transactional read requests.
+:Metrics:
+ ===================== ============== =============================================================
+ Name Type Description
+ ===================== ============== =============================================================
+ Timeouts Counter Number of timeouts encountered.
+ Failures Counter Number of transaction failures encountered.
+ |nbsp| Latency Transaction read latency.
+ Unavailables Counter Number of unavailable exceptions encountered.
+ UnfinishedCommit Counter Number of transactions that were committed on read.
+ ConditionNotMet Counter Number of transaction preconditions did not match current values.
+ ContentionHistogram Histogram How many contended reads were encountered
+ ===================== ============== =============================================================
+
+:RequestType: CASWrite
+:Description: Metrics related to transactional write requests.
+:Metrics:
+ ===================== ============== =============================================================
+ Name Type Description
+ ===================== ============== =============================================================
+ Timeouts Counter Number of timeouts encountered.
+ Failures Counter Number of transaction failures encountered.
+ |nbsp| Latency Transaction write latency.
+ UnfinishedCommit Counter Number of transactions that were committed on write.
+ ConditionNotMet Counter Number of transaction preconditions did not match current values.
+ ContentionHistogram Histogram How many contended writes were encountered
+ ===================== ============== =============================================================
+
+
+:RequestType: Read
+:Description: Metrics related to standard read requests.
+:Metrics:
+ ===================== ============== =============================================================
+ Name Type Description
+ ===================== ============== =============================================================
+ Timeouts Counter Number of timeouts encountered.
+ Failures Counter Number of read failures encountered.
+ |nbsp| Latency Read latency.
+ Unavailables Counter Number of unavailable exceptions encountered.
+ ===================== ============== =============================================================
+
+:RequestType: RangeSlice
+:Description: Metrics related to token range read requests.
+:Metrics:
+ ===================== ============== =============================================================
+ Name Type Description
+ ===================== ============== =============================================================
+ Timeouts Counter Number of timeouts encountered.
+ Failures Counter Number of range query failures encountered.
+ |nbsp| Latency Range query latency.
+ Unavailables Counter Number of unavailable exceptions encountered.
+ ===================== ============== =============================================================
+
+:RequestType: Write
+:Description: Metrics related to regular write requests.
+:Metrics:
+ ===================== ============== =============================================================
+ Name Type Description
+ ===================== ============== =============================================================
+ Timeouts Counter Number of timeouts encountered.
+ Failures Counter Number of write failures encountered.
+ |nbsp| Latency Write latency.
+ Unavailables Counter Number of unavailable exceptions encountered.
+ ===================== ============== =============================================================
+
+
+:RequestType: ViewWrite
+:Description: Metrics related to materialized view write wrtes.
+:Metrics:
+ ===================== ============== =============================================================
+ Timeouts Counter Number of timeouts encountered.
+ Failures Counter Number of transaction failures encountered.
+ Unavailables Counter Number of unavailable exceptions encountered.
+ ViewReplicasAttempted Counter Total number of attempted view replica writes.
+ ViewReplicasSuccess Counter Total number of succeded view replica writes.
+ ViewPendingMutations Gauge<Long> ViewReplicasAttempted - ViewReplicasSuccess.
+ ViewWriteLatency Timer Time between when mutation is applied to base table and when CL.ONE is achieved on view.
+ ===================== ============== =============================================================
+
+Cache Metrics
+^^^^^^^^^^^^^
+
+Cassandra caches have metrics to track the effectivness of the caches. Though the ``Table Metrics`` might be more useful.
+
+Reported name format:
+
+**Metric Name**
+ ``org.apache.cassandra.metrics.Cache.<MetricName>.<CacheName>``
+
+**JMX MBean**
+ ``org.apache.cassandra.metrics:type=Cache scope=<CacheName> name=<MetricName>``
+
+========================== ============== ===========
+Name Type Description
+========================== ============== ===========
+Capacity Gauge<Long> Cache capacity in bytes.
+Entries Gauge<Integer> Total number of cache entries.
+FifteenMinuteCacheHitRate Gauge<Double> 15m cache hit rate.
+FiveMinuteCacheHitRate Gauge<Double> 5m cache hit rate.
+OneMinuteCacheHitRate Gauge<Double> 1m cache hit rate.
+HitRate Gauge<Double> All time cache hit rate.
+Hits Meter Total number of cache hits.
+Misses Meter Total number of cache misses.
+MissLatency Timer Latency of misses.
+Requests Gauge<Long> Total number of cache requests.
+Size Gauge<Long> Total size of occupied cache, in bytes.
+========================== ============== ===========
+
+The following caches are covered:
+
+============================ ===========
+Name Description
+============================ ===========
+CounterCache Keeps hot counters in memory for performance.
+ChunkCache In process uncompressed page cache.
+KeyCache Cache for partition to sstable offsets.
+RowCache Cache for rows kept in memory.
+============================ ===========
+
+.. NOTE::
+ Misses and MissLatency are only defined for the ChunkCache
+
+CQL Metrics
+^^^^^^^^^^^
+
+Metrics specific to CQL prepared statement caching.
+
+Reported name format:
+
+**Metric Name**
+ ``org.apache.cassandra.metrics.CQL.<MetricName>``
+
+**JMX MBean**
+ ``org.apache.cassandra.metrics:type=CQL name=<MetricName>``
+
+========================== ============== ===========
+Name Type Description
+========================== ============== ===========
+PreparedStatementsCount Gauge<Integer> Number of cached prepared statements.
+PreparedStatementsEvicted Counter Number of prepared statements evicted from the prepared statement cache
+PreparedStatementsExecuted Counter Number of prepared statements executed.
+RegularStatementsExecuted Counter Number of **non** prepared statements executed.
+PreparedStatementsRatio Gauge<Double> Percentage of statements that are prepared vs unprepared.
+========================== ============== ===========
+
+
+DroppedMessage Metrics
+^^^^^^^^^^^^^^^^^^^^^^
+
+Metrics specific to tracking dropped messages for different types of requests.
+Dropped writes are stored and retried by ``Hinted Handoff``
+
+Reported name format:
+
+**Metric Name**
+ ``org.apache.cassandra.metrics.DroppedMessages.<MetricName>.<Type>``
+
+**JMX MBean**
+ ``org.apache.cassandra.metrics:type=DroppedMetrics scope=<Type> name=<MetricName>``
+
+========================== ============== ===========
+Name Type Description
+========================== ============== ===========
+CrossNodeDroppedLatency Timer The dropped latency across nodes.
+InternalDroppedLatency Timer The dropped latency within node.
+Dropped Meter Number of dropped messages.
+========================== ============== ===========
+
+The different types of messages tracked are:
+
+============================ ===========
+Name Description
+============================ ===========
+BATCH_STORE Batchlog write
+BATCH_REMOVE Batchlog cleanup (after succesfully applied)
+COUNTER_MUTATION Counter writes
+HINT Hint replay
+MUTATION Regular writes
+READ Regular reads
+READ_REPAIR Read repair
+PAGED_SLICE Paged read
+RANGE_SLICE Token range read
+REQUEST_RESPONSE RPC Callbacks
+_TRACE Tracing writes
+============================ ===========
+
+Streaming Metrics
+^^^^^^^^^^^^^^^^^
+
+Metrics reported during ``Streaming`` operations, such as repair, bootstrap, rebuild.
+
+These metrics are specific to a peer endpoint, with the source node being the node you are pulling the metrics from.
+
+Reported name format:
+
+**Metric Name**
+ ``org.apache.cassandra.metrics.Streaming.<MetricName>.<PeerIP>``
+
+**JMX MBean**
+ ``org.apache.cassandra.metrics:type=Streaming scope=<PeerIP> name=<MetricName>``
+
+========================== ============== ===========
+Name Type Description
+========================== ============== ===========
+IncomingBytes Counter Number of bytes streamed to this node from the peer.
+OutgoingBytes Counter Number of bytes streamed to the peer endpoint from this node.
+========================== ============== ===========
+
+
+Compaction Metrics
+^^^^^^^^^^^^^^^^^^
+
+Metrics specific to ``Compaction`` work.
+
+Reported name format:
+
+**Metric Name**
+ ``org.apache.cassandra.metrics.Compaction.<MetricName>``
+
+**JMX MBean**
+ ``org.apache.cassandra.metrics:type=Compaction name=<MetricName>``
+
+========================== ======================================== ===============================================
+Name Type Description
+========================== ======================================== ===============================================
+BytesCompacted Counter Total number of bytes compacted since server [re]start.
+PendingTasks Gauge<Integer> Estimated number of compactions remaining to perform.
+CompletedTasks Gauge<Long> Number of completed compactions since server [re]start.
+TotalCompactionsCompleted Meter Throughput of completed compactions since server [re]start.
+PendingTasksByTableName Gauge<Map<String, Map<String, Integer>>> Estimated number of compactions remaining to perform, grouped by keyspace and then table name. This info is also kept in ``Table Metrics``.
+========================== ======================================== ===============================================
+
+CommitLog Metrics
+^^^^^^^^^^^^^^^^^
+
+Metrics specific to the ``CommitLog``
+
+Reported name format:
+
+**Metric Name**
+ ``org.apache.cassandra.metrics.CommitLog.<MetricName>``
+
+**JMX MBean**
+ ``org.apache.cassandra.metrics:type=CommitLog name=<MetricName>``
+
+========================== ============== ===========
+Name Type Description
+========================== ============== ===========
+CompletedTasks Gauge<Long> Total number of commit log messages written since [re]start.
+PendingTasks Gauge<Long> Number of commit log messages written but yet to be fsync'd.
+TotalCommitLogSize Gauge<Long> Current size, in bytes, used by all the commit log segments.
+WaitingOnSegmentAllocation Timer Time spent waiting for a CommitLogSegment to be allocated - under normal conditions this should be zero.
+WaitingOnCommit Timer The time spent waiting on CL fsync; for Periodic this is only occurs when the sync is lagging its sync interval.
+========================== ============== ===========
+
+Storage Metrics
+^^^^^^^^^^^^^^^
+
+Metrics specific to the storage engine.
+
+Reported name format:
+
+**Metric Name**
+ ``org.apache.cassandra.metrics.Storage.<MetricName>``
+
+**JMX MBean**
+ ``org.apache.cassandra.metrics:type=Storage name=<MetricName>``
+
+========================== ============== ===========
+Name Type Description
+========================== ============== ===========
+Exceptions Counter Number of internal exceptions caught. Under normal exceptions this should be zero.
+Load Counter Size, in bytes, of the on disk data size this node manages.
+TotalHints Counter Number of hint messages written to this node since [re]start. Includes one entry for each host to be hinted per hint.
+TotalHintsInProgress Counter Number of hints attemping to be sent currently.
+========================== ============== ===========
+
+HintedHandoff Metrics
+^^^^^^^^^^^^^^^^^^^^^
+
+Metrics specific to Hinted Handoff. There are also some metrics related to hints tracked in ``Storage Metrics``
+
+These metrics include the peer endpoint **in the metric name**
+
+Reported name format:
+
+**Metric Name**
+ ``org.apache.cassandra.metrics.HintedHandOffManager.<MetricName>``
+
+**JMX MBean**
+ ``org.apache.cassandra.metrics:type=HintedHandOffManager name=<MetricName>``
+
+=========================== ============== ===========
+Name Type Description
+=========================== ============== ===========
+Hints_created-<PeerIP> Counter Number of hints on disk for this peer.
+Hints_not_stored-<PeerIP> Counter Number of hints not stored for this peer, due to being down past the configured hint window.
+=========================== ============== ===========
+
+SSTable Index Metrics
+^^^^^^^^^^^^^^^^^^^^^
+
+Metrics specific to the SSTable index metadata.
+
+Reported name format:
+
+**Metric Name**
+ ``org.apache.cassandra.metrics.Index.<MetricName>.RowIndexEntry``
+
+**JMX MBean**
+ ``org.apache.cassandra.metrics:type=Index scope=RowIndexEntry name=<MetricName>``
+
+=========================== ============== ===========
+Name Type Description
+=========================== ============== ===========
+IndexedEntrySize Histogram Histogram of the on-heap size, in bytes, of the index across all SSTables.
+IndexInfoCount Histogram Histogram of the number of on-heap index entries managed across all SSTables.
+IndexInfoGets Histogram Histogram of the number index seeks performed per SSTable.
+=========================== ============== ===========
+
+BufferPool Metrics
+^^^^^^^^^^^^^^^^^^
+
+Metrics specific to the internal recycled buffer pool Cassandra manages. This pool is meant to keep allocations and GC
+lower by recycling on and off heap buffers.
+
+Reported name format:
+
+**Metric Name**
+ ``org.apache.cassandra.metrics.BufferPool.<MetricName>``
+
+**JMX MBean**
+ ``org.apache.cassandra.metrics:type=BufferPool name=<MetricName>``
+
+=========================== ============== ===========
+Name Type Description
+=========================== ============== ===========
+Size Gauge<Long> Size, in bytes, of the managed buffer pool
+Misses Meter The rate of misses in the pool. The higher this is the more allocations incurred.
+=========================== ============== ===========
+
+
+Client Metrics
+^^^^^^^^^^^^^^
+
+Metrics specifc to client managment.
+
+Reported name format:
+
+**Metric Name**
+ ``org.apache.cassandra.metrics.Client.<MetricName>``
+
+**JMX MBean**
+ ``org.apache.cassandra.metrics:type=Client name=<MetricName>``
+
+=========================== ============== ===========
+Name Type Description
+=========================== ============== ===========
+connectedNativeClients Counter Number of clients connected to this nodes native protocol server
+connectedThriftClients Counter Number of clients connected to this nodes thrift protocol server
+=========================== ============== ===========
+
+JVM Metrics
+^^^^^^^^^^^
+
+JVM metrics such as memory and garbage collection statistics can either be accessed by connecting to the JVM using JMX or can be exported using `Metric Reporters`_.
+
+BufferPool
+++++++++++
+
+**Metric Name**
+ ``jvm.buffers.<direct|mapped>.<MetricName>``
+
+**JMX MBean**
+ ``java.nio:type=BufferPool name=<direct|mapped>``
+
+========================== ============== ===========
+Name Type Description
+========================== ============== ===========
+Capacity Gauge<Long> Estimated total capacity of the buffers in this pool
+Count Gauge<Long> Estimated number of buffers in the pool
+Used Gauge<Long> Estimated memory that the Java virtual machine is using for this buffer pool
+========================== ============== ===========
+
+FileDescriptorRatio
++++++++++++++++++++
+
+**Metric Name**
+ ``jvm.fd.<MetricName>``
+
+**JMX MBean**
+ ``java.lang:type=OperatingSystem name=<OpenFileDescriptorCount|MaxFileDescriptorCount>``
+
+========================== ============== ===========
+Name Type Description
+========================== ============== ===========
+Usage Ratio Ratio of used to total file descriptors
+========================== ============== ===========
+
+GarbageCollector
+++++++++++++++++
+
+**Metric Name**
+ ``jvm.gc.<gc_type>.<MetricName>``
+
+**JMX MBean**
+ ``java.lang:type=GarbageCollector name=<gc_type>``
+
+========================== ============== ===========
+Name Type Description
+========================== ============== ===========
+Count Gauge<Long> Total number of collections that have occurred
+Time Gauge<Long> Approximate accumulated collection elapsed time in milliseconds
+========================== ============== ===========
+
+Memory
+++++++
+
+**Metric Name**
+ ``jvm.memory.<heap/non-heap/total>.<MetricName>``
+
+**JMX MBean**
+ ``java.lang:type=Memory``
+
+========================== ============== ===========
+Committed Gauge<Long> Amount of memory in bytes that is committed for the JVM to use
+Init Gauge<Long> Amount of memory in bytes that the JVM initially requests from the OS
+Max Gauge<Long> Maximum amount of memory in bytes that can be used for memory management
+Usage Ratio Ratio of used to maximum memory
+Used Gauge<Long> Amount of used memory in bytes
+========================== ============== ===========
+
+MemoryPool
+++++++++++
+
+**Metric Name**
+ ``jvm.memory.pools.<memory_pool>.<MetricName>``
+
+**JMX MBean**
+ ``java.lang:type=MemoryPool name=<memory_pool>``
+
+========================== ============== ===========
+Committed Gauge<Long> Amount of memory in bytes that is committed for the JVM to use
+Init Gauge<Long> Amount of memory in bytes that the JVM initially requests from the OS
+Max Gauge<Long> Maximum amount of memory in bytes that can be used for memory management
+Usage Ratio Ratio of used to maximum memory
+Used Gauge<Long> Amount of used memory in bytes
+========================== ============== ===========
+
+JMX
+^^^
+
+Any JMX based client can access metrics from cassandra.
+
+If you wish to access JMX metrics over http it's possible to download `Mx4jTool <http://mx4j.sourceforge.net/>`__ and
+place ``mx4j-tools.jar`` into the classpath. On startup you will see in the log::
+
+ HttpAdaptor version 3.0.2 started on port 8081
+
+To choose a different port (8081 is the default) or a different listen address (0.0.0.0 is not the default) edit
+``conf/cassandra-env.sh`` and uncomment::
+
+ #MX4J_ADDRESS="-Dmx4jaddress=0.0.0.0"
+
+ #MX4J_PORT="-Dmx4jport=8081"
+
+
+Metric Reporters
+^^^^^^^^^^^^^^^^
+
+As mentioned at the top of this section on monitoring the Cassandra metrics can be exported to a number of monitoring
+system a number of `built in <http://metrics.dropwizard.io/3.1.0/getting-started/#other-reporting>`__ and `third party
+<http://metrics.dropwizard.io/3.1.0/manual/third-party/>`__ reporter plugins.
+
+The configuration of these plugins is managed by the `metrics reporter config project
+<https://github.com/addthis/metrics-reporter-config>`__. There is a sample configuration file located at
+``conf/metrics-reporter-config-sample.yaml``.
+
+Once configured, you simply start cassandra with the flag
+``-Dcassandra.metricsReporterConfigFile=metrics-reporter-config.yaml``. The specified .yaml file plus any 3rd party
+reporter jars must all be in Cassandra's classpath.
http://git-wip-us.apache.org/repos/asf/cassandra/blob/5c9db9af/src/java/org/apache/cassandra/db/PartitionRangeReadCommand.java
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/cassandra/blob/5c9db9af/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java
----------------------------------------------------------------------
diff --cc src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java
index 1358f29,72b4465..47c426e
--- a/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java
+++ b/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java
@@@ -39,9 -36,9 +39,10 @@@ import org.apache.cassandra.db.lifecycl
import org.apache.cassandra.db.filter.*;
import org.apache.cassandra.db.partitions.*;
import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.transform.Transformation;
import org.apache.cassandra.exceptions.RequestExecutionException;
import org.apache.cassandra.io.sstable.format.SSTableReader;
+ import org.apache.cassandra.io.sstable.format.SSTableReadsListener;
import org.apache.cassandra.io.util.DataInputPlus;
import org.apache.cassandra.io.util.DataOutputPlus;
import org.apache.cassandra.metrics.TableMetrics;
@@@ -570,9 -553,11 +571,10 @@@ public class SinglePartitionReadComman
* in one pass, and minimize the number of sstables for which we read a partition tombstone.
*/
Collections.sort(view.sstables, SSTableReader.maxTimestampComparator);
- List<SSTableReader> skippedSSTables = null;
long mostRecentPartitionTombstone = Long.MIN_VALUE;
- long minTimestamp = Long.MAX_VALUE;
int nonIntersectingSSTables = 0;
+ List<SSTableReader> skippedSSTablesWithTombstones = null;
+ SSTableReadMetricsCollector metricsCollector = new SSTableReadMetricsCollector();
for (SSTableReader sstable : view.sstables)
{
@@@ -594,11 -580,12 +596,11 @@@
continue;
}
- @SuppressWarnings("resource") // 'iter' is added to iterators which is closed on exception, or through the closing of the final merged iterator
- UnfilteredRowIterator iter = filter.filter(sstable.iterator(partitionKey(),
- columnFilter(),
- filter.isReversed(),
- isForThrift(),
- metricsCollector));
+ minTimestamp = Math.min(minTimestamp, sstable.getMinTimestamp());
+
+ @SuppressWarnings("resource") // 'iter' is added to iterators which is closed on exception,
+ // or through the closing of the final merged iterator
- UnfilteredRowIteratorWithLowerBound iter = makeIterator(cfs, sstable, true);
++ UnfilteredRowIteratorWithLowerBound iter = makeIterator(cfs, sstable, true, metricsCollector);
if (!sstable.isRepaired())
oldestUnrepairedTombstone = Math.min(oldestUnrepairedTombstone, sstable.getMinLocalDeletionTime());
@@@ -616,14 -602,24 +618,14 @@@
if (sstable.getMaxTimestamp() <= minTimestamp)
continue;
- @SuppressWarnings("resource") // 'iter' is either closed right away, or added to iterators which is close on exception, or through the closing of the final merged iterator
- UnfilteredRowIterator iter = filter.filter(sstable.iterator(partitionKey(),
- columnFilter(),
- filter.isReversed(),
- isForThrift(),
- metricsCollector));
+ @SuppressWarnings("resource") // 'iter' is added to iterators which is close on exception,
+ // or through the closing of the final merged iterator
- UnfilteredRowIteratorWithLowerBound iter = makeIterator(cfs, sstable, false);
++ UnfilteredRowIteratorWithLowerBound iter = makeIterator(cfs, sstable, false, metricsCollector);
+ if (!sstable.isRepaired())
+ oldestUnrepairedTombstone = Math.min(oldestUnrepairedTombstone, sstable.getMinLocalDeletionTime());
- if (iter.partitionLevelDeletion().markedForDeleteAt() > minTimestamp)
- {
- iterators.add(iter);
- if (!sstable.isRepaired())
- oldestUnrepairedTombstone = Math.min(oldestUnrepairedTombstone, sstable.getMinLocalDeletionTime());
- includedDueToTombstones++;
- }
- else
- {
- iter.close();
- }
+ iterators.add(iter);
+ includedDueToTombstones++;
}
}
if (Tracing.isTracing())
@@@ -633,8 -631,17 +635,8 @@@
if (iterators.isEmpty())
return EmptyIterators.unfilteredRow(cfs.metadata, partitionKey(), filter.isReversed());
- Tracing.trace("Merging data from memtables and {} sstables", metricsCollector.getMergedSSTables());
-
- @SuppressWarnings("resource") // Closed through the closing of the result of that method.
- UnfilteredRowIterator merged = UnfilteredRowIterators.merge(iterators, nowInSec());
- if (!merged.isEmpty())
- {
- DecoratedKey key = merged.partitionKey();
- cfs.metric.samplers.get(TableMetrics.Sampler.READS).addSample(key.getKey(), key.hashCode(), 1);
- }
-
- return merged;
+ StorageHook.instance.reportRead(cfs.metadata.cfId, partitionKey());
- return withSSTablesIterated(iterators, cfs.metric);
++ return withSSTablesIterated(iterators, cfs.metric, metricsCollector);
}
catch (RuntimeException | Error e)
{
@@@ -661,52 -668,6 +663,53 @@@
return clusteringIndexFilter().shouldInclude(sstable);
}
- private UnfilteredRowIteratorWithLowerBound makeIterator(ColumnFamilyStore cfs, final SSTableReader sstable, boolean applyThriftTransformation)
++ private UnfilteredRowIteratorWithLowerBound makeIterator(ColumnFamilyStore cfs,
++ final SSTableReader sstable,
++ boolean applyThriftTransformation,
++ SSTableReadsListener listener)
+ {
+ return StorageHook.instance.makeRowIteratorWithLowerBound(cfs,
+ partitionKey(),
+ sstable,
+ clusteringIndexFilter(),
+ columnFilter(),
+ isForThrift(),
+ nowInSec(),
- applyThriftTransformation);
++ applyThriftTransformation,
++ listener);
+
+ }
+
+ /**
+ * Return a wrapped iterator that when closed will update the sstables iterated and READ sample metrics.
+ * Note that we cannot use the Transformations framework because they greedily get the static row, which
+ * would cause all iterators to be initialized and hence all sstables to be accessed.
+ */
+ private UnfilteredRowIterator withSSTablesIterated(List<UnfilteredRowIterator> iterators,
- TableMetrics metrics)
++ TableMetrics metrics,
++ SSTableReadMetricsCollector metricsCollector)
+ {
+ @SuppressWarnings("resource") // Closed through the closing of the result of the caller method.
+ UnfilteredRowIterator merged = UnfilteredRowIterators.merge(iterators, nowInSec());
+
+ if (!merged.isEmpty())
+ {
+ DecoratedKey key = merged.partitionKey();
+ metrics.samplers.get(TableMetrics.Sampler.READS).addSample(key.getKey(), key.hashCode(), 1);
+ }
+
+ class UpdateSstablesIterated extends Transformation
+ {
+ public void onPartitionClose()
+ {
- int sstablesIterated = (int)iterators.stream()
- .filter(it -> it instanceof LazilyInitializedUnfilteredRowIterator)
- .filter(it -> ((LazilyInitializedUnfilteredRowIterator)it).initialized())
- .count();
-
- metrics.updateSSTableIterated(sstablesIterated);
- Tracing.trace("Merged data from memtables and {} sstables", sstablesIterated);
++ int mergedSSTablesIterated = metricsCollector.getMergedSSTables();
++ metrics.updateSSTableIterated(mergedSSTablesIterated);
++ Tracing.trace("Merged data from memtables and {} sstables", mergedSSTablesIterated);
+ }
+ };
+ return Transformation.apply(merged, new UpdateSstablesIterated());
+ }
+
private boolean queriesMulticellType()
{
for (ColumnDefinition column : columnFilter().fetchedColumns())
@@@ -773,14 -737,16 +776,19 @@@
// however: if it is set, it impacts everything and must be included. Getting that top-level partition deletion costs us
// some seek in general however (unless the partition is indexed and is in the key cache), so we first check if the sstable
// has any tombstone at all as a shortcut.
- if (!sstable.hasTombstones())
- continue; // Means no tombstone at all, we can skip that sstable
+ if (!sstable.mayHaveTombstones())
+ continue; // no tombstone at all, we can skip that sstable
// We need to get the partition deletion and include it if it's live. In any case though, we're done with that sstable.
- sstable.incrementReadCount();
- try (UnfilteredRowIterator iter = StorageHook.instance.makeRowIterator(cfs, sstable, partitionKey(), filter.getSlices(metadata()), columnFilter(), filter.isReversed(), isForThrift()))
- try (UnfilteredRowIterator iter = filter.filter(sstable.iterator(partitionKey(),
- columnFilter(),
- filter.isReversed(),
- isForThrift(),
- metricsCollector)))
++ try (UnfilteredRowIterator iter = StorageHook.instance.makeRowIterator(cfs,
++ sstable,
++ partitionKey(),
++ filter.getSlices(metadata()),
++ columnFilter(),
++ filter.isReversed(),
++ isForThrift(),
++ metricsCollector))
{
- sstablesIterated++;
if (!iter.partitionLevelDeletion().isLive())
result = add(UnfilteredRowIterators.noRowsIterator(iter.metadata(), iter.partitionKey(), Rows.EMPTY_STATIC_ROW, iter.partitionLevelDeletion(), filter.isReversed()), result, filter, sstable.isRepaired());
else
@@@ -789,9 -755,12 +797,14 @@@
continue;
}
-- Tracing.trace("Merging data from sstable {}", sstable.descriptor.generation);
- sstable.incrementReadCount();
- try (UnfilteredRowIterator iter = StorageHook.instance.makeRowIterator(cfs, sstable, partitionKey(), filter.getSlices(metadata()), columnFilter(), filter.isReversed(), isForThrift()))
- try (UnfilteredRowIterator iter = filter.filter(sstable.iterator(partitionKey(),
- columnFilter(),
- filter.isReversed(),
- isForThrift(),
- metricsCollector)))
++ try (UnfilteredRowIterator iter = StorageHook.instance.makeRowIterator(cfs,
++ sstable,
++ partitionKey(),
++ filter.getSlices(metadata()),
++ columnFilter(),
++ filter.isReversed(),
++ isForThrift(),
++ metricsCollector))
{
if (iter.isEmpty())
continue;
@@@ -810,10 -778,9 +822,10 @@@
DecoratedKey key = result.partitionKey();
cfs.metric.samplers.get(TableMetrics.Sampler.READS).addSample(key.getKey(), key.hashCode(), 1);
+ StorageHook.instance.reportRead(cfs.metadata.cfId, partitionKey());
// "hoist up" the requested data into a more recent sstable
- if (sstablesIterated > cfs.getMinimumCompactionThreshold()
+ if (metricsCollector.getMergedSSTables() > cfs.getMinimumCompactionThreshold()
&& onlyUnrepaired
&& !cfs.isAutoCompactionDisabled()
&& cfs.getCompactionStrategyManager().shouldDefragment())
http://git-wip-us.apache.org/repos/asf/cassandra/blob/5c9db9af/src/java/org/apache/cassandra/db/StorageHook.java
----------------------------------------------------------------------
diff --cc src/java/org/apache/cassandra/db/StorageHook.java
index 0f27adb,0000000..48d7ede
mode 100644,000000..100644
--- a/src/java/org/apache/cassandra/db/StorageHook.java
+++ b/src/java/org/apache/cassandra/db/StorageHook.java
@@@ -1,86 -1,0 +1,104 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db;
+
+import java.util.UUID;
+
+import org.apache.cassandra.db.filter.ClusteringIndexFilter;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.db.rows.UnfilteredRowIteratorWithLowerBound;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
++import org.apache.cassandra.io.sstable.format.SSTableReadsListener;
+import org.apache.cassandra.utils.FBUtilities;
+
+public interface StorageHook
+{
+ public static final StorageHook instance = createHook();
+
+ public void reportWrite(UUID cfid, PartitionUpdate partitionUpdate);
+ public void reportRead(UUID cfid, DecoratedKey key);
+ public UnfilteredRowIteratorWithLowerBound makeRowIteratorWithLowerBound(ColumnFamilyStore cfs,
+ DecoratedKey partitionKey,
+ SSTableReader sstable,
+ ClusteringIndexFilter filter,
+ ColumnFilter selectedColumns,
+ boolean isForThrift,
+ int nowInSec,
- boolean applyThriftTransformation);
++ boolean applyThriftTransformation,
++ SSTableReadsListener listener);
++
+ public UnfilteredRowIterator makeRowIterator(ColumnFamilyStore cfs,
+ SSTableReader sstable,
+ DecoratedKey key,
+ Slices slices,
+ ColumnFilter selectedColumns,
+ boolean reversed,
- boolean isForThrift);
++ boolean isForThrift,
++ SSTableReadsListener listener);
+
+ static StorageHook createHook()
+ {
+ String className = System.getProperty("cassandra.storage_hook");
+ if (className != null)
+ {
+ return FBUtilities.construct(className, StorageHook.class.getSimpleName());
+ }
- else
++
++ return new StorageHook()
+ {
- return new StorageHook()
- {
- public void reportWrite(UUID cfid, PartitionUpdate partitionUpdate) {}
++ public void reportWrite(UUID cfid, PartitionUpdate partitionUpdate) {}
+
- public void reportRead(UUID cfid, DecoratedKey key) {}
++ public void reportRead(UUID cfid, DecoratedKey key) {}
+
- public UnfilteredRowIteratorWithLowerBound makeRowIteratorWithLowerBound(ColumnFamilyStore cfs, DecoratedKey partitionKey, SSTableReader sstable, ClusteringIndexFilter filter, ColumnFilter selectedColumns, boolean isForThrift, int nowInSec, boolean applyThriftTransformation)
- {
- return new UnfilteredRowIteratorWithLowerBound(partitionKey,
- sstable,
- filter,
- selectedColumns,
- isForThrift,
- nowInSec,
- applyThriftTransformation);
- }
++ public UnfilteredRowIteratorWithLowerBound makeRowIteratorWithLowerBound(ColumnFamilyStore cfs,
++ DecoratedKey partitionKey,
++ SSTableReader sstable,
++ ClusteringIndexFilter filter,
++ ColumnFilter selectedColumns,
++ boolean isForThrift,
++ int nowInSec,
++ boolean applyThriftTransformation,
++ SSTableReadsListener listener)
++ {
++ return new UnfilteredRowIteratorWithLowerBound(partitionKey,
++ sstable,
++ filter,
++ selectedColumns,
++ isForThrift,
++ nowInSec,
++ applyThriftTransformation,
++ listener);
++ }
+
- public UnfilteredRowIterator makeRowIterator(ColumnFamilyStore cfs, SSTableReader sstable, DecoratedKey key, Slices slices, ColumnFilter selectedColumns, boolean reversed, boolean isForThrift)
- {
- return sstable.iterator(key, slices, selectedColumns, reversed, isForThrift);
- }
- };
- }
++ public UnfilteredRowIterator makeRowIterator(ColumnFamilyStore cfs,
++ SSTableReader sstable,
++ DecoratedKey key,
++ Slices slices,
++ ColumnFilter selectedColumns,
++ boolean reversed,
++ boolean isForThrift,
++ SSTableReadsListener listener)
++ {
++ return sstable.iterator(key, slices, selectedColumns, reversed, isForThrift, listener);
++ }
++ };
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@cassandra.apache.org
For additional commands, e-mail: commits-help@cassandra.apache.org