# Cassandra storage config YAML # NOTE: # See http://wiki.apache.org/cassandra/StorageConfiguration for # full explanations of configuration directives # /NOTE # The name of the cluster. This is mainly used to prevent machines in # one logical cluster from joining another. cluster_name: 'Test Cluster' # You should always specify InitialToken when setting up a production # cluster for the first time, and often when adding capacity later. # The principle is that each node should be given an equal slice of # the token ring; see http://wiki.apache.org/cassandra/Operations # for more details. # # If blank, Cassandra will request a token bisecting the range of # the heaviest-loaded existing node. If there is no load information # available, such as is the case with a new cluster, it will pick # a random token, which will lead to hot spots. initial_token: # Set to true to make new [non-seed] nodes automatically migrate data # to themselves from the pre-existing nodes in the cluster. Defaults # to false because you can only bootstrap N machines at a time from # an existing cluster of N, so if you are bringing up a cluster of # 10 machines with 3 seeds you would have to do it in stages. Leaving # this off for the initial start simplifies that. auto_bootstrap: false # See http://wiki.apache.org/cassandra/HintedHandoff hinted_handoff_enabled: true # authentication backend, implementing IAuthenticator; used to identify users authenticator: org.apache.cassandra.auth.AllowAllAuthenticator # authorization backend, implementing IAuthority; used to limit access/provide permissions authority: org.apache.cassandra.auth.AllowAllAuthority # any IPartitioner may be used, including your own as long as it is on # the classpath. Out of the box, Cassandra provides # org.apache.cassandra.dht.RandomPartitioner # org.apache.cassandra.dht.ByteOrderedPartitioner, # org.apache.cassandra.dht.OrderPreservingPartitioner, and # org.apache.cassandra.dht.CollatingOrderPreservingPartitioner. # (CollatingOPP colates according to EN,US rules, not naive byte # ordering. Use this as an example if you need locale-aware collation.) partitioner: org.apache.cassandra.dht.RandomPartitioner # directories where Cassandra should store data on disk. data_file_directories: - /var/lib/cassandra/data # commit log commitlog_directory: /var/lib/cassandra/commitlog # saved caches saved_caches_directory: /var/lib/cassandra/saved_caches # Size to allow commitlog to grow to before creating a new segment commitlog_rotation_threshold_in_mb: 128 # commitlog_sync may be either "periodic" or "batch." # When in batch mode, Cassandra won't ack writes until the commit log # has been fsynced to disk. It will wait up to # CommitLogSyncBatchWindowInMS milliseconds for other writes, before # performing the sync. commitlog_sync: periodic # the other option is "timed," where writes may be acked immediately # and the CommitLog is simply synced every commitlog_sync_period_in_ms # milliseconds. commitlog_sync_period_in_ms: 10000 # Addresses of hosts that are deemed contact points. # Cassandra nodes use this list of hosts to find each other and learn # the topology of the ring. You must change this if you are running # multiple nodes! seeds: - 127.0.0.1 # Access mode. mmapped i/o is substantially faster, but only practical on # a 64bit machine (which notably does not include EC2 "small" instances) # or relatively small datasets. "auto", the safe choice, will enable # mmapping on a 64bit JVM. Other values are "mmap", "mmap_index_only" # (which may allow you to get part of the benefits of mmap on a 32bit # machine by mmapping only index files) and "standard". # (The buffer size settings that follow only apply to standard, # non-mmapped i/o.) disk_access_mode: auto # Unlike most systems, in Cassandra writes are faster than reads, so # you can afford more of those in parallel. A good rule of thumb is 2 # concurrent reads per processor core. Increase ConcurrentWrites to # the number of clients writing at once if you enable CommitLogSync + # CommitLogSyncDelay. --> concurrent_reads: 8 concurrent_writes: 32 # This sets the amount of memtable flush writer threads. These will # be blocked by disk io, and each one will hold a memtable in memory # while blocked. If you have a large heap and many data directories, # you can increase this value for better flush performance. # By default this will be set to the amount of data directories defined. #memtable_flush_writers: 1 # Buffer size to use when performing contiguous column slices. # Increase this to the size of the column slices you typically perform sliced_buffer_size_in_kb: 64 # TCP port, for commands and data storage_port: 7000 # Address to bind to and tell other Cassandra nodes to connect to. You # _must_ change this if you want multiple nodes to be able to # communicate! # # Leaving it blank leaves it up to InetAddress.getLocalHost(). This # will always do the Right Thing *if* the node is properly configured # (hostname, name resolution, etc), and the Right Thing is to use the # address associated with the hostname (it might not be). # # Setting this to 0.0.0.0 is always wrong. listen_address: localhost # The address to bind the Thrift RPC service to -- clients connect # here. Unlike ListenAddress above, you *can* specify 0.0.0.0 here if # you want Thrift to listen on all interfaces. # # Leaving this blank has the same effect it does for ListenAddress, # (i.e. it will be based on the configured hostname of the node). rpc_address: localhost # port for Thrift to listen for clients on rpc_port: 9160 # enable or disable keepalive on rpc connections rpc_keepalive: true # uncomment to set socket buffer sizes on rpc connections # rpc_send_buff_size_in_bytes: # rpc_recv_buff_size_in_bytes: # Frame size for thrift (maximum field length). # 0 disables TFramedTransport in favor of TSocket. This option # is deprecated; we strongly recommend using Framed mode. thrift_framed_transport_size_in_mb: 15 # The max length of a thrift message, including all fields and # internal thrift overhead. thrift_max_message_length_in_mb: 16 # Whether or not to take a snapshot before each compaction. Be # careful using this option, since Cassandra won't clean up the # snapshots for you. Mostly useful if you're paranoid when there # is a data format change. snapshot_before_compaction: false # change this to increase the compaction thread's priority. In java, 1 is the # lowest priority and that is our default. # compaction_thread_priority: 1 # The threshold size in megabytes the binary memtable must grow to, # before it's submitted for flushing to disk. binary_memtable_throughput_in_mb: 256 # Add column indexes to a row after its contents reach this size. # Increase if your column values are large, or if you have a very large # number of columns. The competing causes are, Cassandra has to # deserialize this much of the row to read a single column, so you want # it to be small - at least if you do many partial-row reads - but all # the index data is read for each access, so you don't want to generate # that wastefully either. column_index_size_in_kb: 64 # Size limit for rows being compacted in memory. Larger rows will spill # over to disk and use a slower two-pass compaction process. A message # will be logged specifying the row key. in_memory_compaction_limit_in_mb: 64 # Time to wait for a reply from other nodes before failing the command rpc_timeout_in_ms: 10000 # phi value that must be reached for a host to be marked down. # most users should never need to adjust this. # phi_convict_threshold: 8 # endpoint_snitch -- Set this to a class that implements # IEndpointSnitch, which will let Cassandra know enough # about your network topology to route requests efficiently. # Out of the box, Cassandra provides # - org.apache.cassandra.locator.SimpleSnitch: # Treats Strategy order as proximity. This improves cache locality # when disabling read repair, which can further improve throughput. # - org.apache.cassandra.locator.RackInferringSnitch: # Proximity is determined by rack and data center, which are # assumed to correspond to the 3rd and 2nd octet of each node's # IP address, respectively # org.apache.cassandra.locator.PropertyFileSnitch: # - Proximity is determined by rack and data center, which are # explicitly configured in cassandra-topology.properties. endpoint_snitch: org.apache.cassandra.locator.SimpleSnitch # dynamic_snitch -- This boolean controls whether the above snitch is # wrapped with a dynamic snitch, which will monitor read latencies # and avoid reading from hosts that have slowed (due to compaction, # for instance) dynamic_snitch: true # controls how often to perform the more expensive part of host score # calculation dynamic_snitch_update_interval_in_ms: 100 # controls how often to reset all host scores, allowing a bad host to # possibly recover dynamic_snitch_reset_interval_in_ms: 600000 # if set greater than zero and read_repair_chance is < 1.0, this will allow # 'pinning' of replicas to hosts in order to increase cache capacity. # The badness threshold will control how much worse the pinned host has to be # before the dynamic snitch will prefer other replicas over it. This is # expressed as a double which represents a percentage. Thus, a value of # 0.2 means Cassandra would continue to prefer the static snitch values # until the pinned host was 20% worse than the fastest. dynamic_snitch_badness_threshold: 0.0 # request_scheduler -- Set this to a class that implements # RequestScheduler, which will schedule incoming client requests # according to the specific policy. This is useful for multi-tenancy # with a single Cassandra cluster. # NOTE: This is specifically for requests from the client and does # not affect inter node communication. # org.apache.cassandra.scheduler.NoScheduler - No scheduling takes place # org.apache.cassandra.scheduler.RoundRobinScheduler - Round robin of # client requests to a node with a separate queue for each # request_scheduler_id. The scheduler is further customized by # request_scheduler_options as described below. request_scheduler: org.apache.cassandra.scheduler.NoScheduler # Scheduler Options vary based on the type of scheduler # NoScheduler - Has no options # RoundRobin # - throttle_limit -- The throttle_limit is the number of in-flight # requests per client. Requests beyond # that limit are queued up until # running requests can complete. # The value of 80 here is twice the number of # concurrent_reads + concurrent_writes. # - default_weight -- default_weight is optional and allows for # overriding the default which is 1. # - weights -- Weights are optional and will default to 1 or the # overridden default_weight. The weight translates into how # many requests are handled during each turn of the # RoundRobin, based on the scheduler id. # # request_scheduler_options: # throttle_limit: 80 # default_weight: 5 # weights: # Keyspace1: 1 # Keyspace2: 5 # request_scheduler_id -- An identifer based on which to perform # the request scheduling. Currently the only valid option is keyspace. # request_scheduler_id: keyspace # The Index Interval determines how large the sampling of row keys # is for a given SSTable. The larger the sampling, the more effective # the index is at the cost of space. index_interval: 128 # Keyspaces have ColumnFamilies. (Usually 1 KS per application.) # ColumnFamilies have Rows. (Dozens of CFs per KS.) # Rows contain Columns. (Many per CF.) # Columns contain name:value:timestamp. (Many per Row.) # # A KS is most similar to a schema, and a CF is most similar to a relational table. # # Keyspaces, ColumnFamilies, and Columns may carry additional # metadata that change their behavior. These are as follows: # # Keyspace required parameters: # - name: name of the keyspace; "system" is # reserved for Cassandra Internals. # - replica_placement_strategy: the class that determines how replicas # are distributed among nodes. Contains both the class as well as # configuration information. Must extend AbstractReplicationStrategy. # Out of the box, Cassandra provides # * org.apache.cassandra.locator.SimpleStrategy # * org.apache.cassandra.locator.NetworkTopologyStrategy # * org.apache.cassandra.locator.OldNetworkTopologyStrategy # # SimpleStrategy merely places the first # replica at the node whose token is closest to the key (as determined # by the Partitioner), and additional replicas on subsequent nodes # along the ring in increasing Token order. # # With NetworkTopologyStrategy, # for each datacenter, you can specify how many replicas you want # on a per-keyspace basis. Replicas are placed on different racks # within each DC, if possible. This strategy also requires rack aware # snitch, such as RackInferringSnitch or PropertyFileSnitch. # An example: # - name: Keyspace1 # replica_placement_strategy: org.apache.cassandra.locator.NetworkTopologyStrategy # strategy_options: # DC1 : 3 # DC2 : 2 # DC3 : 1 # # OldNetworkToplogyStrategy [formerly RackAwareStrategy] # places one replica in each of two datacenters, and the third on a # different rack in in the first. Additional datacenters are not # guaranteed to get a replica. Additional replicas after three are placed # in ring order after the third without regard to rack or datacenter. # - replication_factor: Number of replicas of each row # Keyspace optional paramaters: # - strategy_options: Additional information for the replication strategy. # - column_families: # ColumnFamily required parameters: # - name: name of the ColumnFamily. Must not contain the character "-". # - compare_with: tells Cassandra how to sort the columns for slicing # operations. The default is BytesType, which is a straightforward # lexical comparison of the bytes in each column. Other options are # AsciiType, UTF8Type, LexicalUUIDType, TimeUUIDType, LongType, # and IntegerType (a generic variable-length integer type). # You can also specify the fully-qualified class name to a class of # your choice extending org.apache.cassandra.db.marshal.AbstractType. # # ColumnFamily optional parameters: # - keys_cached: specifies the number of keys per sstable whose # locations we keep in memory in "mostly LRU" order. (JUST the key # locations, NOT any column values.) Specify a fraction (value less # than 1) or an absolute number of keys to cache. Defaults to 200000 # keys. # - rows_cached: specifies the number of rows whose entire contents we # cache in memory. Do not use this on ColumnFamilies with large rows, # or ColumnFamilies with high write:read ratios. Specify a fraction # (value less than 1) or an absolute number of rows to cache. # Defaults to 0. (i.e. row caching is off by default) # - comment: used to attach additional human-readable information about # the column family to its definition. # - read_repair_chance: specifies the probability with which read # repairs should be invoked on non-quorum reads. must be between 0 # and 1. defaults to 1.0 (always read repair). # - gc_grace_seconds: specifies the time to wait before garbage # collecting tombstones (deletion markers). defaults to 864000 (10 # days). See http://wiki.apache.org/cassandra/DistributedDeletes # - default_validation_class: specifies a validator class to use for # validating all the column values in the CF. # NOTE: # min_ must be less than max_compaction_threshold! # - min_compaction_threshold: the minimum number of SSTables needed # to start a minor compaction. increasing this will cause minor # compactions to start less frequently and be more intensive. setting # this to 0 disables minor compactions. defaults to 4. # - max_compaction_threshold: the maximum number of SSTables allowed # before a minor compaction is forced. decreasing this will cause # minor compactions to start more frequently and be less intensive. # setting this to 0 disables minor compactions. defaults to 32. # /NOTE # - row_cache_save_period_in_seconds: number of seconds between saving # row caches. The row caches can be saved periodically and if one # exists on startup it will be loaded. # - key_cache_save_period_in_seconds: number of seconds between saving # key caches. The key caches can be saved periodically and if one # exists on startup it will be loaded. # - memtable_flush_after_mins: The maximum time to leave a dirty table # unflushed. This should be large enough that it won't cause a flush # storm of all memtables during periods of inactivity. # - memtable_throughput_in_mb: The maximum size of the memtable before # it is flushed. If undefined, 1/8 * heapsize will be used. # - memtable_operations_in_millions: Number of operations in millions # before the memtable is flushed. If undefined, throughput / 64 * 0.3 # will be used. # - column_metadata: # Column required parameters: # - name: binds a validator (and optionally an indexer) to columns # with this name in any row of the enclosing column family. # - validator: like cf.compare_with, an AbstractType that checks # that the value of the column is well-defined. # Column optional parameters: # NOTE: # index_name cannot be set if index_type is not also set! # - index_name: User-friendly name for the index. # - index_type: The type of index to be created. Currently only # KEYS is supported. # /NOTE # # NOTE: # this keyspace definition is for demonstration purposes only. # Cassandra will not load these definitions during startup. See # http://wiki.apache.org/cassandra/FAQ#no_keyspaces for an explanation. # /NOTE keyspaces: - name: Keyspace1 replica_placement_strategy: org.apache.cassandra.locator.SimpleStrategy replication_factor: 1 column_families: - name: Standard1 compare_with: BytesType keys_cached: 10000 rows_cached: 1000 row_cache_save_period_in_seconds: 0 key_cache_save_period_in_seconds: 3600 memtable_flush_after_mins: 59 memtable_throughput_in_mb: 255 memtable_operations_in_millions: 0.29 - name: Standard2 compare_with: UTF8Type read_repair_chance: 0.1 keys_cached: 100 gc_grace_seconds: 0 min_compaction_threshold: 5 max_compaction_threshold: 31 - name: StandardByUUID1 compare_with: TimeUUIDType - name: Super1 column_type: Super compare_with: BytesType compare_subcolumns_with: BytesType - name: Super2 column_type: Super compare_subcolumns_with: UTF8Type rows_cached: 10000 keys_cached: 50 comment: 'A column family with supercolumns, whose column and subcolumn names are UTF8 strings' - name: Super3 column_type: Super compare_with: LongType comment: 'A column family with supercolumns, whose column names are Longs (8 bytes)' - name: Indexed1 default_validation_class: LongType column_metadata: - name: birthdate validator_class: LongType index_name: birthdate_idx index_type: KEYS