You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@chukwa.apache.org by ey...@apache.org on 2014/07/31 06:05:02 UTC
svn commit: r1614808 [4/7] - in /chukwa/trunk: ./ conf/ contrib/solr/
contrib/solr/logs/ contrib/solr/logs/conf/ contrib/solr/logs/conf/clustering/
contrib/solr/logs/conf/clustering/carrot2/ contrib/solr/logs/conf/lang/
contrib/solr/logs/conf/velocity/...
Added: chukwa/trunk/contrib/solr/logs/conf/protwords.txt
URL: http://svn.apache.org/viewvc/chukwa/trunk/contrib/solr/logs/conf/protwords.txt?rev=1614808&view=auto
==============================================================================
--- chukwa/trunk/contrib/solr/logs/conf/protwords.txt (added)
+++ chukwa/trunk/contrib/solr/logs/conf/protwords.txt Thu Jul 31 04:04:59 2014
@@ -0,0 +1,21 @@
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#-----------------------------------------------------------------------
+# Use a protected word file to protect against the stemmer reducing two
+# unrelated words to the same base word.
+
+# Some non-words that normally won't be encountered,
+# just to test that they won't be stemmed.
+dontstems
+zwhacky
+
Added: chukwa/trunk/contrib/solr/logs/conf/schema.xml
URL: http://svn.apache.org/viewvc/chukwa/trunk/contrib/solr/logs/conf/schema.xml?rev=1614808&view=auto
==============================================================================
--- chukwa/trunk/contrib/solr/logs/conf/schema.xml (added)
+++ chukwa/trunk/contrib/solr/logs/conf/schema.xml Thu Jul 31 04:04:59 2014
@@ -0,0 +1,369 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!--
+ This is the Solr schema file. This file should be named "schema.xml" and
+ should be in the conf directory under the solr home
+ (i.e. ./solr/conf/schema.xml by default)
+ or located where the classloader for the Solr webapp can find it.
+
+ This example schema is the recommended starting point for users.
+ It should be kept correct and concise, usable out-of-the-box.
+
+ For more information, on how to customize this file, please see
+ http://wiki.apache.org/solr/SchemaXml
+
+ PERFORMANCE NOTE: this schema includes many optional features and should not
+ be used for benchmarking. To improve performance one could
+ - set stored="false" for all fields possible (esp large fields) when you
+ only need to search on the field but don't need to return the original
+ value.
+ - set indexed="false" if you don't need to search on the field, but only
+ return the field as a result of searching on other indexed fields.
+ - remove all unneeded copyField statements
+ - for best index size and searching performance, set "index" to false
+ for all general text fields, use copyField to copy them to the
+ catchall "text" field, and use that for searching.
+ - For maximum indexing performance, use the ConcurrentUpdateSolrServer
+ java client.
+ - Remember to run the JVM in server mode, and use a higher logging level
+ that avoids logging every request
+-->
+
+<schema name="logs" version="1.0">
+ <!-- attribute "name" is the name of this schema and is only used for display purposes.
+ version="x.y" is Solr's version number for the schema syntax and
+ semantics. It should not normally be changed by applications.
+
+ 1.0: Generic Hadoop log collection
+ -->
+
+
+ <!-- Valid attributes for fields:
+ name: mandatory - the name for the field
+ type: mandatory - the name of a field type from the
+ <types> fieldType section
+ indexed: true if this field should be indexed (searchable or sortable)
+ stored: true if this field should be retrievable
+ docValues: true if this field should have doc values. Doc values are
+ useful for faceting, grouping, sorting and function queries. Although not
+ required, doc values will make the index faster to load, more
+ NRT-friendly and more memory-efficient. They however come with some
+ limitations: they are currently only supported by StrField, UUIDField
+ and all Trie*Fields, and depending on the field type, they might
+ require the field to be single-valued, be required or have a default
+ value (check the documentation of the field type you're interested in
+ for more information)
+ multiValued: true if this field may contain multiple values per document
+ omitNorms: (expert) set to true to omit the norms associated with
+ this field (this disables length normalization and index-time
+ boosting for the field, and saves some memory). Only full-text
+ fields or fields that need an index-time boost need norms.
+ Norms are omitted for primitive (non-analyzed) types by default.
+ termVectors: [false] set to true to store the term vector for a
+ given field.
+ When using MoreLikeThis, fields used for similarity should be
+ stored for best performance.
+ termPositions: Store position information with the term vector.
+ This will increase storage costs.
+ termOffsets: Store offset information with the term vector. This
+ will increase storage costs.
+ required: The field is required. It will throw an error if the
+ value does not exist
+ default: a value that should be used if no value is specified
+ when adding a document.
+ -->
+
+ <!-- field names should consist of alphanumeric or underscore characters only and
+ not start with a digit. This is not currently strictly enforced,
+ but other field names will not have first class support from all components
+ and back compatibility is not guaranteed. Names with both leading and
+ trailing underscores (e.g. _version_) are reserved.
+ -->
+
+ <!-- If you remove this field, you must _also_ disable the update log in solrconfig.xml
+ or Solr won't start. _version_ and update log are required for SolrCloud
+ -->
+ <field name="_version_" type="long" indexed="true" stored="true" multiValued="false" />
+
+ <!-- points to the root document of a block of nested documents. Required for nested
+ document support, may be removed otherwise
+ -->
+ <field name="_root_" type="string" indexed="true" stored="false"/>
+
+ <!-- Only remove the "id" field if you have a very good reason to. While not strictly
+ required, it is highly recommended. A <uniqueKey> is present in almost all Solr
+ installations. See the <uniqueKey> declaration below where <uniqueKey> is set to "id".
+ -->
+ <field name="id" type="string" indexed="true" stored="true" required="true" multiValued="false" />
+
+ <field name="seqId" type="string" indexed="true" stored="true" omitNorms="true"/>
+ <field name="type" type="text_general" indexed="true" stored="true" omitNorms="true"/>
+ <field name="stream" type="string" indexed="true" stored="true" multiValued="true"/>
+ <field name="tags" type="text_en_splitting" indexed="true" stored="true" multiValued="true"/>
+ <field name="source" type="string" indexed="true" stored="true"/>
+ <field name="data" type="text_general" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" />
+
+ <!-- catchall field, containing all other searchable text fields (implemented
+ via copyField further on in this schema -->
+ <field name="text" type="text_general" indexed="true" stored="false" multiValued="true"/>
+
+ <!-- Field to use to determine and enforce document uniqueness.
+ Unless this field is marked with required="false", it will be a required field
+ -->
+ <uniqueKey>id</uniqueKey>
+
+ <!-- DEPRECATED: The defaultSearchField is consulted by various query parsers when
+ parsing a query string that isn't explicit about the field. Machine (non-user)
+ generated queries are best made explicit, or they can use the "df" request parameter
+ which takes precedence over this.
+ Note: Un-commenting defaultSearchField will be insufficient if your request handler
+ in solrconfig.xml defines "df", which takes precedence. That would need to be removed.
+ <defaultSearchField>text</defaultSearchField> -->
+
+ <!-- DEPRECATED: The defaultOperator (AND|OR) is consulted by various query parsers
+ when parsing a query string to determine if a clause of the query should be marked as
+ required or optional, assuming the clause isn't already marked by some operator.
+ The default is OR, which is generally assumed so it is not a good idea to change it
+ globally here. The "q.op" request parameter takes precedence over this.
+ <solrQueryParser defaultOperator="OR"/> -->
+
+ <!-- copyField commands copy one field to another at the time a document
+ is added to the index. It's used either to index the same field differently,
+ or to add multiple fields to the same field for easier/faster searching. -->
+
+ <copyField source="type" dest="text"/>
+ <copyField source="source" dest="text"/>
+ <copyField source="tags" dest="text"/>
+ <copyField source="data" dest="text"/>
+
+ <!-- The StrField type is not analyzed, but indexed/stored verbatim.
+ It supports doc values but in that case the field needs to be
+ single-valued and either required or have a default value.
+ -->
+ <fieldType name="string" class="solr.StrField" sortMissingLast="true" />
+
+ <!-- boolean type: "true" or "false" -->
+ <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/>
+
+ <!-- sortMissingLast and sortMissingFirst attributes are optional attributes are
+ currently supported on types that are sorted internally as strings
+ and on numeric types.
+ This includes "string","boolean", and, as of 3.5 (and 4.x),
+ int, float, long, date, double, including the "Trie" variants.
+ - If sortMissingLast="true", then a sort on this field will cause documents
+ without the field to come after documents with the field,
+ regardless of the requested sort order (asc or desc).
+ - If sortMissingFirst="true", then a sort on this field will cause documents
+ without the field to come before documents with the field,
+ regardless of the requested sort order.
+ - If sortMissingLast="false" and sortMissingFirst="false" (the default),
+ then default lucene sorting will be used which places docs without the
+ field first in an ascending sort and last in a descending sort.
+ -->
+
+ <!--
+ Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types.
+
+ These fields support doc values, but they require the field to be
+ single-valued and either be required or have a default value.
+ -->
+ <fieldType name="int" class="solr.TrieIntField" precisionStep="0" positionIncrementGap="0"/>
+ <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" positionIncrementGap="0"/>
+ <fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0"/>
+ <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" positionIncrementGap="0"/>
+
+ <!--
+ Numeric field types that index each value at various levels of precision
+ to accelerate range queries when the number of values between the range
+ endpoints is large. See the javadoc for NumericRangeQuery for internal
+ implementation details.
+
+ Smaller precisionStep values (specified in bits) will lead to more tokens
+ indexed per value, slightly larger index size, and faster range queries.
+ A precisionStep of 0 disables indexing at different precision levels.
+ -->
+ <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" positionIncrementGap="0"/>
+ <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" positionIncrementGap="0"/>
+ <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" positionIncrementGap="0"/>
+ <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" positionIncrementGap="0"/>
+
+ <!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and
+ is a more restricted form of the canonical representation of dateTime
+ http://www.w3.org/TR/xmlschema-2/#dateTime
+ The trailing "Z" designates UTC time and is mandatory.
+ Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z
+ All other components are mandatory.
+
+ Expressions can also be used to denote calculations that should be
+ performed relative to "NOW" to determine the value, ie...
+
+ NOW/HOUR
+ ... Round to the start of the current hour
+ NOW-1DAY
+ ... Exactly 1 day prior to now
+ NOW/DAY+6MONTHS+3DAYS
+ ... 6 months and 3 days in the future from the start of
+ the current day
+
+ Consult the DateField javadocs for more information.
+
+ Note: For faster range queries, consider the tdate type
+ -->
+ <fieldType name="date" class="solr.TrieDateField" precisionStep="0" positionIncrementGap="0"/>
+
+ <!-- A general text field that has reasonable, generic
+ cross-language defaults: it tokenizes with StandardTokenizer,
+ removes stop words from case-insensitive "stopwords.txt"
+ (empty by default), and down cases. At query time only, it
+ also applies synonyms. -->
+ <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
+ <analyzer type="index">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
+ <!-- in this example, we will only use synonyms at query time
+ <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+ -->
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- A text field with defaults appropriate for English: it
+ tokenizes with StandardTokenizer, removes English stop words
+ (lang/stopwords_en.txt), down cases, protects words from protwords.txt, and
+ finally applies Porter's stemming. The query time analyzer
+ also applies synonyms from synonyms.txt. -->
+ <fieldType name="text_en" class="solr.TextField" positionIncrementGap="100">
+ <analyzer type="index">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <!-- in this example, we will only use synonyms at query time
+ <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+ -->
+ <!-- Case insensitive stop word removal.
+ -->
+ <filter class="solr.StopFilterFactory"
+ ignoreCase="true"
+ words="lang/stopwords_en.txt"
+ />
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.EnglishPossessiveFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+ <!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory:
+ <filter class="solr.EnglishMinimalStemFilterFactory"/>
+ -->
+ <filter class="solr.PorterStemFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+ <filter class="solr.StopFilterFactory"
+ ignoreCase="true"
+ words="lang/stopwords_en.txt"
+ />
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.EnglishPossessiveFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+ <!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory:
+ <filter class="solr.EnglishMinimalStemFilterFactory"/>
+ -->
+ <filter class="solr.PorterStemFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- A text field with defaults appropriate for English, plus
+ aggressive word-splitting and autophrase features enabled.
+ This field is just like text_en, except it adds
+ WordDelimiterFilter to enable splitting and matching of
+ words on case-change, alpha numeric boundaries, and
+ non-alphanumeric chars. This means certain compound word
+ cases will work, for example query "wi fi" will match
+ document "WiFi" or "wi-fi".
+ -->
+ <fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
+ <analyzer type="index">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <!-- in this example, we will only use synonyms at query time
+ <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
+ -->
+ <!-- Case insensitive stop word removal.
+ -->
+ <filter class="solr.StopFilterFactory"
+ ignoreCase="true"
+ words="lang/stopwords_en.txt"
+ />
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+ <filter class="solr.PorterStemFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+ <filter class="solr.StopFilterFactory"
+ ignoreCase="true"
+ words="lang/stopwords_en.txt"
+ />
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+ <filter class="solr.PorterStemFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- Less flexible matching, but less false matches. Probably not ideal for product names,
+ but may be good for SKUs. Can insert dashes in the wrong place and still match. -->
+ <fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
+ <analyzer>
+ <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt"/>
+ <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+ <filter class="solr.EnglishMinimalStemFilterFactory"/>
+ <!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
+ possible with WordDelimiterFilter in conjuncton with stemming. -->
+ <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ <!-- Just like text_general except it reverses the characters of
+ each token, to enable more efficient leading wildcard queries. -->
+ <fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100">
+ <analyzer type="index">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
+ maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
+ <filter class="solr.LowerCaseFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+</schema>
Added: chukwa/trunk/contrib/solr/logs/conf/scripts.conf
URL: http://svn.apache.org/viewvc/chukwa/trunk/contrib/solr/logs/conf/scripts.conf?rev=1614808&view=auto
==============================================================================
--- chukwa/trunk/contrib/solr/logs/conf/scripts.conf (added)
+++ chukwa/trunk/contrib/solr/logs/conf/scripts.conf Thu Jul 31 04:04:59 2014
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+user=
+solr_hostname=localhost
+solr_port=8983
+rsyncd_port=18983
+data_dir=
+webapp_name=solr
+master_host=
+master_data_dir=
+master_status_dir=