You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@atlas.apache.org by am...@apache.org on 2019/05/20 18:42:26 UTC
[atlas] 01/02: ATLAS-2280: Index Repair tool kit,
support for kerberos auth.
This is an automated email from the ASF dual-hosted git repository.
amestry pushed a commit to branch branch-0.8
in repository https://gitbox.apache.org/repos/asf/atlas.git
commit 51b95b4fcc8b9d7717f971c491770468470756f6
Author: Ashutosh Mestry <am...@hortonworks.com>
AuthorDate: Mon May 20 11:41:20 2019 -0700
ATLAS-2280: Index Repair tool kit, support for kerberos auth.
---
.../atlas-conf/atlas-titan.properties | 22 +++
.../atlas-conf/log4j-gremlin.properties | 50 +++++
tools/atlas-index-repair-kit/bin/atlas-gremlin.sh | 203 +++++++++++++++++++++
.../bin/atlas-index-repair.groovy | 186 +++++++++++++++++++
.../lib/titan-core-0.5.4.jar | Bin 0 -> 1414728 bytes
5 files changed, 461 insertions(+)
diff --git a/tools/atlas-index-repair-kit/atlas-conf/atlas-titan.properties b/tools/atlas-index-repair-kit/atlas-conf/atlas-titan.properties
new file mode 100644
index 0000000..52cb3c4
--- /dev/null
+++ b/tools/atlas-index-repair-kit/atlas-conf/atlas-titan.properties
@@ -0,0 +1,22 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+storage.backend=hbase
+storage.hostname=localhost.localdomain
+storage.hbase.table=atlas_titan
+
diff --git a/tools/atlas-index-repair-kit/atlas-conf/log4j-gremlin.properties b/tools/atlas-index-repair-kit/atlas-conf/log4j-gremlin.properties
new file mode 100644
index 0000000..c6543e3
--- /dev/null
+++ b/tools/atlas-index-repair-kit/atlas-conf/log4j-gremlin.properties
@@ -0,0 +1,50 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Used by gremlin.sh
+
+log4j.appender.A2=org.apache.log4j.ConsoleAppender
+log4j.appender.A2.Threshold=TRACE
+log4j.appender.A2.layout=org.apache.log4j.PatternLayout
+log4j.appender.A2.layout.ConversionPattern=%d{HH:mm:ss} %-5p %c %x - %m%n
+
+log4j.rootLogger=${gremlin.log4j.level}, A2
+
+#log4j.logger.com.thinkaurelius.titan.graphdb.database.idassigner.placement=DEBUG
+#log4j.logger.com.thinkaurelius.titan.diskstorage.hbase.HBaseStoreManager=DEBUG
+
+# Disable spurious Hadoop config deprecation warnings under 2.2.0.
+#
+# See https://issues.apache.org/jira/browse/HADOOP-10178
+#
+# This can and should be deleted when we upgrade our Hadoop 2.2.0
+# dependency to 2.3.0 or 3.0.0.
+log4j.logger.org.apache.hadoop.conf.Configuration.deprecation=OFF
+
+# Configure MR at its own loglevel. We usually want MR at INFO,
+# even if the rest of the loggers are at WARN or ERROR or FATAL,
+# because job progress information is at INFO.
+log4j.logger.org.apache.hadoop.mapred=${gremlin.mr.log4j.level}
+log4j.logger.org.apache.hadoop.mapreduce=${gremlin.mr.log4j.level}
+log4j.logger.com.thinkaurelius.titan.hadoop.compat.h2.Hadoop2Compiler=${gremlin.mr.log4j.level}
+log4j.logger.com.thinkaurelius.titan.hadoop.compat.h1.Hadoop1Compiler=${gremlin.mr.log4j.level}
+
+# This generates 3 INFO lines per jar on the classpath -- usually more
+# noise than desirable in the REPL. Switching it to the default
+# log4j level means it will be at WARN by default, which is ideal.
+log4j.logger.org.apache.hadoop.mapred.LocalDistributedCacheManager=${gremlin.log4j.level}
diff --git a/tools/atlas-index-repair-kit/bin/atlas-gremlin.sh b/tools/atlas-index-repair-kit/bin/atlas-gremlin.sh
new file mode 100755
index 0000000..ed3b938
--- /dev/null
+++ b/tools/atlas-index-repair-kit/bin/atlas-gremlin.sh
@@ -0,0 +1,203 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# ATLAS_WEBAPP_DIR=/home/atlas/atlas-server/server/webapp
+ATLAS_WEBAPP_DIR=
+
+# STORE_CONF_DIR=/etc/hbase/conf
+STORE_CONF_DIR=
+
+# JAAS_CONF_FILE=infra_solr_jaas.conf
+JAAS_CONF_FILE=
+
+
+set -e
+set -u
+
+# Environment variables that affect this script:
+#
+# TITAN_JAVA_OPTS
+#
+# When set to a nonempty value, this variable is interpreted as a
+# set of additional VM options. These options are appended after
+# the default JVM options that this script normally sets. This is
+# the preferred way to specify additional VM options.
+#
+# JAVA_OPTIONS
+#
+# When set to a nonempty value, this variable is interpreted as a
+# completel list of VM options. This script will invoke the VM with
+# exactly the options specified in the variable. This is rarely
+# preferable to TITAN_JAVA_OPTS, but it's available in unusual cases
+# where the default VM options need to be omitted. Note that the
+# classpath is passed to the VM by building a CLASSPATH environment
+# variable in this script and exporting it before invoking the VM,
+# not by a command-line option. See the entry on CLASSPATH for more
+# information.
+#
+# CLASSPATH
+#
+# When set to a nonempty value, this is prepended to the classpath
+# entries automatically generated by this script.
+#
+# SCRIPT_DEBUG
+#
+# When set to a nonempty value, this makes the script noisier about
+# what it's doing. The effect of this variable is limited to the
+# script. It does not affect the Log4j/Slf4j log level in the JVM
+# (use the -l <LOGLEVEL> option for that).
+#
+# JAVA_HOME
+#
+# When set to a nonempty value, this script will use the JVM binary
+# at the path $JAVA_HOME/bin/java.
+#
+# HADOOP_PREFIX, HADOOP_CONF_DIR, HADOOP_CONF, HADOOP_HOME
+#
+# When set to a nonempty value, the script attempts to add to the
+# CLASSPATH the etc/hadoop or conf subdirectory of the Hadoop
+# install to the variable points.
+
+# Returns the absolute path of this script regardless of symlinks
+abs_path() {
+ # From: http://stackoverflow.com/a/246128
+ # - To resolve finding the directory after symlinks
+ SOURCE="${BASH_SOURCE[0]}"
+ while [ -h "$SOURCE" ]; do
+ DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
+ SOURCE="$(readlink "$SOURCE")"
+ [[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE"
+ done
+ echo "$( cd -P "$( dirname "$SOURCE" )" && pwd )"
+}
+
+CP=`abs_path`/../atlas-conf:${STORE_CONF_DIR}
+CP=$CP:$(find -L `abs_path`/../lib/ -name '*.jar' | fgrep -v 'solr' | fgrep -v 'http' | tr '\n' ':')
+CP=$CP:$(find -L `abs_path`/../lib/ -name 'commons-httpclient*.jar' | tr '\n' ':')
+CP=$CP:$(find -L ${ATLAS_WEBAPP_DIR}/atlas/WEB-INF/lib/ -name 'http*.jar' | tr '\n' ':')
+CP=$CP:$(find -L ${ATLAS_WEBAPP_DIR}/atlas/WEB-INF/lib/ -name 'atlas-graphdb-titan0*.jar' | tr '\n' ':')
+CP=$CP:$(find -L ${ATLAS_WEBAPP_DIR}/atlas/WEB-INF/lib/ -name 'atlas-typesystem*.jar' | tr '\n' ':')
+CP=$CP:$(find -L `abs_path`/../ext/ -name '*.jar' | tr '\n' ':')
+
+
+if [ -z "${JAAS_CONF_FILE}" ]; then
+ JASS_CONF_FILE_OPTION=
+else
+ JAAS_CONF_FILE_PATH=`abs_path`/../atlas-conf/${JAAS_CONF_FILE}
+ JASS_CONF_FILE_OPTION="-Djava.security.auth.login.config=${JAAS_CONF_FILE_PATH}"
+fi
+
+# Check some Hadoop-related environment variables
+if [ -n "${HADOOP_PREFIX:-}" ]; then
+ # Check Hadoop 2 first
+ if [ -d "$HADOOP_PREFIX"/etc/hadoop ]; then
+ CP="$CP:$HADOOP_PREFIX"/etc/hadoop
+ elif [ -d "$HADOOP_PREFIX"/conf ]; then
+ # Then try Hadoop 1
+ CP="$CP:$HADOOP_PREFIX"/conf
+ fi
+elif [ -n "${HADOOP_CONF_DIR:-}" ]; then
+ CP="$CP:$HADOOP_CONF_DIR"
+elif [ -n "${HADOOP_CONF:-}" ]; then
+ CP="$CP:$HADOOP_CONF"
+elif [ -n "${HADOOP_HOME:-}" ]; then
+ # Check Hadoop 2 first
+ if [ -d "$HADOOP_HOME"/etc/hadoop ]; then
+ CP="$CP:$HADOOP_HOME"/etc/hadoop
+ elif [ -d "$HADOOP_HOME"/conf ]; then
+ # Then try Hadoop 1
+ CP="$CP:$HADOOP_HOME"/conf
+ fi
+fi
+
+# Convert from *NIX to Windows path convention if needed
+case `uname` in
+ CYGWIN*) CP=`cygpath -p -w "$CP"`
+esac
+
+export CLASSPATH="${CLASSPATH:-}:$CP"
+
+# Find Java
+if [ -z "${JAVA_HOME:-}" ]; then
+ JAVA="java -server"
+else
+ JAVA="$JAVA_HOME/bin/java -server"
+fi
+
+# Set default message threshold for Log4j Gremlin's console appender
+if [ -z "${GREMLIN_LOG_LEVEL:-}" -o "${GREMLIN_MR_LOG_LEVEL:-}" ]; then
+ GREMLIN_LOG_LEVEL=ERROR
+ GREMLIN_MR_LOG_LEVEL=INFO
+fi
+
+# Script debugging is disabled by default, but can be enabled with -l
+# TRACE or -l DEBUG or enabled by exporting
+# SCRIPT_DEBUG=nonemptystring to gremlin.sh's environment
+if [ -z "${SCRIPT_DEBUG:-}" ]; then
+ SCRIPT_DEBUG=
+fi
+
+# Process options
+MAIN_CLASS=com.thinkaurelius.titan.hadoop.tinkerpop.gremlin.Console
+
+while getopts "eilv" opt; do
+ case "$opt" in
+ e) MAIN_CLASS=com.thinkaurelius.titan.hadoop.tinkerpop.gremlin.ScriptExecutor
+ # For compatibility with behavior pre-Titan-0.5.0, stop
+ # processing gremlin.sh arguments as soon as the -e switch is
+ # seen; everything following -e becomes arguments to the
+ # ScriptExecutor main class
+ break;;
+ i) MAIN_CLASS=com.thinkaurelius.titan.hadoop.tinkerpop.gremlin.InlineScriptExecutor
+ # This class was brought in with Faunus/titan-hadoop. Like -e,
+ # everything after this option is treated as an argument to the
+ # main class.
+ break;;
+ l) eval GREMLIN_LOG_LEVEL=\$$OPTIND
+ GREMLIN_MR_LOG_LEVEL="$GREMLIN_LOG_LEVEL"
+ OPTIND="$(( $OPTIND + 1 ))"
+ if [ "$GREMLIN_LOG_LEVEL" = "TRACE" -o \
+ "$GREMLIN_LOG_LEVEL" = "DEBUG" ]; then
+ SCRIPT_DEBUG=y
+ fi
+ ;;
+ v) MAIN_CLASS=com.tinkerpop.gremlin.Version
+ esac
+done
+
+# Discard shell script options
+shift $(( $OPTIND - 1 ))
+
+if [ -z "${JAVA_OPTIONS:-}" ]; then
+ JAVA_OPTIONS="-Dlog4j.configuration=log4j-gremlin.properties"
+ JAVA_OPTIONS="$JAVA_OPTIONS -Dgremlin.log4j.level=$GREMLIN_LOG_LEVEL"
+ JAVA_OPTIONS="$JAVA_OPTIONS -Dgremlin.mr.log4j.level=$GREMLIN_MR_LOG_LEVEL"
+fi
+
+if [ -n "${TITAN_JAVA_OPTS:-}" ]; then
+ JAVA_OPTIONS="$JAVA_OPTIONS $TITAN_JAVA_OPTS"
+fi
+
+if [ -n "$SCRIPT_DEBUG" ]; then
+ echo "CLASSPATH: $CLASSPATH"
+ set -x
+fi
+
+# Start the JVM
+$JAVA $JAVA_OPTIONS ${JASS_CONF_FILE_OPTION} $MAIN_CLASS "$@"
diff --git a/tools/atlas-index-repair-kit/bin/atlas-index-repair.groovy b/tools/atlas-index-repair-kit/bin/atlas-index-repair.groovy
new file mode 100644
index 0000000..e02073e
--- /dev/null
+++ b/tools/atlas-index-repair-kit/bin/atlas-index-repair.groovy
@@ -0,0 +1,186 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+class GroovyIndexRepair {
+ final long COMMIT_BATCH_SIZE = 200
+ final int MAX_TRIES_ON_FAILURE = 3
+ final int NUM_THREADS = Runtime.getRuntime().availableProcessors()
+
+ Graph g
+ def indexSerializer
+ long totalVertexCount
+
+ def setup(String configPath) {
+ display("Initializing graph..")
+
+ g = TitanFactory.open(configPath)
+ indexSerializer = g.getIndexSerializer() as com.thinkaurelius.titan.graphdb.database.IndexSerializer
+ totalVertexCount = g.V.count()
+
+ display("Graph initialized!")
+ }
+
+ def processIndex(String indexName) {
+ processBatch(indexName, 0, totalVertexCount, NUM_THREADS)
+ }
+
+ def processIndex(List<String> indexNames, String guid) {
+ for (String indexName : indexNames) {
+ restore(indexName, guid)
+ }
+ }
+
+ def processBatch(String indexName, long startIndex, long endIndex, long threadCount) {
+ Date startTime = new Date()
+ long batchSize = (endIndex - startIndex) / threadCount
+
+ if ((batchSize * threadCount) < endIndex) {
+ batchSize++
+ }
+
+ display("Processing index: " + indexName)
+ display(" Start Index : " + startIndex)
+ display(" End Index : " + endIndex)
+ display(" Chunk Count : " + threadCount)
+ display(" Batch Size : " + batchSize)
+
+ def threads = []
+ for (long batchStartIdx = startIndex; batchStartIdx < endIndex; batchStartIdx += batchSize) {
+ final String lIdxName = indexName
+ final long lStartIdx = batchStartIdx;
+ final long lEndIdx = (lStartIdx + batchSize) > endIndex ? endIndex : (lStartIdx + batchSize);
+
+ threads << new Thread({
+ restore(lIdxName, lStartIdx, lEndIdx)
+ })
+ }
+
+ threads.each { it.start() }
+ threads.each { it.join() }
+
+ Date endTime = new Date()
+
+ display("Restore complete: " + indexName + ". Time taken: " + (endTime.time - startTime.time) + "ms")
+ }
+
+ void restore(String indexName, String guid) {
+ Date startTime = new Date()
+
+ def mgmt = g.getManagementSystem()
+ def documentsPerStore = new java.util.HashMap()
+ def tx = mgmt.getWrappedTx()
+ def mutator = tx.getTxHandle()
+ def tIdx = mgmt.getGraphIndex(indexName)
+ def indexType = mgmt.getSchemaVertex(tIdx).asIndexType()
+ def mixedIndexType = indexType as com.thinkaurelius.titan.graphdb.types.indextype.MixedIndexTypeWrapper
+ def vertex = g.V().has("__guid", guid).next();
+ if (vertex == null) {
+ display(" Could not find: " + guid);
+ return;
+ }
+
+ for (int attemptCount = 1; attemptCount <= MAX_TRIES_ON_FAILURE; attemptCount++) {
+ try {
+ indexSerializer.reindexElement(vertex, mixedIndexType, documentsPerStore);
+ break;
+ } catch (all) {
+ display("Exception: " + all)
+ display("Pausing before retry..")
+ Thread.sleep(2000 * attemptCount)
+ }
+ }
+
+ display(" commit")
+ mutator.getIndexTransaction(mixedIndexType.getBackingIndexName()).restore(documentsPerStore)
+
+ def endTime = new Date()
+ display(" thread end. Time taken: " + (endTime.time - startTime.time) + "ms")
+ }
+
+ void restore(String indexName, long startIndex, long endIndex) {
+ Date startTime = new Date()
+ String batchId = "Batch Id: [" + startIndex + "-" + endIndex + "]"
+
+ display(batchId + " thread start")
+
+ def mgmt = g.getManagementSystem()
+ def documentsPerStore = new java.util.HashMap()
+ def tx = mgmt.getWrappedTx()
+ def mutator = tx.getTxHandle()
+ def tIdx = mgmt.getGraphIndex(indexName)
+ def indexType = mgmt.getSchemaVertex(tIdx).asIndexType()
+ def mixedIndexType = indexType as com.thinkaurelius.titan.graphdb.types.indextype.MixedIndexTypeWrapper
+
+ for (long batchStartIdx = startIndex; batchStartIdx < endIndex; batchStartIdx += COMMIT_BATCH_SIZE) {
+ long batchEndIdx = (batchStartIdx + COMMIT_BATCH_SIZE) > endIndex ? endIndex : (batchStartIdx + COMMIT_BATCH_SIZE)
+ String displayStr = batchId + ": " + "{" + batchStartIdx + "-" + batchEndIdx + "}"
+
+ for (def vertex : g.V[batchStartIdx..batchEndIdx]) {
+ for (int attemptCount = 1; attemptCount <= MAX_TRIES_ON_FAILURE; attemptCount++) {
+ try {
+ indexSerializer.reindexElement(vertex, mixedIndexType, documentsPerStore);
+ break;
+ } catch (all) {
+ display("Exception: " + all)
+ display("Pausing before retry..")
+ Thread.sleep(2000 * attemptCount)
+ }
+ }
+ }
+
+ display(displayStr + " commit")
+
+ mutator.getIndexTransaction(mixedIndexType.getBackingIndexName()).restore(documentsPerStore)
+ }
+ def endTime = new Date()
+
+ display(batchId + " thread end. Time taken: " + (endTime.time - startTime.time) + "ms")
+ }
+
+ def close() {
+ g.shutdown()
+ display("Graph shutdown!")
+ }
+
+ def display(String s) {
+ println(getTimestamp() + " " + s)
+ }
+
+ def getTimestamp() {
+ new Date().format("yyyy-MM-dd HH:mm:ss.SSS")
+ }
+}
+
+def repairAtlasIndex(String configPath) {
+ r = new GroovyIndexRepair()
+
+ r.setup(configPath)
+ r.processIndex("vertex_index")
+ r.processIndex("fulltext_index")
+ r.processIndex("edge_index")
+
+ r.close()
+}
+
+def repairAtlasEntity(String configPath, String guid) {
+ r = new GroovyIndexRepair()
+
+ r.setup(configPath)
+ r.processIndex(["vertex_index", "fulltext_index", "edge_index"], guid)
+ r.close()
+}
diff --git a/tools/atlas-index-repair-kit/lib/titan-core-0.5.4.jar b/tools/atlas-index-repair-kit/lib/titan-core-0.5.4.jar
new file mode 100644
index 0000000..aeade01
Binary files /dev/null and b/tools/atlas-index-repair-kit/lib/titan-core-0.5.4.jar differ