You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@atlas.apache.org by am...@apache.org on 2019/05/20 18:42:26 UTC
[atlas] 01/02: ATLAS-2280: Index Repair tool kit, support for kerberos auth.

This is an automated email from the ASF dual-hosted git repository.

amestry pushed a commit to branch branch-0.8
in repository https://gitbox.apache.org/repos/asf/atlas.git

commit 51b95b4fcc8b9d7717f971c491770468470756f6
Author: Ashutosh Mestry <am...@hortonworks.com>
AuthorDate: Mon May 20 11:41:20 2019 -0700

    ATLAS-2280: Index Repair tool kit, support for kerberos auth.
---
 .../atlas-conf/atlas-titan.properties              |  22 +++
 .../atlas-conf/log4j-gremlin.properties            |  50 +++++
 tools/atlas-index-repair-kit/bin/atlas-gremlin.sh  | 203 +++++++++++++++++++++
 .../bin/atlas-index-repair.groovy                  | 186 +++++++++++++++++++
 .../lib/titan-core-0.5.4.jar                       | Bin 0 -> 1414728 bytes
 5 files changed, 461 insertions(+)

diff --git a/tools/atlas-index-repair-kit/atlas-conf/atlas-titan.properties b/tools/atlas-index-repair-kit/atlas-conf/atlas-titan.properties
new file mode 100644
index 0000000..52cb3c4
--- /dev/null
+++ b/tools/atlas-index-repair-kit/atlas-conf/atlas-titan.properties
@@ -0,0 +1,22 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+storage.backend=hbase
+storage.hostname=localhost.localdomain
+storage.hbase.table=atlas_titan
+
diff --git a/tools/atlas-index-repair-kit/atlas-conf/log4j-gremlin.properties b/tools/atlas-index-repair-kit/atlas-conf/log4j-gremlin.properties
new file mode 100644
index 0000000..c6543e3
--- /dev/null
+++ b/tools/atlas-index-repair-kit/atlas-conf/log4j-gremlin.properties
@@ -0,0 +1,50 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Used by gremlin.sh
+
+log4j.appender.A2=org.apache.log4j.ConsoleAppender
+log4j.appender.A2.Threshold=TRACE
+log4j.appender.A2.layout=org.apache.log4j.PatternLayout
+log4j.appender.A2.layout.ConversionPattern=%d{HH:mm:ss} %-5p %c %x - %m%n
+
+log4j.rootLogger=${gremlin.log4j.level}, A2
+
+#log4j.logger.com.thinkaurelius.titan.graphdb.database.idassigner.placement=DEBUG
+#log4j.logger.com.thinkaurelius.titan.diskstorage.hbase.HBaseStoreManager=DEBUG
+
+# Disable spurious Hadoop config deprecation warnings under 2.2.0.
+#
+# See https://issues.apache.org/jira/browse/HADOOP-10178
+#
+# This can and should be deleted when we upgrade our Hadoop 2.2.0
+# dependency to 2.3.0 or 3.0.0.
+log4j.logger.org.apache.hadoop.conf.Configuration.deprecation=OFF
+
+# Configure MR at its own loglevel.  We usually want MR at INFO,
+# even if the rest of the loggers are at WARN or ERROR or FATAL,
+# because job progress information is at INFO.
+log4j.logger.org.apache.hadoop.mapred=${gremlin.mr.log4j.level}
+log4j.logger.org.apache.hadoop.mapreduce=${gremlin.mr.log4j.level}
+log4j.logger.com.thinkaurelius.titan.hadoop.compat.h2.Hadoop2Compiler=${gremlin.mr.log4j.level}
+log4j.logger.com.thinkaurelius.titan.hadoop.compat.h1.Hadoop1Compiler=${gremlin.mr.log4j.level}
+
+# This generates 3 INFO lines per jar on the classpath -- usually more
+# noise than desirable in the REPL.  Switching it to the default
+# log4j level means it will be at WARN by default, which is ideal.
+log4j.logger.org.apache.hadoop.mapred.LocalDistributedCacheManager=${gremlin.log4j.level}
diff --git a/tools/atlas-index-repair-kit/bin/atlas-gremlin.sh b/tools/atlas-index-repair-kit/bin/atlas-gremlin.sh
new file mode 100755
index 0000000..ed3b938
--- /dev/null
+++ b/tools/atlas-index-repair-kit/bin/atlas-gremlin.sh
@@ -0,0 +1,203 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# ATLAS_WEBAPP_DIR=/home/atlas/atlas-server/server/webapp
+ATLAS_WEBAPP_DIR=
+
+# STORE_CONF_DIR=/etc/hbase/conf
+STORE_CONF_DIR=
+
+# JAAS_CONF_FILE=infra_solr_jaas.conf
+JAAS_CONF_FILE=
+
+
+set -e
+set -u
+
+# Environment variables that affect this script:
+#
+# TITAN_JAVA_OPTS
+#
+#   When set to a nonempty value, this variable is interpreted as a
+#   set of additional VM options.  These options are appended after
+#   the default JVM options that this script normally sets.  This is
+#   the preferred way to specify additional VM options.
+#
+# JAVA_OPTIONS
+#
+#   When set to a nonempty value, this variable is interpreted as a
+#   completel list of VM options.  This script will invoke the VM with
+#   exactly the options specified in the variable.  This is rarely
+#   preferable to TITAN_JAVA_OPTS, but it's available in unusual cases
+#   where the default VM options need to be omitted.  Note that the
+#   classpath is passed to the VM by building a CLASSPATH environment
+#   variable in this script and exporting it before invoking the VM,
+#   not by a command-line option.  See the entry on CLASSPATH for more
+#   information.
+#
+# CLASSPATH
+#
+#   When set to a nonempty value, this is prepended to the classpath
+#   entries automatically generated by this script.
+#
+# SCRIPT_DEBUG
+#
+#   When set to a nonempty value, this makes the script noisier about
+#   what it's doing.  The effect of this variable is limited to the
+#   script.  It does not affect the Log4j/Slf4j log level in the JVM
+#   (use the -l <LOGLEVEL> option for that).
+#
+# JAVA_HOME
+#
+#   When set to a nonempty value, this script will use the JVM binary
+#   at the path $JAVA_HOME/bin/java.
+#
+# HADOOP_PREFIX, HADOOP_CONF_DIR, HADOOP_CONF, HADOOP_HOME
+#
+#   When set to a nonempty value, the script attempts to add to the
+#   CLASSPATH the etc/hadoop or conf subdirectory of the Hadoop
+#   install to the variable points.
+
+# Returns the absolute path of this script regardless of symlinks
+abs_path() {
+    # From: http://stackoverflow.com/a/246128
+    #   - To resolve finding the directory after symlinks
+    SOURCE="${BASH_SOURCE[0]}"
+    while [ -h "$SOURCE" ]; do
+        DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
+        SOURCE="$(readlink "$SOURCE")"
+        [[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE"
+    done
+    echo "$( cd -P "$( dirname "$SOURCE" )" && pwd )"
+}
+
+CP=`abs_path`/../atlas-conf:${STORE_CONF_DIR}
+CP=$CP:$(find -L `abs_path`/../lib/ -name '*.jar' | fgrep -v 'solr' | fgrep -v 'http' | tr '\n' ':')
+CP=$CP:$(find -L `abs_path`/../lib/ -name 'commons-httpclient*.jar' | tr '\n' ':')
+CP=$CP:$(find -L ${ATLAS_WEBAPP_DIR}/atlas/WEB-INF/lib/ -name 'http*.jar' | tr '\n' ':')
+CP=$CP:$(find -L ${ATLAS_WEBAPP_DIR}/atlas/WEB-INF/lib/ -name 'atlas-graphdb-titan0*.jar' | tr '\n' ':')
+CP=$CP:$(find -L ${ATLAS_WEBAPP_DIR}/atlas/WEB-INF/lib/ -name 'atlas-typesystem*.jar' | tr '\n' ':')
+CP=$CP:$(find -L `abs_path`/../ext/ -name '*.jar' | tr '\n' ':')
+
+
+if [ -z "${JAAS_CONF_FILE}" ]; then
+  JASS_CONF_FILE_OPTION=
+else
+  JAAS_CONF_FILE_PATH=`abs_path`/../atlas-conf/${JAAS_CONF_FILE}
+  JASS_CONF_FILE_OPTION="-Djava.security.auth.login.config=${JAAS_CONF_FILE_PATH}"
+fi
+
+# Check some Hadoop-related environment variables
+if [ -n "${HADOOP_PREFIX:-}" ]; then
+    # Check Hadoop 2 first
+    if [ -d "$HADOOP_PREFIX"/etc/hadoop ]; then
+        CP="$CP:$HADOOP_PREFIX"/etc/hadoop
+    elif [ -d "$HADOOP_PREFIX"/conf ]; then
+        # Then try Hadoop 1
+        CP="$CP:$HADOOP_PREFIX"/conf
+    fi
+elif [ -n "${HADOOP_CONF_DIR:-}" ]; then
+    CP="$CP:$HADOOP_CONF_DIR"
+elif [ -n "${HADOOP_CONF:-}" ]; then
+    CP="$CP:$HADOOP_CONF"
+elif [ -n "${HADOOP_HOME:-}" ]; then
+    # Check Hadoop 2 first
+    if [ -d "$HADOOP_HOME"/etc/hadoop ]; then
+        CP="$CP:$HADOOP_HOME"/etc/hadoop
+    elif [ -d "$HADOOP_HOME"/conf ]; then
+        # Then try Hadoop 1
+        CP="$CP:$HADOOP_HOME"/conf
+    fi
+fi
+
+# Convert from *NIX to Windows path convention if needed
+case `uname` in
+    CYGWIN*) CP=`cygpath -p -w "$CP"`
+esac
+
+export CLASSPATH="${CLASSPATH:-}:$CP"
+
+# Find Java
+if [ -z "${JAVA_HOME:-}" ]; then
+    JAVA="java -server"
+else
+    JAVA="$JAVA_HOME/bin/java -server"
+fi
+
+# Set default message threshold for Log4j Gremlin's console appender
+if [ -z "${GREMLIN_LOG_LEVEL:-}" -o "${GREMLIN_MR_LOG_LEVEL:-}" ]; then
+    GREMLIN_LOG_LEVEL=ERROR
+    GREMLIN_MR_LOG_LEVEL=INFO
+fi
+
+# Script debugging is disabled by default, but can be enabled with -l
+# TRACE or -l DEBUG or enabled by exporting
+# SCRIPT_DEBUG=nonemptystring to gremlin.sh's environment
+if [ -z "${SCRIPT_DEBUG:-}" ]; then
+    SCRIPT_DEBUG=
+fi
+
+# Process options
+MAIN_CLASS=com.thinkaurelius.titan.hadoop.tinkerpop.gremlin.Console
+
+while getopts "eilv" opt; do
+    case "$opt" in
+    e) MAIN_CLASS=com.thinkaurelius.titan.hadoop.tinkerpop.gremlin.ScriptExecutor
+       # For compatibility with behavior pre-Titan-0.5.0, stop
+       # processing gremlin.sh arguments as soon as the -e switch is
+       # seen; everything following -e becomes arguments to the
+       # ScriptExecutor main class
+       break;;
+    i) MAIN_CLASS=com.thinkaurelius.titan.hadoop.tinkerpop.gremlin.InlineScriptExecutor
+       # This class was brought in with Faunus/titan-hadoop. Like -e,
+       # everything after this option is treated as an argument to the
+       # main class.
+       break;;
+    l) eval GREMLIN_LOG_LEVEL=\$$OPTIND
+       GREMLIN_MR_LOG_LEVEL="$GREMLIN_LOG_LEVEL"
+       OPTIND="$(( $OPTIND + 1 ))"
+       if [ "$GREMLIN_LOG_LEVEL" = "TRACE" -o \
+            "$GREMLIN_LOG_LEVEL" = "DEBUG" ]; then
+	   SCRIPT_DEBUG=y
+       fi
+       ;;
+    v) MAIN_CLASS=com.tinkerpop.gremlin.Version
+    esac
+done
+
+# Discard shell script options
+shift $(( $OPTIND - 1 ))
+
+if [ -z "${JAVA_OPTIONS:-}" ]; then
+    JAVA_OPTIONS="-Dlog4j.configuration=log4j-gremlin.properties"
+    JAVA_OPTIONS="$JAVA_OPTIONS -Dgremlin.log4j.level=$GREMLIN_LOG_LEVEL"
+    JAVA_OPTIONS="$JAVA_OPTIONS -Dgremlin.mr.log4j.level=$GREMLIN_MR_LOG_LEVEL"
+fi
+
+if [ -n "${TITAN_JAVA_OPTS:-}" ]; then
+    JAVA_OPTIONS="$JAVA_OPTIONS $TITAN_JAVA_OPTS"
+fi
+
+if [ -n "$SCRIPT_DEBUG" ]; then
+    echo "CLASSPATH: $CLASSPATH"
+    set -x
+fi
+
+# Start the JVM
+$JAVA $JAVA_OPTIONS ${JASS_CONF_FILE_OPTION} $MAIN_CLASS "$@"
diff --git a/tools/atlas-index-repair-kit/bin/atlas-index-repair.groovy b/tools/atlas-index-repair-kit/bin/atlas-index-repair.groovy
new file mode 100644
index 0000000..e02073e
--- /dev/null
+++ b/tools/atlas-index-repair-kit/bin/atlas-index-repair.groovy
@@ -0,0 +1,186 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+class GroovyIndexRepair {
+    final long COMMIT_BATCH_SIZE    = 200
+    final int  MAX_TRIES_ON_FAILURE = 3
+    final int  NUM_THREADS          = Runtime.getRuntime().availableProcessors()
+
+    Graph g
+    def   indexSerializer
+    long  totalVertexCount
+
+    def setup(String configPath) {
+        display("Initializing graph..")
+
+        g                = TitanFactory.open(configPath)
+        indexSerializer  = g.getIndexSerializer() as com.thinkaurelius.titan.graphdb.database.IndexSerializer
+        totalVertexCount = g.V.count()
+
+        display("Graph initialized!")
+    }
+
+    def processIndex(String indexName) {
+        processBatch(indexName, 0, totalVertexCount, NUM_THREADS)
+    }
+
+    def processIndex(List<String> indexNames, String guid) {
+        for (String indexName : indexNames) {
+            restore(indexName, guid)
+        }
+    }
+
+    def processBatch(String indexName, long startIndex, long endIndex, long threadCount) {
+        Date startTime = new Date()
+        long batchSize = (endIndex - startIndex) / threadCount
+
+        if ((batchSize * threadCount) < endIndex) {
+            batchSize++
+        }
+
+        display("Processing index: " + indexName)
+        display("    Start Index : " + startIndex)
+        display("    End Index   : " + endIndex)
+        display("    Chunk Count : " + threadCount)
+        display("    Batch Size  : " + batchSize)
+
+        def threads = []
+        for (long batchStartIdx = startIndex; batchStartIdx < endIndex; batchStartIdx += batchSize) {
+            final String lIdxName  = indexName
+            final long   lStartIdx = batchStartIdx;
+            final long   lEndIdx   = (lStartIdx + batchSize) > endIndex ? endIndex : (lStartIdx + batchSize);
+
+            threads << new Thread({
+                restore(lIdxName, lStartIdx, lEndIdx)
+            })
+        }
+
+        threads.each { it.start() }
+        threads.each { it.join() }
+
+        Date endTime = new Date()
+
+        display("Restore complete: " + indexName + ". Time taken: " + (endTime.time - startTime.time) + "ms")
+    }
+
+    void restore(String indexName, String guid) {
+        Date   startTime = new Date()
+
+        def mgmt              = g.getManagementSystem()
+        def documentsPerStore = new java.util.HashMap()
+        def tx                = mgmt.getWrappedTx()
+        def mutator           = tx.getTxHandle()
+        def tIdx              = mgmt.getGraphIndex(indexName)
+        def indexType         = mgmt.getSchemaVertex(tIdx).asIndexType()
+        def mixedIndexType    = indexType as com.thinkaurelius.titan.graphdb.types.indextype.MixedIndexTypeWrapper
+        def vertex            = g.V().has("__guid", guid).next();
+        if (vertex == null) {
+            display(" Could not find: " + guid);
+            return;
+        }
+
+        for (int attemptCount = 1; attemptCount <= MAX_TRIES_ON_FAILURE; attemptCount++) {
+            try {
+                indexSerializer.reindexElement(vertex, mixedIndexType, documentsPerStore);
+                break;
+            } catch (all) {
+                display("Exception: " + all)
+                display("Pausing before retry..")
+                Thread.sleep(2000 * attemptCount)
+            }
+        }
+
+        display(" commit")
+        mutator.getIndexTransaction(mixedIndexType.getBackingIndexName()).restore(documentsPerStore)
+
+        def endTime = new Date()
+        display(" thread end. Time taken: " + (endTime.time - startTime.time) + "ms")
+    }
+
+    void restore(String indexName, long startIndex, long endIndex) {
+        Date   startTime = new Date()
+        String batchId   = "Batch Id: [" + startIndex + "-" + endIndex + "]"
+
+        display(batchId + " thread start")
+
+        def mgmt              = g.getManagementSystem()
+        def documentsPerStore = new java.util.HashMap()
+        def tx                = mgmt.getWrappedTx()
+        def mutator           = tx.getTxHandle()
+        def tIdx              = mgmt.getGraphIndex(indexName)
+        def indexType         = mgmt.getSchemaVertex(tIdx).asIndexType()
+        def mixedIndexType    = indexType as com.thinkaurelius.titan.graphdb.types.indextype.MixedIndexTypeWrapper
+
+        for (long batchStartIdx = startIndex; batchStartIdx < endIndex; batchStartIdx += COMMIT_BATCH_SIZE) {
+            long   batchEndIdx = (batchStartIdx + COMMIT_BATCH_SIZE) > endIndex ? endIndex : (batchStartIdx + COMMIT_BATCH_SIZE)
+            String displayStr  = batchId + ": " + "{" + batchStartIdx + "-" + batchEndIdx + "}"
+
+            for (def vertex : g.V[batchStartIdx..batchEndIdx]) {
+                for (int attemptCount = 1; attemptCount <= MAX_TRIES_ON_FAILURE; attemptCount++) {
+                    try {
+                        indexSerializer.reindexElement(vertex, mixedIndexType, documentsPerStore);
+                        break;
+                    } catch (all) {
+                        display("Exception: " + all)
+                        display("Pausing before retry..")
+                        Thread.sleep(2000 * attemptCount)
+                    }
+                }
+            }
+
+            display(displayStr + " commit")
+
+            mutator.getIndexTransaction(mixedIndexType.getBackingIndexName()).restore(documentsPerStore)
+        }
+        def endTime = new Date()
+
+        display(batchId + " thread end. Time taken: " + (endTime.time - startTime.time) + "ms")
+    }
+
+    def close() {
+        g.shutdown()
+        display("Graph shutdown!")
+    }
+
+    def display(String s) {
+        println(getTimestamp() + " " + s)
+    }
+
+    def getTimestamp() {
+        new Date().format("yyyy-MM-dd HH:mm:ss.SSS")
+    }
+}
+
+def repairAtlasIndex(String configPath) {
+    r = new GroovyIndexRepair()
+
+    r.setup(configPath)
+    r.processIndex("vertex_index")
+    r.processIndex("fulltext_index")
+    r.processIndex("edge_index")
+
+    r.close()
+}
+
+def repairAtlasEntity(String configPath, String guid) {
+    r = new GroovyIndexRepair()
+
+    r.setup(configPath)
+    r.processIndex(["vertex_index", "fulltext_index", "edge_index"], guid)
+    r.close()
+}
diff --git a/tools/atlas-index-repair-kit/lib/titan-core-0.5.4.jar b/tools/atlas-index-repair-kit/lib/titan-core-0.5.4.jar
new file mode 100644
index 0000000..aeade01
Binary files /dev/null and b/tools/atlas-index-repair-kit/lib/titan-core-0.5.4.jar differ