You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@trafodion.apache.org by se...@apache.org on 2018/10/31 16:36:58 UTC
[1/2] trafodion git commit: [TRAFODION-3225] Obscure cores seen in
RMS and logger related code when Trafodion is stressed
Repository: trafodion
Updated Branches:
refs/heads/master 06b648612 -> d83b97efe
[TRAFODION-3225] Obscure cores seen in RMS and logger related code when Trafodion is stressed
Process de-registration is now logged to understand the spate of cores seen during
stress testing
Cleanup dangling semaphore incorrectly detected the problem due to pid recycling and
hence de-registered the process and its query fragment from the shared segment
When the RMS semaphore is held for more than 10 seconds, it is assumed to be in a
deadlock situation and hence the process holding the semaphore is core dumped for
further analysis.
Create a global logger instance instead of a function static variable
to help in debugging when problems with QRlogger is reported.
Project: http://git-wip-us.apache.org/repos/asf/trafodion/repo
Commit: http://git-wip-us.apache.org/repos/asf/trafodion/commit/6a25d5c1
Tree: http://git-wip-us.apache.org/repos/asf/trafodion/tree/6a25d5c1
Diff: http://git-wip-us.apache.org/repos/asf/trafodion/diff/6a25d5c1
Branch: refs/heads/master
Commit: 6a25d5c17ade2140759dab9a6580979147d0e26a
Parents: 06b6486
Author: selvaganesang <se...@esgyn.com>
Authored: Tue Oct 30 17:21:45 2018 +0000
Committer: selvaganesang <se...@esgyn.com>
Committed: Tue Oct 30 17:34:58 2018 +0000
----------------------------------------------------------------------
core/sql/bin/SqlciMain.cpp | 1 +
core/sql/cli/Globals.cpp | 6 +-
core/sql/executor/ex_esp_frag_dir.cpp | 6 +-
core/sql/qmscommon/QRLogger.cpp | 6 +-
core/sql/qmscommon/QRLogger.h | 2 +
core/sql/runtimestats/SqlStats.cpp | 113 +++++++++++++++++++----------
core/sql/runtimestats/SqlStats.h | 4 +-
core/sql/runtimestats/sscpipc.cpp | 4 +-
8 files changed, 99 insertions(+), 43 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/trafodion/blob/6a25d5c1/core/sql/bin/SqlciMain.cpp
----------------------------------------------------------------------
diff --git a/core/sql/bin/SqlciMain.cpp b/core/sql/bin/SqlciMain.cpp
index fc02077..b45f108 100644
--- a/core/sql/bin/SqlciMain.cpp
+++ b/core/sql/bin/SqlciMain.cpp
@@ -216,6 +216,7 @@ _callable void removeProcess()
{
statsGlobals->removeProcess(cliGlobals->myPin());
statsGlobals->releaseStatsSemaphore(cliGlobals->getSemId(), cliGlobals->myPin());
+ statsGlobals->logProcessDeath(cliGlobals->myCpu(), cliGlobals->myPin(), "Normal process death");
}
else if (error == 4011)
{
http://git-wip-us.apache.org/repos/asf/trafodion/blob/6a25d5c1/core/sql/cli/Globals.cpp
----------------------------------------------------------------------
diff --git a/core/sql/cli/Globals.cpp b/core/sql/cli/Globals.cpp
index 6f498fb..014a2ef 100644
--- a/core/sql/cli/Globals.cpp
+++ b/core/sql/cli/Globals.cpp
@@ -206,6 +206,7 @@ void CliGlobals::init( NABoolean espProcess,
}
else
{
+ bool reincarnated;
error = statsGlobals_->getStatsSemaphore(semId_, myPin_);
statsHeap_ = (NAHeap *)statsGlobals_->
@@ -221,10 +222,12 @@ void CliGlobals::init( NABoolean espProcess,
NAHeap("Process Stats Heap", statsGlobals_->getStatsHeap(),
8192,
0);
- statsGlobals_->addProcess(myPin_, statsHeap_);
+ reincarnated = statsGlobals_->addProcess(myPin_, statsHeap_);
processStats_ = statsGlobals_->getExProcessStats(myPin_);
processStats_->setStartTime(myStartTime_);
statsGlobals_->releaseStatsSemaphore(semId_, myPin_);
+ if (reincarnated)
+ statsGlobals_->logProcessDeath(myCpu_, myPin_, "Process reincarnated before RIP");
}
}
// create a default context and make it the current context
@@ -298,6 +301,7 @@ CliGlobals::~CliGlobals()
error = statsGlobals_->getStatsSemaphore(semId_, myPin_);
statsGlobals_->removeProcess(myPin_);
statsGlobals_->releaseStatsSemaphore(semId_, myPin_);
+ statsGlobals_->logProcessDeath(myCpu_, myPin_, "Normal process death");
sem_close((sem_t *)semId_);
}
}
http://git-wip-us.apache.org/repos/asf/trafodion/blob/6a25d5c1/core/sql/executor/ex_esp_frag_dir.cpp
----------------------------------------------------------------------
diff --git a/core/sql/executor/ex_esp_frag_dir.cpp b/core/sql/executor/ex_esp_frag_dir.cpp
index 6825ec0..a880ae8 100644
--- a/core/sql/executor/ex_esp_frag_dir.cpp
+++ b/core/sql/executor/ex_esp_frag_dir.cpp
@@ -131,6 +131,7 @@ ExEspFragInstanceDir::ExEspFragInstanceDir(CliGlobals *cliGlobals,
}
else
{
+ bool reincarnated;
cliGlobals_->setStatsGlobals(statsGlobals_);
cliGlobals_->setSemId(semId_);
error = statsGlobals_->getStatsSemaphore(semId_, pid_);
@@ -142,12 +143,14 @@ ExEspFragInstanceDir::ExEspFragInstanceDir(CliGlobals *cliGlobals,
// We need to set up the cliGlobals, since addProcess will call getRTSSemaphore
// and it uses these members
cliGlobals_->setStatsHeap(statsHeap_);
- statsGlobals_->addProcess(pid_, statsHeap_);
+ reincarnated = statsGlobals_->addProcess(pid_, statsHeap_);
ExProcessStats *processStats =
statsGlobals_->getExProcessStats(pid_);
processStats->setStartTime(cliGlobals_->myStartTime());
cliGlobals_->setExProcessStats(processStats);
statsGlobals_->releaseStatsSemaphore(semId_, pid_);
+ if (reincarnated)
+ statsGlobals_->logProcessDeath(cpu_, pid_, "Process reincarnated before RIP");
}
}
cliGlobals_->setStatsHeap(statsHeap_);
@@ -191,6 +194,7 @@ ExEspFragInstanceDir::~ExEspFragInstanceDir()
int error = statsGlobals_->getStatsSemaphore(semId_, pid_);
statsGlobals_->removeProcess(pid_);
statsGlobals_->releaseStatsSemaphore(semId_, pid_);
+ statsGlobals_->logProcessDeath(cpu_, pid_, "Normal process death");
sem_close((sem_t *)semId_);
}
}
http://git-wip-us.apache.org/repos/asf/trafodion/blob/6a25d5c1/core/sql/qmscommon/QRLogger.cpp
----------------------------------------------------------------------
diff --git a/core/sql/qmscommon/QRLogger.cpp b/core/sql/qmscommon/QRLogger.cpp
index 850c7d0..bb32f4f 100644
--- a/core/sql/qmscommon/QRLogger.cpp
+++ b/core/sql/qmscommon/QRLogger.cpp
@@ -41,6 +41,7 @@
#include "seabed/fserr.h"
BOOL gv_QRLoggerInitialized_ = FALSE;
+QRLogger *gv_QRLoggerInstance_ = NULL;
using namespace log4cxx;
using namespace log4cxx::helpers;
@@ -99,8 +100,9 @@ QRLogger::QRLogger()
// **************************************************************************
QRLogger& QRLogger::instance()
{
- static QRLogger onlyInstance_;
- return onlyInstance_;
+ if (gv_QRLoggerInstance_ == NULL)
+ gv_QRLoggerInstance_ = new QRLogger();
+ return *gv_QRLoggerInstance_;
}
http://git-wip-us.apache.org/repos/asf/trafodion/blob/6a25d5c1/core/sql/qmscommon/QRLogger.h
----------------------------------------------------------------------
diff --git a/core/sql/qmscommon/QRLogger.h b/core/sql/qmscommon/QRLogger.h
index 6e8244c..cd3f399 100644
--- a/core/sql/qmscommon/QRLogger.h
+++ b/core/sql/qmscommon/QRLogger.h
@@ -28,6 +28,8 @@
#include "NAString.h"
#include "CommonLogger.h"
+class QRLogger;
+extern QRLogger *gv_QRLoggerInstance_;
// ----- these categories are currently not used
// qmscomon
extern std::string CAT_SQL_COMP_QR_COMMON;
http://git-wip-us.apache.org/repos/asf/trafodion/blob/6a25d5c1/core/sql/runtimestats/SqlStats.cpp
----------------------------------------------------------------------
diff --git a/core/sql/runtimestats/SqlStats.cpp b/core/sql/runtimestats/SqlStats.cpp
index 3fcf9d3..58f8eb4 100644
--- a/core/sql/runtimestats/SqlStats.cpp
+++ b/core/sql/runtimestats/SqlStats.cpp
@@ -194,17 +194,15 @@ const char *StatsGlobals::rmsEnvType(RTSEnvType envType)
}
}
-void StatsGlobals::addProcess(pid_t pid, NAHeap *heap)
+bool StatsGlobals::addProcess(pid_t pid, NAHeap *heap)
{
+ bool pidReincarnated = false;
if (pid >= configuredPidMax_)
- return;
+ return pidReincarnated;
char msg[256];;
if (statsArray_[pid].processStats_ != NULL)
{
- snprintf(msg, sizeof(msg),
- "Pid %d,%d got recycled soon or SSMP didn't receive the death message ",
- cpu_, pid);
- SQLMXLoggingArea::logExecRtInfo(__FILE__, __LINE__, msg, 0);
+ pidReincarnated = true;
removeProcess(pid, TRUE);
}
statsArray_[pid].processId_ = pid;
@@ -214,7 +212,7 @@ void StatsGlobals::addProcess(pid_t pid, NAHeap *heap)
incProcessStatsHeaps();
if (pid > maxPid_)
maxPid_ = pid;
- return;
+ return pidReincarnated;
}
void StatsGlobals::removeProcess(pid_t pid, NABoolean calledAtAdd)
@@ -304,6 +302,8 @@ void StatsGlobals::checkForDeadProcesses(pid_t myPid)
error = getStatsSemaphore(ssmpProcSemId_, myPid);
int pidRemainingToCheck = 20;
+ pid_t pidsRemoved[pidRemainingToCheck];
+ int numPidsRemoved = 0;
pid_t firstPid = pidToCheck_;
for (;maxPid_ > 0;)
{
@@ -326,11 +326,16 @@ void StatsGlobals::checkForDeadProcesses(pid_t myPid)
Int32 ln_error = msg_mon_get_process_name(
cpu_, statsArray_[pidToCheck_].processId_,
processName);
- if (ln_error == XZFIL_ERR_NOSUCHDEV)
- removeProcess(pidToCheck_);
+ if (ln_error == XZFIL_ERR_NOSUCHDEV) {
+ pidsRemoved[numPidsRemoved++];
+ removeProcess(pidToCheck_);
+ }
}
}
releaseStatsSemaphore(ssmpProcSemId_, myPid);
+ // death messages are logged outside of semaphore due to process hop
+ for (int i = 0 ; i < numPidsRemoved ; i++)
+ logProcessDeath(cpu_, pidsRemoved[i], "Dead Process");
}
// We expect a death message to be delivered to MXSSMP by the monitor
@@ -340,48 +345,55 @@ void StatsGlobals::checkForDeadProcesses(pid_t myPid)
// be unexpectedly holding the stats semaphore.
static Int32 SemHeldTooManySeconds = -1;
+static bool dumpRmsDeadLockProcess = true;
void StatsGlobals::cleanupDanglingSemaphore(NABoolean checkForSemaphoreHolders)
{
CliGlobals *cliGlobals = GetCliGlobals();
NABoolean cleanupSemaphore = FALSE;
+ char coreFile[512];
if (semPid_ == -1)
- return; // Nobody has the semaphore, nothing to do here.
+ return; // Nobody has the semaphore, nothing to do here.
if (NOT (cliGlobals->myPin() == getSsmpPid()
&& cliGlobals->myStartTime() == getSsmpTimestamp()))
- return; // Only ssmp is allowed to cleanup after another process.
+ return; // Only ssmp is allowed to cleanup after another process.
// Coverage notes - it would be too difficult to automate a test
// for this since usually a death message is used to clean up a
// generic SQL process' exit. But this code has been unit tested
// using gdb sessions on the generic process and on this MXSSMP
// process.
- if (checkForSemaphoreHolders)
- {
- if (SemHeldTooManySeconds < 0)
- {
- // call getenv once per process
- char *shtms = getenv("MXSSMP_SEM_RELEASE_SECONDS");
- if (shtms)
- {
- SemHeldTooManySeconds = str_atoi(shtms, str_len(shtms));
- if (SemHeldTooManySeconds < 1)
- SemHeldTooManySeconds = 10;
- }
- else
+ Int32 lockedTimeInSecs = 0;
+ timespec ts;
+ if (SemHeldTooManySeconds < 0)
+ {
+ // call getenv once per process
+ char *shtms = getenv("MXSSMP_SEM_RELEASE_SECONDS");
+ if (shtms)
+ {
+ SemHeldTooManySeconds = str_atoi(shtms, str_len(shtms));
+ if (SemHeldTooManySeconds < 1)
SemHeldTooManySeconds = 10;
- }
-
- timespec ts;
- clock_gettime(CLOCK_REALTIME, &ts);
- if ((ts.tv_sec - lockingTimestamp_.tv_sec) >= SemHeldTooManySeconds)
- cleanupSemaphore = TRUE;
- }
- else
- cleanupSemaphore = TRUE;
- if (cleanupSemaphore)
- {
+ }
+ else
+ SemHeldTooManySeconds = 10;
+ char *dumpRmsDeadLockProcessEnv = getenv("RMS_DEADLOCK_PROCESS_DUMP");
+ if ((dumpRmsDeadLockProcessEnv != NULL) &&
+ (strcmp(dumpRmsDeadLockProcessEnv, "N") == 0))
+ dumpRmsDeadLockProcess = false;
+ }
+ clock_gettime(CLOCK_REALTIME, &ts);
+ lockedTimeInSecs = ts.tv_sec - lockingTimestamp_.tv_sec;
+ if (checkForSemaphoreHolders)
+ {
+ if (lockedTimeInSecs >= SemHeldTooManySeconds)
+ cleanupSemaphore = TRUE;
+ }
+ else
+ cleanupSemaphore = TRUE;
+ if (cleanupSemaphore)
+ {
NAProcessHandle myPhandle;
myPhandle.getmine();
myPhandle.decompose();
@@ -398,6 +410,16 @@ void StatsGlobals::cleanupDanglingSemaphore(NABoolean checkForSemaphoreHolders)
}
else
{
+ char msg[256];
+ if (semPid_ != -1 && lockedTimeInSecs >= SemHeldTooManySeconds) {
+ snprintf(msg, sizeof(msg), "Pid %d, %d held semaphore for more than %d seconds ",
+ cpu_, semPid_, lockedTimeInSecs);
+ SQLMXLoggingArea::logExecRtInfo(__FILE__, __LINE__, msg, 0);
+ if (dumpRmsDeadLockProcess)
+ msg_mon_dump_process_id(NULL, cpu_, semPid_, coreFile);
+ }
+
+/*
// Is this the same incarnation of the process name?
// Do not be fooled by pid recycle.
MS_Mon_Process_Info_Type processInfo;
@@ -411,6 +433,7 @@ void StatsGlobals::cleanupDanglingSemaphore(NABoolean checkForSemaphoreHolders)
seabedPidRecycle_ = true;
semHoldingProcessExists = false;
}
+*/
}
if (!semHoldingProcessExists)
{
@@ -1483,16 +1506,22 @@ void StatsGlobals::cleanup_SQL(
removeProcess(pidToCleanup);
releaseStatsSemaphore(semId, myPid);
+ logProcessDeath(cpu_, pidToCleanup, "Clean dangling semaphore");
}
void StatsGlobals::verifyAndCleanup(pid_t pidThatDied, SB_Int64_Type seqNum)
{
+ bool processRemoved = false;
int error = getStatsSemaphore(ssmpProcSemId_, getSsmpPid());
if (statsArray_ &&
(statsArray_[pidThatDied].processId_ == pidThatDied) &&
- (statsArray_[pidThatDied].phandleSeqNum_ == seqNum))
- removeProcess(pidThatDied);
+ (statsArray_[pidThatDied].phandleSeqNum_ == seqNum)) {
+ removeProcess(pidThatDied);
+ processRemoved = true;
+ }
releaseStatsSemaphore(ssmpProcSemId_, getSsmpPid());
+ if (processRemoved)
+ logProcessDeath(cpu_, pidThatDied, "Received death message");
}
void StatsGlobals::updateMemStats(pid_t pid,
@@ -1631,6 +1660,16 @@ NABoolean StatsGlobals::getInitError(pid_t pid, NABoolean &reportError)
return retcode;
}
+void StatsGlobals::logProcessDeath(short cpu, pid_t pid, const char *reason)
+{
+ char msg[256];
+
+ snprintf(msg, sizeof(msg),
+ "Pid %d,%d de-registered from shared segment. Reason: %s ", cpu, pid, reason);
+ SQLMXLoggingArea::logExecRtInfo(__FILE__, __LINE__, msg, 0);
+ return;
+}
+
short getMasterCpu(char *uniqueStmtId, Lng32 uniqueStmtIdLen, char *nodeName, short maxLen, short &cpu)
{
Int32 nodeNumber = 0;
http://git-wip-us.apache.org/repos/asf/trafodion/blob/6a25d5c1/core/sql/runtimestats/SqlStats.h
----------------------------------------------------------------------
diff --git a/core/sql/runtimestats/SqlStats.h b/core/sql/runtimestats/SqlStats.h
index 1cfc18f..77721ec 100644
--- a/core/sql/runtimestats/SqlStats.h
+++ b/core/sql/runtimestats/SqlStats.h
@@ -329,7 +329,7 @@ class StatsGlobals
public:
StatsGlobals(void *baseAddr, short envType, Lng32 maxSegSize);
static void* operator new (size_t size, void* loc = 0);
- void addProcess(pid_t pid, NAHeap *heap);
+ bool addProcess(pid_t pid, NAHeap *heap);
void removeProcess(pid_t pid, NABoolean calledDuringAdd = FALSE);
ProcessStats *checkProcess(pid_t pid);
void setStatsArea(pid_t pid, ExStatisticsArea *stats)
@@ -346,6 +346,8 @@ public:
return NULL;
}
+ void logProcessDeath(short cpu, pid_t pid, const char *reason);
+
StmtStats *addQuery(pid_t pid, char *queryId, Lng32 queryIdLen, void *backRef,
Lng32 fragId, char *sourceStr = NULL, Lng32 sourceStrLen = 0,
NABoolean isMaster = FALSE);
http://git-wip-us.apache.org/repos/asf/trafodion/blob/6a25d5c1/core/sql/runtimestats/sscpipc.cpp
----------------------------------------------------------------------
diff --git a/core/sql/runtimestats/sscpipc.cpp b/core/sql/runtimestats/sscpipc.cpp
index ccb8dbb..8ec84ef 100755
--- a/core/sql/runtimestats/sscpipc.cpp
+++ b/core/sql/runtimestats/sscpipc.cpp
@@ -93,7 +93,9 @@ SscpGlobals::SscpGlobals(NAHeap *sscpheap, StatsGlobals *statsGlobals)
NAHeap("Process Stats Heap", statsGlobals_->getStatsHeap(),
8192,
0);
- statsGlobals_->addProcess(myPin_, statsHeap);
+ bool reincarnated = statsGlobals_->addProcess(myPin_, statsHeap);
+ if (reincarnated)
+ statsGlobals_->logProcessDeath(myCpu, myPin_, "Process reincarnated before RIP");
statsGlobals_->releaseStatsSemaphore(semId_, myPin_);
CliGlobals *cliGlobals = GetCliGlobals();
[2/2] trafodion git commit: Merge PR 1735 [TRAFODION-3225] Obscure
cores seen in RMS and logger related code when Trafodion is stressed
Posted by se...@apache.org.
Merge PR 1735 [TRAFODION-3225] Obscure cores seen in RMS and logger related code when Trafodion is stressed
Project: http://git-wip-us.apache.org/repos/asf/trafodion/repo
Commit: http://git-wip-us.apache.org/repos/asf/trafodion/commit/d83b97ef
Tree: http://git-wip-us.apache.org/repos/asf/trafodion/tree/d83b97ef
Diff: http://git-wip-us.apache.org/repos/asf/trafodion/diff/d83b97ef
Branch: refs/heads/master
Commit: d83b97efe015245f427255fe11bcec6edc304d87
Parents: 06b6486 6a25d5c
Author: selvaganesang <se...@apache.org>
Authored: Wed Oct 31 16:36:08 2018 +0000
Committer: selvaganesang <se...@apache.org>
Committed: Wed Oct 31 16:36:08 2018 +0000
----------------------------------------------------------------------
core/sql/bin/SqlciMain.cpp | 1 +
core/sql/cli/Globals.cpp | 6 +-
core/sql/executor/ex_esp_frag_dir.cpp | 6 +-
core/sql/qmscommon/QRLogger.cpp | 6 +-
core/sql/qmscommon/QRLogger.h | 2 +
core/sql/runtimestats/SqlStats.cpp | 113 +++++++++++++++++++----------
core/sql/runtimestats/SqlStats.h | 4 +-
core/sql/runtimestats/sscpipc.cpp | 4 +-
8 files changed, 99 insertions(+), 43 deletions(-)
----------------------------------------------------------------------