You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@trafodion.apache.org by se...@apache.org on 2018/10/31 16:36:58 UTC

[1/2] trafodion git commit: [TRAFODION-3225] Obscure cores seen in RMS and logger related code when Trafodion is stressed

Repository: trafodion
Updated Branches:
  refs/heads/master 06b648612 -> d83b97efe


[TRAFODION-3225] Obscure cores seen in RMS and logger related code when Trafodion is stressed

Process de-registration is now logged to understand the spate of cores seen during
stress testing

Cleanup dangling semaphore incorrectly detected the problem due to pid recycling and
hence de-registered the process and its query fragment from the shared segment

When the RMS semaphore is held for more than 10 seconds, it is assumed to be in a
deadlock situation and hence the process holding the semaphore is core dumped for
further analysis.

Create a global logger instance instead of a function static variable
to help in debugging when problems with QRlogger is reported.


Project: http://git-wip-us.apache.org/repos/asf/trafodion/repo
Commit: http://git-wip-us.apache.org/repos/asf/trafodion/commit/6a25d5c1
Tree: http://git-wip-us.apache.org/repos/asf/trafodion/tree/6a25d5c1
Diff: http://git-wip-us.apache.org/repos/asf/trafodion/diff/6a25d5c1

Branch: refs/heads/master
Commit: 6a25d5c17ade2140759dab9a6580979147d0e26a
Parents: 06b6486
Author: selvaganesang <se...@esgyn.com>
Authored: Tue Oct 30 17:21:45 2018 +0000
Committer: selvaganesang <se...@esgyn.com>
Committed: Tue Oct 30 17:34:58 2018 +0000

----------------------------------------------------------------------
 core/sql/bin/SqlciMain.cpp            |   1 +
 core/sql/cli/Globals.cpp              |   6 +-
 core/sql/executor/ex_esp_frag_dir.cpp |   6 +-
 core/sql/qmscommon/QRLogger.cpp       |   6 +-
 core/sql/qmscommon/QRLogger.h         |   2 +
 core/sql/runtimestats/SqlStats.cpp    | 113 +++++++++++++++++++----------
 core/sql/runtimestats/SqlStats.h      |   4 +-
 core/sql/runtimestats/sscpipc.cpp     |   4 +-
 8 files changed, 99 insertions(+), 43 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/trafodion/blob/6a25d5c1/core/sql/bin/SqlciMain.cpp
----------------------------------------------------------------------
diff --git a/core/sql/bin/SqlciMain.cpp b/core/sql/bin/SqlciMain.cpp
index fc02077..b45f108 100644
--- a/core/sql/bin/SqlciMain.cpp
+++ b/core/sql/bin/SqlciMain.cpp
@@ -216,6 +216,7 @@ _callable void removeProcess()
     {
       statsGlobals->removeProcess(cliGlobals->myPin());
       statsGlobals->releaseStatsSemaphore(cliGlobals->getSemId(), cliGlobals->myPin());
+      statsGlobals->logProcessDeath(cliGlobals->myCpu(), cliGlobals->myPin(), "Normal process death");
     }
     else if (error == 4011)
     {

http://git-wip-us.apache.org/repos/asf/trafodion/blob/6a25d5c1/core/sql/cli/Globals.cpp
----------------------------------------------------------------------
diff --git a/core/sql/cli/Globals.cpp b/core/sql/cli/Globals.cpp
index 6f498fb..014a2ef 100644
--- a/core/sql/cli/Globals.cpp
+++ b/core/sql/cli/Globals.cpp
@@ -206,6 +206,7 @@ void CliGlobals::init( NABoolean espProcess,
       }
       else
       {
+        bool reincarnated;
         error = statsGlobals_->getStatsSemaphore(semId_, myPin_);
 
         statsHeap_ = (NAHeap *)statsGlobals_->
@@ -221,10 +222,12 @@ void CliGlobals::init( NABoolean espProcess,
 	  NAHeap("Process Stats Heap", statsGlobals_->getStatsHeap(),
 		 8192,
 		 0);
-	statsGlobals_->addProcess(myPin_, statsHeap_);
+	reincarnated = statsGlobals_->addProcess(myPin_, statsHeap_);
         processStats_ = statsGlobals_->getExProcessStats(myPin_);
         processStats_->setStartTime(myStartTime_);
 	statsGlobals_->releaseStatsSemaphore(semId_, myPin_);
+        if (reincarnated)
+           statsGlobals_->logProcessDeath(myCpu_, myPin_, "Process reincarnated before RIP");
       }
     }
     // create a default context and make it the current context
@@ -298,6 +301,7 @@ CliGlobals::~CliGlobals()
     error = statsGlobals_->getStatsSemaphore(semId_, myPin_);
     statsGlobals_->removeProcess(myPin_);
     statsGlobals_->releaseStatsSemaphore(semId_, myPin_);
+    statsGlobals_->logProcessDeath(myCpu_, myPin_, "Normal process death");
     sem_close((sem_t *)semId_);
   }
 }

http://git-wip-us.apache.org/repos/asf/trafodion/blob/6a25d5c1/core/sql/executor/ex_esp_frag_dir.cpp
----------------------------------------------------------------------
diff --git a/core/sql/executor/ex_esp_frag_dir.cpp b/core/sql/executor/ex_esp_frag_dir.cpp
index 6825ec0..a880ae8 100644
--- a/core/sql/executor/ex_esp_frag_dir.cpp
+++ b/core/sql/executor/ex_esp_frag_dir.cpp
@@ -131,6 +131,7 @@ ExEspFragInstanceDir::ExEspFragInstanceDir(CliGlobals *cliGlobals,
     }
     else
     {
+      bool reincarnated;
       cliGlobals_->setStatsGlobals(statsGlobals_);
       cliGlobals_->setSemId(semId_);
       error = statsGlobals_->getStatsSemaphore(semId_, pid_);
@@ -142,12 +143,14 @@ ExEspFragInstanceDir::ExEspFragInstanceDir(CliGlobals *cliGlobals,
       // We need to set up the cliGlobals, since addProcess will call getRTSSemaphore
       // and it uses these members
       cliGlobals_->setStatsHeap(statsHeap_);
-      statsGlobals_->addProcess(pid_, statsHeap_);
+      reincarnated = statsGlobals_->addProcess(pid_, statsHeap_);
       ExProcessStats *processStats = 
            statsGlobals_->getExProcessStats(pid_);
       processStats->setStartTime(cliGlobals_->myStartTime());
       cliGlobals_->setExProcessStats(processStats);
       statsGlobals_->releaseStatsSemaphore(semId_, pid_);
+      if (reincarnated)
+         statsGlobals_->logProcessDeath(cpu_, pid_, "Process reincarnated before RIP");
     }
   }
   cliGlobals_->setStatsHeap(statsHeap_);
@@ -191,6 +194,7 @@ ExEspFragInstanceDir::~ExEspFragInstanceDir()
     int error = statsGlobals_->getStatsSemaphore(semId_, pid_);
     statsGlobals_->removeProcess(pid_);
     statsGlobals_->releaseStatsSemaphore(semId_, pid_);
+    statsGlobals_->logProcessDeath(cpu_, pid_, "Normal process death");
     sem_close((sem_t *)semId_);
   }
 }

http://git-wip-us.apache.org/repos/asf/trafodion/blob/6a25d5c1/core/sql/qmscommon/QRLogger.cpp
----------------------------------------------------------------------
diff --git a/core/sql/qmscommon/QRLogger.cpp b/core/sql/qmscommon/QRLogger.cpp
index 850c7d0..bb32f4f 100644
--- a/core/sql/qmscommon/QRLogger.cpp
+++ b/core/sql/qmscommon/QRLogger.cpp
@@ -41,6 +41,7 @@
 #include "seabed/fserr.h"
 
 BOOL gv_QRLoggerInitialized_ = FALSE;
+QRLogger *gv_QRLoggerInstance_ = NULL;
 
 using namespace log4cxx;
 using namespace log4cxx::helpers;
@@ -99,8 +100,9 @@ QRLogger::QRLogger()
 // **************************************************************************
 QRLogger& QRLogger::instance()
 {
-  static QRLogger onlyInstance_;
-  return onlyInstance_;
+  if (gv_QRLoggerInstance_ == NULL)
+     gv_QRLoggerInstance_ = new QRLogger();
+  return *gv_QRLoggerInstance_;
 }
 
 

http://git-wip-us.apache.org/repos/asf/trafodion/blob/6a25d5c1/core/sql/qmscommon/QRLogger.h
----------------------------------------------------------------------
diff --git a/core/sql/qmscommon/QRLogger.h b/core/sql/qmscommon/QRLogger.h
index 6e8244c..cd3f399 100644
--- a/core/sql/qmscommon/QRLogger.h
+++ b/core/sql/qmscommon/QRLogger.h
@@ -28,6 +28,8 @@
 #include "NAString.h"
 #include "CommonLogger.h"
 
+class QRLogger;
+extern QRLogger *gv_QRLoggerInstance_;
 // -----  these categories are currently not used
 // qmscomon
 extern std::string CAT_SQL_COMP_QR_COMMON;

http://git-wip-us.apache.org/repos/asf/trafodion/blob/6a25d5c1/core/sql/runtimestats/SqlStats.cpp
----------------------------------------------------------------------
diff --git a/core/sql/runtimestats/SqlStats.cpp b/core/sql/runtimestats/SqlStats.cpp
index 3fcf9d3..58f8eb4 100644
--- a/core/sql/runtimestats/SqlStats.cpp
+++ b/core/sql/runtimestats/SqlStats.cpp
@@ -194,17 +194,15 @@ const char *StatsGlobals::rmsEnvType(RTSEnvType envType)
   }
 }
 
-void StatsGlobals::addProcess(pid_t pid, NAHeap *heap)
+bool StatsGlobals::addProcess(pid_t pid, NAHeap *heap)
 {
+  bool pidReincarnated = false;
   if (pid >= configuredPidMax_)
-     return;
+     return pidReincarnated;
   char msg[256];;
   if (statsArray_[pid].processStats_ != NULL)
   {
-    snprintf(msg, sizeof(msg),
-        "Pid %d,%d got recycled soon or SSMP didn't receive the death message ",
-           cpu_, pid);
-    SQLMXLoggingArea::logExecRtInfo(__FILE__, __LINE__, msg, 0);
+    pidReincarnated = true;
     removeProcess(pid, TRUE);
   }   
   statsArray_[pid].processId_ = pid;
@@ -214,7 +212,7 @@ void StatsGlobals::addProcess(pid_t pid, NAHeap *heap)
   incProcessStatsHeaps();
   if (pid > maxPid_)
      maxPid_ = pid;
-  return;
+  return pidReincarnated;
 }
 
 void StatsGlobals::removeProcess(pid_t pid, NABoolean calledAtAdd)
@@ -304,6 +302,8 @@ void StatsGlobals::checkForDeadProcesses(pid_t myPid)
   error = getStatsSemaphore(ssmpProcSemId_, myPid);
 
   int pidRemainingToCheck = 20;
+  pid_t pidsRemoved[pidRemainingToCheck]; 
+  int numPidsRemoved = 0;
   pid_t firstPid = pidToCheck_;
   for (;maxPid_ > 0;)
   {
@@ -326,11 +326,16 @@ void StatsGlobals::checkForDeadProcesses(pid_t myPid)
       Int32 ln_error = msg_mon_get_process_name(
                        cpu_, statsArray_[pidToCheck_].processId_, 
                        processName);
-      if (ln_error == XZFIL_ERR_NOSUCHDEV) 
-        removeProcess(pidToCheck_);
+      if (ln_error == XZFIL_ERR_NOSUCHDEV)  {
+         pidsRemoved[numPidsRemoved++]; 
+         removeProcess(pidToCheck_);
+      }
     }
   }
   releaseStatsSemaphore(ssmpProcSemId_, myPid);
+  // death messages are logged outside of semaphore due to process hop
+  for (int i = 0 ; i < numPidsRemoved ; i++) 
+      logProcessDeath(cpu_, pidsRemoved[i], "Dead Process");
 }
 
 // We expect a death message to be delivered to MXSSMP by the monitor
@@ -340,48 +345,55 @@ void StatsGlobals::checkForDeadProcesses(pid_t myPid)
 // be unexpectedly holding the stats semaphore.  
 
 static Int32 SemHeldTooManySeconds = -1;
+static bool dumpRmsDeadLockProcess = true;
 
 void StatsGlobals::cleanupDanglingSemaphore(NABoolean checkForSemaphoreHolders)
 {
   CliGlobals *cliGlobals = GetCliGlobals();
   NABoolean cleanupSemaphore = FALSE;
+  char coreFile[512];
   if (semPid_ == -1)
-    return; // Nobody has the semaphore, nothing to do here.
+     return; // Nobody has the semaphore, nothing to do here.
     
   if (NOT (cliGlobals->myPin() == getSsmpPid()
           && cliGlobals->myStartTime() == getSsmpTimestamp()))
-    return; // Only ssmp is allowed to cleanup after another process.
+     return; // Only ssmp is allowed to cleanup after another process.
 
   // Coverage notes - it would be too difficult to automate a test
   // for this since usually a death message is used to clean up a
   // generic SQL process' exit.  But this code has been unit tested
   // using gdb sessions on the generic process and on this MXSSMP
   // process.
-    if (checkForSemaphoreHolders)
-    {
-       if (SemHeldTooManySeconds < 0)
-       {
-          // call getenv once per process
-          char *shtms = getenv("MXSSMP_SEM_RELEASE_SECONDS");
-          if (shtms)
-          {
-             SemHeldTooManySeconds = str_atoi(shtms, str_len(shtms));
-             if (SemHeldTooManySeconds < 1)
-                SemHeldTooManySeconds = 10;
-           }
-           else
+  Int32 lockedTimeInSecs = 0;
+  timespec ts;
+  if (SemHeldTooManySeconds < 0)
+  {
+     // call getenv once per process
+     char *shtms = getenv("MXSSMP_SEM_RELEASE_SECONDS");
+     if (shtms)
+     {
+        SemHeldTooManySeconds = str_atoi(shtms, str_len(shtms));
+        if (SemHeldTooManySeconds < 1)
            SemHeldTooManySeconds = 10;
-        }
-
-        timespec ts;
-        clock_gettime(CLOCK_REALTIME, &ts);
-        if ((ts.tv_sec - lockingTimestamp_.tv_sec) >= SemHeldTooManySeconds)
-           cleanupSemaphore  = TRUE;
-    }
-    else
-        cleanupSemaphore = TRUE;
-    if (cleanupSemaphore)
-    {
+     }
+     else
+        SemHeldTooManySeconds = 10;
+     char *dumpRmsDeadLockProcessEnv = getenv("RMS_DEADLOCK_PROCESS_DUMP");
+     if ((dumpRmsDeadLockProcessEnv != NULL) &&
+         (strcmp(dumpRmsDeadLockProcessEnv, "N") == 0))
+           dumpRmsDeadLockProcess = false;
+  }
+  clock_gettime(CLOCK_REALTIME, &ts);
+  lockedTimeInSecs = ts.tv_sec - lockingTimestamp_.tv_sec;
+  if (checkForSemaphoreHolders) 
+  { 
+     if (lockedTimeInSecs >= SemHeldTooManySeconds)
+        cleanupSemaphore  = TRUE;
+  }
+  else
+     cleanupSemaphore = TRUE;
+  if (cleanupSemaphore) 
+  {
       NAProcessHandle myPhandle;
       myPhandle.getmine();
       myPhandle.decompose();
@@ -398,6 +410,16 @@ void StatsGlobals::cleanupDanglingSemaphore(NABoolean checkForSemaphoreHolders)
       }
       else
       {
+         char msg[256];
+         if (semPid_ != -1 && lockedTimeInSecs >= SemHeldTooManySeconds) {
+            snprintf(msg, sizeof(msg), "Pid %d, %d held semaphore for more than %d seconds ",
+                      cpu_, semPid_, lockedTimeInSecs); 
+            SQLMXLoggingArea::logExecRtInfo(__FILE__, __LINE__, msg, 0);
+            if (dumpRmsDeadLockProcess)
+                msg_mon_dump_process_id(NULL, cpu_, semPid_, coreFile);
+         }
+         
+/*
         // Is this the same incarnation of the process name?
         // Do not be fooled by pid recycle.
         MS_Mon_Process_Info_Type processInfo;
@@ -411,6 +433,7 @@ void StatsGlobals::cleanupDanglingSemaphore(NABoolean checkForSemaphoreHolders)
           seabedPidRecycle_ = true;
           semHoldingProcessExists = false;
         }
+*/
       }
       if (!semHoldingProcessExists)
       {
@@ -1483,16 +1506,22 @@ void StatsGlobals::cleanup_SQL(
 
   removeProcess(pidToCleanup);
   releaseStatsSemaphore(semId, myPid);
+  logProcessDeath(cpu_, pidToCleanup, "Clean dangling semaphore");
 }
 
 void StatsGlobals::verifyAndCleanup(pid_t pidThatDied, SB_Int64_Type seqNum)
 {
+  bool processRemoved = false;
   int error = getStatsSemaphore(ssmpProcSemId_, getSsmpPid());
   if (statsArray_ && 
       (statsArray_[pidThatDied].processId_ == pidThatDied) &&
-      (statsArray_[pidThatDied].phandleSeqNum_ == seqNum))
-    removeProcess(pidThatDied);
+      (statsArray_[pidThatDied].phandleSeqNum_ == seqNum)) {
+     removeProcess(pidThatDied);
+     processRemoved = true;
+  }
   releaseStatsSemaphore(ssmpProcSemId_, getSsmpPid());
+  if (processRemoved)
+     logProcessDeath(cpu_, pidThatDied, "Received death message");
 }
 
 void StatsGlobals::updateMemStats(pid_t pid, 
@@ -1631,6 +1660,16 @@ NABoolean StatsGlobals::getInitError(pid_t pid, NABoolean &reportError)
    return retcode; 
 }
 
+void StatsGlobals::logProcessDeath(short cpu, pid_t pid, const char *reason)
+{
+   char msg[256];
+
+   snprintf(msg, sizeof(msg),
+        "Pid %d,%d de-registered from shared segment. Reason: %s ", cpu, pid, reason);
+   SQLMXLoggingArea::logExecRtInfo(__FILE__, __LINE__, msg, 0);
+   return;
+}
+
 short getMasterCpu(char *uniqueStmtId, Lng32 uniqueStmtIdLen, char *nodeName, short maxLen, short &cpu)
 {
   Int32 nodeNumber = 0;

http://git-wip-us.apache.org/repos/asf/trafodion/blob/6a25d5c1/core/sql/runtimestats/SqlStats.h
----------------------------------------------------------------------
diff --git a/core/sql/runtimestats/SqlStats.h b/core/sql/runtimestats/SqlStats.h
index 1cfc18f..77721ec 100644
--- a/core/sql/runtimestats/SqlStats.h
+++ b/core/sql/runtimestats/SqlStats.h
@@ -329,7 +329,7 @@ class StatsGlobals
 public:
   StatsGlobals(void *baseAddr, short envType, Lng32 maxSegSize);
   static void* operator new (size_t size, void* loc = 0);
-  void addProcess(pid_t pid, NAHeap *heap);
+  bool addProcess(pid_t pid, NAHeap *heap);
   void removeProcess(pid_t pid, NABoolean calledDuringAdd = FALSE);
   ProcessStats *checkProcess(pid_t pid);
   void setStatsArea(pid_t pid, ExStatisticsArea *stats)
@@ -346,6 +346,8 @@ public:
       return NULL;
   }
 
+  void logProcessDeath(short cpu, pid_t pid, const char *reason);
+
   StmtStats *addQuery(pid_t pid, char *queryId, Lng32 queryIdLen, void *backRef,
                      Lng32 fragId, char *sourceStr = NULL, Lng32 sourceStrLen = 0,
                      NABoolean isMaster = FALSE);

http://git-wip-us.apache.org/repos/asf/trafodion/blob/6a25d5c1/core/sql/runtimestats/sscpipc.cpp
----------------------------------------------------------------------
diff --git a/core/sql/runtimestats/sscpipc.cpp b/core/sql/runtimestats/sscpipc.cpp
index ccb8dbb..8ec84ef 100755
--- a/core/sql/runtimestats/sscpipc.cpp
+++ b/core/sql/runtimestats/sscpipc.cpp
@@ -93,7 +93,9 @@ SscpGlobals::SscpGlobals(NAHeap *sscpheap, StatsGlobals *statsGlobals)
        NAHeap("Process Stats Heap", statsGlobals_->getStatsHeap(),
                8192,
                0);
-  statsGlobals_->addProcess(myPin_, statsHeap);
+  bool reincarnated = statsGlobals_->addProcess(myPin_, statsHeap);
+  if (reincarnated)
+     statsGlobals_->logProcessDeath(myCpu, myPin_, "Process reincarnated before RIP");
 
   statsGlobals_->releaseStatsSemaphore(semId_, myPin_);
   CliGlobals *cliGlobals = GetCliGlobals();


[2/2] trafodion git commit: Merge PR 1735 [TRAFODION-3225] Obscure cores seen in RMS and logger related code when Trafodion is stressed

Posted by se...@apache.org.
Merge PR 1735 [TRAFODION-3225] Obscure cores seen in RMS and logger related code when Trafodion is stressed 


Project: http://git-wip-us.apache.org/repos/asf/trafodion/repo
Commit: http://git-wip-us.apache.org/repos/asf/trafodion/commit/d83b97ef
Tree: http://git-wip-us.apache.org/repos/asf/trafodion/tree/d83b97ef
Diff: http://git-wip-us.apache.org/repos/asf/trafodion/diff/d83b97ef

Branch: refs/heads/master
Commit: d83b97efe015245f427255fe11bcec6edc304d87
Parents: 06b6486 6a25d5c
Author: selvaganesang <se...@apache.org>
Authored: Wed Oct 31 16:36:08 2018 +0000
Committer: selvaganesang <se...@apache.org>
Committed: Wed Oct 31 16:36:08 2018 +0000

----------------------------------------------------------------------
 core/sql/bin/SqlciMain.cpp            |   1 +
 core/sql/cli/Globals.cpp              |   6 +-
 core/sql/executor/ex_esp_frag_dir.cpp |   6 +-
 core/sql/qmscommon/QRLogger.cpp       |   6 +-
 core/sql/qmscommon/QRLogger.h         |   2 +
 core/sql/runtimestats/SqlStats.cpp    | 113 +++++++++++++++++++----------
 core/sql/runtimestats/SqlStats.h      |   4 +-
 core/sql/runtimestats/sscpipc.cpp     |   4 +-
 8 files changed, 99 insertions(+), 43 deletions(-)
----------------------------------------------------------------------