You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spot.apache.org by ev...@apache.org on 2017/01/25 18:36:13 UTC

[01/49] incubator-spot git commit: Eddited DNSWordCreation.scala DomainProcessor.scala DNSSuspiciousConnectsModel.scala, and SuspicousConnectsArgumentParser to all allow in a variable user domain designation. More work to do in order to have this inform [Forced Update!]

Repository: incubator-spot
Updated Branches:
  refs/heads/master 78387cf13 -> d30337d42 (forced update)


 Eddited DNSWordCreation.scala DomainProcessor.scala DNSSuspiciousConnectsModel.scala, and SuspicousConnectsArgumentParser to all allow in a variable user domain designation. More work to do in order to have this information come in from the spot.conf.


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/bc5744f6
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/bc5744f6
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/bc5744f6

Branch: refs/heads/master
Commit: bc5744f6f5e81521a3905ae072177218fdd6b05c
Parents: 83d157a
Author: Brandon Edwards <br...@intel.com>
Authored: Mon Dec 5 14:59:38 2016 -0800
Committer: Brandon Edwards <br...@intel.com>
Committed: Tue Dec 6 21:50:34 2016 -0800

----------------------------------------------------------------------
 .../spot/SuspiciousConnectsArgumentParser.scala | 25 ++++++++++++++++++++
 .../org/apache/spot/dns/DNSWordCreation.scala   |  6 +++--
 .../dns/model/DNSSuspiciousConnectsModel.scala  |  9 ++++++-
 .../apache/spot/utilities/DomainProcessor.scala |  5 ++--
 4 files changed, 40 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/bc5744f6/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala
index 33186c5..632e0d8 100644
--- a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala
@@ -11,6 +11,11 @@ object SuspiciousConnectsArgumentParser {
                                       feedbackFile: String = "",
                                       duplicationFactor: Int = 1,
                                       topicCount: Int = 20,
+                                      localPath: String = "",
+                                      localUser: String = "",
+                                      userDomain: String = "",
+                                      ldaPath: String = "",
+                                      nodes: String = "",
                                       hdfsScoredConnect: String = "",
                                       threshold: Double = 1.0d,
                                       maxResults: Int = -1,
@@ -45,6 +50,26 @@ object SuspiciousConnectsArgumentParser {
       action((x, c) => c.copy(topicCount = x.toInt)).
       text("topic count")
 
+    opt[String]("lpath").required().valueName("<local path>").
+      action((x, c) => c.copy(localPath = x)).
+      text("Local Path")
+
+    opt[String]("ldapath").required().valueName("<local path>").
+      action((x, c) => c.copy(ldaPath = x)).
+      text("LDA Path")
+
+    opt[String]("luser").required().valueName("<local path>").
+      action((x, c) => c.copy(localUser = x)).
+      text("Local user path")
+
+    opt[String]("userDomain").required().valueName("<user domain>").
+      action((x, c) => c.copy(userDomain = x)).
+      text("Domain of spot user (example: intel)")
+
+    opt[String]("nodes").required().valueName("<input param>").
+      action((x, c) => c.copy(nodes = x)).
+      text("Node list")
+
     opt[String]("scored").required().valueName("<hdfs path>").
       action((x, c) => c.copy(hdfsScoredConnect = x)).
       text("HDFS path for results")

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/bc5744f6/spot-ml/src/main/scala/org/apache/spot/dns/DNSWordCreation.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/dns/DNSWordCreation.scala b/spot-ml/src/main/scala/org/apache/spot/dns/DNSWordCreation.scala
index 2ffe12e..e4595e1 100644
--- a/spot-ml/src/main/scala/org/apache/spot/dns/DNSWordCreation.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/dns/DNSWordCreation.scala
@@ -15,13 +15,15 @@ import org.apache.spot.utilities.Quantiles
   * @param entropyCuts
   * @param numberPeriodsCuts
   * @param topDomainsBC
+  * @param userDomain
   */
 class DNSWordCreation(frameLengthCuts: Array[Double],
                       timeCuts: Array[Double],
                       subdomainLengthCuts: Array[Double],
                       entropyCuts: Array[Double],
                       numberPeriodsCuts: Array[Double],
-                      topDomainsBC: Broadcast[Set[String]]) extends Serializable {
+                      topDomainsBC: Broadcast[Set[String]],
+                      userDomain: String) extends Serializable {
 
 
   /**
@@ -79,7 +81,7 @@ class DNSWordCreation(frameLengthCuts: Array[Double],
 
 
     val DomainInfo(domain, topDomain, subdomain, subdomainLength, subdomainEntropy, numPeriods) =
-      extractDomainInfo(queryName, topDomainsBC)
+      extractDomainInfo(queryName, topDomainsBC, userDomain)
 
     Seq(topDomain,
       Quantiles.bin(frameLength.toDouble, frameLengthCuts),

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/bc5744f6/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala b/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala
index 81c214f..953e1ec 100644
--- a/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala
@@ -158,6 +158,7 @@ object DNSSuspiciousConnectsModel {
 
     val countryCodesBC = sparkContext.broadcast(CountryCodes.CountryCodes)
     val topDomainsBC = sparkContext.broadcast(TopDomains.TopDomains)
+    val userDomain = config.userDomain
 
     // create quantile cut-offs
 
@@ -180,7 +181,13 @@ object DNSSuspiciousConnectsModel {
 
     // simplify DNS log entries into "words"
 
-    val dnsWordCreator = new DNSWordCreation(frameLengthCuts, timeCuts, subdomainLengthCuts, entropyCuts, numberPeriodsCuts, topDomainsBC)
+    val dnsWordCreator = new DNSWordCreation(frameLengthCuts,
+                                             timeCuts,
+                                             subdomainLengthCuts,
+                                             entropyCuts,
+                                             numberPeriodsCuts,
+                                             topDomainsBC,
+                                             userDomain)
 
     val dataWithWordDF = totalDataDF.withColumn(Word, dnsWordCreator.wordCreationUDF(modelColumns: _*))
 

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/bc5744f6/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala b/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala
index 28595e4..334ae87 100644
--- a/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala
@@ -48,9 +48,10 @@ object DomainProcessor extends Serializable {
     * Extract domain info from a url.
     * @param url Incoming url.
     * @param topDomainsBC Broadcast variable containing the top domains set.
+    * @param userDomain Domain of the spot user (example:'intel').
     * @return New [[DomainInfo]] object containing extracted domain information.
     */
-  def extractDomainInfo(url: String, topDomainsBC: Broadcast[Set[String]]): DomainInfo = {
+  def extractDomainInfo(url: String, topDomainsBC: Broadcast[Set[String]], userDomain: String): DomainInfo = {
 
     val spliturl = url.split('.')
     val numParts = spliturl.length
@@ -64,7 +65,7 @@ object DomainProcessor extends Serializable {
       0
     }
 
-    val topDomainClass = if (domain == "intel") {
+    val topDomainClass = if (domain == userDomain) {
       2
     } else if (topDomainsBC.value contains domain) {
       1


[24/49] incubator-spot git commit: unit_test_cleanup

Posted by ev...@apache.org.
unit_test_cleanup

added dns and proxy unit tests

got rid of Spark INFO loggin spew

stopped parallel test runs in sbt to avoid test errors


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/991fd0ef
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/991fd0ef
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/991fd0ef

Branch: refs/heads/master
Commit: 991fd0ef5e51dbaf0a9eb55b8aac6d3567f3e9a9
Parents: 0b1d46e
Author: nlsegerl <na...@intel.com>
Authored: Tue Dec 20 16:23:25 2016 -0800
Committer: nlsegerl <na...@intel.com>
Committed: Tue Dec 20 16:23:25 2016 -0800

----------------------------------------------------------------------
 spot-ml/build.sbt                               |   2 +
 .../apache/spot/lda/SpotLDAWrapperSchema.scala  |   6 +
 spot-ml/src/test/resources/log4j.properties     |   8 +
 .../org/apache/spot/SpotLDAWrapperTest.scala    | 172 +++++++++----------
 .../dns/DNSSuspiciousConnectsAnalysisTest.scala |   9 +-
 .../ProxySuspiciousConnectsAnalysisTest.scala   |  12 +-
 6 files changed, 103 insertions(+), 106 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/991fd0ef/spot-ml/build.sbt
----------------------------------------------------------------------
diff --git a/spot-ml/build.sbt b/spot-ml/build.sbt
index 4ed5884..62983d2 100644
--- a/spot-ml/build.sbt
+++ b/spot-ml/build.sbt
@@ -31,3 +31,5 @@ mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) => {
 }
 }
 
+// super important with multiple tests running spark Contexts
+parallelExecution in Test := false
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/991fd0ef/spot-ml/src/main/scala/org/apache/spot/lda/SpotLDAWrapperSchema.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/lda/SpotLDAWrapperSchema.scala b/spot-ml/src/main/scala/org/apache/spot/lda/SpotLDAWrapperSchema.scala
index d5412f2..7b8da28 100644
--- a/spot-ml/src/main/scala/org/apache/spot/lda/SpotLDAWrapperSchema.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/lda/SpotLDAWrapperSchema.scala
@@ -1,10 +1,16 @@
 package org.apache.spot.lda
 
+import org.apache.spark.sql.types.{LongType, StringType, StructField}
+
 object SpotLDAWrapperSchema {
 
   // modelDF columns
   val DocumentName = "document_name"
+  val DocumentNameField = StructField(DocumentName, StringType)
+
   val DocumentNumber = "document_number"
+  val DocumentNumberField = StructField(DocumentNumber, LongType)
+
   val DocumentCount = "document_count"
   val DocumentNameWordNameWordCount = "document_word_count"
 

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/991fd0ef/spot-ml/src/test/resources/log4j.properties
----------------------------------------------------------------------
diff --git a/spot-ml/src/test/resources/log4j.properties b/spot-ml/src/test/resources/log4j.properties
new file mode 100644
index 0000000..3fd5d2b
--- /dev/null
+++ b/spot-ml/src/test/resources/log4j.properties
@@ -0,0 +1,8 @@
+# Change this to set Spark log level
+log4j.logger.org.apache.spark=WARN
+
+# Silence akka remoting
+log4j.logger.Remoting=WARN
+
+# Ignore messages below warning level from Jetty, because it's a bit verbose
+log4j.logger.org.eclipse.jetty=WARN
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/991fd0ef/spot-ml/src/test/scala/org/apache/spot/SpotLDAWrapperTest.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/test/scala/org/apache/spot/SpotLDAWrapperTest.scala b/spot-ml/src/test/scala/org/apache/spot/SpotLDAWrapperTest.scala
index 70fb89c..d5c4089 100644
--- a/spot-ml/src/test/scala/org/apache/spot/SpotLDAWrapperTest.scala
+++ b/spot-ml/src/test/scala/org/apache/spot/SpotLDAWrapperTest.scala
@@ -3,7 +3,8 @@ package org.apache.spot
 import org.apache.log4j.{Level, LogManager}
 import org.apache.spark.mllib.linalg.{Matrices, Vector, Vectors}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{DataFrame, SQLContext}
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.{DataFrame, Row, SQLContext}
 import org.apache.spot.lda.SpotLDAWrapperSchema._
 import org.apache.spot.lda.SpotLDAWrapper
 import org.apache.spot.lda.SpotLDAWrapper._
@@ -14,127 +15,114 @@ import scala.collection.immutable.Map
 
 class SpotLDAWrapperTest extends TestingSparkContextFlatSpec with Matchers {
 
-  val ldaAlpha = 1.02
-  val ldaBeta = 1.001
-  val ldaMaxiterations = 20
+    val ldaAlpha = 1.02
+    val ldaBeta = 1.001
+    val ldaMaxiterations = 20
 
-  "SparkLDA" should "handle an extremely unbalanced two word doc" in {
-    val logger = LogManager.getLogger("SuspiciousConnectsAnalysis")
-    logger.setLevel(Level.INFO)
-    val testSqlContext = new org.apache.spark.sql.SQLContext(sparkContext)
+    "SparkLDA" should "handle an extremely unbalanced two word doc" in {
+      val logger = LogManager.getLogger("SuspiciousConnectsAnalysis")
+      logger.setLevel(Level.WARN)
 
-    val catFancy = SpotLDAInput("pets", "cat", 1)
-    val dogWorld = SpotLDAInput("pets", "dog", 999)
+      val catFancy = SpotLDAInput("pets", "cat", 1)
+      val dogWorld = SpotLDAInput("pets", "dog", 999)
 
-    val data = sparkContext.parallelize(Seq(catFancy, dogWorld))
-    val out = SpotLDAWrapper.runLDA(sparkContext, testSqlContext, data, 2, logger, Some(0xdeadbeef), ldaAlpha, ldaBeta, ldaMaxiterations)
+      val data = sparkContext.parallelize(Seq(catFancy, dogWorld))
+      val out = SpotLDAWrapper.runLDA(sparkContext, sqlContext, data, 2, logger, Some(0xdeadbeef), ldaAlpha, ldaBeta, ldaMaxiterations)
 
-    val topicMixDF = out.docToTopicMix
+      val topicMixDF = out.docToTopicMix
 
-    var topicMix =
-      topicMixDF.filter(topicMixDF(DocumentName) === "pets").select(TopicProbabilityMix).first().toSeq(0).asInstanceOf[Seq[Double]].toArray
-    val catTopics = out.wordResults("cat")
-    val dogTopics = out.wordResults("dog")
+      var topicMix =
+        topicMixDF.filter(topicMixDF(DocumentName) === "pets").select(TopicProbabilityMix).first().toSeq(0).asInstanceOf[Seq[Double]].toArray
+      val catTopics = out.wordResults("cat")
+      val dogTopics = out.wordResults("dog")
 
-    Math.abs(topicMix(0) * catTopics(0) + topicMix(1) * catTopics(1)) should be < 0.01
-    Math.abs(0.999 - (topicMix(0) * dogTopics(0) + topicMix(1) * dogTopics(1))) should be < 0.01
-  }
-
-  "SparkLDA" should "handle distinct docs on distinct words" in {
-    val logger = LogManager.getLogger("SuspiciousConnectsAnalysis")
-    logger.setLevel(Level.INFO)
-    val testSqlContext = new org.apache.spark.sql.SQLContext(sparkContext)
+      Math.abs(topicMix(0) * catTopics(0) + topicMix(1) * catTopics(1)) should be < 0.01
+      Math.abs(0.999 - (topicMix(0) * dogTopics(0) + topicMix(1) * dogTopics(1))) should be < 0.01
+    }
 
-    val catFancy = SpotLDAInput("cat fancy", "cat", 1)
-    val dogWorld = SpotLDAInput("dog world", "dog", 1)
+    "SparkLDA" should "handle distinct docs on distinct words" in {
+      val logger = LogManager.getLogger("SuspiciousConnectsAnalysis")
+      logger.setLevel(Level.WARN)
+      val catFancy = SpotLDAInput("cat fancy", "cat", 1)
+      val dogWorld = SpotLDAInput("dog world", "dog", 1)
 
-    val data = sparkContext.parallelize(Seq(catFancy, dogWorld))
-    val out = SpotLDAWrapper.runLDA(sparkContext, testSqlContext, data, 2, logger, Some(0xdeadbeef), ldaAlpha, ldaBeta, ldaMaxiterations)
+      val data = sparkContext.parallelize(Seq(catFancy, dogWorld))
+      val out = SpotLDAWrapper.runLDA(sparkContext, sqlContext, data, 2, logger, Some(0xdeadbeef), ldaAlpha, ldaBeta, ldaMaxiterations)
 
-    val topicMixDF = out.docToTopicMix
-    var dogTopicMix: Array[Double] =
-      topicMixDF.filter(topicMixDF(DocumentName) === "dog world").select(TopicProbabilityMix).first().toSeq(0).asInstanceOf[Seq[Double]].toArray
+      val topicMixDF = out.docToTopicMix
+      var dogTopicMix: Array[Double] =
+        topicMixDF.filter(topicMixDF(DocumentName) === "dog world").select(TopicProbabilityMix).first().toSeq(0).asInstanceOf[Seq[Double]].toArray
 
-    val catTopicMix: Array[Double] =
-      topicMixDF.filter(topicMixDF(DocumentName) === "cat fancy").select(TopicProbabilityMix).first().toSeq(0).asInstanceOf[Seq[Double]].toArray
+      val catTopicMix: Array[Double] =
+        topicMixDF.filter(topicMixDF(DocumentName) === "cat fancy").select(TopicProbabilityMix).first().toSeq(0).asInstanceOf[Seq[Double]].toArray
 
-    val catTopics = out.wordResults("cat")
-    val dogTopics = out.wordResults("dog")
+      val catTopics = out.wordResults("cat")
+      val dogTopics = out.wordResults("dog")
 
-    Math.abs(1 - (catTopicMix(0) * catTopics(0) + catTopicMix(1) * catTopics(1))) should be < 0.01
-    Math.abs(1 - (dogTopicMix(0) * dogTopics(0) + dogTopicMix(1) * dogTopics(1))) should be < 0.01
-  }
+      Math.abs(1 - (catTopicMix(0) * catTopics(0) + catTopicMix(1) * catTopics(1))) should be < 0.01
+      Math.abs(1 - (dogTopicMix(0) * dogTopics(0) + dogTopicMix(1) * dogTopics(1))) should be < 0.01
+    }
 
-  "formatSparkLDAInput" should "return input in RDD[(Long, Vector)] (collected as Array for testing) format. The index " +
-    "is the docID, values are the vectors of word occurrences in that doc" in {
+    "formatSparkLDAInput" should "return input in RDD[(Long, Vector)] (collected as Array for testing) format. The index " +
+      "is the docID, values are the vectors of word occurrences in that doc" in {
 
-    val sqlContext = new SQLContext(sparkContext)
-    import sqlContext.implicits._
 
-    val documentWordData = sparkContext.parallelize(Array(SpotLDAInput("192.168.1.1", "333333_7.0_0.0_1.0", 8),
-      SpotLDAInput("10.10.98.123", "1111111_6.0_3.0_5.0", 4),
-      SpotLDAInput("66.23.45.11", "-1_43_7.0_2.0_6.0", 2),
-      SpotLDAInput("192.168.1.1", "-1_80_6.0_1.0_1.0", 5)))
+      val documentWordData = sparkContext.parallelize(Seq(SpotLDAInput("192.168.1.1", "333333_7.0_0.0_1.0", 8),
+        SpotLDAInput("10.10.98.123", "1111111_6.0_3.0_5.0", 4),
+        SpotLDAInput("66.23.45.11", "-1_43_7.0_2.0_6.0", 2),
+        SpotLDAInput("192.168.1.1", "-1_80_6.0_1.0_1.0", 5)))
 
-    val wordDictionary = Map("333333_7.0_0.0_1.0" -> 0, "1111111_6.0_3.0_5.0" -> 1, "-1_43_7.0_2.0_6.0" -> 2, "-1_80_6.0_1.0_1.0" -> 3)
+      val wordDictionary = Map("333333_7.0_0.0_1.0" -> 0, "1111111_6.0_3.0_5.0" -> 1, "-1_43_7.0_2.0_6.0" -> 2, "-1_80_6.0_1.0_1.0" -> 3)
 
-    val documentDictionary: DataFrame = {
-      documentWordData
-        .map({ case SpotLDAInput(doc, word, count) => doc })
-        .distinct
-        .zipWithIndex
-        .toDF(DocumentName, DocumentNumber)
-    }
+      val documentDictionary: DataFrame = sqlContext.createDataFrame(documentWordData
+          .map({ case SpotLDAInput(doc, word, count) => doc })
+          .distinct
+          .zipWithIndex.map({case (d,c) => Row(d,c)}), StructType(List(DocumentNameField, DocumentNumberField)))
 
-    val sparkLDAInput: RDD[(Long, Vector)] = SpotLDAWrapper.formatSparkLDAInput(documentWordData, documentDictionary, wordDictionary, sqlContext)
-    val sparkLDAInArr: Array[(Long, Vector)] = sparkLDAInput.collect()
 
-    sparkLDAInArr shouldBe Array((0, Vectors.sparse(4, Array(0, 3), Array(8.0, 5.0))), (2, Vectors.sparse(4, Array(2), Array(2.0))), (1, Vectors.sparse(4, Array(1), Array(4.0))))
-  }
+      val sparkLDAInput: RDD[(Long, Vector)] = SpotLDAWrapper.formatSparkLDAInput(documentWordData, documentDictionary, wordDictionary, sqlContext)
+      val sparkLDAInArr: Array[(Long, Vector)] = sparkLDAInput.collect()
 
-  "formatSparkLDADocTopicOuptut" should "return RDD[(String,Array(Double))] after converting doc results from vector: " +
-    "convert docID back to string, convert vector of probabilities to array" in {
+      sparkLDAInArr shouldBe Array((0, Vectors.sparse(4, Array(0, 3), Array(8.0, 5.0))), (2, Vectors.sparse(4, Array(2), Array(2.0))), (1, Vectors.sparse(4, Array(1), Array(4.0))))
+    }
 
-    val sqlContext = new SQLContext(sparkContext)
-    import sqlContext.implicits._
+    "formatSparkLDADocTopicOuptut" should "return RDD[(String,Array(Double))] after converting doc results from vector: " +
+      "convert docID back to string, convert vector of probabilities to array" in {
 
-    val documentWordData = sparkContext.parallelize(Array(SpotLDAInput("192.168.1.1", "333333_7.0_0.0_1.0", 8),
-      SpotLDAInput("10.10.98.123", "1111111_6.0_3.0_5.0", 4),
-      SpotLDAInput("66.23.45.11", "-1_43_7.0_2.0_6.0", 2),
-      SpotLDAInput("192.168.1.1", "-1_80_6.0_1.0_1.0", 5)))
 
-    val documentDictionary: DataFrame = {
-      documentWordData
-        .map({ case SpotLDAInput(doc, word, count) => doc })
-        .distinct
-        .zipWithIndex
-        .toDF(DocumentName, DocumentNumber)
-    }
+      val documentWordData = sparkContext.parallelize(Seq(SpotLDAInput("192.168.1.1", "333333_7.0_0.0_1.0", 8),
+        SpotLDAInput("10.10.98.123", "1111111_6.0_3.0_5.0", 4),
+        SpotLDAInput("66.23.45.11", "-1_43_7.0_2.0_6.0", 2),
+        SpotLDAInput("192.168.1.1", "-1_80_6.0_1.0_1.0", 5)))
 
-    val docTopicDist: RDD[(Long, Vector)] = sparkContext.parallelize(Array((0.toLong, Vectors.dense(0.15, 0.3, 0.5, 0.05)), (1.toLong,
-      Vectors.dense(0.25, 0.15, 0.4, 0.2)), (2.toLong, Vectors.dense(0.4, 0.1, 0.3, 0.2))))
+      val documentDictionary: DataFrame = sqlContext.createDataFrame(documentWordData
+          .map({ case SpotLDAInput(doc, word, count) => doc })
+          .distinct
+          .zipWithIndex.map({case (d,c) => Row(d,c)}), StructType(List(DocumentNameField, DocumentNumberField)))
 
-    val sparkDocRes: DataFrame = formatSparkLDADocTopicOutput(docTopicDist, documentDictionary, sqlContext)
+      val docTopicDist: RDD[(Long, Vector)] = sparkContext.parallelize(Array((0.toLong, Vectors.dense(0.15, 0.3, 0.5, 0.05)), (1.toLong,
+        Vectors.dense(0.25, 0.15, 0.4, 0.2)), (2.toLong, Vectors.dense(0.4, 0.1, 0.3, 0.2))))
 
-    val documents = sparkDocRes.select(DocumentName).map(documentName => documentName.toString.replaceAll("\\[", "").replaceAll("\\]", "")).collect()
+      val sparkDocRes: DataFrame = formatSparkLDADocTopicOutput(docTopicDist, documentDictionary, sqlContext)
 
-    documents(0) should be("10.10.98.123")
-    documents(1) should be("192.168.1.1")
-    documents(2) should be("66.23.45.11")
-  }
+      val documents = sparkDocRes.select(DocumentName).map(documentName => documentName.toString.replaceAll("\\[", "").replaceAll("\\]", "")).collect()
 
-  "formatSparkLDAWordOutput" should "return Map[Int,String] after converting word matrix to sequence, wordIDs back to strings, and sequence of probabilities to array" in {
-    val testMat = Matrices.dense(4, 4, Array(0.5, 0.2, 0.05, 0.25, 0.25, 0.1, 0.15, 0.5, 0.1, 0.4, 0.25, 0.25, 0.7, 0.2, 0.02, 0.08))
+      documents(0) should be("10.10.98.123")
+      documents(1) should be("192.168.1.1")
+      documents(2) should be("66.23.45.11")
+    }
 
-    val wordDictionary = Map("-1_23.0_7.0_7.0_4.0" -> 3, "23.0_7.0_7.0_4.0" -> 0, "333333.0_7.0_7.0_4.0" -> 2, "80.0_7.0_7.0_4.0" -> 1)
-    val revWordMap: Map[Int, String] = wordDictionary.map(_.swap)
+    "formatSparkLDAWordOutput" should "return Map[Int,String] after converting word matrix to sequence, wordIDs back to strings, and sequence of probabilities to array" in {
+      val testMat = Matrices.dense(4, 4, Array(0.5, 0.2, 0.05, 0.25, 0.25, 0.1, 0.15, 0.5, 0.1, 0.4, 0.25, 0.25, 0.7, 0.2, 0.02, 0.08))
 
-    val sparkWordRes = formatSparkLDAWordOutput(testMat, revWordMap)
+      val wordDictionary = Map("-1_23.0_7.0_7.0_4.0" -> 3, "23.0_7.0_7.0_4.0" -> 0, "333333.0_7.0_7.0_4.0" -> 2, "80.0_7.0_7.0_4.0" -> 1)
+      val revWordMap: Map[Int, String] = wordDictionary.map(_.swap)
 
-    sparkWordRes should contain key ("23.0_7.0_7.0_4.0")
-    sparkWordRes should contain key ("80.0_7.0_7.0_4.0")
-    sparkWordRes should contain key ("333333.0_7.0_7.0_4.0")
-    sparkWordRes should contain key ("-1_23.0_7.0_7.0_4.0")
-  }
+      val sparkWordRes = formatSparkLDAWordOutput(testMat, revWordMap)
 
+      sparkWordRes should contain key ("23.0_7.0_7.0_4.0")
+      sparkWordRes should contain key ("80.0_7.0_7.0_4.0")
+      sparkWordRes should contain key ("333333.0_7.0_7.0_4.0")
+      sparkWordRes should contain key ("-1_23.0_7.0_7.0_4.0")
+    }
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/991fd0ef/spot-ml/src/test/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysisTest.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/test/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysisTest.scala b/spot-ml/src/test/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysisTest.scala
index 138f32e..d9ec94e 100644
--- a/spot-ml/src/test/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysisTest.scala
+++ b/spot-ml/src/test/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysisTest.scala
@@ -28,15 +28,12 @@ class DNSSuspiciousConnectsAnalysisTest  extends TestingSparkContextFlatSpec wit
   "dns supicious connects analysis" should "estimate correct probabilities in toy data with framelength anomaly" in {
 
     val logger = LogManager.getLogger("SuspiciousConnectsAnalysis")
-    logger.setLevel(Level.INFO)
-    val testSqlContext = new org.apache.spark.sql.SQLContext(sparkContext)
+    logger.setLevel(Level.WARN)
 
     val anomalousRecord = DNSInput("May 20 2016 02:10:25.970987000 PDT",	1463735425L,	1,	"172.16.9.132",	"turner.com.122.2o7.net",	"0x00000001",	1,	0)
     val typicalRecord   = DNSInput("May 20 2016 02:10:25.970987000 PDT",	1463735425L,	168,	"172.16.9.132",	"turner.com.122.2o7.net",	"0x00000001",	1,	0)
 
-    import testSqlContext.implicits._
-
-    val data = sparkContext.parallelize(Seq(anomalousRecord, typicalRecord, typicalRecord, typicalRecord, typicalRecord)).toDF
+    val data = sqlContext.createDataFrame(Seq(anomalousRecord, typicalRecord, typicalRecord, typicalRecord, typicalRecord))
 
     val scoredData = DNSSuspiciousConnectsAnalysis.detectDNSAnomalies(data, testConfig,
       sparkContext,
@@ -54,6 +51,4 @@ class DNSSuspiciousConnectsAnalysisTest  extends TestingSparkContextFlatSpec wit
     Math.abs(typicalScores(2) - 0.8d)  should be <= 0.01d
     Math.abs(typicalScores(3) - 0.8d)  should be <= 0.01d
   }
-
-
 }

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/991fd0ef/spot-ml/src/test/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysisTest.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/test/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysisTest.scala b/spot-ml/src/test/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysisTest.scala
index 9e06d6c..d98500b 100644
--- a/spot-ml/src/test/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysisTest.scala
+++ b/spot-ml/src/test/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysisTest.scala
@@ -30,7 +30,7 @@ class ProxySuspiciousConnectsAnalysisTest extends TestingSparkContextFlatSpec wi
 
 
 
-  val testConfig = SuspiciousConnectsConfig(analysis = "proxy",
+  val testConfigProxy = SuspiciousConnectsConfig(analysis = "proxy",
     inputPath = "",
     feedbackFile = "",
     duplicationFactor = 1,
@@ -48,8 +48,7 @@ class ProxySuspiciousConnectsAnalysisTest extends TestingSparkContextFlatSpec wi
   "proxy supicious connects analysis" should "estimate correct probabilities in toy data with top domain anomaly" in {
 
     val logger = LogManager.getLogger("SuspiciousConnectsAnalysis")
-    logger.setLevel(Level.INFO)
-    val testSqlContext = new org.apache.spark.sql.SQLContext(sparkContext)
+    logger.setLevel(Level.WARN)
 
     val anomalousRecord = ProxyInput("2016-10-03",	"04:57:36", "127.0.0.1",	"intel.com",	"PUT",
       "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36",
@@ -65,12 +64,11 @@ class ProxySuspiciousConnectsAnalysisTest extends TestingSparkContextFlatSpec wi
       "-",	"127.0.0.1",	338,	647,
       "maw.bronto.com/sites/c37i4q22szvir8ga3m8mtxaft7gwnm5fio8hfxo35mu81absi1/carts/4b3a313d-50f6-4117-8ffd-4e804fd354ef/fiddle")
 
-    import testSqlContext.implicits._
 
-    val data = sparkContext.parallelize(Seq(anomalousRecord, typicalRecord, typicalRecord, typicalRecord, typicalRecord,
-      typicalRecord, typicalRecord, typicalRecord, typicalRecord, typicalRecord)).toDF
+    val data = sqlContext.createDataFrame(Seq(anomalousRecord, typicalRecord, typicalRecord, typicalRecord, typicalRecord,
+      typicalRecord, typicalRecord, typicalRecord, typicalRecord, typicalRecord))
 
-    val scoredData = ProxySuspiciousConnectsAnalysis.detectProxyAnomalies(data, testConfig,
+    val scoredData = ProxySuspiciousConnectsAnalysis.detectProxyAnomalies(data, testConfigProxy,
       sparkContext,
       sqlContext,
       logger)


[13/49] incubator-spot git commit: Merge pull request #169 from NathanSegerlind/spot

Posted by ev...@apache.org.
Merge pull request #169 from NathanSegerlind/spot

spot

Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/ab5ba53c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/ab5ba53c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/ab5ba53c

Branch: refs/heads/master
Commit: ab5ba53cfd3efc16ca7372dbbfb4fe88e56a1d8e
Parents: 40a1a38 18a6967
Author: NathanSegerlind <na...@intel.com>
Authored: Mon Dec 12 16:06:44 2016 -0800
Committer: GitHub <no...@github.com>
Committed: Mon Dec 12 16:06:44 2016 -0800

----------------------------------------------------------------------
 spot-ml/README.md | 24 +++++-------------------
 1 file changed, 5 insertions(+), 19 deletions(-)
----------------------------------------------------------------------



[32/49] incubator-spot git commit: Netflow-Storyboard: Globe View refactor

Posted by ev...@apache.org.
Netflow-Storyboard: Globe View refactor

Take advantage of ContentLoaderMixin and ChartMixin

Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/6ebae478
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/6ebae478
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/6ebae478

Branch: refs/heads/master
Commit: 6ebae478e7bf4aa81b4c3e7ae63695876b6293d8
Parents: a58345e
Author: Diego Armando Ortiz Huerta <di...@intel.com>
Authored: Tue Jan 3 12:40:43 2017 -0600
Committer: GitHub <no...@github.com>
Committed: Tue Jan 3 12:40:43 2017 -0600

----------------------------------------------------------------------
 .../flow/js/components/GlobeViewPanel.react.js  | 633 +++++++++----------
 .../ui/flow/js/constants/NetflowConstants.js    |   1 +
 spot-oa/ui/flow/js/stores/GlobeViewStore.js     |  34 +-
 spot-oa/ui/package.json                         |   1 -
 4 files changed, 344 insertions(+), 325 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/6ebae478/spot-oa/ui/flow/js/components/GlobeViewPanel.react.js
----------------------------------------------------------------------
diff --git a/spot-oa/ui/flow/js/components/GlobeViewPanel.react.js b/spot-oa/ui/flow/js/components/GlobeViewPanel.react.js
index d96bdd7..a431d25 100755
--- a/spot-oa/ui/flow/js/components/GlobeViewPanel.react.js
+++ b/spot-oa/ui/flow/js/components/GlobeViewPanel.react.js
@@ -1,300 +1,313 @@
+const $ = require('jquery');
+const d3 = require('d3');
 const React = require('react');
 const ReactDOM = require('react-dom');
-const queue = require('d3-queue');
 const GlobeViewStore = require('../stores/GlobeViewStore');
 
-var m0, o0, fill, proj, sky, path, swoosh, links, svg, width, height, arcLines;
-var dataset, container;
-
-function buildGraph(root, ipsrc) {
-    container = $(ReactDOM.findDOMNode(this));
-    container.html("");
-    dataset = root;
-
-     //Event Handlers
-    d3.select(container[0])
-       .on("mousemove", mousemove)
-       .on("mouseup", mouseup);
-
-    fill = d3.scale.ordinal()
-           .domain(d3.range(4))
-           .range(["#ffda00", "#ed1c24", "#000000", "#fdb813"]);
-
-    width = container.width();
-    height = container.height();
-
-    proj = d3.geo.orthographic()
-        .translate([width / 2, height / 2])
-        .clipAngle(90)
-        .scale(Math.round(height/2.5)); // 2.5 is a magic number for styling purposes
-
-    sky = d3.geo.orthographic()
-        .translate([width / 1.75, height / 1.75])
-        .clipAngle(90)
-        .scale(Math.round(height / 2.5));
-
-    path = d3.geo.path().projection(proj).pointRadius(2);
-
-    swoosh = d3.svg.line()
-          .x(function (d) {
-              return d[0]
-          })
-          .y(function (d) {
-              return d[1]
-          })
-          .interpolate("cardinal")
-          .tension(0);
-
-    links = [];
-    arcLines = [];
-
-    svg = d3.select(container[0]).append("svg")
-        .attr("width", width)
-        .attr("height", height)
-        .on("mousedown", mousedown);
-
-    function resizeHandler() {
-        buildGraph.call(this, root, ipsrc);
-    }
-
-    window.removeEventListener('resize', resizeHandler.bind(this));
-    window.addEventListener('resize', resizeHandler.bind(this));
-    $('svg', ReactDOM.findDOMNode(this)).off('parentUpdate').on('parentUpdate', resizeHandler.bind(this));
+const fillScale = d3.scale.ordinal()
+    .domain(d3.range(4))
+    .range(['#ffda00', '#ed1c24', '#000000', '#fdb813']);
 
-    queue()
-        .defer(d3.json, "../flow/world-110m.json")
-        .defer(getRawData)
-        .await(ready);
+const swoosh = d3.svg.line()
+    .x((d) => d[0])
+    .y(d => d[1])
+    .interpolate('cardinal')
+    .tension(0);
 
+const location_along_arc = function location_along_arc(start, end, loc) {
+  return d3.geo.interpolate(start, end)(loc);
 }
 
-function getRawData(callback){
-    callback(null,dataset.children);
-}
+const ContentLoaderMixin = require('../../../js/components/ContentLoaderMixin.react');
+const ChartMixin = require('../../../js/components/ChartMixin.react');
 
-function ready(error, world, places) {
-    if (error != null && error != undefined) {
-        container.html('<p class="lead text-danger"> Oops! looks like the data for this incident is missing. </p>');
-        return;
-    }
-    var ocean_fill = svg.append("defs").append("radialGradient")
-          .attr("id", "ocean_fill")
-          .attr("cx", "75%")
-          .attr("cy", "25%");
-    ocean_fill.append("stop").attr("offset", "5%").attr("stop-color", "#fff");
-    ocean_fill.append("stop").attr("offset", "100%").attr("stop-color", "#ababab");
-
-    var globe_highlight = svg.append("defs").append("radialGradient")
-        .attr("id", "globe_highlight")
-        .attr("cx", "75%")
-        .attr("cy", "25%");
-    globe_highlight.append("stop")
-        .attr("offset", "5%").attr("stop-color", "#ffd")
-        .attr("stop-opacity", "0.6");
-    globe_highlight.append("stop")
-        .attr("offset", "100%").attr("stop-color", "#ba9")
-        .attr("stop-opacity", "0.2");
-
-    var globe_shading = svg.append("defs").append("radialGradient")
-          .attr("id", "globe_shading")
-          .attr("cx", "55%")
-          .attr("cy", "45%");
-    globe_shading.append("stop")
-        .attr("offset", "30%").attr("stop-color", "#fff")
-        .attr("stop-opacity", "0")
-    globe_shading.append("stop")
-        .attr("offset", "100%").attr("stop-color", "#505962")
-        .attr("stop-opacity", "0.3")
-
-    var drop_shadow = svg.append("defs")
-        .append("radialGradient")
-        .attr("id", "drop_shadow")
-        .attr("cx", "50%")
-        .attr("cy", "50%");
-    drop_shadow.append("stop")
-        .attr("offset", "20%").attr("stop-color", "#000")
-        .attr("stop-opacity", ".5")
-    drop_shadow.append("stop")
-        .attr("offset", "100%").attr("stop-color", "#000")
-        .attr("stop-opacity", "0")
-
-    svg.append("ellipse")
-        .attr("cx", width * 0.8).attr("cy", height * 0.87) // Locate the Ellipse at 80% of the width and 87% of the height
-        .attr("rx", proj.scale() * .90)
-        .attr("ry", proj.scale() * .25)
-        .attr("class", "noclicks")
-        .style("fill", "url(#drop_shadow)");
-
-    svg.append("circle")
-        .attr("cx", width / 2).attr("cy", height / 2)
-        .attr("r", proj.scale())
-        .attr("class", "noclicks")
-        .style("fill", "url(#ocean_fill)");
-
-    svg.append("path")
-        .datum(topojson.object(world, world.objects.land))
-        .attr("class", "land noclicks")
-        .attr("d", path);
-
-    svg.append("circle")
-        .attr("cx", width / 2).attr("cy", height / 2)
-        .attr("r", proj.scale())
-        .attr("class", "noclicks")
-        .style("fill", "url(#globe_highlight)");
-
-    svg.append("circle")
-        .attr("cx", width / 2).attr("cy", height / 2)
-        .attr("r", proj.scale())
-        .attr("class", "noclicks")
-        .style("fill", "url(#globe_shading)");
-
-    svg.append("g").attr("class", "points")
-        .selectAll("text").data(places.sourceips)
-        .enter().append("path")
-        .attr("class", "point")
-        .attr("d", path);
-
-    svg.append("g").attr("class", "points")
-        .selectAll("text").data(places.destips)
-        .enter().append("path")
-        .attr("class", "point")
-        .attr("d", path);
-
-    places.sourceips.forEach(function (a, j) {
-        places.destips.forEach(function (b, k) {
-            if (j == k) {
-                links.push({
-                    source: a.geometry.coordinates,
-                    target: b.geometry.coordinates,
-                    ltype: a.properties.type
-                });
-            }
-        });
-    });
-
-    // build geoJSON features from links array
-    links.forEach(function (e, i, a) {
-        var feature = { "type": "Feature", "geometry": { "type": "LineString", "coordinates": [e.source, e.target] } }
-        arcLines.push(feature)
-    })
-
-    svg.append("g").attr("class", "arcs")
-        .selectAll("path").data(arcLines)
-        .enter().append("path")
-        .attr("class", "arc")
-        .attr("d", path)
-
-    svg.append("g").attr("class", "flyers")
-        .selectAll("path").data(links)
-        .enter().append("path")
-        .attr("class", "flyer")
-        .style("stroke", function (d) {
-            return fill(d.ltype);
-        })
-        .attr("d", function (d) {
-            return swoosh(flying_arc(d));
-        })
-
-    refresh();
-}
+const GlobeViewPanel = React.createClass({
+    mixins: [ContentLoaderMixin, ChartMixin],
+    componentDidMount: function ()
+    {
+        GlobeViewStore.addChangeDataListener(this._onChange);
 
-function flying_arc(pts) {
-    var source = pts.source,
-        target = pts.target;
+        d3.select(ReactDOM.findDOMNode(this))
+            .on('mousemove', this.onMouseMove)
+            .on('mouseup', this.onMouseUp);
+    },
+    componentWillUnmount: function ()
+    {
+        GlobeViewStore.removeChangeDataListener(this._onChange);
 
-    var mid = location_along_arc(source, target, 1);
-    var result = [proj(source),
-                   sky(mid),
-                   proj(target)]
-    return result;
-}
+        d3.select(ReactDOM.findDOMNode(this))
+            .on('mousemove', null)
+            .on('mouseup', null);
 
-function refresh() {
-    svg.selectAll(".land").attr("d", path);
-    svg.selectAll(".point").attr("d", path);
-
-    svg.selectAll(".arc").attr("d", path)
-        .attr("opacity", function (d) {
-            return fade_at_edge(d)
-        })
-
-    svg.selectAll(".flyer")
-        .attr("d", function (d) {
-            return swoosh(flying_arc(d));
-        })
-        .attr("opacity", function (d) {
-            return fade_at_edge(d);
-        })
-}
+        d3.select(this.svg).on('mousedown', null);
+    },
+    buildChart() {
+        this.proj = d3.geo.orthographic().clipAngle(90);
+        this.sky = d3.geo.orthographic().clipAngle(90);
 
-function fade_at_edge(d) {
-    var centerPos = proj.invert([width / 2, height / 2]),
-        arc = d3.geo.greatArc(),
-        start, end;
-    // function is called on 2 different data structures..
-    if (d.source) {
-        start = d.source,
-        end = d.target;
-    }
-    else {
-        start = d.geometry.coordinates[0];
-        end = d.geometry.coordinates[1];
-    }
+        this.path = d3.geo.path().projection(this.proj).pointRadius(2);
 
-    var start_dist = 1.87 - arc.distance({ source: start, target: centerPos }), //1.57
-        end_dist = 1.87 - arc.distance({ source: end, target: centerPos });
+        let d3svg = d3.select(this.svg)
+            .style('width', '100%')
+            .style('height', '100%')
+            .on('mousedown', this.onMouseDown);
 
-    var fade = d3.scale.linear().domain([-.1, 0]).range([0, .1])
-    var dist = start_dist < end_dist ? start_dist : end_dist;
+        this.createDefiniions();
+        this.createMap();
 
-    return fade(dist)
-}
+        d3svg.append('g').attr('class', 'flyers');
+        d3svg.append('g').attr('class', 'arcs');
+        d3svg.append('g').attr('class', 'src-points');
+        d3svg.append('g').attr('class', 'dst-points');
+    },
+    createDefiniions() {
+        let d3svg = d3.select(this.svg);
+
+        // Create definitions
+        const defs = d3svg.append('defs');
+        const ocean_fill = defs.append('radialGradient')
+            .attr('id', 'ocean_fill')
+            .attr('cx', '75%')
+            .attr('cy', '25%');
+
+        ocean_fill.append('stop')
+            .attr('offset', '5%')
+            .attr('stop-color', '#fff');
+        ocean_fill.append('stop')
+            .attr('offset', '100%')
+            .attr('stop-color', '#ababab');
+
+        const globe_highlight = defs.append('radialGradient')
+            .attr('id', 'globe_highlight')
+            .attr('cx', '75%')
+            .attr('cy', '25%');
+
+        globe_highlight.append('stop')
+            .attr('offset', '5%')
+            .attr('stop-color', '#ffd')
+            .attr('stop-opacity', '0.6');
+        globe_highlight.append('stop')
+            .attr('offset', '100%')
+            .attr('stop-color', '#ba9')
+            .attr('stop-opacity', '0.2');
+
+        const globe_shading = defs.append('radialGradient')
+              .attr('id', 'globe_shading')
+              .attr('cx', '55%')
+              .attr('cy', '45%');
+
+        globe_shading.append('stop')
+            .attr('offset', '30%')
+            .attr('stop-color', '#fff')
+            .attr('stop-opacity', '0');
+        globe_shading.append('stop')
+            .attr('offset', '100%')
+            .attr('stop-color', '#505962')
+            .attr('stop-opacity', '0.3');
+
+        const drop_shadow = defs.append('radialGradient')
+            .attr('id', 'drop_shadow')
+            .attr('cx', '50%')
+            .attr('cy', '50%');
+
+        drop_shadow.append('stop')
+            .attr('offset', '20%')
+            .attr('stop-color', '#000')
+            .attr('stop-opacity', '.5')
+        drop_shadow.append('stop')
+            .attr('offset', '100%')
+            .attr('stop-color', '#000')
+            .attr('stop-opacity', '0');
+    },
+    createMap() {
+        let d3svg = d3.select(this.svg);
 
-function location_along_arc(start, end, loc) {
-    var interpolator = d3.geo.interpolate(start, end);
-    return interpolator(loc)
-}
+        this.ellipse = d3svg.append('ellipse')
+            .attr('class', 'noclicks')
+            .style('fill', 'url(#drop_shadow)');
 
-// modified from http://bl.ocks.org/1392560
-function mousedown() {
-    m0 = [d3.event.pageX, d3.event.pageY];
-    o0 = proj.rotate();
-    d3.event.preventDefault();
-}
-function mousemove() {
-    if (m0) {
-        var m1 = [d3.event.pageX, d3.event.pageY]
-          , o1 = [o0[0] + (m1[0] - m0[0]) / 6, o0[1] + (m0[1] - m1[1]) / 6];
-        o1[1] = o1[1] > 30 ? 30 :
-                o1[1] < -30 ? -30 :
-                o1[1];
-        proj.rotate(o1);
-        sky.rotate(o1);
-        refresh();
-    }
-}
-function mouseup() {
-    if (m0) {
-        mousemove();
-        m0 = null;
-    }
-}
+        d3svg.append('circle')
+            .attr('class', 'noclicks')
+            .style('fill', 'url(#ocean_fill)');
 
+        const worldData = GlobeViewStore.getWorldData();
 
-var GlobeViewPanel = React.createClass({
-    componentDidMount: function ()
-    {
-        GlobeViewStore.addChangeDataListener(this._onChange);
+        d3svg.append('path')
+            .datum(topojson.object(worldData, worldData.objects.land))
+            .attr('class', 'land noclicks');
+
+        d3svg.append('circle')
+            .attr('class', 'noclicks')
+            .style('fill', 'url(#globe_highlight)');
+
+        d3svg.append('circle')
+            .attr('class', 'noclicks')
+            .style('fill', 'url(#globe_shading)');
     },
-    componentWillUnmount: function ()
-    {
-        GlobeViewStore.removeChangeDataListener(this._onChange);
+    draw() {
+        let $rootNode = $(ReactDOM.findDOMNode(this));
+
+        const width = $rootNode.width();
+        const height = $rootNode.height();
+
+        this.proj
+            .translate([width / 2, height / 2])
+            .scale(Math.round(height/2.5)); // 2.5 is a magic number for styling purposes
+
+        this.sky
+            .translate([width / 1.75, height / 1.75])
+            .scale(Math.round(height / 2.5));  // 2.5 is a magic number for styling purposes
+
+        this.ellipse
+            .attr('cx', width * 0.8)
+            .attr('cy', height * 0.87) // Locate the Ellipse at 80% of the width and 87% of the height
+            .attr('rx', this.proj.scale() * .90)
+            .attr('ry', this.proj.scale() * .25);
+
+        let d3svg = d3.select(this.svg);
+
+        d3svg.selectAll('circle')
+            .attr('cx', width / 2)
+            .attr('cy', height / 2)
+            .attr('r', () => this.proj.scale());
+
+        this.drawLinks();
+        this.drawArcLines();
+        this.drawLocations();
+
+        d3svg.selectAll('.land').attr('d', this.path);
+    },
+    drawLocations() {
+        const d3SrcPoints = d3.select(this.svg).select('.src-points');
+
+        const srcSel = {};
+        srcSel.update = d3SrcPoints.selectAll('path').data(this.state.data.srcIps);
+        srcSel.enter = srcSel.update.enter();
+        srcSel.exit = srcSel.update.exit();
+
+        srcSel.exit.remove();
+
+        srcSel.enter.append('path').attr('class', 'point');
+
+        srcSel.update
+            .attr('d', this.path);
+
+        const d3DstPoints = d3.select(this.svg).select('.dst-points');
+
+        const dstSel = {};
+        dstSel.update = d3DstPoints.selectAll('path').data(this.state.data.dstIps);
+        dstSel.enter = dstSel.update.enter();
+        dstSel.exit = dstSel.update.exit();
+
+        dstSel.enter.append('path').attr('class', 'point');
+
+        dstSel.exit.remove();
+
+        dstSel.update
+            .attr('d', this.path);
+    },
+    drawLinks() {
+        const flyersCanvas = d3.select(this.svg).select('.flyers');
+        const flyerSel = {};
+
+        flyerSel.update = flyersCanvas.selectAll('path').data(this.state.data.links);
+        flyerSel.enter = flyerSel.update.enter();
+        flyerSel.exit = flyerSel.update.exit();
+
+        flyerSel.enter.append('path')
+            .attr('class', 'flyer')
+            .style('stroke', d => fillScale(d.ltype));
+
+        flyerSel.exit.remove();
+
+        flyerSel.update
+            .attr('d', d => swoosh(this.flying_arc(d)))
+            .style('opacity', this.fade_at_edge);
+    },
+    drawArcLines() {
+        const arcsCanvas = d3.select(this.svg).select('.arcs');
+        const arcsSel = {};
+
+        arcsSel.update = arcsCanvas.selectAll('path').data(this.state.data.arcLines);
+        arcsSel.enter = arcsSel.update.enter();
+        arcsSel.exit = arcsSel.update.exit();
+
+        arcsSel.enter.append('path').attr('class', 'arc');
+
+        arcsSel.exit.remove();
+
+        arcsSel.update
+            .attr('d', this.path)
+            .style('opacity', this.fade_at_edge);
+    },
+    flying_arc(pts) {
+        const source = pts.source;
+        const target = pts.target;
+
+        const mid = location_along_arc(source, target, 1);
+        const result = [
+            this.proj(source),
+            this.sky(mid),
+            this.proj(target)
+        ];
+
+        return result;
+    },
+    fade_at_edge(d) {
+        const $svg = $(this.svg);
+        const centerPos = this.proj.invert([$svg.width() / 2, $svg.height() / 2]);
+        const arc = d3.geo.greatArc();
+
+        let start, end;
+        // function is called on 2 different data structures..
+        if (d.source) {
+            start = d.source,
+            end = d.target;
+        }
+        else {
+            start = d.geometry.coordinates[0];
+            end = d.geometry.coordinates[1];
+        }
+
+        const start_dist = 1.87 - arc.distance({ source: start, target: centerPos });
+        const end_dist = 1.87 - arc.distance({ source: end, target: centerPos });
+
+        const fade = d3.scale.linear().domain([-.1, 0]).range([0, .1]);
+        const dist = start_dist < end_dist ? start_dist : end_dist;
+
+        return fade(dist)
+    },
+    // modified from http://bl.ocks.org/1392560
+    onMouseDown() {
+        this.m0 = [d3.event.pageX, d3.event.pageY];
+        this.o0 = this.proj.rotate();
+        d3.event.preventDefault();
+    },
+    onMouseMove() {
+        if (this.m0) {
+            let m1 = [d3.event.pageX, d3.event.pageY];
+            let o1 = [this.o0[0] + (m1[0] - this.m0[0]) / 6, this.o0[1] + (this.m0[1] - m1[1]) / 6];
+
+            o1[1] = o1[1] > 30 ? 30 :
+                    o1[1] < -30 ? -30 :
+                    o1[1];
+
+            this.proj.rotate(o1);
+            this.sky.rotate(o1);
+
+            this.draw();
+        }
+    },
+    onMouseUp() {
+        this.onMouseMove();
+        this.m0 = null;
     },
+    // END of modifications from http://bl.ocks.org/1392560
     _onChange: function ()
     {
         const storeData = GlobeViewStore.getData();
 
-        const state ={
+        const state = {
             loading: storeData.loading
         };
 
@@ -302,57 +315,43 @@ var GlobeViewPanel = React.createClass({
             state.error = storeData.error;
         }
         else if(!storeData.loading && storeData.data) {
-            state.root = {
-                name: GlobeViewStore.getIp(),
-                children: storeData.data
-            };
+            state.data = this.getStateFromData(storeData.data);
         }
 
         this.replaceState(state);
     },
-    getInitialState: function ()
-    {
-        return {loading: false};
-    },
-    render:function()
-    {
-        var content;
-
-        if (this.state.error)
-        {
-            content = (
-                <div className="text-center text-danger">
-                    {this.state.error}
-                </div>
-            );
-        }
-        else if (this.state.loading)
-        {
-          content = (
-              <div className="spot-loader">
-                  Loading <span className="spinner"></span>
-              </div>
-          );
-        }
-        else
-        {
-            content = '';
-        }
-        return (
-            <div>{content}</div>
-        )
-    },
-    componentDidUpdate: function ()
-    {
-        if (!this.state.loading && !this.state.error)
-        {
-            if (this.state.root) {
-                buildGraph.call(this, this.state.root);
-            }
-            else {
-                d3.select(ReactDOM.findDOMNode(this)).selectAll('*').remove();
+    getStateFromData(data) {
+        const state = {};
+
+        state.name = GlobeViewStore.getIp();
+        state.srcIps = data.sourceips;
+        state.dstIps = data.destips;
+
+        state.links = [];
+        data.sourceips.forEach(function (a, j) {
+            data.destips.forEach(function (b, k) {
+                if (j == k) {
+                    state.links.push({
+                        source: a.geometry.coordinates,
+                        target: b.geometry.coordinates,
+                        ltype: a.properties.type
+                    });
+                }
+            });
+        });
+
+        // build geoJSON features from links array
+        state.arcLines = state.links.map(e => {
+            return {
+                type: 'Feature',
+                geometry: {
+                    type: 'LineString',
+                    coordinates: [e.source, e.target]
+                }
             }
-        }
+        });
+
+        return state;
     }
 });
 

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/6ebae478/spot-oa/ui/flow/js/constants/NetflowConstants.js
----------------------------------------------------------------------
diff --git a/spot-oa/ui/flow/js/constants/NetflowConstants.js b/spot-oa/ui/flow/js/constants/NetflowConstants.js
index cb7ffb2..92da4b2 100755
--- a/spot-oa/ui/flow/js/constants/NetflowConstants.js
+++ b/spot-oa/ui/flow/js/constants/NetflowConstants.js
@@ -13,6 +13,7 @@ var NetflowConstants = {
   API_INGEST_SUMMARY: '../../data/flow/ingest_summary/is_${year}${month}.csv',
   API_IMPACT_ANALYSIS: '../../data/flow/${date}/stats-${ip}.json',
   API_GLOBE_VIEW: '../../data/flow/${date}/globe-${ip}.json',
+  API_WORLD_110M: '../flow/world-110m.json',
   API_TIMELINE: '../../data/flow/${date}/sbdet-${ip}.tsv',
 };
 

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/6ebae478/spot-oa/ui/flow/js/stores/GlobeViewStore.js
----------------------------------------------------------------------
diff --git a/spot-oa/ui/flow/js/stores/GlobeViewStore.js b/spot-oa/ui/flow/js/stores/GlobeViewStore.js
index 7a5d4e7..cab3447 100755
--- a/spot-oa/ui/flow/js/stores/GlobeViewStore.js
+++ b/spot-oa/ui/flow/js/stores/GlobeViewStore.js
@@ -1,19 +1,21 @@
-var assign = require('object-assign');
+const assign = require('object-assign');
+const d3 = require('d3');
 
-var SpotDispatcher = require('../../../js/dispatchers/SpotDispatcher');
-var FlowConstants = require('../constants/NetflowConstants');
-var SpotConstants = require('../../../js/constants/SpotConstants');
-var JsonStore = require('../../../js/stores/JsonStore');
+const SpotDispatcher = require('../../../js/dispatchers/SpotDispatcher');
+const NetflowConstants = require('../constants/NetflowConstants');
+const SpotConstants = require('../../../js/constants/SpotConstants');
+const JsonStore = require('../../../js/stores/JsonStore');
 
 const IP_FILTER_NAME = 'ip';
+let WORLD_DATA = null;
 
-var GlobeViewStore = assign(new JsonStore(FlowConstants.API_GLOBE_VIEW), {
+const GlobeViewStore = assign(new JsonStore(NetflowConstants.API_GLOBE_VIEW), {
     errorMessages: {
         404: 'Please choose a different date, no data has been found'
     },
     setDate: function (date)
     {
-        this.setEndpoint(FlowConstants.API_GLOBE_VIEW.replace('${date}', date.replace(/-/g, '')));
+        this.setEndpoint(NetflowConstants.API_GLOBE_VIEW.replace('${date}', date.replace(/-/g, '')));
     },
     setIp: function (value)
     {
@@ -28,6 +30,24 @@ var GlobeViewStore = assign(new JsonStore(FlowConstants.API_GLOBE_VIEW), {
         this._data = data;
 
         this.emitChangeData();
+    },
+    getWorldData() {
+        return WORLD_DATA;
+    },
+    reload() {
+        if (WORLD_DATA instanceof Object) {
+            Object.getPrototypeOf(GlobeViewStore).reload.call(this);
+        }
+        else if (WORLD_DATA===true) {
+            // Do nothing, already loading world data
+        }
+        else {
+            WORLD_DATA = true; // Signal world data is loading
+            d3.json(NetflowConstants.API_WORLD_110M, (error, response) => {
+                WORLD_DATA = response;
+                Object.getPrototypeOf(GlobeViewStore).reload.call(this);
+            });
+        }
     }
 });
 

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/6ebae478/spot-oa/ui/package.json
----------------------------------------------------------------------
diff --git a/spot-oa/ui/package.json b/spot-oa/ui/package.json
index a07a106..2f40acb 100644
--- a/spot-oa/ui/package.json
+++ b/spot-oa/ui/package.json
@@ -10,7 +10,6 @@
     "bootstrap-jquery": "^3.3.2",
     "d3": "^3.5.17",
     "d3-interpolate": "^1.1.0",
-    "d3-queue": "^1.0.1",
     "d3-tip": "^0.6.7",
     "event-drops": "^0.1.1",
     "flux": "^2.0.0",


[21/49] incubator-spot git commit: Merge branch 'spot' into test_flow

Posted by ev...@apache.org.
Merge branch 'spot' into test_flow


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/a83208ea
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/a83208ea
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/a83208ea

Branch: refs/heads/master
Commit: a83208ea4e7fa93a41237de77b6f4070d6314254
Parents: deeed03 ab5ba53
Author: nlsegerl <na...@intel.com>
Authored: Mon Dec 19 13:48:42 2016 -0800
Committer: nlsegerl <na...@intel.com>
Committed: Mon Dec 19 13:48:42 2016 -0800

----------------------------------------------------------------------
 .../netflow/FlowSuspiciousConnectsAnalysisTest.scala  | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/a83208ea/spot-ml/src/test/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysisTest.scala
----------------------------------------------------------------------
diff --cc spot-ml/src/test/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysisTest.scala
index 0000000,0000000..73a7913
new file mode 100644
--- /dev/null
+++ b/spot-ml/src/test/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysisTest.scala
@@@ -1,0 -1,0 +1,14 @@@
++package org.apache.spot.netflow
++
++import org.scalatest.FunSuite
++
++/**
++  * Created by nlsegerl on 12/13/16.
++  */
++class FlowSuspiciousConnectsAnalysisTest extends FunSuite {
++
++  test("testDetectFlowAnomalies") {
++
++  }
++
++}


[15/49] incubator-spot git commit: Merge branch 'spot' into test_dns_topdomain

Posted by ev...@apache.org.
Merge branch 'spot' into test_dns_topdomain


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/ac44bf0a
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/ac44bf0a
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/ac44bf0a

Branch: refs/heads/master
Commit: ac44bf0a247485aa4702b89d2b076f398e7fbdcb
Parents: b1b5d74 ab5ba53
Author: nlsegerl <na...@intel.com>
Authored: Tue Dec 13 10:46:45 2016 -0800
Committer: nlsegerl <na...@intel.com>
Committed: Tue Dec 13 10:46:45 2016 -0800

----------------------------------------------------------------------

----------------------------------------------------------------------



[31/49] incubator-spot git commit: Merge pull request #172 from brandon-edwards/Shouldbe_good_domain_fix

Posted by ev...@apache.org.
Merge pull request #172 from brandon-edwards/Shouldbe_good_domain_fix

Fixed bug related to empty environmental variable in a parameter command in ml_ops.sh

Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/a58345ec
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/a58345ec
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/a58345ec

Branch: refs/heads/master
Commit: a58345ec2d79bb6d7a681f6d2638135ee151773d
Parents: 760dbf3 06900bb
Author: NathanSegerlind <na...@intel.com>
Authored: Thu Dec 22 17:07:14 2016 -0800
Committer: GitHub <no...@github.com>
Committed: Thu Dec 22 17:07:14 2016 -0800

----------------------------------------------------------------------
 spot-ml/ml_ops.sh | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)
----------------------------------------------------------------------



[35/49] incubator-spot git commit: Merge branch 'spot' into unit_test_cleanup

Posted by ev...@apache.org.
Merge branch 'spot' into unit_test_cleanup

# Conflicts:
#	spot-ml/src/main/scala/org/apache/spot/SuspiciousConnects.scala
#	spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala
#	spot-ml/src/main/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysis.scala
#	spot-ml/src/main/scala/org/apache/spot/netflow/FlowWordCreator.scala
#	spot-ml/src/main/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysis.scala
#	spot-ml/src/test/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysisTest.scala
#	spot-ml/src/test/scala/org/apache/spot/dns/DNSWordCreationTest.scala
#	spot-ml/src/test/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysisTest.scala


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/24b3a37b
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/24b3a37b
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/24b3a37b

Branch: refs/heads/master
Commit: 24b3a37bfaf8bab21015cb9da40eb2897e83f9df
Parents: 9ac5a8c
Author: nlsegerl <na...@intel.com>
Authored: Wed Jan 4 15:32:32 2017 -0800
Committer: nlsegerl <na...@intel.com>
Committed: Wed Jan 4 15:32:32 2017 -0800

----------------------------------------------------------------------
 .../spot/SuspiciousConnectsScoreFunction.scala  |  14 ++-
 .../scala/org/apache/spot/dns/DNSSchema.scala   |   5 -
 .../dns/DNSSuspiciousConnectsAnalysis.scala     |  21 ++--
 .../org/apache/spot/dns/DNSWordCreation.scala   |  16 +--
 .../dns/model/DNSSuspiciousConnectsModel.scala  |  70 +++++------
 .../FlowSuspiciousConnectsAnalysis.scala        |  19 ++-
 .../apache/spot/netflow/FlowWordCreator.scala   |  10 +-
 .../spot/netflow/model/FlowScoreFunction.scala  |  69 +++++------
 .../model/FlowSuspiciousConnectsModel.scala     |  64 ++++++-----
 .../proxy/ProxySuspiciousConnectsModel.scala    |  30 ++---
 .../utilities/data/InputOutputDataHandler.scala |  34 +++---
 .../dns/DNSSuspiciousConnectsAnalysisTest.scala |  49 ++++----
 .../apache/spot/dns/DNSWordCreationTest.scala   |   1 -
 .../FlowSuspiciousConnectsAnalysisTest.scala    | 115 ++++++++++++++++++-
 14 files changed, 314 insertions(+), 203 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/24b3a37b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsScoreFunction.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsScoreFunction.scala b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsScoreFunction.scala
index 04db60e..92cfea8 100644
--- a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsScoreFunction.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsScoreFunction.scala
@@ -3,8 +3,15 @@ package org.apache.spot
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spot.utilities.data.validation.InvalidDataHandler
 
-
-
+/**
+  * Base class for scoring suspicious connects models.
+  * Assumes that distribution of words is independent of the IP when conditioned on the topic
+  * and performs a simple sum over a partition of the space by topic.
+  *
+  * @param topicCount Number of topics produced by the topic modelling analysis.
+  * @param ipToTopicMixBC Broadcast of map assigning IPs to topic mixes.
+  * @param wordToPerTopicProbBC Broadcast of map assigning words to per-topic conditional probability.
+  */
 class SuspiciousConnectsScoreFunction(topicCount: Int,
                                       ipToTopicMixBC: Broadcast[Map[String, Array[Double]]],
                                       wordToPerTopicProbBC: Broadcast[Map[String, Array[Double]]]) extends Serializable {
@@ -25,5 +32,4 @@ class SuspiciousConnectsScoreFunction(topicCount: Int,
         .sum
     }
   }
-
-}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/24b3a37b/spot-ml/src/main/scala/org/apache/spot/dns/DNSSchema.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/dns/DNSSchema.scala b/spot-ml/src/main/scala/org/apache/spot/dns/DNSSchema.scala
index 020fc46..2244085 100644
--- a/spot-ml/src/main/scala/org/apache/spot/dns/DNSSchema.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/dns/DNSSchema.scala
@@ -39,7 +39,6 @@ object DNSSchema {
   val AnswerAddress = "dns_a"
   val AnswerAddressField = StructField(AnswerAddress, StringType, nullable= true)
 
-
   // intermediate and derived fields
 
   val Domain = "domain"
@@ -67,8 +66,4 @@ object DNSSchema {
 
   val Score = "score"
   val ScoreField = StructField(Score, DoubleType)
-
-
-  val ScoreSchema = StructType(List(ScoreField))
-
 }

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/24b3a37b/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala b/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala
index 929b69e..1688bed 100644
--- a/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala
@@ -8,14 +8,13 @@ import org.apache.spark.sql.{DataFrame, SQLContext}
 import org.apache.spot.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig
 import org.apache.spot.dns.DNSSchema._
 import org.apache.spot.dns.model.DNSSuspiciousConnectsModel
-import org.apache.log4j.Logger
 import org.apache.spot.dns.model.DNSSuspiciousConnectsModel.ModelSchema
 import org.apache.spot.proxy.ProxySchema.Score
 import org.apache.spot.utilities.data.validation.{InvalidDataHandler => dataValidation}
 
 /**
   * The suspicious connections analysis of DNS log data develops a probabilistic model the DNS queries
-  * made by each client IP and flags
+  * made by each client IP and flags those assigned a low probability as "suspicious"
   */
 
 object DNSSuspiciousConnectsAnalysis {
@@ -37,13 +36,9 @@ object DNSSuspiciousConnectsAnalysis {
 
     logger.info("Starting DNS suspicious connects analysis.")
 
-    val userDomain = config.userDomain
-
     val cleanDNSRecords = filterAndSelectCleanDNSRecords(inputDNSRecords)
 
-    logger.info("Training the model")
-
-    val scoredDNSRecords = detectDNSAnomalies(cleanDNSRecords, config, sparkContext, sqlContext, logger)
+    val scoredDNSRecords = scoreDNSRecords(cleanDNSRecords, config, sparkContext, sqlContext, logger)
 
     val filteredDNSRecords = filterScoredDNSRecords(scoredDNSRecords, config.threshold)
 
@@ -55,6 +50,7 @@ object DNSSuspiciousConnectsAnalysis {
 
     logger.info("DNS  suspicious connects analysis completed.")
     logger.info("Saving results to : " + config.hdfsScoredConnect)
+
     outputDNSRecords.map(_.mkString(config.outputDelimiter)).saveAsTextFile(config.hdfsScoredConnect)
 
     val invalidDNSRecords = filterAndSelectInvalidDNSRecords(inputDNSRecords)
@@ -76,18 +72,17 @@ object DNSSuspiciousConnectsAnalysis {
     * @return
     */
 
-  def detectDNSAnomalies(data: DataFrame, config: SuspiciousConnectsConfig,
-                         sparkContext: SparkContext,
-                         sqlContext: SQLContext,
-                         logger: Logger) : DataFrame = {
+  def scoreDNSRecords(data: DataFrame, config: SuspiciousConnectsConfig,
+                      sparkContext: SparkContext,
+                      sqlContext: SQLContext,
+                      logger: Logger) : DataFrame = {
 
-    val userDomain = config.userDomain
     logger.info("Fitting probabilistic model to data")
     val model =
       DNSSuspiciousConnectsModel.trainNewModel(sparkContext, sqlContext, logger, config, data, config.topicCount)
 
     logger.info("Identifying outliers")
-    model.score(sparkContext, sqlContext, data, userDomain)
+    model.score(sparkContext, sqlContext, data, config.userDomain)
   }
 
 

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/24b3a37b/spot-ml/src/main/scala/org/apache/spot/dns/DNSWordCreation.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/dns/DNSWordCreation.scala b/spot-ml/src/main/scala/org/apache/spot/dns/DNSWordCreation.scala
index 383eb2f..e080b3e 100644
--- a/spot-ml/src/main/scala/org/apache/spot/dns/DNSWordCreation.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/dns/DNSWordCreation.scala
@@ -6,19 +6,19 @@ import org.apache.spot.utilities.DomainProcessor.{DomainInfo, extractDomainInfo}
 import org.apache.spot.utilities.Quantiles
 import org.apache.spot.utilities.data.validation.InvalidDataHandler
 
-import scala.util.{Failure, Success, Try}
+import scala.util.{Success, Try}
 
 
 /**
   * Convert DNS log entries into "words" for topic modelling analyses.
   *
-  * @param frameLengthCuts
-  * @param timeCuts
-  * @param subdomainLengthCuts
-  * @param entropyCuts
-  * @param numberPeriodsCuts
-  * @param topDomainsBC
-  * @param userDomain
+  * @param frameLengthCuts Quantile cut-offs for discretizing frame length in word construction.
+  * @param timeCuts Quantile cut-offs for discretizing the time of day in word construction.
+  * @param subdomainLengthCuts Quantile cut-offs for discretizing subdomain length in word construction.
+  * @param entropyCuts Quantile cut-offs for discretizing entropy in word construction.
+  * @param numberPeriodsCuts Quantile cut-offs for discretizing the number of periods in word construction.
+  * @param topDomainsBC List of most popular top level domain names.
+  * @param userDomain User's domain for internal network.
   */
 class DNSWordCreation(frameLengthCuts: Array[Double],
                       timeCuts: Array[Double],

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/24b3a37b/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala b/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala
index 47c32a7..f1893e9 100644
--- a/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala
@@ -73,7 +73,6 @@ class DNSSuspiciousConnectsModel(inTopicCount: Int,
     */
   def score(sc: SparkContext, sqlContext: SQLContext, inDF: DataFrame, userDomain: String): DataFrame = {
 
-    val countryCodesBC = sc.broadcast(CountryCodes.CountryCodes)
     val topDomainsBC = sc.broadcast(TopDomains.TopDomains)
     val ipToTopicMixBC = sc.broadcast(ipToTopicMix)
     val wordToPerTopicProbBC = sc.broadcast(wordToPerTopicProb)
@@ -139,10 +138,10 @@ object DNSSuspiciousConnectsModel {
     * @param sparkContext
     * @param sqlContext
     * @param logger
-    * @param config     Analysis configuration object containing CLI parameters.
-    *                   Contains the path to the feedback file in config.scoresFile
-    * @param inputRecords       Data used to train the model.
-    * @param topicCount Number of topics (traffic profiles) used to build the model.
+    * @param config       Analysis configuration object containing CLI parameters.
+    *                     Contains the path to the feedback file in config.scoresFile
+    * @param inputRecords Data used to train the model.
+    * @param topicCount   Number of topics (traffic profiles) used to build the model.
     * @return A new [[DNSSuspiciousConnectsModel]] instance trained on the dataframe and feedback file.
     */
   def trainNewModel(sparkContext: SparkContext,
@@ -171,11 +170,12 @@ object DNSSuspiciousConnectsModel {
       Quantiles.computeDeciles(totalRecords
         .select(UnixTimestamp)
         .rdd
-        .flatMap({ case Row(unixTimeStamp: Long) => {
-          Try {unixTimeStamp.toDouble} match {
-              case Failure(_) => Seq()
-              case Success(timestamp) => Seq(timestamp)
-            }
+        .flatMap({ case Row(unixTimeStamp: Long) =>
+          Try {
+            unixTimeStamp.toDouble
+          } match {
+            case Failure(_) => Seq()
+            case Success(timestamp) => Seq(timestamp)
           }
         }))
 
@@ -183,11 +183,12 @@ object DNSSuspiciousConnectsModel {
       Quantiles.computeDeciles(totalRecords
         .select(FrameLength)
         .rdd
-        .flatMap({case Row(frameLen: Int) => {
-            Try{frameLen.toDouble} match{
-              case Failure(_) => Seq()
-              case Success(frameLen) => Seq(frameLen)
-            }
+        .flatMap({ case Row(frameLen: Int) =>
+          Try {
+            frameLen.toDouble
+          } match {
+            case Failure(_) => Seq()
+            case Success(frameLength) => Seq(frameLength)
           }
         }))
 
@@ -198,11 +199,12 @@ object DNSSuspiciousConnectsModel {
         .filter(domainStatsRecords(SubdomainLength).gt(0))
         .select(SubdomainLength)
         .rdd
-        .flatMap({ case Row(subdomainLength: Int) => {
-            Try{subdomainLength.toDouble} match {
-              case Failure(_) => Seq()
-              case Success(subdomainLength) => Seq(subdomainLength)
-            }
+        .flatMap({ case Row(subdomainLength: Int) =>
+          Try {
+            subdomainLength.toDouble
+          } match {
+            case Failure(_) => Seq()
+            case Success(subdomainLength) => Seq(subdomainLength)
           }
         }))
 
@@ -211,11 +213,12 @@ object DNSSuspiciousConnectsModel {
         .filter(domainStatsRecords(SubdomainEntropy).gt(0))
         .select(SubdomainEntropy)
         .rdd
-        .flatMap({ case Row(subdomainEntropy: Double) => {
-          Try{subdomainEntropy.toDouble} match {
+        .flatMap({ case Row(subdomainEntropy: Double) =>
+          Try {
+            subdomainEntropy.toDouble
+          } match {
             case Failure(_) => Seq()
             case Success(subdomainEntropy) => Seq(subdomainEntropy)
-            }
           }
         }))
 
@@ -224,23 +227,24 @@ object DNSSuspiciousConnectsModel {
         .filter(domainStatsRecords(NumPeriods).gt(0))
         .select(NumPeriods)
         .rdd
-        .flatMap({ case Row(numberPeriods: Int) => {
-          Try {numberPeriods.toDouble} match {
+        .flatMap({ case Row(numberPeriods: Int) =>
+          Try {
+            numberPeriods.toDouble
+          } match {
             case Failure(_) => Seq()
             case Success(numberPeriods) => Seq(numberPeriods)
-            }
           }
         }))
 
     // simplify DNS log entries into "words"
 
     val dnsWordCreator = new DNSWordCreation(frameLengthCuts,
-                                             timeCuts,
-                                             subdomainLengthCuts,
-                                             entropyCuts,
-                                             numberPeriodsCuts,
-                                             topDomainsBC,
-                                             userDomain)
+      timeCuts,
+      subdomainLengthCuts,
+      entropyCuts,
+      numberPeriodsCuts,
+      topDomainsBC,
+      userDomain)
 
     val dataWithWord = totalRecords.withColumn(Word, dnsWordCreator.wordCreationUDF(modelColumns: _*))
 
@@ -267,7 +271,7 @@ object DNSSuspiciousConnectsModel {
     // Since DNS is still broadcasting ip to topic mix, we need to convert data frame to Map[String, Array[Double]]
     val ipToTopicMix = ipToTopicMixDF
       .rdd
-      .map({ case (ipToTopicMixRow: Row) => (ipToTopicMixRow.toSeq.toArray) })
+      .map({ case (ipToTopicMixRow: Row) => ipToTopicMixRow.toSeq.toArray })
       .map({
         case (ipToTopicMixSeq) => (ipToTopicMixSeq(0).asInstanceOf[String], ipToTopicMixSeq(1).asInstanceOf[Seq[Double]]
           .toArray)

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/24b3a37b/spot-ml/src/main/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysis.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysis.scala b/spot-ml/src/main/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysis.scala
index aad3e66..bb0a951 100644
--- a/spot-ml/src/main/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysis.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysis.scala
@@ -33,7 +33,7 @@ object FlowSuspiciousConnectsAnalysis {
     val orderedFlowRecords = filteredFlowRecords.orderBy(Score)
 
     val mostSuspiciousFlowRecords =
-      if(config.maxResults > 0 ) orderedFlowRecords.limit(config.maxResults) else orderedFlowRecords
+      if (config.maxResults > 0) orderedFlowRecords.limit(config.maxResults) else orderedFlowRecords
 
     val outputFlowRecords = mostSuspiciousFlowRecords.select(OutSchema: _*)
 
@@ -60,9 +60,9 @@ object FlowSuspiciousConnectsAnalysis {
     */
   def detectFlowAnomalies(data: DataFrame,
                           config: SuspiciousConnectsConfig,
-                         sparkContext: SparkContext,
-                         sqlContext: SQLContext,
-                         logger: Logger) : DataFrame = {
+                          sparkContext: SparkContext,
+                          sqlContext: SQLContext,
+                          logger: Logger): DataFrame = {
 
 
     logger.info("Fitting probabilistic model to data")
@@ -78,7 +78,7 @@ object FlowSuspiciousConnectsAnalysis {
     * @param inputFlowRecords raw flow records
     * @return
     */
-  def filterAndSelectCleanFlowRecords(inputFlowRecords: DataFrame): DataFrame ={
+  def filterAndSelectCleanFlowRecords(inputFlowRecords: DataFrame): DataFrame = {
 
     val cleanFlowRecordsFilter = inputFlowRecords(Hour).between(0, 23) &&
       inputFlowRecords(Minute).between(0, 59) &&
@@ -104,9 +104,9 @@ object FlowSuspiciousConnectsAnalysis {
     */
   def filterAndSelectInvalidFlowRecords(inputFlowRecords: DataFrame): DataFrame = {
 
-    val invalidFlowRecordsFilter = inputFlowRecords(Hour).between(0,23) &&
-      inputFlowRecords(Minute).between(0,59) &&
-      inputFlowRecords(Second).between(0,59) &&
+    val invalidFlowRecordsFilter = inputFlowRecords(Hour).between(0, 23) &&
+      inputFlowRecords(Minute).between(0, 59) &&
+      inputFlowRecords(Second).between(0, 59) &&
       inputFlowRecords(TimeReceived).isNull ||
       inputFlowRecords(SourceIP).isNull ||
       inputFlowRecords(DestinationIP).isNull ||
@@ -123,7 +123,7 @@ object FlowSuspiciousConnectsAnalysis {
   /**
     *
     * @param scoredFlowRecords scored flow records.
-    * @param threshold score tolerance.
+    * @param threshold         score tolerance.
     * @return
     */
   def filterScoredFlowRecords(scoredFlowRecords: DataFrame, threshold: Double): DataFrame = {
@@ -150,7 +150,6 @@ object FlowSuspiciousConnectsAnalysis {
   }
 
 
-
   val InSchema = StructType(List(TimeReceivedField,
     YearField,
     MonthField,

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/24b3a37b/spot-ml/src/main/scala/org/apache/spot/netflow/FlowWordCreator.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/netflow/FlowWordCreator.scala b/spot-ml/src/main/scala/org/apache/spot/netflow/FlowWordCreator.scala
index bd94df5..b825671 100644
--- a/spot-ml/src/main/scala/org/apache/spot/netflow/FlowWordCreator.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/netflow/FlowWordCreator.scala
@@ -4,7 +4,7 @@ import org.apache.spark.sql.functions._
 import org.apache.spot.utilities.Quantiles
 import org.apache.spot.utilities.data.validation.InvalidDataHandler
 
-import scala.util.{Failure, Success, Try}
+import scala.util.{Success, Try}
 
 
 /**
@@ -30,6 +30,7 @@ class FlowWordCreator(timeCuts: Array[Double],
 
   /**
     * Spark SQL UDF for calculating the word summarizing a netflow transaction at the source IP
+    *
     * @return String "word" summarizing a netflow connection.
     */
   def srcWordUDF = udf((hour: Int,
@@ -46,6 +47,7 @@ class FlowWordCreator(timeCuts: Array[Double],
 
   /**
     * Spark SQL UDF for calculating the word summarizing a netflow transaction at the destination IP
+    *
     * @return String "word" summarizing a netflow connection.
     */
   def dstWordUDF = udf((hour: Int,
@@ -96,7 +98,7 @@ class FlowWordCreator(timeCuts: Array[Double],
 
       } else if (srcPort == 0 && dstPort > 0) {
 
-        val baseWord = Array(dstPort.toString(), timeBin, ibytBin, ipktBin).mkString("_")
+        val baseWord = Array(dstPort.toString, timeBin, ibytBin, ipktBin).mkString("_")
         FlowWords(srcWord = baseWord, dstWord = "-1_" + baseWord)
 
       } else if (srcPort <= 1024 && dstPort <= 1024) {
@@ -106,12 +108,12 @@ class FlowWordCreator(timeCuts: Array[Double],
 
       } else if (srcPort <= 1024 && dstPort > 1024) {
 
-        val baseWord = Array(srcPort.toString(), timeBin, ibytBin, ipktBin).mkString("_")
+        val baseWord = Array(srcPort.toString, timeBin, ibytBin, ipktBin).mkString("_")
         FlowWords(srcWord = "-1_" + baseWord, dstWord = baseWord)
 
       } else if (srcPort > 1024 && dstPort <= 1024) {
 
-        val baseWord = Array(dstPort.toString(), timeBin, ibytBin, ipktBin).mkString("_")
+        val baseWord = Array(dstPort.toString, timeBin, ibytBin, ipktBin).mkString("_")
         FlowWords(srcWord = baseWord, dstWord = "-1_" + baseWord)
 
       } else {

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/24b3a37b/spot-ml/src/main/scala/org/apache/spot/netflow/model/FlowScoreFunction.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/netflow/model/FlowScoreFunction.scala b/spot-ml/src/main/scala/org/apache/spot/netflow/model/FlowScoreFunction.scala
index b82cd3f..b37f2f1 100644
--- a/spot-ml/src/main/scala/org/apache/spot/netflow/model/FlowScoreFunction.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/netflow/model/FlowScoreFunction.scala
@@ -1,29 +1,27 @@
 package org.apache.spot.netflow.model
 
 import org.apache.spark.broadcast.Broadcast
-import org.apache.spot.SuspiciousConnectsScoreFunction
 import org.apache.spot.netflow.{FlowWordCreator, FlowWords}
 import org.apache.spot.utilities.data.validation.InvalidDataHandler
 
 
 /**
   * Estimate the probabilities of network events using a [[FlowSuspiciousConnectsModel]]
-  * @param timeCuts
-  * @param ibytCuts
-  * @param ipktCuts
-  * @param topicCount
-  * @param wordToPerTopicProbBC
+  *
+  * @param timeCuts Quantile cut-offs for binning time-of-day values when forming words from netflow records.
+  * @param ibytCuts Quantile cut-offs for binning ibyt values when forming words from netflow records.
+  * @param ipktCuts Quantile cut-offs for binning ipkt values when forming words from netflow records.
+  * @param topicCount Number of topics used in the topic modelling analysis.
+  * @param wordToPerTopicProbBC Broadcast map assigning to each word it's per-topic probabilities.
+  *                           Ie. Prob [word | t ] for t = 0 to topicCount -1
   */
 
 
-
 class FlowScoreFunction(timeCuts: Array[Double],
                         ibytCuts: Array[Double],
                         ipktCuts: Array[Double],
-                       topicCount: Int,
-                       wordToPerTopicProbBC: Broadcast[Map[String, Array[Double]]]) extends Serializable {
-
-
+                        topicCount: Int,
+                        wordToPerTopicProbBC: Broadcast[Map[String, Array[Double]]]) extends Serializable {
 
 
   val flowWordCreator = new FlowWordCreator(timeCuts, ibytCuts, ipktCuts)
@@ -31,17 +29,18 @@ class FlowScoreFunction(timeCuts: Array[Double],
   /**
     * Estimate the probability of a netflow connection as distributed from the source IP and from the destination IP
     * and assign it the least of these two values.
-    * @param hour
-    * @param minute
-    * @param second
-    * @param srcIP
-    * @param dstIP
-    * @param srcPort
-    * @param dstPort
-    * @param ipkt
-    * @param ibyt
-    * @param srcTopicMix
-    * @param dstTopicMix
+    *
+    * @param hour Hour of flow record.
+    * @param minute Minute of flow record.
+    * @param second Second of flow record.
+    * @param srcIP Source IP of flow record.
+    * @param dstIP Destination IP of flow record.
+    * @param srcPort Source port of flow record.
+    * @param dstPort Destination port of flow record.
+    * @param ipkt ipkt entry of flow record
+    * @param ibyt ibyt entry of flow record
+    * @param srcTopicMix topic mix assigned of source IP
+    * @param dstTopicMix topic mix assigned of destination IP
     * @return Minium of probability of this word from the source IP and probability of this word from the dest IP.
     */
   def score(hour: Int,
@@ -60,28 +59,30 @@ class FlowScoreFunction(timeCuts: Array[Double],
     val FlowWords(srcWord, dstWord) = flowWordCreator.flowWords(hour: Int, minute: Int, second: Int,
       srcPort: Int, dstPort: Int, ipkt: Long, ibyt: Long)
 
-    val zeroProb = Array.fill(topicCount) { 0.0 }
+    val zeroProb = Array.fill(topicCount) {
+      0.0
+    }
 
     /** WordError indicates there was a problem creating a word and should not be used for scoring.
-
-      A null value for srcTopicMix or dstTopicMix indicated the ip (source or dest respectively)
+      * *
+      * A null value for srcTopicMix or dstTopicMix indicated the ip (source or dest respectively)
       * were not seen in training.
       */
-    if(srcWord == InvalidDataHandler.WordError || dstWord == InvalidDataHandler.WordError){
+    if (srcWord == InvalidDataHandler.WordError || dstWord == InvalidDataHandler.WordError) {
       InvalidDataHandler.ScoreError
     } else if (srcTopicMix == null || dstTopicMix == null) {
-       0.0
+      0.0
     } else {
 
-       val scoreOfConnectionFromSrcIP = srcTopicMix.zip(wordToPerTopicProbBC.value.getOrElse(srcWord, zeroProb))
-         .map({ case (pWordGivenTopic, pTopicGivenDoc) => pWordGivenTopic * pTopicGivenDoc })
-         .sum
+      val scoreOfConnectionFromSrcIP = srcTopicMix.zip(wordToPerTopicProbBC.value.getOrElse(srcWord, zeroProb))
+        .map({ case (pWordGivenTopic, pTopicGivenDoc) => pWordGivenTopic * pTopicGivenDoc })
+        .sum
 
-       val scoreOfConnectionsFromDstIP = dstTopicMix.zip(wordToPerTopicProbBC.value.getOrElse(dstWord, zeroProb))
-         .map({ case (pWordGivenTopic, pTopicGivenDoc) => pWordGivenTopic * pTopicGivenDoc })
-         .sum
+      val scoreOfConnectionsFromDstIP = dstTopicMix.zip(wordToPerTopicProbBC.value.getOrElse(dstWord, zeroProb))
+        .map({ case (pWordGivenTopic, pTopicGivenDoc) => pWordGivenTopic * pTopicGivenDoc })
+        .sum
 
-       Math.min(scoreOfConnectionFromSrcIP, scoreOfConnectionsFromDstIP)
+      Math.min(scoreOfConnectionFromSrcIP, scoreOfConnectionsFromDstIP)
 
     }
   }

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/24b3a37b/spot-ml/src/main/scala/org/apache/spot/netflow/model/FlowSuspiciousConnectsModel.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/netflow/model/FlowSuspiciousConnectsModel.scala b/spot-ml/src/main/scala/org/apache/spot/netflow/model/FlowSuspiciousConnectsModel.scala
index 4f07fba..2ab682e 100644
--- a/spot-ml/src/main/scala/org/apache/spot/netflow/model/FlowSuspiciousConnectsModel.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/netflow/model/FlowSuspiciousConnectsModel.scala
@@ -22,23 +22,23 @@ import scala.util.{Failure, Success, Try}
   *
   * The model uses a topic-modelling approach that:
   * 1. Simplifies netflow records into words, one word at the source IP and another (possibly different) at the
-  *    destination IP.
+  * destination IP.
   * 2. The netflow words about each IP are treated as collections of thes words.
   * 3. A topic modelling approach is used to infer a collection of "topics" that represent common profiles
-  *    of network traffic. These "topics" are probability distributions on words.
+  * of network traffic. These "topics" are probability distributions on words.
   * 4. Each IP has a mix of topics corresponding to its behavior.
   * 5. The probability of a word appearing in the traffic about an IP is estimated by simplifying its netflow record
-  *    into a word, and then combining the word probabilities per topic using the topic mix of the particular IP.
+  * into a word, and then combining the word probabilities per topic using the topic mix of the particular IP.
   *
   * Create these models using the  factory in the companion object.
   *
-  * @param topicCount Number of topics (profiles of common traffic patterns) used in the topic modelling routine.
-  * @param ipToTopicMix DataFrame assigning a distribution on topics to each document or IP.
+  * @param topicCount         Number of topics (profiles of common traffic patterns) used in the topic modelling routine.
+  * @param ipToTopicMix       DataFrame assigning a distribution on topics to each document or IP.
   * @param wordToPerTopicProb Map assigning to each word it's per-topic probabilities.
   *                           Ie. Prob [word | t ] for t = 0 to topicCount -1
-  * @param timeCuts Quantile cut-offs for binning time-of-day values when forming words from netflow records.
-  * @param ibytCuts Quantile cut-offs for binning ibyt values when forming words from netflow records.
-  * @param ipktCuts Quantile cut-offs for binning ipkt values when forming words from netflow records.
+  * @param timeCuts           Quantile cut-offs for binning time-of-day values when forming words from netflow records.
+  * @param ibytCuts           Quantile cut-offs for binning ibyt values when forming words from netflow records.
+  * @param ipktCuts           Quantile cut-offs for binning ipkt values when forming words from netflow records.
   */
 
 class FlowSuspiciousConnectsModel(topicCount: Int,
@@ -66,15 +66,15 @@ class FlowSuspiciousConnectsModel(topicCount: Int,
 
       val recordsWithIPTopicMixes = dataWithSrcIpProb.join(ipToTopicMix,
         dataWithSrcIpProb(DestinationIP) === ipToTopicMix(DocumentName), "left_outer")
-      val schema = dataWithSrcIpProb.schema.fieldNames :+  TopicProbabilityMix
-        recordsWithIPTopicMixes.selectExpr(schema: _*).withColumnRenamed(TopicProbabilityMix, DstIpTopicMix)
+      val schema = dataWithSrcIpProb.schema.fieldNames :+ TopicProbabilityMix
+      recordsWithIPTopicMixes.selectExpr(schema: _*).withColumnRenamed(TopicProbabilityMix, DstIpTopicMix)
     }
 
-    val scoreFunction =  new FlowScoreFunction(timeCuts,
-        ibytCuts,
-        ipktCuts,
-        topicCount,
-        wordToPerTopicProbBC)
+    val scoreFunction = new FlowScoreFunction(timeCuts,
+      ibytCuts,
+      ipktCuts,
+      topicCount,
+      wordToPerTopicProbBC)
 
 
     val scoringUDF = udf((hour: Int,
@@ -149,11 +149,12 @@ object FlowSuspiciousConnectsModel {
     val timeCuts = Quantiles.computeDeciles(totalRecords
       .select(Hour, Minute, Second)
       .rdd
-      .flatMap({ case Row(hours: Int, minutes: Int, seconds: Int) => {
-          Try {  (3600 * hours + 60 * minutes + seconds).toDouble } match{
-            case Failure(_) => Seq()
-            case Success(time) => Seq(time)
-          }
+      .flatMap({ case Row(hours: Int, minutes: Int, seconds: Int) =>
+        Try {
+          (3600 * hours + 60 * minutes + seconds).toDouble
+        } match {
+          case Failure(_) => Seq()
+          case Success(time) => Seq(time)
         }
       }))
 
@@ -164,11 +165,12 @@ object FlowSuspiciousConnectsModel {
     val ibytCuts = Quantiles.computeDeciles(totalRecords
       .select(Ibyt)
       .rdd
-      .flatMap({ case Row(ibyt: Long) => {
-          Try {  ibyt.toDouble } match{
-            case Failure(_) => Seq()
-            case Success(ibyt) => Seq(ibyt)
-          }
+      .flatMap({ case Row(ibyt: Long) =>
+        Try {
+          ibyt.toDouble
+        } match {
+          case Failure(_) => Seq()
+          case Success(ibyt) => Seq(ibyt)
         }
       }))
 
@@ -179,11 +181,12 @@ object FlowSuspiciousConnectsModel {
     val ipktCuts = Quantiles.computeQuintiles(totalRecords
       .select(Ipkt)
       .rdd
-      .flatMap({ case Row(ipkt: Long) => {
-          Try { ipkt.toDouble } match {
-            case Failure(_) => Seq()
-            case Success(ipkt) => Seq(ipkt)
-          }
+      .flatMap({ case Row(ipkt: Long) =>
+        Try {
+          ipkt.toDouble
+        } match {
+          case Failure(_) => Seq()
+          case Success(ipkt) => Seq(ipkt)
         }
       }))
 
@@ -232,5 +235,4 @@ object FlowSuspiciousConnectsModel {
       ibytCuts,
       ipktCuts)
   }
-
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/24b3a37b/spot-ml/src/main/scala/org/apache/spot/proxy/ProxySuspiciousConnectsModel.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/proxy/ProxySuspiciousConnectsModel.scala b/spot-ml/src/main/scala/org/apache/spot/proxy/ProxySuspiciousConnectsModel.scala
index c38ed93..edfa341 100644
--- a/spot-ml/src/main/scala/org/apache/spot/proxy/ProxySuspiciousConnectsModel.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/proxy/ProxySuspiciousConnectsModel.scala
@@ -7,11 +7,11 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.{DataFrame, Row, SQLContext}
 import org.apache.spot.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig
-import org.apache.spot.proxy.ProxySchema._
-import org.apache.spot.utilities._
 import org.apache.spot.SuspiciousConnectsScoreFunction
 import org.apache.spot.lda.SpotLDAWrapper
 import org.apache.spot.lda.SpotLDAWrapper.{SpotLDAInput, SpotLDAOutput}
+import org.apache.spot.proxy.ProxySchema._
+import org.apache.spot.utilities._
 import org.apache.spot.utilities.data.validation.InvalidDataHandler
 
 import scala.util.{Failure, Success, Try}
@@ -88,7 +88,7 @@ object ProxySuspiciousConnectsModel {
     * @param sqlContext   SQL context.
     * @param logger       Logge object.
     * @param config       SuspiciousConnetsArgumnetParser.Config object containg CLI arguments.
-    * @param inputRecords         Dataframe for training data, with columns Host, Time, ReqMethod, FullURI, ResponseContentType,
+    * @param inputRecords Dataframe for training data, with columns Host, Time, ReqMethod, FullURI, ResponseContentType,
     *                     UserAgent, RespCode (as defined in ProxySchema object).
     * @return ProxySuspiciousConnectsModel
     */
@@ -108,23 +108,27 @@ object ProxySuspiciousConnectsModel {
       Quantiles.computeDeciles(selectedRecords
         .select(Time)
         .rdd
-        .flatMap({ case Row(t: String) => {
-            Try {TimeUtilities.getTimeAsDouble(t)} match {
-              case Failure(_) => Seq()
-              case Success(time) =>  Seq(time)
-            }
+        .flatMap({ case Row(t: String) =>
+          Try {
+            TimeUtilities.getTimeAsDouble(t)
+          } match {
+            case Failure(_) => Seq()
+            case Success(time) => Seq(time)
+
           }
         }))
 
     val entropyCuts = Quantiles.computeQuintiles(selectedRecords
       .select(FullURI)
       .rdd
-      .flatMap({ case Row(uri: String) => {
-          Try {Entropy.stringEntropy(uri)} match {
-            case Failure(_) => Seq()
-            case Success(entropy) => Seq(entropy)
-          }
+      .flatMap({ case Row(uri: String) =>
+        Try {
+          Entropy.stringEntropy(uri)
+        } match {
+          case Failure(_) => Seq()
+          case Success(entropy) => Seq(entropy)
         }
+
       }))
 
     val agentToCount: Map[String, Long] =

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/24b3a37b/spot-ml/src/main/scala/org/apache/spot/utilities/data/InputOutputDataHandler.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/utilities/data/InputOutputDataHandler.scala b/spot-ml/src/main/scala/org/apache/spot/utilities/data/InputOutputDataHandler.scala
index e7934f4..fbbd085 100644
--- a/spot-ml/src/main/scala/org/apache/spot/utilities/data/InputOutputDataHandler.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/utilities/data/InputOutputDataHandler.scala
@@ -1,7 +1,7 @@
 package org.apache.spot.utilities.data
 
-import org.apache.log4j.Logger
 import org.apache.hadoop.fs.{LocatedFileStatus, Path, RemoteIterator, FileUtil => fileUtil}
+import org.apache.log4j.Logger
 import org.apache.spark.SparkContext
 import org.apache.spark.sql.{DataFrame, SQLContext}
 
@@ -15,27 +15,25 @@ object InputOutputDataHandler {
   /**
     *
     * @param sqlContext Application SqlContext.
-    * @param inputPath HDFS input folder for every execution; flow, dns or proxy.
-    * @param logger Application logger.
+    * @param inputPath  HDFS input folder for every execution; flow, dns or proxy.
+    * @param logger     Application logger.
     * @return raw data frame.
     */
-  def getInputDataFrame(sqlContext: SQLContext, inputPath: String, logger: Logger): Option[DataFrame] ={
+  def getInputDataFrame(sqlContext: SQLContext, inputPath: String, logger: Logger): Option[DataFrame] = {
     try {
       logger.info("Loading data from: " + inputPath)
       Some(sqlContext.read.parquet(inputPath))
     } catch {
-      case _ : Throwable => {
-        None
-      }
+      case _: Throwable => None
     }
   }
 
   /**
     *
-    * @param sparkContext Application SparkContext.
+    * @param sparkContext      Application SparkContext.
     * @param hdfsScoredConnect HDFS output folder. The location where results were saved; flow, dns or proxy.
-    * @param analysis Data type to analyze.
-    * @param logger Application Logger.
+    * @param analysis          Data type to analyze.
+    * @param logger            Application Logger.
     */
   def mergeResultsFiles(sparkContext: SparkContext, hdfsScoredConnect: String, analysis: String, logger: Logger) {
     val hadoopConfiguration = sparkContext.hadoopConfiguration
@@ -43,21 +41,21 @@ object InputOutputDataHandler {
 
     val exists = fileSystem.exists(new org.apache.hadoop.fs.Path(hdfsScoredConnect))
 
-    if(exists){
+    if (exists) {
       val srcDir = new Path(hdfsScoredConnect)
-      val dstFile = new Path(hdfsScoredConnect+"/"+analysis+"_results.csv")
-      fileUtil.copyMerge(fileSystem,srcDir, fileSystem, dstFile, false, hadoopConfiguration, "")
+      val dstFile = new Path(hdfsScoredConnect + "/" + analysis + "_results.csv")
+      fileUtil.copyMerge(fileSystem, srcDir, fileSystem, dstFile, false, hadoopConfiguration, "")
 
       val files: RemoteIterator[LocatedFileStatus] = fileSystem.listFiles(srcDir, false)
-      while (files.hasNext){
-        val filePath = files.next().getPath()
-        if(filePath.toString.contains("part-")){
+      while (files.hasNext) {
+        val filePath = files.next.getPath
+        if (filePath.toString.contains("part-")) {
           fileSystem.delete(filePath, false)
         }
       }
     }
     else logger.info(s"Couldn't find results in $hdfsScoredConnect." +
-        s"Please check previous logs to see if there were errors.")
-   }
+      s"Please check previous logs to see if there were errors.")
+  }
 
 }

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/24b3a37b/spot-ml/src/test/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysisTest.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/test/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysisTest.scala b/spot-ml/src/test/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysisTest.scala
index b106e6e..6e03585 100644
--- a/spot-ml/src/test/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysisTest.scala
+++ b/spot-ml/src/test/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysisTest.scala
@@ -2,32 +2,31 @@ package org.apache.spot.dns
 
 
 import org.apache.log4j.{Level, LogManager}
-import org.apache.spot.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.{Row, SQLContext}
 import org.apache.spark.sql.types.StructType
-import org.apache.spark.sql.Row
+import org.apache.spot.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig
 import org.apache.spot.dns.DNSSchema._
 import org.apache.spot.testutils.TestingSparkContextFlatSpec
 import org.scalatest.Matchers
 
 
-case class DNSInput(frame_time:String, unix_tstamp:Long, frame_len:Int, ip_dst: String, dns_qry_name:String, dns_qry_class:String, dns_qry_type: Int, dns_qry_rcode: Int)
+case class DNSInput(frame_time: String, unix_tstamp: Long, frame_len: Int, ip_dst: String, dns_qry_name: String, dns_qry_class: String, dns_qry_type: Int, dns_qry_rcode: Int)
 
-class DNSSuspiciousConnectsAnalysisTest  extends TestingSparkContextFlatSpec with Matchers {
+class DNSSuspiciousConnectsAnalysisTest extends TestingSparkContextFlatSpec with Matchers {
 
   val testConfig = SuspiciousConnectsConfig(analysis = "dns",
-  inputPath = "",
-  feedbackFile = "",
-  duplicationFactor = 1,
-  topicCount = 20,
-  hdfsScoredConnect = "",
-  threshold = 1.0d,
-  maxResults = 1000,
-  outputDelimiter = "\t",
-  ldaPRGSeed = None,
-  ldaMaxiterations = 20,
-  ldaAlpha = 1.02,
-  ldaBeta = 1.001)
+    inputPath = "",
+    feedbackFile = "",
+    duplicationFactor = 1,
+    topicCount = 20,
+    hdfsScoredConnect = "",
+    threshold = 1.0d,
+    maxResults = 1000,
+    outputDelimiter = "\t",
+    ldaPRGSeed = None,
+    ldaMaxiterations = 20,
+    ldaAlpha = 1.02,
+    ldaBeta = 1.001)
 
 
   "dns supicious connects analysis" should "estimate correct probabilities in toy data with framelength anomaly" in {
@@ -35,12 +34,12 @@ class DNSSuspiciousConnectsAnalysisTest  extends TestingSparkContextFlatSpec wit
     val logger = LogManager.getLogger("SuspiciousConnectsAnalysis")
     logger.setLevel(Level.WARN)
 
-    val anomalousRecord = DNSInput("May 20 2016 02:10:25.970987000 PDT",	1463735425L,	1,	"172.16.9.132",	"turner.com.122.2o7.net",	"0x00000001",	1,	0)
-    val typicalRecord   = DNSInput("May 20 2016 02:10:25.970987000 PDT",	1463735425L,	168,	"172.16.9.132",	"turner.com.122.2o7.net",	"0x00000001",	1,	0)
+    val anomalousRecord = DNSInput("May 20 2016 02:10:25.970987000 PDT", 1463735425L, 1, "172.16.9.132", "turner.com.122.2o7.net", "0x00000001", 1, 0)
+    val typicalRecord = DNSInput("May 20 2016 02:10:25.970987000 PDT", 1463735425L, 168, "172.16.9.132", "turner.com.122.2o7.net", "0x00000001", 1, 0)
 
     val data = sqlContext.createDataFrame(Seq(anomalousRecord, typicalRecord, typicalRecord, typicalRecord, typicalRecord))
 
-    val scoredData = DNSSuspiciousConnectsAnalysis.detectDNSAnomalies(data, testConfig,
+    val scoredData = DNSSuspiciousConnectsAnalysis.scoreDNSRecords(data, testConfig,
       sparkContext,
       sqlContext,
       logger)
@@ -49,12 +48,12 @@ class DNSSuspiciousConnectsAnalysisTest  extends TestingSparkContextFlatSpec wit
     val anomalyScore = scoredData.filter(scoredData(FrameLength) === 1).first().getAs[Double](Score)
     val typicalScores = scoredData.filter(scoredData(FrameLength) === 168).collect().map(_.getAs[Double](Score))
 
-    Math.abs(anomalyScore - 0.2d)  should be <= 0.01d
+    Math.abs(anomalyScore - 0.2d) should be <= 0.01d
     typicalScores.length shouldBe 4
-    Math.abs(typicalScores(0) - 0.8d)  should be <= 0.01d
-    Math.abs(typicalScores(1) - 0.8d)  should be <= 0.01d
-    Math.abs(typicalScores(2) - 0.8d)  should be <= 0.01d
-    Math.abs(typicalScores(3) - 0.8d)  should be <= 0.01d
+    Math.abs(typicalScores(0) - 0.8d) should be <= 0.01d
+    Math.abs(typicalScores(1) - 0.8d) should be <= 0.01d
+    Math.abs(typicalScores(2) - 0.8d) should be <= 0.01d
+    Math.abs(typicalScores(3) - 0.8d) should be <= 0.01d
   }
 
 

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/24b3a37b/spot-ml/src/test/scala/org/apache/spot/dns/DNSWordCreationTest.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/test/scala/org/apache/spot/dns/DNSWordCreationTest.scala b/spot-ml/src/test/scala/org/apache/spot/dns/DNSWordCreationTest.scala
index fa233a2..1422bb6 100644
--- a/spot-ml/src/test/scala/org/apache/spot/dns/DNSWordCreationTest.scala
+++ b/spot-ml/src/test/scala/org/apache/spot/dns/DNSWordCreationTest.scala
@@ -13,5 +13,4 @@ class DNSWordCreationTest extends TestingSparkContextFlatSpec with Matchers {
 
     result shouldBe 2.807354922057604
   }
-
 }

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/24b3a37b/spot-ml/src/test/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysisTest.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/test/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysisTest.scala b/spot-ml/src/test/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysisTest.scala
index d59635d..8f9f89f 100644
--- a/spot-ml/src/test/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysisTest.scala
+++ b/spot-ml/src/test/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysisTest.scala
@@ -1,8 +1,8 @@
 package org.apache.spot.netflow
 
-import org.apache.log4j.{Level, LogManager, Logger}
-import org.apache.spark.SparkContext
-import org.apache.spark.sql.{DataFrame, SQLContext}
+import org.apache.log4j.{Level, LogManager}
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.{DataFrame, Row, SQLContext}
 import org.apache.spot.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig
 import org.apache.spot.netflow.FlowSchema._
 import org.apache.spot.testutils.TestingSparkContextFlatSpec
@@ -56,7 +56,7 @@ class FlowSuspiciousConnectsAnalysisTest extends TestingSparkContextFlatSpec wit
       typicalRecord, typicalRecord, typicalRecord, typicalRecord))
 
 
-    val scoredData : DataFrame = FlowSuspiciousConnectsAnalysis.detectFlowAnomalies(data,
+    val scoredData: DataFrame = FlowSuspiciousConnectsAnalysis.detectFlowAnomalies(data,
       testConfig,
       sparkContext,
       sqlContext,
@@ -80,7 +80,114 @@ class FlowSuspiciousConnectsAnalysisTest extends TestingSparkContextFlatSpec wit
     Math.abs(typicalScores(8) - 0.9d) should be < 0.01
 
 
+  }
+  "filterAndSelectCleanFlowRecords" should "return data set without garbage" in {
+
+    val cleanedFlowRecords = FlowSuspiciousConnectsAnalysis
+      .filterAndSelectCleanFlowRecords(testFlowRecords.inputFlowRecordsDF)
+
+    cleanedFlowRecords.count should be(5)
+    cleanedFlowRecords.schema.size should be(17)
+  }
+
+  "filterAndSelectInvalidFlowRecords" should "return invalid records" in {
+
+    val invalidFlowRecords = FlowSuspiciousConnectsAnalysis
+      .filterAndSelectInvalidFlowRecords(testFlowRecords.inputFlowRecordsDF)
+
+    invalidFlowRecords.count should be(7)
+    invalidFlowRecords.schema.size should be(17)
+  }
 
+  "filterScoredFlowRecords" should "return records with score less or equal to threshold" in {
+
+    val threshold = 10e-5
+
+    val scoredFlowRecords = FlowSuspiciousConnectsAnalysis
+      .filterScoredFlowRecords(testFlowRecords.scoredFlowRecordsDF, threshold)
+
+    scoredFlowRecords.count should be(2)
+  }
+
+  "filterAndSelectCorruptFlowRecords" should "return records where Score is equal to -1" in {
+
+    val corruptFlowRecords = FlowSuspiciousConnectsAnalysis
+      .filterAndSelectCorruptFlowRecords(testFlowRecords.scoredFlowRecordsDF)
+
+    corruptFlowRecords.count should be(1)
+    corruptFlowRecords.schema.size should be(18)
   }
 
+  def testFlowRecords = new {
+    val sqlContext = new SQLContext(sparkContext)
+
+    val inputFlowRecordsRDD = sparkContext.parallelize(wrapRefArray(Array(
+      Seq("2016-05-05 13:54:58", 2016, 5, 5, 24, 54, 58, 0.972, "172.16.0.129", "10.0.2.202", 1024, 80, "TCP", 39l, 12522l, 0, 0),
+      Seq("2016-05-05 13:54:58", 2016, 5, 5, 13, 54, 60, 0.972, "172.16.0.129", "10.0.2.202", 1024, 80, "TCP", 39l, 12522l, 0, 0),
+      Seq("2016-05-05 13:54:58", 2016, 5, 5, 13, 60, 58, 0.972, "172.16.0.129", "10.0.2.202", 1024, 80, "TCP", 39l, 12522l, 0, 0),
+      Seq(null, 2016, 5, 5, 13, 54, 58, 0.972, "172.16.0.129", "10.0.2.202", 1024, 80, "TCP", 39l, 12522l, 0, 0),
+      Seq("2016-05-05 13:54:58", 2016, 5, 5, 13, 54, 58, 0.972, null, "10.0.2.202", 1024, 80, "TCP", 39l, 12522l, 0, 0),
+      Seq("2016-05-05 13:54:58", 2016, 5, 5, 13, 54, 58, 0.972, "172.16.0.129", null, 1024, 80, "TCP", 39l, 12522l, 0, 0),
+      Seq("2016-05-05 13:54:58", 2016, 5, 5, 13, 54, 58, 0.972, "172.16.0.129", "10.0.2.202", null, 80, "TCP", 39l, 12522l, 0, 0),
+      Seq("2016-05-05 13:54:58", 2016, 5, 5, 13, 54, 58, 0.972, "172.16.0.129", "10.0.2.202", 1024, null, "TCP", 39l, 12522l, 0, 0),
+      Seq("2016-05-05 13:54:58", 2016, 5, 5, 13, 54, 58, 0.972, "172.16.0.129", "10.0.2.202", 1024, 80, "TCP", null, 12522l, 0, 0),
+      Seq("2016-05-05 13:54:58", 2016, 5, 5, 13, 54, 58, 0.972, "172.16.0.129", "10.0.2.202", 1024, 80, "TCP", 39l, null, 0, 0),
+      Seq("2016-05-05 13:54:58", 2016, 5, 5, 13, 54, 58, 0.972, "172.16.0.129", "10.0.2.202", 1024, 80, "TCP", 39l, 12522l, null, 0),
+      Seq("2016-05-05 13:54:58", 2016, 5, 5, 13, 54, 58, 0.972, "172.16.0.129", "10.0.2.202", 1024, 80, "TCP", 39l, 12522l, 0, null),
+      Seq("2016-05-05 13:54:58", 2016, 5, 5, 13, 54, 58, 0.972, "172.16.0.129", "10.0.2.202", 1024, 80, "TCP", 39l, 12522l, 0, 0),
+      Seq("2016-05-05 13:54:58", 2016, 5, 5, 13, 54, 58, 0.972, "172.16.0.129", "10.0.2.202", 1024, 80, "TCP", 39l, 12522l, 0, 0),
+      Seq("2016-05-05 13:54:58", 2016, 5, 5, 13, 54, 58, 0.972, "172.16.0.129", "10.0.2.202", 1024, 80, "TCP", 39l, 12522l, 0, 0))
+      .map(row => Row.fromSeq(row))))
+
+    val inputFlowRecordsSchema = StructType(
+      Array(TimeReceivedField,
+        YearField,
+        MonthField,
+        DayField,
+        HourField,
+        MinuteField,
+        SecondField,
+        DurationField,
+        SourceIPField,
+        DestinationIPField,
+        SourcePortField,
+        DestinationPortField,
+        ProtocolField,
+        IpktField,
+        IbytField,
+        OpktField,
+        ObytField))
+
+    val inputFlowRecordsDF = sqlContext.createDataFrame(inputFlowRecordsRDD, inputFlowRecordsSchema)
+
+    val scoredFlowRecordsRDD = sparkContext.parallelize(wrapRefArray(Array(
+      Seq("2016-05-05 13:54:58", 2016, 5, 5, 13, 54, 58, 0.972, "172.16.0.129", "10.0.2.202", 1024, 80, "TCP", 39l, 12522l, 0, 0, -1d),
+      Seq("2016-05-05 13:54:58", 2016, 5, 5, 13, 54, 58, 0.972, "172.16.0.129", "10.0.2.202", 1024, 80, "TCP", 39l, 12522l, 0, 0, 1d),
+      Seq("2016-05-05 13:54:58", 2016, 5, 5, 13, 54, 58, 0.972, "172.16.0.129", "10.0.2.202", 1024, 80, "TCP", 39l, 12522l, 0, 0, 0.0000005),
+      Seq("2016-05-05 13:54:58", 2016, 5, 5, 13, 54, 58, 0.972, "172.16.0.129", "10.0.2.202", 1024, 80, "TCP", 39l, 12522l, 0, 0, 0.05),
+      Seq("2016-05-05 13:54:58", 2016, 5, 5, 13, 54, 58, 0.972, "172.16.0.129", "10.0.2.202", 1024, 80, "TCP", 39l, 12522l, 0, 0, 0.0001))
+      .map(row => Row.fromSeq(row))))
+
+    val scoredFlowRecordsSchema = StructType(
+      Array(TimeReceivedField,
+        YearField,
+        MonthField,
+        DayField,
+        HourField,
+        MinuteField,
+        SecondField,
+        DurationField,
+        SourceIPField,
+        DestinationIPField,
+        SourcePortField,
+        DestinationPortField,
+        ProtocolField,
+        IpktField,
+        IbytField,
+        OpktField,
+        ObytField,
+        ScoreField))
+
+    val scoredFlowRecordsDF = sqlContext.createDataFrame(scoredFlowRecordsRDD, scoredFlowRecordsSchema)
+  }
 }


[27/49] incubator-spot git commit: unit_test_cleanup

Posted by ev...@apache.org.
unit_test_cleanup

removal of unnecessary toString() calls in FlowWordCreator


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/986cebf9
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/986cebf9
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/986cebf9

Branch: refs/heads/master
Commit: 986cebf973e91d40a61f3384f2c956a5589ad514
Parents: 5a75bc5
Author: nlsegerl <na...@intel.com>
Authored: Thu Dec 22 11:08:59 2016 -0800
Committer: nlsegerl <na...@intel.com>
Committed: Thu Dec 22 11:08:59 2016 -0800

----------------------------------------------------------------------
 .../main/scala/org/apache/spot/netflow/FlowWordCreator.scala   | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/986cebf9/spot-ml/src/main/scala/org/apache/spot/netflow/FlowWordCreator.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/netflow/FlowWordCreator.scala b/spot-ml/src/main/scala/org/apache/spot/netflow/FlowWordCreator.scala
index f82d270..bbea010 100644
--- a/spot-ml/src/main/scala/org/apache/spot/netflow/FlowWordCreator.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/netflow/FlowWordCreator.scala
@@ -74,9 +74,9 @@ class FlowWordCreator(timeCuts: Array[Double],
 
     val timeOfDay: Double = hour.toDouble + minute.toDouble / 60 + second.toDouble / 3600
 
-    val timeBin = Quantiles.bin(timeOfDay, timeCuts).toString()
-    val ibytBin = Quantiles.bin(ibyt, ibytCuts).toString()
-    val ipktBin = Quantiles.bin(ipkt, ipktCuts).toString()
+    val timeBin = Quantiles.bin(timeOfDay, timeCuts)
+    val ibytBin = Quantiles.bin(ibyt, ibytCuts)
+    val ipktBin = Quantiles.bin(ipkt, ipktCuts)
 
 
     val LowToLowPortEncoding = 111111


[02/49] incubator-spot git commit: Inserted lines into spot.conf and ml_ops.sh for input of user domain string.

Posted by ev...@apache.org.
Inserted lines into spot.conf and ml_ops.sh for input of user domain string.


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/41ffbc3c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/41ffbc3c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/41ffbc3c

Branch: refs/heads/master
Commit: 41ffbc3c3744eadc7baac4633f160188bcad731a
Parents: bc5744f
Author: Brandon Edwards <br...@intel.com>
Authored: Tue Dec 6 09:03:02 2016 -0800
Committer: Brandon Edwards <br...@intel.com>
Committed: Tue Dec 6 21:59:29 2016 -0800

----------------------------------------------------------------------
 spot-ml/ml_ops.sh    | 3 ++-
 spot-setup/spot.conf | 4 ++++
 2 files changed, 6 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/41ffbc3c/spot-ml/ml_ops.sh
----------------------------------------------------------------------
diff --git a/spot-ml/ml_ops.sh b/spot-ml/ml_ops.sh
index 244bcc4..cf1bc31 100755
--- a/spot-ml/ml_ops.sh
+++ b/spot-ml/ml_ops.sh
@@ -97,6 +97,7 @@ time spark-submit --class "org.apache.spot.SuspiciousConnects" \
   --dupfactor ${DUPFACTOR} \
   --feedback ${FEEDBACK_PATH} \
   --ldatopiccount ${TOPIC_COUNT} \
+  --userdomain ${USER_DOMAIN}\
   --scored ${HDFS_SCORED_CONNECTS} \
   --threshold ${TOL} \
   --maxresults ${MAXRESULTS} \
@@ -107,4 +108,4 @@ wait
 # move results to hdfs.
 cd ${LPATH}
 hadoop fs -getmerge ${HDFS_SCORED_CONNECTS}/part-* ${DSOURCE}_results.csv && hadoop fs -moveFromLocal \
-    ${DSOURCE}_results.csv  ${HDFS_SCORED_CONNECTS}/${DSOURCE}_results.csv
\ No newline at end of file
+    ${DSOURCE}_results.csv  ${HDFS_SCORED_CONNECTS}/${DSOURCE}_results.csv

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/41ffbc3c/spot-setup/spot.conf
----------------------------------------------------------------------
diff --git a/spot-setup/spot.conf b/spot-setup/spot.conf
index 8f60eb1..433afa8 100755
--- a/spot-setup/spot.conf
+++ b/spot-setup/spot.conf
@@ -17,6 +17,7 @@ HPATH=${HUSER}/${DSOURCE}/scored_results/${FDATE}
 #impala config
 IMPALA_DEM='node04'
 
+#kerberos config
 KRB_AUTH=false
 KINITPATH=
 KINITOPTS=
@@ -30,6 +31,9 @@ RPATH=${LUSER}/ipython/user/${FDATE}
 LDAPATH=${LUSER}/ml/oni-lda-c
 LIPATH=${LUSER}/ingest
 
+#domain associated to network data to be analyzed
+USER_DOMAIN='intel'
+
 SPK_EXEC='400'
 SPK_EXEC_MEM='2048m'
 SPK_DRIVER_MEM=''


[39/49] incubator-spot git commit: Reverted changes to split into two different PRs

Posted by ev...@apache.org.
Reverted changes to split into two different PRs


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/a958cb4e
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/a958cb4e
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/a958cb4e

Branch: refs/heads/master
Commit: a958cb4e5f8b2c3976bb7a32731bcfb969927028
Parents: c9e27ba
Author: LedaLima <ga...@intel.com>
Authored: Fri Jan 20 14:28:18 2017 -0600
Committer: Everardo Lopez Sandoval (Intel) <el...@elopezsa-mac02.ra.intel.com>
Committed: Fri Jan 20 17:01:02 2017 -0800

----------------------------------------------------------------------
 spot-oa/oa/dns/dns_oa.py | 22 ++++------------------
 spot-oa/oa/utils.py      |  5 ++---
 2 files changed, 6 insertions(+), 21 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/a958cb4e/spot-oa/oa/dns/dns_oa.py
----------------------------------------------------------------------
diff --git a/spot-oa/oa/dns/dns_oa.py b/spot-oa/oa/dns/dns_oa.py
index ffcc839..abb8cac 100644
--- a/spot-oa/oa/dns/dns_oa.py
+++ b/spot-oa/oa/dns/dns_oa.py
@@ -143,31 +143,17 @@ class OA(object):
         dns_scores_csv = "{0}/dns_scores.csv".format(self._data_path)
         dns_scores_final =  self._move_time_stamp(self._dns_scores)
         dns_scores_final.insert(0,self._dns_scores_headers)
-        Util.create_csv_file(dns_scores_csv,dns_scores_final,',',0)   
+        Util.create_csv_file(dns_scores_csv,dns_scores_final)   
 
         # create bk file
         dns_scores_bu_csv = "{0}/dns_scores_bu.csv".format(self._data_path)
-        Util.create_csv_file(dns_scores_bu_csv,dns_scores_final,',',0)     
+        Util.create_csv_file(dns_scores_bu_csv,dns_scores_final)     
 
 
     def _add_tld_column(self):
-        qry_name_col = self._conf['dns_results_fields']['dns_qry_name']
-        self._dns_scores = [conn + [ self._get_valid_tld(str(conn[qry_name_col])) ] for conn in self._dns_scores ]
-         
+        qry_name_col = self._conf['dns_results_fields']['dns_qry_name'] 
+        self._dns_scores = [conn + [ get_tld("http://" + str(conn[qry_name_col]), fail_silently=True) if "http://" not in str(conn[qry_name_col]) else get_tld(str(conn[qry_name_col]), fail_silently=True)] for conn in self._dns_scores ] 
   
-    def _get_valid_tld(self, qry_name):
-        tld = ""
-        try:
-            if "http://" not in qry_name: 
-                tld = get_tld("http://" + qry_name)
-            else:
-                tld = get_tld(qry_name)
-        except ValueError:
-            self._logger.error("Unable to get top level domain from query: {0}".format(qry_name))
-            tld = "UNKNOWN"
-        return tld
-    
-
     def _add_reputation(self):
 
         # read configuration.

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/a958cb4e/spot-oa/oa/utils.py
----------------------------------------------------------------------
diff --git a/spot-oa/oa/utils.py b/spot-oa/oa/utils.py
index 52b251a..d08b26e 100644
--- a/spot-oa/oa/utils.py
+++ b/spot-oa/oa/utils.py
@@ -98,10 +98,9 @@ class Util(object):
 	
 	
 	@classmethod
-	def create_csv_file(cls,full_path_file,content,delimiter=',',set_quoting=3):  
-		#set_quoting: 0 - MINIMAL, 1 - ALL, 3 - NONE
+	def create_csv_file(cls,full_path_file,content,delimiter=','):   
 		with open(full_path_file, 'w+') as u_file:
-			writer = csv.writer(u_file, quoting=set_quoting, quotechar='"', delimiter=delimiter)
+			writer = csv.writer(u_file, quoting=csv.QUOTE_NONE, delimiter=delimiter)
 			writer.writerows(content)
 
 


[34/49] incubator-spot git commit: Merge branch 'spot' into unit_test_cleanup

Posted by ev...@apache.org.
Merge branch 'spot' into unit_test_cleanup

# Conflicts:
#	spot-ml/src/main/scala/org/apache/spot/SuspiciousConnects.scala
#	spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala
#	spot-ml/src/main/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysis.scala
#	spot-ml/src/main/scala/org/apache/spot/netflow/FlowWordCreator.scala
#	spot-ml/src/main/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysis.scala
#	spot-ml/src/test/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysisTest.scala
#	spot-ml/src/test/scala/org/apache/spot/dns/DNSWordCreationTest.scala
#	spot-ml/src/test/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysisTest.scala


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/9ac5a8c7
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/9ac5a8c7
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/9ac5a8c7

Branch: refs/heads/master
Commit: 9ac5a8c763e50a047eeb38a3a6b93aad5a30cc5b
Parents: c638e2f a58345e
Author: nlsegerl <na...@intel.com>
Authored: Wed Jan 4 15:29:31 2017 -0800
Committer: nlsegerl <na...@intel.com>
Committed: Wed Jan 4 15:29:31 2017 -0800

----------------------------------------------------------------------
 spot-ml/install_ml.sh                           |  12 --
 spot-ml/ml_ops.sh                               |  40 +---
 spot-ml/ml_test.sh                              |  30 +--
 .../org/apache/spot/SuspiciousConnects.scala    |   5 +-
 .../spot/SuspiciousConnectsScoreFunction.scala  |  19 +-
 .../scala/org/apache/spot/dns/DNSSchema.scala   |   2 -
 .../dns/DNSSuspiciousConnectsAnalysis.scala     |   8 +-
 .../org/apache/spot/dns/DNSWordCreation.scala   |  28 ++-
 .../dns/model/DNSSuspiciousConnectsModel.scala  |  99 +++++++--
 .../FlowSuspiciousConnectsAnalysis.scala        |   3 +
 .../apache/spot/netflow/FlowWordCreator.scala   |  66 +++---
 .../spot/netflow/model/FlowScoreFunction.scala  |   9 +-
 .../model/FlowSuspiciousConnectsModel.scala     |  73 ++++---
 .../org/apache/spot/proxy/ProxySchema.scala     |  49 +++++
 .../proxy/ProxySuspiciousConnectsAnalysis.scala |  31 ++-
 .../proxy/ProxySuspiciousConnectsModel.scala    |  65 ++++--
 .../apache/spot/proxy/ProxyWordCreation.scala   |  27 ++-
 .../utilities/data/InputOutputDataHandler.scala |  63 ++++++
 .../data/validation/InvalidDataHandler.scala    |  56 +++++
 .../org/apache/spot/DNSWordCreationTest.scala   |  19 --
 .../org/apache/spot/FlowWordCreatorTest.scala   | 216 -------------------
 .../dns/DNSSuspiciousConnectsAnalysisTest.scala | 105 +++++++++
 .../apache/spot/dns/DNSWordCreationTest.scala   |  17 ++
 .../spot/netflow/FlowWordCreatorTest.scala      | 214 ++++++++++++++++++
 .../ProxySuspiciousConnectsAnalysisTest.scala   | 123 +++++++++++
 spot-oa/oa/components/iana/dns-qclass.csv       |   1 +
 spot-oa/oa/components/iana/dns-qtype.csv        |   1 +
 spot-oa/oa/components/iana/dns-rcode.csv        |   1 +
 spot-setup/spot.conf                            |  13 +-
 29 files changed, 934 insertions(+), 461 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/9ac5a8c7/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala
----------------------------------------------------------------------
diff --cc spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala
index 5db5c50,f444dfe..929b69e
--- a/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala
@@@ -21,10 -19,8 +20,11 @@@ import org.apache.spot.utilities.data.v
  
  object DNSSuspiciousConnectsAnalysis {
  
 +
++
    /**
      * Run suspicious connections analysis on DNS log data.
 +    * Saves the most suspicious connections to a CSV file on HDFS.
      *
      * @param config Object encapsulating runtime parameters and CLI options.
      * @param sparkContext
@@@ -64,30 -63,6 +64,33 @@@
      dataValidation.showAndSaveCorruptRecords(corruptDNSRecords, config.hdfsScoredConnect, logger)
    }
  
++
 +  /**
 +    * Identify anomalous DNS log entries in in the provided data frame.
 +    *
 +    * @param data Data frame of DNS entries
 +    * @param config
 +    * @param sparkContext
 +    * @param sqlContext
 +    * @param logger
 +    * @return
 +    */
++
 +  def detectDNSAnomalies(data: DataFrame, config: SuspiciousConnectsConfig,
 +                         sparkContext: SparkContext,
 +                         sqlContext: SQLContext,
 +                         logger: Logger) : DataFrame = {
 +
 +    val userDomain = config.userDomain
 +    logger.info("Fitting probabilistic model to data")
 +    val model =
 +      DNSSuspiciousConnectsModel.trainNewModel(sparkContext, sqlContext, logger, config, data, config.topicCount)
 +
 +    logger.info("Identifying outliers")
 +    model.score(sparkContext, sqlContext, data, userDomain)
 +  }
 +
++
    /**
      *
      * @param inputDNSRecords raw DNS records.
@@@ -121,6 -96,6 +124,7 @@@
        .na.fill(DefaultQueryResponseCode, Seq(QueryResponseCode))
    }
  
++
    /**
      *
      * @param inputDNSRecords raw DNS records.
@@@ -151,6 -126,6 +155,7 @@@
        .select(InSchema: _*)
    }
  
++
    /**
      *
      * @param scoredDNSRecords scored DNS records.
@@@ -159,6 -134,6 +164,7 @@@
      */
    def filterScoredDNSRecords(scoredDNSRecords: DataFrame, threshold: Double): DataFrame ={
  
++
      val filteredDNSRecordsFilter = scoredDNSRecords(Score).leq(threshold) &&
        scoredDNSRecords(Score).gt(dataValidation.ScoreError)
  

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/9ac5a8c7/spot-ml/src/main/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysis.scala
----------------------------------------------------------------------
diff --cc spot-ml/src/main/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysis.scala
index 127b2a7,2ff1383..aad3e66
--- a/spot-ml/src/main/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysis.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysis.scala
@@@ -18,8 -17,8 +18,9 @@@ import org.apache.spot.utilities.data.v
  
  object FlowSuspiciousConnectsAnalysis {
  
 -  def run(config: SuspiciousConnectsConfig, sparkContext: SparkContext, sqlContext: SQLContext,
 -          logger: Logger, inputFlowRecords: DataFrame) = {
++
 +  def run(config: SuspiciousConnectsConfig, sparkContext: SparkContext, sqlContext: SQLContext, logger: Logger,
 +          inputFlowRecords: DataFrame) = {
  
      logger.info("Starting flow suspicious connects analysis.")
  
@@@ -148,6 -129,6 +149,8 @@@
  
    }
  
++
++
    val InSchema = StructType(List(TimeReceivedField,
      YearField,
      MonthField,

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/9ac5a8c7/spot-ml/src/main/scala/org/apache/spot/netflow/FlowWordCreator.scala
----------------------------------------------------------------------
diff --cc spot-ml/src/main/scala/org/apache/spot/netflow/FlowWordCreator.scala
index bbea010,50e4f71..bd94df5
--- a/spot-ml/src/main/scala/org/apache/spot/netflow/FlowWordCreator.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/netflow/FlowWordCreator.scala
@@@ -71,55 -74,59 +74,58 @@@ class FlowWordCreator(timeCuts: Array[D
      */
    def flowWords(hour: Int, minute: Int, second: Int, srcPort: Int, dstPort: Int, ipkt: Long, ibyt: Long): FlowWords = {
  
+     Try {
+       val timeOfDay: Double = hour.toDouble + minute.toDouble / 60 + second.toDouble / 3600
  
-     val timeOfDay: Double = hour.toDouble + minute.toDouble / 60 + second.toDouble / 3600
- 
-     val timeBin = Quantiles.bin(timeOfDay, timeCuts)
-     val ibytBin = Quantiles.bin(ibyt, ibytCuts)
-     val ipktBin = Quantiles.bin(ipkt, ipktCuts)
+       val timeBin = Quantiles.bin(timeOfDay, timeCuts)
+       val ibytBin = Quantiles.bin(ibyt, ibytCuts)
+       val ipktBin = Quantiles.bin(ipkt, ipktCuts)
  
 -
+       val LowToLowPortEncoding = 111111
+       val HighToHighPortEncoding = 333333
  
-     val LowToLowPortEncoding = 111111
-     val HighToHighPortEncoding = 333333
+       if (dstPort == 0 && srcPort == 0) {
  
-     if (dstPort == 0 && srcPort == 0) {
+         val baseWord = Array("0", timeBin, ibytBin, ipktBin).mkString("_")
+         FlowWords(srcWord = baseWord, dstWord = baseWord)
  
-       val baseWord = Array("0", timeBin, ibytBin, ipktBin).mkString("_")
-       FlowWords(srcWord = baseWord, dstWord = baseWord)
+       } else if (dstPort == 0 && srcPort > 0) {
  
-     } else if (dstPort == 0 && srcPort > 0) {
+         val baseWord = Array(srcPort.toString(), timeBin, ibytBin, ipktBin).mkString("_")
+         FlowWords(srcWord = "-1_" + baseWord, dstWord = baseWord)
  
-       val baseWord = Array(srcPort.toString(), timeBin, ibytBin, ipktBin).mkString("_")
-       FlowWords(srcWord = "-1_" + baseWord, dstWord = baseWord)
+       } else if (srcPort == 0 && dstPort > 0) {
  
-     } else if (srcPort == 0 && dstPort > 0) {
+         val baseWord = Array(dstPort.toString(), timeBin, ibytBin, ipktBin).mkString("_")
+         FlowWords(srcWord = baseWord, dstWord = "-1_" + baseWord)
  
-       val baseWord = Array(dstPort.toString(), timeBin, ibytBin, ipktBin).mkString("_")
-       FlowWords(srcWord = baseWord, dstWord = "-1_" + baseWord)
+       } else if (srcPort <= 1024 && dstPort <= 1024) {
  
-     } else if (srcPort <= 1024 && dstPort <= 1024) {
+         val baseWord = Array(LowToLowPortEncoding, timeBin, ibytBin, ipktBin).mkString("_")
+         FlowWords(srcWord = baseWord, dstWord = baseWord)
  
-       val baseWord = Array(LowToLowPortEncoding, timeBin, ibytBin, ipktBin).mkString("_")
-       FlowWords(srcWord = baseWord, dstWord = baseWord)
+       } else if (srcPort <= 1024 && dstPort > 1024) {
  
-     } else if (srcPort <= 1024 && dstPort > 1024) {
+         val baseWord = Array(srcPort.toString(), timeBin, ibytBin, ipktBin).mkString("_")
+         FlowWords(srcWord = "-1_" + baseWord, dstWord = baseWord)
  
-       val baseWord = Array(srcPort.toString(), timeBin, ibytBin, ipktBin).mkString("_")
-       FlowWords(srcWord = "-1_" + baseWord, dstWord = baseWord)
+       } else if (srcPort > 1024 && dstPort <= 1024) {
  
-     } else if (srcPort > 1024 && dstPort <= 1024) {
+         val baseWord = Array(dstPort.toString(), timeBin, ibytBin, ipktBin).mkString("_")
+         FlowWords(srcWord = baseWord, dstWord = "-1_" + baseWord)
  
-       val baseWord = Array(dstPort.toString(), timeBin, ibytBin, ipktBin).mkString("_")
-       FlowWords(srcWord = baseWord, dstWord = "-1_" + baseWord)
+       } else {
  
-     } else {
+         // this is the srcPort > 1024 && dstPort > 1024 case
  
-       // this is the srcPort > 1024 && dstPort > 1024 case
+         val baseWord = Array(HighToHighPortEncoding, timeBin, ibytBin, ipktBin).mkString("_")
+         FlowWords(srcWord = baseWord, dstWord = baseWord)
+       }
  
-       val baseWord = Array(HighToHighPortEncoding, timeBin, ibytBin, ipktBin).mkString("_")
-       FlowWords(srcWord = baseWord, dstWord = baseWord)
+     } match {
+       case Success(flowWords) => flowWords
+       case _ => FlowWords(InvalidDataHandler.WordError, InvalidDataHandler.WordError)
      }
- 
    }
  
  }

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/9ac5a8c7/spot-ml/src/main/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysis.scala
----------------------------------------------------------------------
diff --cc spot-ml/src/main/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysis.scala
index 290f101,cc2319f..746fdd1
--- a/spot-ml/src/main/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysis.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysis.scala
@@@ -28,18 -29,22 +29,18 @@@ object ProxySuspiciousConnectsAnalysis 
  
      val cleanProxyRecords = filterAndSelectCleanProxyRecords(inputProxyRecords)
  
- 
 -    logger.info("Training the model")
 -    val model =
 -      ProxySuspiciousConnectsModel.trainNewModel(sparkContext, sqlContext, logger, config, cleanProxyRecords)
 +    val scoredProxyRecords = detectProxyAnomalies(cleanProxyRecords, config, sparkContext, sqlContext, logger)
  
 -    logger.info("Scoring")
 -    val scoredProxyRecords = model.score(sparkContext, cleanProxyRecords)
+ 
      // take the maxResults least probable events of probability below the threshold and sort
  
      val filteredProxyRecords = filterScoredProxyRecords(scoredProxyRecords, config.threshold)
  
      val orderedProxyRecords = filteredProxyRecords.orderBy(Score)
  
--    val mostSuspiciousProxyRecords = if(config.maxResults > 0)  orderedProxyRecords.limit(config.maxResults) else orderedProxyRecords
++    val mostSuspiciousProxyRecords = if (config.maxResults > 0) orderedProxyRecords.limit(config.maxResults) else orderedProxyRecords
  
--    val outputProxyRecords = mostSuspiciousProxyRecords.select(OutSchema:_*)
++    val outputProxyRecords = mostSuspiciousProxyRecords.select(OutSchema: _*)
  
      logger.info("Proxy suspicious connects analysis completed")
      logger.info("Saving results to: " + config.hdfsScoredConnect)
@@@ -52,39 -57,14 +53,38 @@@
      dataValidation.showAndSaveCorruptRecords(corruptProxyRecords, config.hdfsScoredConnect, logger)
    }
  
- 
    /**
 +    * Identify anomalous proxy log entries in in the provided data frame.
 +    *
 +    * @param data Data frame of proxy entries
 +    * @param config
 +    * @param sparkContext
 +    * @param sqlContext
 +    * @param logger
 +    * @return
 +    */
 +  def detectProxyAnomalies(data: DataFrame,
-                           config: SuspiciousConnectsConfig,
-                           sparkContext: SparkContext,
-                           sqlContext: SQLContext,
-                           logger: Logger) : DataFrame = {
++                           config: SuspiciousConnectsConfig,
++                           sparkContext: SparkContext,
++                           sqlContext: SQLContext,
++                           logger: Logger): DataFrame = {
 +
 +
 +    logger.info("Fitting probabilistic model to data")
 +    val model = ProxySuspiciousConnectsModel.trainNewModel(sparkContext, sqlContext, logger, config, data)
 +    logger.info("Identifying outliers")
 +
 +    model.score(sparkContext, data)
 +  }
 +
 +  /**
      *
      * @param inputProxyRecords raw proxy records.
      * @return
      */
--  def filterAndSelectCleanProxyRecords(inputProxyRecords: DataFrame): DataFrame ={
++  def filterAndSelectCleanProxyRecords(inputProxyRecords: DataFrame): DataFrame = {
  
--    val cleanProxyRecordsFilter =  inputProxyRecords(Date).isNotNull &&
++    val cleanProxyRecordsFilter = inputProxyRecords(Date).isNotNull &&
        inputProxyRecords(Time).isNotNull &&
        inputProxyRecords(ClientIP).isNotNull &&
        inputProxyRecords(Host).isNotNull &&
@@@ -92,7 -72,7 +92,7 @@@
  
      inputProxyRecords
        .filter(cleanProxyRecordsFilter)
--      .select(InSchema:_*)
++      .select(InSchema: _*)
        .na.fill(DefaultUserAgent, Seq(UserAgent))
        .na.fill(DefaultResponseContentType, Seq(ResponseContentType))
    }
@@@ -102,7 -82,7 +102,7 @@@
      * @param inputProxyRecords raw proxy records.
      * @return
      */
--  def filterAndSelectInvalidProxyRecords(inputProxyRecords: DataFrame): DataFrame ={
++  def filterAndSelectInvalidProxyRecords(inputProxyRecords: DataFrame): DataFrame = {
  
      val invalidProxyRecordsFilter = inputProxyRecords(Date).isNull ||
        inputProxyRecords(Time).isNull ||
@@@ -118,10 -98,10 +118,10 @@@
    /**
      *
      * @param scoredProxyRecords scored proxy records.
--    * @param threshold score tolerance.
++    * @param threshold          score tolerance.
      * @return
      */
--  def filterScoredProxyRecords(scoredProxyRecords: DataFrame, threshold: Double): DataFrame ={
++  def filterScoredProxyRecords(scoredProxyRecords: DataFrame, threshold: Double): DataFrame = {
  
      val filteredProxyRecordsFilter = scoredProxyRecords(Score).leq(threshold) &&
        scoredProxyRecords(Score).gt(dataValidation.ScoreError)
@@@ -134,7 -114,7 +134,7 @@@
      * @param scoredProxyRecords scored proxy records.
      * @return
      */
--  def filterAndSelectCorruptProxyRecords(scoredProxyRecords: DataFrame): DataFrame ={
++  def filterAndSelectCorruptProxyRecords(scoredProxyRecords: DataFrame): DataFrame = {
  
      val corruptProxyRecordsFilter = scoredProxyRecords(Score).equalTo(dataValidation.ScoreError)
  

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/9ac5a8c7/spot-ml/src/test/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysisTest.scala
----------------------------------------------------------------------
diff --cc spot-ml/src/test/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysisTest.scala
index d9ec94e,08200be..b106e6e
--- a/spot-ml/src/test/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysisTest.scala
+++ b/spot-ml/src/test/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysisTest.scala
@@@ -1,54 -1,114 +1,159 @@@
  package org.apache.spot.dns
  
++
 +import org.apache.log4j.{Level, LogManager}
 +import org.apache.spot.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig
+ import org.apache.spark.sql.SQLContext
+ import org.apache.spark.sql.types.StructType
+ import org.apache.spark.sql.Row
  import org.apache.spot.dns.DNSSchema._
  import org.apache.spot.testutils.TestingSparkContextFlatSpec
  import org.scalatest.Matchers
  
 -/**
 -  * Created by rabarona on 12/15/16.
 -  */
 -class DNSSuspiciousConnectsAnalysisTest  extends TestingSparkContextFlatSpec with Matchers{
++
 +case class DNSInput(frame_time:String, unix_tstamp:Long, frame_len:Int, ip_dst: String, dns_qry_name:String, dns_qry_class:String, dns_qry_type: Int, dns_qry_rcode: Int)
 +
 +class DNSSuspiciousConnectsAnalysisTest  extends TestingSparkContextFlatSpec with Matchers {
 +
 +  val testConfig = SuspiciousConnectsConfig(analysis = "dns",
 +  inputPath = "",
 +  feedbackFile = "",
 +  duplicationFactor = 1,
 +  topicCount = 20,
 +  hdfsScoredConnect = "",
 +  threshold = 1.0d,
 +  maxResults = 1000,
 +  outputDelimiter = "\t",
 +  ldaPRGSeed = None,
 +  ldaMaxiterations = 20,
 +  ldaAlpha = 1.02,
 +  ldaBeta = 1.001)
 +
 +
 +  "dns supicious connects analysis" should "estimate correct probabilities in toy data with framelength anomaly" in {
 +
 +    val logger = LogManager.getLogger("SuspiciousConnectsAnalysis")
 +    logger.setLevel(Level.WARN)
 +
 +    val anomalousRecord = DNSInput("May 20 2016 02:10:25.970987000 PDT",	1463735425L,	1,	"172.16.9.132",	"turner.com.122.2o7.net",	"0x00000001",	1,	0)
 +    val typicalRecord   = DNSInput("May 20 2016 02:10:25.970987000 PDT",	1463735425L,	168,	"172.16.9.132",	"turner.com.122.2o7.net",	"0x00000001",	1,	0)
 +
 +    val data = sqlContext.createDataFrame(Seq(anomalousRecord, typicalRecord, typicalRecord, typicalRecord, typicalRecord))
 +
 +    val scoredData = DNSSuspiciousConnectsAnalysis.detectDNSAnomalies(data, testConfig,
 +      sparkContext,
 +      sqlContext,
 +      logger)
 +
 +
 +    val anomalyScore = scoredData.filter(scoredData(FrameLength) === 1).first().getAs[Double](Score)
 +    val typicalScores = scoredData.filter(scoredData(FrameLength) === 168).collect().map(_.getAs[Double](Score))
 +
 +    Math.abs(anomalyScore - 0.2d)  should be <= 0.01d
 +    typicalScores.length shouldBe 4
 +    Math.abs(typicalScores(0) - 0.8d)  should be <= 0.01d
 +    Math.abs(typicalScores(1) - 0.8d)  should be <= 0.01d
 +    Math.abs(typicalScores(2) - 0.8d)  should be <= 0.01d
 +    Math.abs(typicalScores(3) - 0.8d)  should be <= 0.01d
 +  }
++
+ 
+   "filterAndSelectCleanDNSRecords" should "return data set without garbage" in {
+ 
+     val cleanedDNSRecords = DNSSuspiciousConnectsAnalysis.filterAndSelectCleanDNSRecords(testDNSRecords.inputDNSRecordsDF)
+ 
+     cleanedDNSRecords.count should be(8)
+     cleanedDNSRecords.schema.size should be(8)
+   }
+ 
+   "filterAndSelectInvalidDNSRecords" should "return invalid records" in {
+ 
+     val invalidDNSRecords = DNSSuspiciousConnectsAnalysis.filterAndSelectInvalidDNSRecords(testDNSRecords.inputDNSRecordsDF)
+ 
+     invalidDNSRecords.count should be(15)
+     invalidDNSRecords.schema.size should be(8)
+   }
+ 
+   "filterScoredDNSRecords" should "return records with score less or equal to threshold" in {
+ 
+     val threshold = 10e-5
+     val scoredDNSRecords = DNSSuspiciousConnectsAnalysis
+       .filterScoredDNSRecords(testDNSRecords.scoredDNSRecordsDF, threshold)
+ 
+     scoredDNSRecords.count should be(2)
+   }
+ 
+   "filterAndSelectCorruptDNSRecords" should "return records where Score is equal to -1" in {
+ 
+     val corruptDNSRecords = DNSSuspiciousConnectsAnalysis
+       .filterAndSelectCorruptDNSRecords(testDNSRecords.scoredDNSRecordsDF)
+ 
+     corruptDNSRecords.count should be(1)
+     corruptDNSRecords.schema.size should be(9)
+   }
+ 
+   def testDNSRecords = new {
+ 
+     val sqlContext = new SQLContext(sparkContext)
+ 
+     val inputDNSRecordsRDD = sparkContext.parallelize(wrapRefArray(Array(
+       Seq(null, 1463735425l, 168, "172.16.9.132", "turner.com.122.2o...", "0x00000001", 1, 0),
+       Seq("", 1463735425l, 168, "172.16.9.132", "turner.com.122.2o...", "0x00000001", 1, 0),
+       Seq("-", 1463735425l, 168, "172.16.9.132", "turner.com.122.2o...", "0x00000001", 1, 0),
+       Seq("May 20 2016 02:10:25.970987000 PDT", null, 168, "172.16.9.132", "turner.com.122.2o...", "0x00000001", 1, 0),
+       Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, null, "172.16.9.132", "turner.com.122.2o...", "0x00000001", 1, 0),
+       Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", null, "0x00000001", 1, 0),
+       Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", "", "0x00000001", 1, 0),
+       Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", "-", "0x00000001", 1, 0),
+       Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", "(empty)", "0x00000001", 1, 0),
+       Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, null, "turner.com.122.2o...", "0x00000001", 1, 0),
+       Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "", "turner.com.122.2o...", "0x00000001", 1, 0),
+       Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "-", "turner.com.122.2o...", "0x00000001", 1, 0),
+       Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", "turner.com.122.2o...", null, null, null),
+       Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", "turner.com.122.2o...", "", null, null),
+       Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", "turner.com.122.2o...", "-", null, null),
+       Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", "turner.com.122.2o...", null, 1, 0),
+       Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", "turner.com.122.2o...", "", 1, 0),
+       Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", "turner.com.122.2o...", "-", 1, 0),
+       Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", "turner.com.122.2o...", "0x00000001", null, 0),
+       Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", "turner.com.122.2o...", "0x00000001", 1, null),
+       Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", "turner.com.122.2o...", null, 1, null),
+       Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", "turner.com.122.2o...", null, 1, 0),
+       Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", "turner.com.122.2o...", "0x00000001", null, null))
+       .map(row => Row.fromSeq(row))))
+ 
+     val inputDNSRecordsSchema = StructType(
+       Array(TimestampField,
+         UnixTimestampField,
+         FrameLengthField,
+         ClientIPField,
+         QueryNameField,
+         QueryClassField,
+         QueryTypeField,
+         QueryResponseCodeField))
+ 
+     val inputDNSRecordsDF = sqlContext.createDataFrame(inputDNSRecordsRDD, inputDNSRecordsSchema)
+ 
+     val scoredDNSRecordsRDD = sparkContext.parallelize(wrapRefArray(Array(
+       Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", "turner.com.122.2o...", "0x00000001", 1, 0, 1d),
+       Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", "turner.com.122.2o...", "0x00000001", 1, 0, 0.0000005),
+       Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", "turner.com.122.2o...", "0x00000001", 1, 0, 0.05),
+       Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", "turner.com.122.2o...", "0x00000001", 1, 0, -1d),
+       Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", "turner.com.122.2o...", "0x00000001", 1, 0, 0.0001))
+       .map(row => Row.fromSeq(row))))
+ 
+     val scoredDNSRecordsSchema = StructType(
+       Array(TimestampField,
+         UnixTimestampField,
+         FrameLengthField,
+         ClientIPField,
+         QueryNameField,
+         QueryClassField,
+         QueryTypeField,
+         QueryResponseCodeField,
+         ScoreField))
+ 
+     val scoredDNSRecordsDF = sqlContext.createDataFrame(scoredDNSRecordsRDD, scoredDNSRecordsSchema)
+ 
+   }
 -
  }

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/9ac5a8c7/spot-ml/src/test/scala/org/apache/spot/netflow/FlowWordCreatorTest.scala
----------------------------------------------------------------------
diff --cc spot-ml/src/test/scala/org/apache/spot/netflow/FlowWordCreatorTest.scala
index 0000000,00d14bc..3208395
mode 000000,100644..100644
--- a/spot-ml/src/test/scala/org/apache/spot/netflow/FlowWordCreatorTest.scala
+++ b/spot-ml/src/test/scala/org/apache/spot/netflow/FlowWordCreatorTest.scala
@@@ -1,0 -1,214 +1,214 @@@
+ package org.apache.spot.netflow
+ 
+ import org.scalatest.{FlatSpec, Matchers}
+ 
+ 
+ class FlowWordCreatorTest extends FlatSpec with Matchers {
+ 
+   // Replace ports in index 10 and 11
+   val srcIP = "10.0.2.115"
+   val dstIP = "172.16.0.107"
+   val hour = 12
+   val minute = 59
+   val second = 32
+ 
+   val ibyts = 222L
+   val ipkts = 3L
+ 
+   val timeCuts = Array(2.4, 4.8, 7.2, 9.6, 12.0, 14.4, 16.8, 19.2, 21.6, 24.0)
+   val ipktCuts = Array(10d, 20d, 30d, 40d, 50d, 60d, 70d, 80d, 90d, 100d)
+   val ibytCuts = Array(100d, 200d, 300d, 400d, 500d)
+ 
+   val expectedIpktBin = 0
+   val expectedIbytBin = 2
+   val expectedTimeBin = 5
+ 
+ 
+   val flowWordCreator = new FlowWordCreator(timeCuts, ibytCuts, ipktCuts)
+ 
+ 
+   // 1. Test when sip is less than dip and sip is not 0 and dport is <= 1024 & sport > 1024 and min(dport, sport) !=0 +
+   "flowWords" should "create word with ip_pair as sourceIp-destIp, port is dport and dest_word direction is -1" in {
+     val srcPort = 2132
+     val dstPort = 23
+ 
+     val FlowWords(srcWord, dstWord) =
+       flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
+ 
+ 
+     dstWord shouldBe "-1_23_5_2_0"
 -    srcWord shouldBe "23_5_2_0"
++    srcWord shouldBe  "23_5_2_0"
+ 
+   }
+ 
+   // 2. Test when sip is less than dip and sip is not 0 and sport is <= 1024 & dport > 1024 and min(dport, sport) !=0 +
+   it should "create word with ip_pair as sourceIp-destIp, port is sport and src_word direction is -1" in {
+ 
+     val srcPort = 23
+     val dstPort = 2132
+ 
+     val FlowWords(srcWord, dstWord) =
+       flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
+ 
+     dstWord shouldBe "23_5_2_0"
+     srcWord shouldBe "-1_23_5_2_0"
+   }
+ 
+   // 3. Test when sip is less than dip and sip is not 0 and dport and sport are > 1024 +
+   it should "create word with ip_pair as sourceIp-destIp, port is 333333 and both words direction is 1 (not showing)" in {
+     val srcPort = 8392
+     val dstPort = 9874
+ 
+     val FlowWords(srcWord, dstWord) =
+       flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
+ 
+     dstWord shouldBe "333333_5_2_0"
+     srcWord shouldBe "333333_5_2_0"
+   }
+ 
+   // 4. Test when sip is less than dip and sip is not 0 and dport is 0 but sport is not +
+   it should "create word with ip_pair as sourceIp-destIp, port is sport and source_word direction is -1" in {
+     val srcPort = 80
+     val dstPort = 0
+ 
+     val FlowWords(srcWord, dstWord) =
+       flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
+ 
+ 
+     dstWord shouldBe "80_5_2_0"
+     srcWord shouldBe "-1_80_5_2_0"
+   }
+ 
+   // 5. Test when sip is less than dip and sip is not 0 and sport is 0 but dport is not +
+   it should "create word with ip_pair as sourceIp-destIp, port is dport and dest_word direction is -1 II" in {
+ 
+     val srcPort = 0
+     val dstPort = 43
+ 
+ 
+     val FlowWords(srcWord, dstWord) =
+       flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
+ 
+ 
+     dstWord shouldBe "-1_43_5_2_0"
+     srcWord shouldBe "43_5_2_0"
+   }
+ 
+   // 6. Test when sip is less than dip and sip is not 0 and sport and dport are less or equal than 1024 +
+   it should "create word with ip_pair as sourceIp-destIp, port is 111111 and both words direction is 1 (not showing)" in {
+     val srcPort = 1024
+     val dstPort = 80
+ 
+ 
+     val FlowWords(srcWord, dstWord) =
+       flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
+ 
+     dstWord shouldBe "111111_5_2_0"
+     srcWord shouldBe "111111_5_2_0"
+   }
+ 
+   // 7. Test when sip is less than dip and sip is not 0 and sport and dport are 0+
+   it should "create word with ip_pair as sourceIp-destIp, port is max(0,0) and both words direction is 1 (not showing)" in {
+     val srcPort = 0
+     val dstPort = 0
+ 
+ 
+     val FlowWords(srcWord, dstWord) =
+       flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
+ 
+     dstWord shouldBe "0_5_2_0"
+     srcWord shouldBe "0_5_2_0"
+   }
+ 
+   // 8. Test when sip is not less than dip and dport is <= 1024 & sport > 1024 and min(dport, sport) !=0+
+   it should "create word with ip_pair as destIp-sourceIp, port is dport and dest_word direction is -1" in {
+     val srcPort = 3245
+     val dstPort = 43
+ 
+ 
+     val FlowWords(srcWord, dstWord) =
+       flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
+ 
+     dstWord shouldBe "-1_43_5_2_0"
+     srcWord shouldBe "43_5_2_0"
+ 
+   }
+ 
+   // 9. Test when sip is not less than dip and sport is <= 1024 & dport > 1024 and min(dport, sport) !=0 +
+   it should "create word with ip_pair as destIp-sourceIp, port is sport and src_word direction is -1" in {
+     val srcPort = 80
+     val dstPort = 2435
+ 
+     val FlowWords(srcWord, dstWord) =
+       flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
+ 
+     dstWord shouldBe "80_5_2_0"
+     srcWord shouldBe "-1_80_5_2_0"
+ 
+   }
+ 
+   // 10. Test when sip is not less than dip and dport and sport are > 1024 +
+   it should "create word with ip_pair as destIp-sourceIp, port is 333333 and both words direction is 1 (not showing)" in {
+     val srcPort = 2354
+     val dstPort = 2435
+ 
+ 
+     val FlowWords(srcWord, dstWord) =
+       flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
+ 
+     dstWord shouldBe "333333_5_2_0"
+     srcWord shouldBe "333333_5_2_0"
+   }
+ 
+   // 11. Test when sip is not less than dip and dport is 0 but sport is not +
+   it should "create word with ip_pair as destIp-sourceIp, port is sport and src_word direction is -1 II" in {
+     val srcPort = 80
+     val dstPort = 0
+ 
+ 
+     val FlowWords(srcWord, dstWord) =
+       flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
+ 
+     dstWord shouldBe "80_5_2_0"
+     srcWord shouldBe "-1_80_5_2_0"
+   }
+ 
+   // 12. Test when sip is not less than dip and sport is 0 but dport is not +
+   it should "create word with ip_pair as destIp-sourceIp, port is dport and dest_word direction is -1 II" in {
+     val srcPort = 0
+     val dstPort = 2435
+ 
+ 
+     val FlowWords(srcWord, dstWord) =
+       flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
+ 
+     dstWord shouldBe "-1_2435_5_2_0"
+     srcWord shouldBe "2435_5_2_0"
+   }
+ 
+   // 13. Test when sip is not less than dip and sport and dport are less or equal than 1024
+   it should "create word with ip_pair as destIp-sourceIp, port 111111 and both words direction is 1 (not showing)" in {
+     val srcPort = 80
+     val dstPort = 1024
+ 
+ 
+     val FlowWords(srcWord, dstWord) =
+       flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
+ 
+     dstWord shouldBe "111111_5_2_0"
+     srcWord shouldBe "111111_5_2_0"
+   }
+ 
+   // 14. Test when sip is not less than dip and sport and dport are 0
+   it should "create word with ip_pair as destIp-sourceIp, port is max(0,0) and both words direction is 1 (not showing)" in {
+     val srcPort = 0
+     val dstPort = 0
+ 
+ 
+     val FlowWords(srcWord, dstWord) =
+       flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
+ 
+     dstWord shouldBe "0_5_2_0"
+     srcWord shouldBe "0_5_2_0"
+   }
+ }

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/9ac5a8c7/spot-ml/src/test/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysisTest.scala
----------------------------------------------------------------------
diff --cc spot-ml/src/test/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysisTest.scala
index d98500b,bc17751..fbb0a9c
--- a/spot-ml/src/test/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysisTest.scala
+++ b/spot-ml/src/test/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysisTest.scala
@@@ -1,95 -1,135 +1,218 @@@
  package org.apache.spot.proxy
  
 +import org.apache.log4j.{Level, LogManager}
 +import org.apache.spot.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig
+ import org.apache.spark.sql.{Row, SQLContext}
+ import org.apache.spark.sql.types.StructType
  import org.apache.spot.proxy.ProxySchema._
  import org.apache.spot.testutils.TestingSparkContextFlatSpec
  import org.scalatest.Matchers
  
 -/**
 -  * Created by rabarona on 12/15/16.
 -  */
 -class ProxySuspiciousConnectsAnalysisTest extends TestingSparkContextFlatSpec with Matchers{
 +case class ProxyInput(p_date:String,
 +                      p_time:String,
 +                      clientip:String,
 +                      host:String,
 +                      reqmethod:String,
 +                      useragent:String,
 +                      resconttype:String,
 +                      duration:Int,
 +                      username:String,
 +                      webcat:String,
 +                      referer:String,
 +                      respcode:String,
 +                      uriport:Int,
 +                      uripath:String,
 +                      uriquery:String,
 +                      serverip:String,
 +                      scbytes:Int,
 +                      csbytes:Int,
 +                      fulluri:String)
 +
 +class ProxySuspiciousConnectsAnalysisTest extends TestingSparkContextFlatSpec with Matchers {
 +
 +
 +
 +  val testConfigProxy = SuspiciousConnectsConfig(analysis = "proxy",
 +    inputPath = "",
 +    feedbackFile = "",
 +    duplicationFactor = 1,
 +    topicCount = 20,
 +    hdfsScoredConnect = "",
 +    threshold = 1.0d,
 +    maxResults = 1000,
 +    outputDelimiter = "\t",
 +    ldaPRGSeed = None,
 +    ldaMaxiterations = 20,
 +    ldaAlpha = 1.02,
 +    ldaBeta = 1.001)
 +
 +
 +  "proxy supicious connects analysis" should "estimate correct probabilities in toy data with top domain anomaly" in {
 +
 +    val logger = LogManager.getLogger("SuspiciousConnectsAnalysis")
 +    logger.setLevel(Level.WARN)
 +
 +    val anomalousRecord = ProxyInput("2016-10-03",	"04:57:36", "127.0.0.1",	"intel.com",	"PUT",
 +      "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36",
 +      "text/plain", 230,	"-", 	"Technology/Internet",	"http://www.spoonflower.com/tags/color",	"202",	80,
 +      "/sites/c37i4q22szvir8ga3m8mtxaft7gwnm5fio8hfxo35mu81absi1/carts/4b3a313d-50f6-4117-8ffd-4e804fd354ef/fiddle",
 +      "-",	"127.0.0.1",	338,	647,
 +      "maw.bronto.com/sites/c37i4q22szvir8ga3m8mtxaft7gwnm5fio8hfxo35mu81absi1/carts/4b3a313d-50f6-4117-8ffd-4e804fd354ef/fiddle")
 +
 +    val typicalRecord   = ProxyInput("2016-10-03",	"04:57:36", "127.0.0.1",	"maw.bronto.com",	"PUT",
 +      "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36",
 +      "text/plain", 230,	"-", 	"Technology/Internet",	"http://www.spoonflower.com/tags/color",	"202",	80,
 +      "/sites/c37i4q22szvir8ga3m8mtxaft7gwnm5fio8hfxo35mu81absi1/carts/4b3a313d-50f6-4117-8ffd-4e804fd354ef/fiddle",
 +      "-",	"127.0.0.1",	338,	647,
 +      "maw.bronto.com/sites/c37i4q22szvir8ga3m8mtxaft7gwnm5fio8hfxo35mu81absi1/carts/4b3a313d-50f6-4117-8ffd-4e804fd354ef/fiddle")
 +
 +
 +    val data = sqlContext.createDataFrame(Seq(anomalousRecord, typicalRecord, typicalRecord, typicalRecord, typicalRecord,
 +      typicalRecord, typicalRecord, typicalRecord, typicalRecord, typicalRecord))
 +
 +    val scoredData = ProxySuspiciousConnectsAnalysis.detectProxyAnomalies(data, testConfigProxy,
 +      sparkContext,
 +      sqlContext,
 +      logger)
 +
 +
 +
 +    val anomalyScore = scoredData.filter(scoredData(Host) ===  "intel.com").first().getAs[Double](Score)
 +    val typicalScores = scoredData.filter(scoredData(Host) === "maw.bronto.com").collect().map(_.getAs[Double](Score))
 +
 +    Math.abs(anomalyScore - 0.1d)  should be <= 0.01d
 +    typicalScores.length shouldBe 9
 +    Math.abs(typicalScores(0) - 0.9d)  should be <= 0.01d
 +    Math.abs(typicalScores(1) - 0.9d)  should be <= 0.01d
 +    Math.abs(typicalScores(2) - 0.9d)  should be <= 0.01d
 +    Math.abs(typicalScores(3) - 0.9d)  should be <= 0.01d
 +    Math.abs(typicalScores(4) - 0.9d)  should be <= 0.01d
 +    Math.abs(typicalScores(5) - 0.9d)  should be <= 0.01d
 +    Math.abs(typicalScores(6) - 0.9d)  should be <= 0.01d
 +    Math.abs(typicalScores(7) - 0.9d)  should be <= 0.01d
 +    Math.abs(typicalScores(8) - 0.9d)  should be <= 0.01d
 +  }
 +
  
+   "filterAndSelectCleanProxyRecords" should "return data without garbage" in {
+ 
+     val cleanedProxyRecords = ProxySuspiciousConnectsAnalysis
+       .filterAndSelectCleanProxyRecords(testProxyRecords.inputProxyRecordsDF)
+ 
+     cleanedProxyRecords.count should be(1)
+     cleanedProxyRecords.schema.size should be(19)
+   }
+ 
+   "filterAndSelectInvalidProxyRecords" should "return invalir records" in {
+ 
+     val invalidProxyRecords = ProxySuspiciousConnectsAnalysis
+       .filterAndSelectInvalidProxyRecords(testProxyRecords.inputProxyRecordsDF)
+ 
+     invalidProxyRecords.count should be(5)
+     invalidProxyRecords.schema.size should be(19)
+ 
+   }
+ 
+   "filterScoredProxyRecords" should "return records with score less or equal to threshold" in {
+ 
+     val threshold = 10e-5
+ 
+     val scoredProxyRecords = ProxySuspiciousConnectsAnalysis
+       .filterScoredProxyRecords(testProxyRecords.scoredProxyRecordsDF, threshold)
+ 
+     scoredProxyRecords.count should be(2)
+ 
+   }
+ 
+   "filterAndSelectCorruptProxyRecords" should "return records where Score is equal to -1" in {
+ 
+     val corruptProxyRecords = ProxySuspiciousConnectsAnalysis
+       .filterAndSelectCorruptProxyRecords(testProxyRecords.scoredProxyRecordsDF)
+ 
+     corruptProxyRecords.count should be(1)
+     corruptProxyRecords.schema.size should be(21)
+   }
+ 
+   def testProxyRecords = new {
+ 
+     val sqlContext = new SQLContext(sparkContext)
+ 
+     val inputProxyRecordsRDD = sparkContext.parallelize(wrapRefArray(Array(
+       Seq(null,"00:09:13","10.239.160.152","cn.archive.ubuntu...","GET","Debian APT-HTTP/...","text/html",448,"-",
+         "-","-","404","80","/ubuntu/dists/tru...","-","10.239.4.160",2864,218,"cn.archive.ubuntu..."),
+       Seq("2016-10-03",null,"10.239.160.152","cn.archive.ubuntu...","GET","Debian APT-HTTP/...","text/html",448,"-",
+         "-","-","404","80","/ubuntu/dists/tru...","-","10.239.4.160",2864,218,"cn.archive.ubuntu..."),
+       Seq("2016-10-03","00:09:13",null,"cn.archive.ubuntu...","GET","Debian APT-HTTP/...","text/html",448,"-",
+         "-","-","404","80","/ubuntu/dists/tru...","-","10.239.4.160",2864,218,"cn.archive.ubuntu..."),
+       Seq("2016-10-03","00:09:13","10.239.160.152",null,"GET","Debian APT-HTTP/...","text/html",448,"-",
+         "-","-","404","80","/ubuntu/dists/tru...","-","10.239.4.160",2864,218,"cn.archive.ubuntu..."),
+       Seq("2016-10-03","00:09:13","10.239.160.152","cn.archive.ubuntu...","GET","Debian APT-HTTP/...","text/html",448,"-",
+         "-","-","404","80","/ubuntu/dists/tru...","-","10.239.4.160",2864,218,null),
+       Seq("2016-10-03","00:09:13","10.239.160.152","cn.archive.ubuntu...","GET","Debian APT-HTTP/...","text/html",448,"-",
+         "-","-","404","80","/ubuntu/dists/tru...","-","10.239.4.160",2864,218,"cn.archive.ubuntu..."))
+       .map(row => Row.fromSeq(row))))
+ 
+     val inputProxyRecordsSchema = StructType(
+       Array(DateField,
+         TimeField,
+         ClientIPField,
+         HostField,
+         ReqMethodField,
+         UserAgentField,
+         ResponseContentTypeField,
+         DurationField,
+         UserNameField,
+         WebCatField,
+         RefererField,
+         RespCodeField,
+         URIPortField,
+         URIPathField,
+         URIQueryField,
+         ServerIPField,
+         SCBytesField,
+         CSBytesField,
+         FullURIField))
+ 
+     val inputProxyRecordsDF = sqlContext.createDataFrame(inputProxyRecordsRDD, inputProxyRecordsSchema)
+ 
+     val scoredProxyRecordsRDD = sparkContext.parallelize(wrapRefArray(Array(
+       Seq("2016-10-03","00:09:13","10.239.160.152","cn.archive.ubuntu...","GET","Debian APT-HTTP/...","text/html",448,"-",
+         "-","-","404","80","/ubuntu/dists/tru...","-","10.239.4.160",2864,218,"cn.archive.ubuntu...", "a word", -1d),
+       Seq("2016-10-03","00:09:13","10.239.160.152","cn.archive.ubuntu...","GET","Debian APT-HTTP/...","text/html",448,"-",
+         "-","-","404","80","/ubuntu/dists/tru...","-","10.239.4.160",2864,218,"cn.archive.ubuntu...", "a word", 1d),
+       Seq("2016-10-03","00:09:13","10.239.160.152","cn.archive.ubuntu...","GET","Debian APT-HTTP/...","text/html",448,"-",
+         "-","-","404","80","/ubuntu/dists/tru...","-","10.239.4.160",2864,218,"cn.archive.ubuntu...", "a word", 0.0000005),
+       Seq("2016-10-03","00:09:13","10.239.160.152","cn.archive.ubuntu...","GET","Debian APT-HTTP/...","text/html",448,"-",
+         "-","-","404","80","/ubuntu/dists/tru...","-","10.239.4.160",2864,218,"cn.archive.ubuntu...", "a word", 0.05),
+       Seq("2016-10-03","00:09:13","10.239.160.152","cn.archive.ubuntu...","GET","Debian APT-HTTP/...","text/html",448,"-",
+         "-","-","404","80","/ubuntu/dists/tru...","-","10.239.4.160",2864,218,"cn.archive.ubuntu...", "a word", 0.0001)
+     ).map(row => Row.fromSeq(row))))
+ 
+     val scoredProxyRecordsSchema = StructType(
+       Array(DateField,
+         TimeField,
+         ClientIPField,
+         HostField,
+         ReqMethodField,
+         UserAgentField,
+         ResponseContentTypeField,
+         DurationField,
+         UserNameField,
+         WebCatField,
+         RefererField,
+         RespCodeField,
+         URIPortField,
+         URIPathField,
+         URIQueryField,
+         ServerIPField,
+         SCBytesField,
+         CSBytesField,
+         FullURIField,
+         WordField,
+         ScoreField))
+ 
+     val scoredProxyRecordsDF = sqlContext.createDataFrame(scoredProxyRecordsRDD, scoredProxyRecordsSchema)
+ 
+   }
+ 
  }



[46/49] incubator-spot git commit: Ingest summary links updated

Posted by ev...@apache.org.
Ingest summary links updated


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/79028579
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/79028579
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/79028579

Branch: refs/heads/master
Commit: 79028579484ec8a1a9a8a68cb7b7923038fcb16f
Parents: 8a69509
Author: Diego Ortiz Huerta <di...@intel.com>
Authored: Wed Dec 7 15:22:02 2016 -0800
Committer: Everardo Lopez Sandoval (Intel) <el...@elopezsa-mac02.ra.intel.com>
Committed: Fri Jan 20 17:01:02 2017 -0800

----------------------------------------------------------------------
 spot-oa/ui/dns/storyboard.html             | 5 +++--
 spot-oa/ui/dns/suspicious.html             | 5 +++--
 spot-oa/ui/dns/threat-investigation.html   | 5 +++--
 spot-oa/ui/flow/storyboard.html            | 5 +++--
 spot-oa/ui/flow/suspicious.html            | 5 +++--
 spot-oa/ui/flow/threat-investigation.html  | 5 +++--
 spot-oa/ui/proxy/storyboard.html           | 5 +++--
 spot-oa/ui/proxy/suspicious.html           | 5 +++--
 spot-oa/ui/proxy/threat-investigation.html | 5 +++--
 9 files changed, 27 insertions(+), 18 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/79028579/spot-oa/ui/dns/storyboard.html
----------------------------------------------------------------------
diff --git a/spot-oa/ui/dns/storyboard.html b/spot-oa/ui/dns/storyboard.html
index 5f9f725..c5dbc0f 100755
--- a/spot-oa/ui/dns/storyboard.html
+++ b/spot-oa/ui/dns/storyboard.html
@@ -69,8 +69,6 @@
                             <li>
                                 <a data-href="../flow/storyboard.html#date=${date}">Storyboard</a>
                             </li>
-                            <li>
-                                <a data-href="../flow/ingest-summary.html#end-date=${date}">Ingest Summary</a>
                         </ul>
                     </li>
                     <li class="dropdown">
@@ -107,6 +105,9 @@
                             </li>
                         </ul>
                     </li>
+                    <li>
+                        <a data-href="../ingest-summary.html#end-date=${date}">Ingest Summary</a>
+                    </li>
                 </ul>
             </div>
             <div id="search-box" class="row text-right">

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/79028579/spot-oa/ui/dns/suspicious.html
----------------------------------------------------------------------
diff --git a/spot-oa/ui/dns/suspicious.html b/spot-oa/ui/dns/suspicious.html
index 6d008b3..845e5e2 100755
--- a/spot-oa/ui/dns/suspicious.html
+++ b/spot-oa/ui/dns/suspicious.html
@@ -79,8 +79,6 @@
                             <li>
                                 <a data-href="../flow/storyboard.html#date=${date}">Storyboard</a>
                             </li>
-                            <li>
-                                <a data-href="../flow/ingest-summary.html#end-date=${date}">Ingest Summary</a>
                         </ul>
                     </li>
                     <li class="dropdown">
@@ -117,6 +115,9 @@
                             </li>
                         </ul>
                     </li>
+                    <li>
+                        <a data-href="../ingest-summary.html#end-date=${date}">Ingest Summary</a>
+                    </li>
                 </ul>
             </div>
             <div id="search-box" class="row text-right">

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/79028579/spot-oa/ui/dns/threat-investigation.html
----------------------------------------------------------------------
diff --git a/spot-oa/ui/dns/threat-investigation.html b/spot-oa/ui/dns/threat-investigation.html
index cf5966d..08218fd 100755
--- a/spot-oa/ui/dns/threat-investigation.html
+++ b/spot-oa/ui/dns/threat-investigation.html
@@ -67,8 +67,6 @@
                             <li>
                                 <a data-href="../flow/storyboard.html#date=${date}">Storyboard</a>
                             </li>
-                            <li>
-                                <a data-href="../flow/ingest-summary.html#end-date=${date}">Ingest Summary</a>
                         </ul>
                     </li>
                     <li class="dropdown">
@@ -105,6 +103,9 @@
                             </li>
                         </ul>
                     </li>
+                    <li>
+                        <a data-href="../ingest-summary.html#end-date=${date}">Ingest Summary</a>
+                    </li>
                 </ul>
             </div>
             <div id="search-box" class="row text-right">

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/79028579/spot-oa/ui/flow/storyboard.html
----------------------------------------------------------------------
diff --git a/spot-oa/ui/flow/storyboard.html b/spot-oa/ui/flow/storyboard.html
index 8596eea..f0dccb7 100755
--- a/spot-oa/ui/flow/storyboard.html
+++ b/spot-oa/ui/flow/storyboard.html
@@ -69,8 +69,6 @@
                             <li>
                                 <a data-href="storyboard.html#date=${date}">Storyboard</a>
                             </li>
-                            <li>
-                                <a data-href="ingest-summary.html#end-date=${date}">Ingest Summary</a>
                         </ul>
                     </li>
                     <li class="dropdown">
@@ -107,6 +105,9 @@
                             </li>
                         </ul>
                     </li>
+                    <li>
+                        <a data-href="../ingest-summary.html#end-date=${date}">Ingest Summary</a>
+                    </li>
                 </ul>
             </div>
             <div id="search-box" class="row text-right">

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/79028579/spot-oa/ui/flow/suspicious.html
----------------------------------------------------------------------
diff --git a/spot-oa/ui/flow/suspicious.html b/spot-oa/ui/flow/suspicious.html
index 9bd47c8..d755474 100755
--- a/spot-oa/ui/flow/suspicious.html
+++ b/spot-oa/ui/flow/suspicious.html
@@ -73,8 +73,6 @@
                             <li>
                                 <a data-href="storyboard.html#date=${date}">Storyboard</a>
                             </li>
-                            <li>
-                                <a data-href="ingest-summary.html#end-date=${date}">Ingest Summary</a>
                         </ul>
                     </li>
                     <li class="dropdown">
@@ -111,6 +109,9 @@
                             </li>
                         </ul>
                     </li>
+                    <li>
+                        <a data-href="../ingest-summary.html#end-date=${date}">Ingest Summary</a>
+                    </li>
                 </ul>
             </div>
             <div id="search-box" class="row text-right">

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/79028579/spot-oa/ui/flow/threat-investigation.html
----------------------------------------------------------------------
diff --git a/spot-oa/ui/flow/threat-investigation.html b/spot-oa/ui/flow/threat-investigation.html
index d80ccc6..b5bb053 100755
--- a/spot-oa/ui/flow/threat-investigation.html
+++ b/spot-oa/ui/flow/threat-investigation.html
@@ -67,8 +67,6 @@
                             <li>
                                 <a data-href="storyboard.html#date=${date}">Storyboard</a>
                             </li>
-                            <li>
-                                <a data-href="ingest-summary.html#end-date=${date}">Ingest Summary</a>
                         </ul>
                     </li>
                     <li class="dropdown">
@@ -105,6 +103,9 @@
                             </li>
                         </ul>
                     </li>
+                    <li>
+                        <a data-href="../ingest-summary.html#end-date=${date}">Ingest Summary</a>
+                    </li>
                 </ul>
             </div>
             <div id="search-box" class="row text-right">

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/79028579/spot-oa/ui/proxy/storyboard.html
----------------------------------------------------------------------
diff --git a/spot-oa/ui/proxy/storyboard.html b/spot-oa/ui/proxy/storyboard.html
index 6968566..2ed4a08 100755
--- a/spot-oa/ui/proxy/storyboard.html
+++ b/spot-oa/ui/proxy/storyboard.html
@@ -147,8 +147,6 @@
                             <li>
                                 <a data-href="../flow/storyboard.html#date=${date}">Storyboard</a>
                             </li>
-                            <li>
-                                <a data-href="../flow/ingest-summary.html#end-date=${date}">Ingest Summary</a>
                         </ul>
                     </li>
                     <li class="dropdown">
@@ -185,6 +183,9 @@
                             </li>
                         </ul>
                     </li>
+                    <li>
+                        <a data-href="../ingest-summary.html#end-date=${date}">Ingest Summary</a>
+                    </li>
                 </ul>
             </div>
             <div id="search-box" class="row text-right">

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/79028579/spot-oa/ui/proxy/suspicious.html
----------------------------------------------------------------------
diff --git a/spot-oa/ui/proxy/suspicious.html b/spot-oa/ui/proxy/suspicious.html
index 828a42c..74145f3 100755
--- a/spot-oa/ui/proxy/suspicious.html
+++ b/spot-oa/ui/proxy/suspicious.html
@@ -137,8 +137,6 @@
                             <li>
                                 <a data-href="../flow/storyboard.html#date=${date}">Storyboard</a>
                             </li>
-                            <li>
-                                <a data-href="../flow/ingest-summary.html#end-date=${date}">Ingest Summary</a>
                         </ul>
                     </li>
                     <li class="dropdown">
@@ -175,6 +173,9 @@
                             </li>
                         </ul>
                     </li>
+                    <li>
+                        <a data-href="../ingest-summary.html#end-date=${date}">Ingest Summary</a>
+                    </li>
                 </ul>
             </div>
             <div id="search-box" class="row text-right">

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/79028579/spot-oa/ui/proxy/threat-investigation.html
----------------------------------------------------------------------
diff --git a/spot-oa/ui/proxy/threat-investigation.html b/spot-oa/ui/proxy/threat-investigation.html
index ae740a9..45c5326 100755
--- a/spot-oa/ui/proxy/threat-investigation.html
+++ b/spot-oa/ui/proxy/threat-investigation.html
@@ -66,8 +66,6 @@
                             <li>
                                 <a data-href="../flow/storyboard.html#date=${date}">Storyboard</a>
                             </li>
-                            <li>
-                                <a data-href="../flow/ingest-summary.html#end-date=${date}">Ingest Summary</a>
                         </ul>
                     </li>
                     <li class="dropdown">
@@ -104,6 +102,9 @@
                             </li>
                         </ul>
                     </li>
+                    <li>
+                        <a data-href="../ingest-summary.html#end-date=${date}">Ingest Summary</a>
+                    </li>
                 </ul>
             </div>
             <div id="search-box" class="row text-right">


[37/49] incubator-spot git commit: Merge pull request #174 from NathanSegerlind/unit_test_cleanup

Posted by ev...@apache.org.
Merge pull request #174 from NathanSegerlind/unit_test_cleanup

Unit test extravaganza

Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/5901e064
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/5901e064
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/5901e064

Branch: refs/heads/master
Commit: 5901e06424c00b25759728eeebf6ea61c3e45d37
Parents: 6ebae47 a70dbf0
Author: NathanSegerlind <na...@intel.com>
Authored: Thu Jan 5 10:09:25 2017 -0800
Committer: GitHub <no...@github.com>
Committed: Thu Jan 5 10:09:25 2017 -0800

----------------------------------------------------------------------
 spot-ml/build.sbt                               |   2 +
 .../org/apache/spot/SuspiciousConnects.scala    |   1 -
 .../spot/SuspiciousConnectsScoreFunction.scala  |  14 +-
 .../scala/org/apache/spot/dns/DNSSchema.scala   |   5 -
 .../dns/DNSSuspiciousConnectsAnalysis.scala     |  50 +++--
 .../org/apache/spot/dns/DNSWordCreation.scala   |  16 +-
 .../dns/model/DNSSuspiciousConnectsModel.scala  |  70 +++----
 .../apache/spot/lda/SpotLDAWrapperSchema.scala  |   6 +
 .../FlowSuspiciousConnectsAnalysis.scala        |  54 ++++--
 .../apache/spot/netflow/FlowWordCreator.scala   |  11 +-
 .../spot/netflow/model/FlowScoreFunction.scala  |  69 +++----
 .../model/FlowSuspiciousConnectsModel.scala     |  64 +++---
 .../proxy/ProxySuspiciousConnectsAnalysis.scala |  48 +++--
 .../proxy/ProxySuspiciousConnectsModel.scala    |  30 +--
 .../apache/spot/utilities/DataFrameUtils.scala  |  39 ----
 .../utilities/data/InputOutputDataHandler.scala |  34 ++--
 spot-ml/src/test/resources/log4j.properties     |   8 +
 .../org/apache/spot/SpotLDAWrapperTest.scala    | 172 ++++++++---------
 .../dns/DNSSuspiciousConnectsAnalysisTest.scala |  58 +++++-
 .../apache/spot/dns/DNSWordCreationTest.scala   |   1 -
 .../FlowSuspiciousConnectsAnalysisTest.scala    | 193 +++++++++++++++++++
 .../FlowSuspiciousCoonectsAnalysis.scala        | 125 ------------
 .../spot/netflow/FlowWordCreatorTest.scala      |   2 +-
 .../ProxySuspiciousConnectsAnalysisTest.scala   |  91 ++++++++-
 24 files changed, 701 insertions(+), 462 deletions(-)
----------------------------------------------------------------------



[42/49] incubator-spot git commit: Removing lda-c code and submodule, new spark LDA version has been added

Posted by ev...@apache.org.
Removing lda-c code and submodule, new spark LDA version has been added


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/7a7d91d8
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/7a7d91d8
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/7a7d91d8

Branch: refs/heads/master
Commit: 7a7d91d8c676fe307de10889e448a1ec1faee899
Parents: 6ae14f5
Author: Everardo Lopez Sandoval (Intel) <el...@elopezsa-mac02.local>
Authored: Wed Jan 4 10:53:22 2017 -0600
Committer: Everardo Lopez Sandoval (Intel) <el...@elopezsa-mac02.ra.intel.com>
Committed: Fri Jan 20 17:01:02 2017 -0800

----------------------------------------------------------------------
 .gitmodules        | 3 ---
 spot-ml/spot-lda-c | 1 -
 2 files changed, 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/7a7d91d8/.gitmodules
----------------------------------------------------------------------
diff --git a/.gitmodules b/.gitmodules
index 8b89ba7..e69de29 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +0,0 @@
-[submodule "spot-ml/spot-lda-c"]
-	path = spot-ml/spot-lda-c
-	url = https://github.com/Open-Network-Insight/spot-lda-c.git

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/7a7d91d8/spot-ml/spot-lda-c
----------------------------------------------------------------------
diff --git a/spot-ml/spot-lda-c b/spot-ml/spot-lda-c
deleted file mode 160000
index 7891541..0000000
--- a/spot-ml/spot-lda-c
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 7891541f27109ea943320473811c9fb357c7fede


[05/49] incubator-spot git commit: And yet more editing of SuspiciousConnectsArgumentParser.scala to remove old argument references.

Posted by ev...@apache.org.
And yet more editing of SuspiciousConnectsArgumentParser.scala to remove old argument references.


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/f7596ca8
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/f7596ca8
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/f7596ca8

Branch: refs/heads/master
Commit: f7596ca8f13629e09d88a4d257ecb6f8d10f33de
Parents: f3b3652
Author: Brandon Edwards <br...@intel.com>
Authored: Wed Dec 7 16:46:41 2016 -0800
Committer: Brandon Edwards <br...@intel.com>
Committed: Wed Dec 7 16:46:41 2016 -0800

----------------------------------------------------------------------
 .../scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala | 4 ----
 1 file changed, 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/f7596ca8/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala
index e6f5c1c..4647dbf 100644
--- a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala
@@ -50,10 +50,6 @@ object SuspiciousConnectsArgumentParser {
       action((x, c) => c.copy(userDomain = x)).
       text("Domain of spot user (example: intel)")
 
-    opt[String]("nodes").required().valueName("<input param>").
-      action((x, c) => c.copy(nodes = x)).
-      text("Node list")
-
     opt[String]("scored").required().valueName("<hdfs path>").
       action((x, c) => c.copy(hdfsScoredConnect = x)).
       text("HDFS path for results")


[23/49] incubator-spot git commit: Merge branch 'test_dns_topdomain' into test_proxy

Posted by ev...@apache.org.
Merge branch 'test_dns_topdomain' into test_proxy


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/0b1d46ee
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/0b1d46ee
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/0b1d46ee

Branch: refs/heads/master
Commit: 0b1d46eebb0e0526d894683589c5d96f80eec082
Parents: deeed03 ac44bf0
Author: nlsegerl <na...@intel.com>
Authored: Mon Dec 19 15:46:23 2016 -0800
Committer: nlsegerl <na...@intel.com>
Committed: Mon Dec 19 15:46:23 2016 -0800

----------------------------------------------------------------------

----------------------------------------------------------------------



[20/49] incubator-spot git commit: Changed USER_DOMAIN_PARSER_CMD to USER_DOMAIN_CMD

Posted by ev...@apache.org.
Changed USER_DOMAIN_PARSER_CMD to USER_DOMAIN_CMD


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/1a0269f2
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/1a0269f2
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/1a0269f2

Branch: refs/heads/master
Commit: 1a0269f2e0f9a1297e1084c01b1991362f0a74ae
Parents: 452fca3
Author: Brandon Edwards <br...@intel.com>
Authored: Wed Dec 14 08:35:14 2016 -0800
Committer: Brandon Edwards <br...@intel.com>
Committed: Wed Dec 14 08:35:14 2016 -0800

----------------------------------------------------------------------
 spot-ml/ml_ops.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/1a0269f2/spot-ml/ml_ops.sh
----------------------------------------------------------------------
diff --git a/spot-ml/ml_ops.sh b/spot-ml/ml_ops.sh
index ece60cd..02a9c1d 100755
--- a/spot-ml/ml_ops.sh
+++ b/spot-ml/ml_ops.sh
@@ -48,9 +48,9 @@ fi
 # pass the user domain designation if not empty
 
 if [ ! -z $USER_DOMAIN ] ; then
-    USER_DOMAIN_PARSER_CMD="--userdomain $USER_DOMAIN"
+    USER_DOMAIN_CMD="--userdomain $USER_DOMAIN"
 else
-    USER_DOMAIN_PARSER_CMD=''
+    USER_DOMAIN_CMD=''
 fi
 
 FEEDBACK_PATH=${LPATH}/${DSOURCE}_scores.csv
@@ -109,7 +109,7 @@ time spark-submit --class "org.apache.spot.SuspiciousConnects" \
   --threshold ${TOL} \
   --maxresults ${MAXRESULTS} \
   --ldamaxiterations 20 \
-  $USER_DOMAIN_PARSER_CMD
+  $USER_DOMAIN_CMD
 
 wait
 


[04/49] incubator-spot git commit: More editing on SuspiciousConnectsArgumentParser to eliminate old (dropped) arguments.

Posted by ev...@apache.org.
More editing on SuspiciousConnectsArgumentParser to eliminate old (dropped) arguments.


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/f3b36526
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/f3b36526
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/f3b36526

Branch: refs/heads/master
Commit: f3b3652673713fd36152605c64df9a95e140e818
Parents: d7d6ae0
Author: Brandon Edwards <br...@intel.com>
Authored: Wed Dec 7 16:39:18 2016 -0800
Committer: Brandon Edwards <br...@intel.com>
Committed: Wed Dec 7 16:39:18 2016 -0800

----------------------------------------------------------------------
 .../spot/SuspiciousConnectsArgumentParser.scala     | 16 ----------------
 1 file changed, 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/f3b36526/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala
index be0db30..e6f5c1c 100644
--- a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala
@@ -11,11 +11,7 @@ object SuspiciousConnectsArgumentParser {
                                       feedbackFile: String = "",
                                       duplicationFactor: Int = 1,
                                       topicCount: Int = 20,
-                                      localPath: String = "",
-                                      localUser: String = "",
                                       userDomain: String = "",
-                                      ldaPath: String = "",
-                                      nodes: String = "",
                                       hdfsScoredConnect: String = "",
                                       threshold: Double = 1.0d,
                                       maxResults: Int = -1,
@@ -50,18 +46,6 @@ object SuspiciousConnectsArgumentParser {
       action((x, c) => c.copy(topicCount = x.toInt)).
       text("topic count")
 
-    opt[String]("lpath").required().valueName("<local path>").
-      action((x, c) => c.copy(localPath = x)).
-      text("Local Path")
-
-    opt[String]("ldapath").required().valueName("<local path>").
-      action((x, c) => c.copy(ldaPath = x)).
-      text("LDA Path")
-
-    opt[String]("luser").required().valueName("<local path>").
-      action((x, c) => c.copy(localUser = x)).
-      text("Local user path")
-
     opt[String]("userdomain").required().valueName("<user domain>").
       action((x, c) => c.copy(userDomain = x)).
       text("Domain of spot user (example: intel)")


[08/49] incubator-spot git commit: Changed the usderDomain input to be optional in the argument parser and changed the default value of this armgument to be the empty string in spot.conf.

Posted by ev...@apache.org.
Changed the usderDomain input to be optional in the argument parser and changed the default value of this armgument to be the empty string in spot.conf.


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/55b0497b
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/55b0497b
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/55b0497b

Branch: refs/heads/master
Commit: 55b0497bf3a6dbe87e659cadffcb70784cb9c1af
Parents: f6c2b2b
Author: Brandon Edwards <br...@intel.com>
Authored: Fri Dec 9 13:56:44 2016 -0800
Committer: Brandon Edwards <br...@intel.com>
Committed: Fri Dec 9 13:56:44 2016 -0800

----------------------------------------------------------------------
 spot-ml/INSTALL.md                                               | 1 +
 .../scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala | 2 +-
 .../main/scala/org/apache/spot/utilities/DomainProcessor.scala   | 2 +-
 spot-setup/spot.conf                                             | 4 ++--
 4 files changed, 5 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/55b0497b/spot-ml/INSTALL.md
----------------------------------------------------------------------
diff --git a/spot-ml/INSTALL.md b/spot-ml/INSTALL.md
index 505f35a..0ca8ef2 100644
--- a/spot-ml/INSTALL.md
+++ b/spot-ml/INSTALL.md
@@ -14,6 +14,7 @@ Names and language that we will use from the configuration variables for Spot (t
 - MLNODE The node from which the spot-ml routines are invoked
 - HUSER An HDFS user path that will be the base path for the solution; this is usually the same user that you created to run the solution
 - HPATH Location for storing intermediate results of the analysis on HDFS.
+- USER_DOMAIN Web domain associated to the user's network (for the DNS suspicous connects analysis). For example: USER_DOMAIN='intel'.
 
 ### Prepare data for input 
 

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/55b0497b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala
index 4647dbf..9b0ac07 100644
--- a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala
@@ -46,7 +46,7 @@ object SuspiciousConnectsArgumentParser {
       action((x, c) => c.copy(topicCount = x.toInt)).
       text("topic count")
 
-    opt[String]("userdomain").required().valueName("<user domain>").
+    opt[String]("userdomain").valueName("<user domain>").
       action((x, c) => c.copy(userDomain = x)).
       text("Domain of spot user (example: intel)")
 

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/55b0497b/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala b/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala
index c5f0d73..a60b1fb 100644
--- a/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala
@@ -65,7 +65,7 @@ object DomainProcessor extends Serializable {
       0
     }
 
-    val topDomainClass = if (domain == userDomain) {
+    val topDomainClass = if (userDomain != "" && domain == userDomain) {
       2
     } else if (topDomainsBC.value contains domain) {
       1

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/55b0497b/spot-setup/spot.conf
----------------------------------------------------------------------
diff --git a/spot-setup/spot.conf b/spot-setup/spot.conf
index 433afa8..3b217ed 100755
--- a/spot-setup/spot.conf
+++ b/spot-setup/spot.conf
@@ -31,8 +31,8 @@ RPATH=${LUSER}/ipython/user/${FDATE}
 LDAPATH=${LUSER}/ml/oni-lda-c
 LIPATH=${LUSER}/ingest
 
-#domain associated to network data to be analyzed
-USER_DOMAIN='intel'
+#dns suspicious connects config
+USER_DOMAIN=''
 
 SPK_EXEC='400'
 SPK_EXEC_MEM='2048m'


[09/49] incubator-spot git commit: Added a test to DomainProcessorTest.scala to insure that an empty string for userDomain would not be taken as the legitimate user domain in the case of an empty string domain.

Posted by ev...@apache.org.
Added a test to DomainProcessorTest.scala to insure that an empty string for userDomain would not be taken as the legitimate user domain in the case of an empty string domain.


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/13fc7186
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/13fc7186
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/13fc7186

Branch: refs/heads/master
Commit: 13fc71863558d8d27512ca7c710fda585812a63a
Parents: 55b0497
Author: Brandon Edwards <br...@intel.com>
Authored: Fri Dec 9 17:02:50 2016 -0800
Committer: Brandon Edwards <br...@intel.com>
Committed: Fri Dec 9 17:02:50 2016 -0800

----------------------------------------------------------------------
 .../org/apache/spot/utilities/DomainProcessorTest.scala   | 10 ++++++++++
 1 file changed, 10 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/13fc7186/spot-ml/src/test/scala/org/apache/spot/utilities/DomainProcessorTest.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/test/scala/org/apache/spot/utilities/DomainProcessorTest.scala b/spot-ml/src/test/scala/org/apache/spot/utilities/DomainProcessorTest.scala
index 7e2cae6..5d7281e 100644
--- a/spot-ml/src/test/scala/org/apache/spot/utilities/DomainProcessorTest.scala
+++ b/spot-ml/src/test/scala/org/apache/spot/utilities/DomainProcessorTest.scala
@@ -115,4 +115,14 @@ class DomainProcessorTest extends TestingSparkContextFlatSpec with Matchers {
 
     result shouldBe DomainInfo(domain = "amazon", subdomain = "None", topDomain = 1, subdomainLength = 0, subdomainEntropy = 0, numPeriods = 2)
   }
+  it should "not identify the domain as the users domain when both are empty strings" in {
+    val url = "ab..com"
+    val countryCodes = sparkContext.broadcast(countryCodesSet)
+    val topDomains = sparkContext.broadcast(TopDomains.TopDomains)
+    val userDomain = ""
+
+    val result = extractDomainInfo(url, topDomains, userDomain)
+
+    result shouldBe DomainInfo(domain = "", subdomain = "ab", topDomain = 0, subdomainLength = 2, subdomainEntropy = 1, numPeriods = 3)
+  }
 }


[38/49] incubator-spot git commit: flow/js/stores/IngestSummaryStore.js is not needed anymore

Posted by ev...@apache.org.
flow/js/stores/IngestSummaryStore.js is not needed anymore


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/5ecde790
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/5ecde790
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/5ecde790

Branch: refs/heads/master
Commit: 5ecde79070bec2954d91390b2ca5b99d27016aa0
Parents: 4647064
Author: Diego Ortiz Huerta <di...@intel.com>
Authored: Mon Dec 12 10:20:15 2016 -0800
Committer: Everardo Lopez Sandoval (Intel) <el...@elopezsa-mac02.ra.intel.com>
Committed: Fri Jan 20 17:01:02 2017 -0800

----------------------------------------------------------------------
 spot-oa/ui/flow/js/stores/IngestSummaryStore.js | 161 -------------------
 1 file changed, 161 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/5ecde790/spot-oa/ui/flow/js/stores/IngestSummaryStore.js
----------------------------------------------------------------------
diff --git a/spot-oa/ui/flow/js/stores/IngestSummaryStore.js b/spot-oa/ui/flow/js/stores/IngestSummaryStore.js
deleted file mode 100755
index 55f2106..0000000
--- a/spot-oa/ui/flow/js/stores/IngestSummaryStore.js
+++ /dev/null
@@ -1,161 +0,0 @@
-const assign = require('object-assign');
-const d3 = require('d3');
-
-const SpotDispatcher = require('../../../js/dispatchers/SpotDispatcher');
-const SpotConstants = require('../../../js/constants/SpotConstants');
-const NetflowConstants = require('../constants/NetflowConstants');
-const DateUtils = require('../../../js/utils/DateUtils');
-const RestStore = require('../../../js/stores/RestStore');
-
-const START_DATE_FILTER = SpotConstants.START_DATE;
-const END_DATE_FILTER = SpotConstants.END_DATE;
-const CURRENT_DATE_FILTER = 'current_date';
-
-const requestQueue = [];
-const requestErrors = [];
-
-const IngestSummaryStore = assign(new RestStore(NetflowConstants.API_INGEST_SUMMARY), {
-    errorMessages: {
-        404: 'No details available'
-    },
-    setStartDate: function (date) {
-        this.setRestFilter(START_DATE_FILTER, date);
-    },
-    getStartDate: function () {
-        return this.getRestFilter(START_DATE_FILTER);
-    },
-    setEndDate: function (date) {
-        this.setRestFilter(END_DATE_FILTER, date);
-    },
-    getEndDate: function () {
-        return this.getRestFilter(END_DATE_FILTER);
-    },
-    /**
-     *  Start asking the server for CSV data to create the chart
-     **/
-    requestSummary: function () {
-        var startDate, endDate, date, delta, startRequests, i, month;
-
-        startDate = DateUtils.parseDate(this.getRestFilter(START_DATE_FILTER));
-        endDate = DateUtils.parseDate(this.getRestFilter(END_DATE_FILTER));
-
-        // Find out how many request need to be made
-        delta = (endDate.getFullYear() - startDate.getFullYear()) * 12 + (endDate.getMonth() - startDate.getMonth());
-
-        startRequests = requestQueue.length == 0;
-
-        // Go to first day in month
-        date = new Date(startDate);
-        date.setDate(1);
-
-        // Queue date requests
-        requestQueue.push(date);
-        for (i = 1; i <= delta; i++) {
-            requestQueue.push(DateUtils.calcDate(date, i, 'month'));
-        }
-
-        // dequeue is no request is running
-        startRequests && this.dequeue();
-    },
-    dequeue: function () {
-        var date, year, month;
-
-        if (requestQueue.length == 0) return;
-
-        date = requestQueue.shift();
-        this.setRestFilter(CURRENT_DATE_FILTER, date);
-        year = date.getFullYear();
-        month = date.getMonth() + 1 + "";
-        month = month.length == 1 ? "0" + month : month;
-
-        this.setEndpoint(NetflowConstants.API_INGEST_SUMMARY.replace('${year}', year).replace('${month}', month));
-
-        this.reload();
-    },
-    setData: function (data) {
-        var startDate, endDate, date, dayFilter, parse;
-
-        // Does the loading indicator needs to be displayed?
-        if (data.loading) {
-            if (!this._data.loading) {
-                this._data = data;
-                this.emitChangeData();
-            }
-
-            // Do nothing when loading is in progress
-            return;
-        }
-
-        // Store errors for later usage
-        if (data.error) {
-            requestErrors.push(data);
-        }
-        else if (data.data) {
-            parse = d3.time.format("%Y-%m-%d %H:%M").parse; // Date formatting parser
-            startDate = DateUtils.parseDate(this.getRestFilter(START_DATE_FILTER));
-            endDate = DateUtils.parseDate(this.getRestFilter(END_DATE_FILTER));
-            date = DateUtils.parseDate(this.getRestFilter(CURRENT_DATE_FILTER));
-
-            if (date.getFullYear() == startDate.getFullYear() && date.getMonth() == startDate.getMonth()) {
-                dayFilter = startDate.getDate();
-                data.data = data.data.filter(function (row) {
-                    return DateUtils.parseDate(row.date, true).getDate() >= dayFilter
-                });
-            }
-
-            if (date.getFullYear() == endDate.getFullYear() && date.getMonth() == endDate.getMonth()) {
-                dayFilter = endDate.getDate();
-                data.data = data.data.filter(function (row) {
-                    return DateUtils.parseDate(row.date, true).getDate() <= dayFilter
-                });
-            }
-
-            // Parse dates and numbers.
-            data.data.forEach(function (d) {
-                d.date = parse(d.date);
-                d.flows = +d.flows;
-            });
-
-            // Sort the data by date ASC
-            data.data.sort(function (a, b) {
-                return a.date - b.date;
-            });
-
-            if (!this._data.data) this._data.data = [];
-            this._data.data.push(data.data);
-        }
-
-        this._data.loading = requestQueue.length > 0;
-
-        if (!this._data.loading) {
-            if (this._data.data && this._data.data.length==0) {
-                // Broadcast first found error
-                this._data = requestErrors[0];
-            }
-            this.emitChangeData();
-        }
-        else {
-            setTimeout(this.dequeue.bind(this), 1);
-        }
-    }
-});
-
-SpotDispatcher.register(function (action) {
-    switch (action.actionType) {
-        case SpotConstants.UPDATE_DATE:
-            switch (action.name) {
-                case SpotConstants.START_DATE:
-                    IngestSummaryStore.setStartDate(action.date);
-                    break;
-                case SpotConstants.END_DATE:
-                    IngestSummaryStore.setEndDate(action.date);
-                    break;
-            }
-            break;
-        case SpotConstants.RELOAD_INGEST_SUMMARY:
-            IngestSummaryStore.requestSummary();
-            break;
-    }
-});
-
-module.exports = IngestSummaryStore;


[28/49] incubator-spot git commit: Spot-ml various validations and cleaning (#171)

Posted by ev...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/760dbf34/spot-ml/src/test/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysisTest.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/test/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysisTest.scala b/spot-ml/src/test/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysisTest.scala
new file mode 100644
index 0000000..08200be
--- /dev/null
+++ b/spot-ml/src/test/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysisTest.scala
@@ -0,0 +1,114 @@
+package org.apache.spot.dns
+
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.Row
+import org.apache.spot.dns.DNSSchema._
+import org.apache.spot.testutils.TestingSparkContextFlatSpec
+import org.scalatest.Matchers
+
+/**
+  * Created by rabarona on 12/15/16.
+  */
+class DNSSuspiciousConnectsAnalysisTest  extends TestingSparkContextFlatSpec with Matchers{
+
+  "filterAndSelectCleanDNSRecords" should "return data set without garbage" in {
+
+    val cleanedDNSRecords = DNSSuspiciousConnectsAnalysis.filterAndSelectCleanDNSRecords(testDNSRecords.inputDNSRecordsDF)
+
+    cleanedDNSRecords.count should be(8)
+    cleanedDNSRecords.schema.size should be(8)
+  }
+
+  "filterAndSelectInvalidDNSRecords" should "return invalid records" in {
+
+    val invalidDNSRecords = DNSSuspiciousConnectsAnalysis.filterAndSelectInvalidDNSRecords(testDNSRecords.inputDNSRecordsDF)
+
+    invalidDNSRecords.count should be(15)
+    invalidDNSRecords.schema.size should be(8)
+  }
+
+  "filterScoredDNSRecords" should "return records with score less or equal to threshold" in {
+
+    val threshold = 10e-5
+    val scoredDNSRecords = DNSSuspiciousConnectsAnalysis
+      .filterScoredDNSRecords(testDNSRecords.scoredDNSRecordsDF, threshold)
+
+    scoredDNSRecords.count should be(2)
+  }
+
+  "filterAndSelectCorruptDNSRecords" should "return records where Score is equal to -1" in {
+
+    val corruptDNSRecords = DNSSuspiciousConnectsAnalysis
+      .filterAndSelectCorruptDNSRecords(testDNSRecords.scoredDNSRecordsDF)
+
+    corruptDNSRecords.count should be(1)
+    corruptDNSRecords.schema.size should be(9)
+  }
+
+  def testDNSRecords = new {
+
+    val sqlContext = new SQLContext(sparkContext)
+
+    val inputDNSRecordsRDD = sparkContext.parallelize(wrapRefArray(Array(
+      Seq(null, 1463735425l, 168, "172.16.9.132", "turner.com.122.2o...", "0x00000001", 1, 0),
+      Seq("", 1463735425l, 168, "172.16.9.132", "turner.com.122.2o...", "0x00000001", 1, 0),
+      Seq("-", 1463735425l, 168, "172.16.9.132", "turner.com.122.2o...", "0x00000001", 1, 0),
+      Seq("May 20 2016 02:10:25.970987000 PDT", null, 168, "172.16.9.132", "turner.com.122.2o...", "0x00000001", 1, 0),
+      Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, null, "172.16.9.132", "turner.com.122.2o...", "0x00000001", 1, 0),
+      Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", null, "0x00000001", 1, 0),
+      Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", "", "0x00000001", 1, 0),
+      Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", "-", "0x00000001", 1, 0),
+      Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", "(empty)", "0x00000001", 1, 0),
+      Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, null, "turner.com.122.2o...", "0x00000001", 1, 0),
+      Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "", "turner.com.122.2o...", "0x00000001", 1, 0),
+      Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "-", "turner.com.122.2o...", "0x00000001", 1, 0),
+      Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", "turner.com.122.2o...", null, null, null),
+      Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", "turner.com.122.2o...", "", null, null),
+      Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", "turner.com.122.2o...", "-", null, null),
+      Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", "turner.com.122.2o...", null, 1, 0),
+      Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", "turner.com.122.2o...", "", 1, 0),
+      Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", "turner.com.122.2o...", "-", 1, 0),
+      Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", "turner.com.122.2o...", "0x00000001", null, 0),
+      Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", "turner.com.122.2o...", "0x00000001", 1, null),
+      Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", "turner.com.122.2o...", null, 1, null),
+      Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", "turner.com.122.2o...", null, 1, 0),
+      Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", "turner.com.122.2o...", "0x00000001", null, null))
+      .map(row => Row.fromSeq(row))))
+
+    val inputDNSRecordsSchema = StructType(
+      Array(TimestampField,
+        UnixTimestampField,
+        FrameLengthField,
+        ClientIPField,
+        QueryNameField,
+        QueryClassField,
+        QueryTypeField,
+        QueryResponseCodeField))
+
+    val inputDNSRecordsDF = sqlContext.createDataFrame(inputDNSRecordsRDD, inputDNSRecordsSchema)
+
+    val scoredDNSRecordsRDD = sparkContext.parallelize(wrapRefArray(Array(
+      Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", "turner.com.122.2o...", "0x00000001", 1, 0, 1d),
+      Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", "turner.com.122.2o...", "0x00000001", 1, 0, 0.0000005),
+      Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", "turner.com.122.2o...", "0x00000001", 1, 0, 0.05),
+      Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", "turner.com.122.2o...", "0x00000001", 1, 0, -1d),
+      Seq("May 20 2016 02:10:25.970987000 PDT", 1463735425l, 168, "172.16.9.132", "turner.com.122.2o...", "0x00000001", 1, 0, 0.0001))
+      .map(row => Row.fromSeq(row))))
+
+    val scoredDNSRecordsSchema = StructType(
+      Array(TimestampField,
+        UnixTimestampField,
+        FrameLengthField,
+        ClientIPField,
+        QueryNameField,
+        QueryClassField,
+        QueryTypeField,
+        QueryResponseCodeField,
+        ScoreField))
+
+    val scoredDNSRecordsDF = sqlContext.createDataFrame(scoredDNSRecordsRDD, scoredDNSRecordsSchema)
+
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/760dbf34/spot-ml/src/test/scala/org/apache/spot/dns/DNSWordCreationTest.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/test/scala/org/apache/spot/dns/DNSWordCreationTest.scala b/spot-ml/src/test/scala/org/apache/spot/dns/DNSWordCreationTest.scala
new file mode 100644
index 0000000..fa233a2
--- /dev/null
+++ b/spot-ml/src/test/scala/org/apache/spot/dns/DNSWordCreationTest.scala
@@ -0,0 +1,17 @@
+package org.apache.spot.dns
+
+import org.apache.spot.testutils.TestingSparkContextFlatSpec
+import org.apache.spot.utilities.Entropy
+import org.scalatest.Matchers
+
+class DNSWordCreationTest extends TestingSparkContextFlatSpec with Matchers {
+
+    "entropy" should "return 2.807354922057603 with value abcdefg" in {
+    val value = "abcdefg"
+
+    val result = Entropy.stringEntropy(value)
+
+    result shouldBe 2.807354922057604
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/760dbf34/spot-ml/src/test/scala/org/apache/spot/netflow/FlowSuspiciousCoonectsAnalysis.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/test/scala/org/apache/spot/netflow/FlowSuspiciousCoonectsAnalysis.scala b/spot-ml/src/test/scala/org/apache/spot/netflow/FlowSuspiciousCoonectsAnalysis.scala
new file mode 100644
index 0000000..abaeb21
--- /dev/null
+++ b/spot-ml/src/test/scala/org/apache/spot/netflow/FlowSuspiciousCoonectsAnalysis.scala
@@ -0,0 +1,125 @@
+package org.apache.spot.netflow
+
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.types.StructType
+import org.apache.spot.netflow.FlowSchema._
+import org.apache.spot.testutils.TestingSparkContextFlatSpec
+import org.scalatest.Matchers
+
+/**
+  * Created by rabarona on 12/15/16.
+  */
+class FlowSuspiciousCoonectsAnalysisTest extends TestingSparkContextFlatSpec with Matchers{
+
+  "filterAndSelectCleanFlowRecords" should "return data set without garbage" in {
+
+    val cleanedFlowRecords = FlowSuspiciousConnectsAnalysis
+      .filterAndSelectCleanFlowRecords(testFlowRecords.inputFlowRecordsDF)
+
+    cleanedFlowRecords.count should be(5)
+    cleanedFlowRecords.schema.size should be(17)
+  }
+
+  "filterAndSelectInvalidFlowRecords" should "return invalid records" in {
+
+    val invalidFlowRecords = FlowSuspiciousConnectsAnalysis
+      .filterAndSelectInvalidFlowRecords(testFlowRecords.inputFlowRecordsDF)
+
+    invalidFlowRecords.count should be(7)
+    invalidFlowRecords.schema.size should be(17)
+  }
+
+  "filterScoredFlowRecords" should "return records with score less or equal to threshold" in {
+
+    val threshold = 10e-5
+
+    val scoredFlowRecords = FlowSuspiciousConnectsAnalysis
+      .filterScoredFlowRecords(testFlowRecords.scoredFlowRecordsDF, threshold)
+
+    scoredFlowRecords.count should be(2)
+  }
+
+  "filterAndSelectCorruptFlowRecords" should "return records where Score is equal to -1" in {
+
+    val corruptFlowRecords = FlowSuspiciousConnectsAnalysis
+      .filterAndSelectCorruptFlowRecords(testFlowRecords.scoredFlowRecordsDF)
+
+    corruptFlowRecords.count should be(1)
+    corruptFlowRecords.schema.size should be(18)
+  }
+
+  def testFlowRecords = new {
+    val sqlContext = new SQLContext(sparkContext)
+
+    val inputFlowRecordsRDD = sparkContext.parallelize(wrapRefArray(Array(
+      Seq("2016-05-05 13:54:58",2016,5,5,24,54,58,0.972,"172.16.0.129","10.0.2.202",1024,80,"TCP",39l,12522l,0,0),
+      Seq("2016-05-05 13:54:58",2016,5,5,13,54,60,0.972,"172.16.0.129","10.0.2.202",1024,80,"TCP",39l,12522l,0,0),
+      Seq("2016-05-05 13:54:58",2016,5,5,13,60,58,0.972,"172.16.0.129","10.0.2.202",1024,80,"TCP",39l,12522l,0,0),
+      Seq(null,2016,5,5,13,54,58,0.972,"172.16.0.129","10.0.2.202",1024,80,"TCP",39l,12522l,0,0),
+      Seq("2016-05-05 13:54:58",2016,5,5,13,54,58,0.972,null,"10.0.2.202",1024,80,"TCP",39l,12522l,0,0),
+      Seq("2016-05-05 13:54:58",2016,5,5,13,54,58,0.972,"172.16.0.129",null,1024,80,"TCP",39l,12522l,0,0),
+      Seq("2016-05-05 13:54:58",2016,5,5,13,54,58,0.972,"172.16.0.129","10.0.2.202",null,80,"TCP",39l,12522l,0,0),
+      Seq("2016-05-05 13:54:58",2016,5,5,13,54,58,0.972,"172.16.0.129","10.0.2.202",1024,null,"TCP",39l,12522l,0,0),
+      Seq("2016-05-05 13:54:58",2016,5,5,13,54,58,0.972,"172.16.0.129","10.0.2.202",1024,80,"TCP",null,12522l,0,0),
+      Seq("2016-05-05 13:54:58",2016,5,5,13,54,58,0.972,"172.16.0.129","10.0.2.202",1024,80,"TCP",39l,null,0,0),
+      Seq("2016-05-05 13:54:58",2016,5,5,13,54,58,0.972,"172.16.0.129","10.0.2.202",1024,80,"TCP",39l,12522l,null,0),
+      Seq("2016-05-05 13:54:58",2016,5,5,13,54,58,0.972,"172.16.0.129","10.0.2.202",1024,80,"TCP",39l,12522l,0,null),
+      Seq("2016-05-05 13:54:58",2016,5,5,13,54,58,0.972,"172.16.0.129","10.0.2.202",1024,80,"TCP",39l,12522l,0,0),
+      Seq("2016-05-05 13:54:58",2016,5,5,13,54,58,0.972,"172.16.0.129","10.0.2.202",1024,80,"TCP",39l,12522l,0,0),
+      Seq("2016-05-05 13:54:58",2016,5,5,13,54,58,0.972,"172.16.0.129","10.0.2.202",1024,80,"TCP",39l,12522l,0,0))
+        .map(row => Row.fromSeq(row))))
+
+    val inputFlowRecordsSchema = StructType(
+      Array(TimeReceivedField,
+        YearField,
+        MonthField,
+        DayField,
+        HourField,
+        MinuteField,
+        SecondField,
+        DurationField,
+        SourceIPField,
+        DestinationIPField,
+        SourcePortField,
+        DestinationPortField,
+        ProtocolField,
+        IpktField,
+        IbytField,
+        OpktField,
+        ObytField))
+
+    val inputFlowRecordsDF = sqlContext.createDataFrame(inputFlowRecordsRDD, inputFlowRecordsSchema)
+
+    val scoredFlowRecordsRDD = sparkContext.parallelize(wrapRefArray(Array(
+      Seq("2016-05-05 13:54:58",2016,5,5,13,54,58,0.972,"172.16.0.129","10.0.2.202",1024,80,"TCP",39l,12522l,0,0, -1d),
+      Seq("2016-05-05 13:54:58",2016,5,5,13,54,58,0.972,"172.16.0.129","10.0.2.202",1024,80,"TCP",39l,12522l,0,0, 1d),
+      Seq("2016-05-05 13:54:58",2016,5,5,13,54,58,0.972,"172.16.0.129","10.0.2.202",1024,80,"TCP",39l,12522l,0,0, 0.0000005),
+      Seq("2016-05-05 13:54:58",2016,5,5,13,54,58,0.972,"172.16.0.129","10.0.2.202",1024,80,"TCP",39l,12522l,0,0, 0.05),
+      Seq("2016-05-05 13:54:58",2016,5,5,13,54,58,0.972,"172.16.0.129","10.0.2.202",1024,80,"TCP",39l,12522l,0,0,0.0001))
+      .map(row => Row.fromSeq(row))))
+
+    val scoredFlowRecordsSchema = StructType(
+      Array(TimeReceivedField,
+        YearField,
+        MonthField,
+        DayField,
+        HourField,
+        MinuteField,
+        SecondField,
+        DurationField,
+        SourceIPField,
+        DestinationIPField,
+        SourcePortField,
+        DestinationPortField,
+        ProtocolField,
+        IpktField,
+        IbytField,
+        OpktField,
+        ObytField,
+        ScoreField))
+
+    val scoredFlowRecordsDF = sqlContext.createDataFrame(scoredFlowRecordsRDD, scoredFlowRecordsSchema)
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/760dbf34/spot-ml/src/test/scala/org/apache/spot/netflow/FlowWordCreatorTest.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/test/scala/org/apache/spot/netflow/FlowWordCreatorTest.scala b/spot-ml/src/test/scala/org/apache/spot/netflow/FlowWordCreatorTest.scala
new file mode 100644
index 0000000..00d14bc
--- /dev/null
+++ b/spot-ml/src/test/scala/org/apache/spot/netflow/FlowWordCreatorTest.scala
@@ -0,0 +1,214 @@
+package org.apache.spot.netflow
+
+import org.scalatest.{FlatSpec, Matchers}
+
+
+class FlowWordCreatorTest extends FlatSpec with Matchers {
+
+  // Replace ports in index 10 and 11
+  val srcIP = "10.0.2.115"
+  val dstIP = "172.16.0.107"
+  val hour = 12
+  val minute = 59
+  val second = 32
+
+  val ibyts = 222L
+  val ipkts = 3L
+
+  val timeCuts = Array(2.4, 4.8, 7.2, 9.6, 12.0, 14.4, 16.8, 19.2, 21.6, 24.0)
+  val ipktCuts = Array(10d, 20d, 30d, 40d, 50d, 60d, 70d, 80d, 90d, 100d)
+  val ibytCuts = Array(100d, 200d, 300d, 400d, 500d)
+
+  val expectedIpktBin = 0
+  val expectedIbytBin = 2
+  val expectedTimeBin = 5
+
+
+  val flowWordCreator = new FlowWordCreator(timeCuts, ibytCuts, ipktCuts)
+
+
+  // 1. Test when sip is less than dip and sip is not 0 and dport is <= 1024 & sport > 1024 and min(dport, sport) !=0 +
+  "flowWords" should "create word with ip_pair as sourceIp-destIp, port is dport and dest_word direction is -1" in {
+    val srcPort = 2132
+    val dstPort = 23
+
+    val FlowWords(srcWord, dstWord) =
+      flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
+
+
+    dstWord shouldBe "-1_23_5_2_0"
+    srcWord shouldBe "23_5_2_0"
+
+  }
+
+  // 2. Test when sip is less than dip and sip is not 0 and sport is <= 1024 & dport > 1024 and min(dport, sport) !=0 +
+  it should "create word with ip_pair as sourceIp-destIp, port is sport and src_word direction is -1" in {
+
+    val srcPort = 23
+    val dstPort = 2132
+
+    val FlowWords(srcWord, dstWord) =
+      flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
+
+    dstWord shouldBe "23_5_2_0"
+    srcWord shouldBe "-1_23_5_2_0"
+  }
+
+  // 3. Test when sip is less than dip and sip is not 0 and dport and sport are > 1024 +
+  it should "create word with ip_pair as sourceIp-destIp, port is 333333 and both words direction is 1 (not showing)" in {
+    val srcPort = 8392
+    val dstPort = 9874
+
+    val FlowWords(srcWord, dstWord) =
+      flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
+
+    dstWord shouldBe "333333_5_2_0"
+    srcWord shouldBe "333333_5_2_0"
+  }
+
+  // 4. Test when sip is less than dip and sip is not 0 and dport is 0 but sport is not +
+  it should "create word with ip_pair as sourceIp-destIp, port is sport and source_word direction is -1" in {
+    val srcPort = 80
+    val dstPort = 0
+
+    val FlowWords(srcWord, dstWord) =
+      flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
+
+
+    dstWord shouldBe "80_5_2_0"
+    srcWord shouldBe "-1_80_5_2_0"
+  }
+
+  // 5. Test when sip is less than dip and sip is not 0 and sport is 0 but dport is not +
+  it should "create word with ip_pair as sourceIp-destIp, port is dport and dest_word direction is -1 II" in {
+
+    val srcPort = 0
+    val dstPort = 43
+
+
+    val FlowWords(srcWord, dstWord) =
+      flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
+
+
+    dstWord shouldBe "-1_43_5_2_0"
+    srcWord shouldBe "43_5_2_0"
+  }
+
+  // 6. Test when sip is less than dip and sip is not 0 and sport and dport are less or equal than 1024 +
+  it should "create word with ip_pair as sourceIp-destIp, port is 111111 and both words direction is 1 (not showing)" in {
+    val srcPort = 1024
+    val dstPort = 80
+
+
+    val FlowWords(srcWord, dstWord) =
+      flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
+
+    dstWord shouldBe "111111_5_2_0"
+    srcWord shouldBe "111111_5_2_0"
+  }
+
+  // 7. Test when sip is less than dip and sip is not 0 and sport and dport are 0+
+  it should "create word with ip_pair as sourceIp-destIp, port is max(0,0) and both words direction is 1 (not showing)" in {
+    val srcPort = 0
+    val dstPort = 0
+
+
+    val FlowWords(srcWord, dstWord) =
+      flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
+
+    dstWord shouldBe "0_5_2_0"
+    srcWord shouldBe "0_5_2_0"
+  }
+
+  // 8. Test when sip is not less than dip and dport is <= 1024 & sport > 1024 and min(dport, sport) !=0+
+  it should "create word with ip_pair as destIp-sourceIp, port is dport and dest_word direction is -1" in {
+    val srcPort = 3245
+    val dstPort = 43
+
+
+    val FlowWords(srcWord, dstWord) =
+      flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
+
+    dstWord shouldBe "-1_43_5_2_0"
+    srcWord shouldBe "43_5_2_0"
+
+  }
+
+  // 9. Test when sip is not less than dip and sport is <= 1024 & dport > 1024 and min(dport, sport) !=0 +
+  it should "create word with ip_pair as destIp-sourceIp, port is sport and src_word direction is -1" in {
+    val srcPort = 80
+    val dstPort = 2435
+
+    val FlowWords(srcWord, dstWord) =
+      flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
+
+    dstWord shouldBe "80_5_2_0"
+    srcWord shouldBe "-1_80_5_2_0"
+
+  }
+
+  // 10. Test when sip is not less than dip and dport and sport are > 1024 +
+  it should "create word with ip_pair as destIp-sourceIp, port is 333333 and both words direction is 1 (not showing)" in {
+    val srcPort = 2354
+    val dstPort = 2435
+
+
+    val FlowWords(srcWord, dstWord) =
+      flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
+
+    dstWord shouldBe "333333_5_2_0"
+    srcWord shouldBe "333333_5_2_0"
+  }
+
+  // 11. Test when sip is not less than dip and dport is 0 but sport is not +
+  it should "create word with ip_pair as destIp-sourceIp, port is sport and src_word direction is -1 II" in {
+    val srcPort = 80
+    val dstPort = 0
+
+
+    val FlowWords(srcWord, dstWord) =
+      flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
+
+    dstWord shouldBe "80_5_2_0"
+    srcWord shouldBe "-1_80_5_2_0"
+  }
+
+  // 12. Test when sip is not less than dip and sport is 0 but dport is not +
+  it should "create word with ip_pair as destIp-sourceIp, port is dport and dest_word direction is -1 II" in {
+    val srcPort = 0
+    val dstPort = 2435
+
+
+    val FlowWords(srcWord, dstWord) =
+      flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
+
+    dstWord shouldBe "-1_2435_5_2_0"
+    srcWord shouldBe "2435_5_2_0"
+  }
+
+  // 13. Test when sip is not less than dip and sport and dport are less or equal than 1024
+  it should "create word with ip_pair as destIp-sourceIp, port 111111 and both words direction is 1 (not showing)" in {
+    val srcPort = 80
+    val dstPort = 1024
+
+
+    val FlowWords(srcWord, dstWord) =
+      flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
+
+    dstWord shouldBe "111111_5_2_0"
+    srcWord shouldBe "111111_5_2_0"
+  }
+
+  // 14. Test when sip is not less than dip and sport and dport are 0
+  it should "create word with ip_pair as destIp-sourceIp, port is max(0,0) and both words direction is 1 (not showing)" in {
+    val srcPort = 0
+    val dstPort = 0
+
+
+    val FlowWords(srcWord, dstWord) =
+      flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
+
+    dstWord shouldBe "0_5_2_0"
+    srcWord shouldBe "0_5_2_0"
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/760dbf34/spot-ml/src/test/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysisTest.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/test/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysisTest.scala b/spot-ml/src/test/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysisTest.scala
new file mode 100644
index 0000000..bc17751
--- /dev/null
+++ b/spot-ml/src/test/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysisTest.scala
@@ -0,0 +1,135 @@
+package org.apache.spot.proxy
+
+import org.apache.spark.sql.{Row, SQLContext}
+import org.apache.spark.sql.types.StructType
+import org.apache.spot.proxy.ProxySchema._
+import org.apache.spot.testutils.TestingSparkContextFlatSpec
+import org.scalatest.Matchers
+
+/**
+  * Created by rabarona on 12/15/16.
+  */
+class ProxySuspiciousConnectsAnalysisTest extends TestingSparkContextFlatSpec with Matchers{
+
+  "filterAndSelectCleanProxyRecords" should "return data without garbage" in {
+
+    val cleanedProxyRecords = ProxySuspiciousConnectsAnalysis
+      .filterAndSelectCleanProxyRecords(testProxyRecords.inputProxyRecordsDF)
+
+    cleanedProxyRecords.count should be(1)
+    cleanedProxyRecords.schema.size should be(19)
+  }
+
+  "filterAndSelectInvalidProxyRecords" should "return invalir records" in {
+
+    val invalidProxyRecords = ProxySuspiciousConnectsAnalysis
+      .filterAndSelectInvalidProxyRecords(testProxyRecords.inputProxyRecordsDF)
+
+    invalidProxyRecords.count should be(5)
+    invalidProxyRecords.schema.size should be(19)
+
+  }
+
+  "filterScoredProxyRecords" should "return records with score less or equal to threshold" in {
+
+    val threshold = 10e-5
+
+    val scoredProxyRecords = ProxySuspiciousConnectsAnalysis
+      .filterScoredProxyRecords(testProxyRecords.scoredProxyRecordsDF, threshold)
+
+    scoredProxyRecords.count should be(2)
+
+  }
+
+  "filterAndSelectCorruptProxyRecords" should "return records where Score is equal to -1" in {
+
+    val corruptProxyRecords = ProxySuspiciousConnectsAnalysis
+      .filterAndSelectCorruptProxyRecords(testProxyRecords.scoredProxyRecordsDF)
+
+    corruptProxyRecords.count should be(1)
+    corruptProxyRecords.schema.size should be(21)
+  }
+
+  def testProxyRecords = new {
+
+    val sqlContext = new SQLContext(sparkContext)
+
+    val inputProxyRecordsRDD = sparkContext.parallelize(wrapRefArray(Array(
+      Seq(null,"00:09:13","10.239.160.152","cn.archive.ubuntu...","GET","Debian APT-HTTP/...","text/html",448,"-",
+        "-","-","404","80","/ubuntu/dists/tru...","-","10.239.4.160",2864,218,"cn.archive.ubuntu..."),
+      Seq("2016-10-03",null,"10.239.160.152","cn.archive.ubuntu...","GET","Debian APT-HTTP/...","text/html",448,"-",
+        "-","-","404","80","/ubuntu/dists/tru...","-","10.239.4.160",2864,218,"cn.archive.ubuntu..."),
+      Seq("2016-10-03","00:09:13",null,"cn.archive.ubuntu...","GET","Debian APT-HTTP/...","text/html",448,"-",
+        "-","-","404","80","/ubuntu/dists/tru...","-","10.239.4.160",2864,218,"cn.archive.ubuntu..."),
+      Seq("2016-10-03","00:09:13","10.239.160.152",null,"GET","Debian APT-HTTP/...","text/html",448,"-",
+        "-","-","404","80","/ubuntu/dists/tru...","-","10.239.4.160",2864,218,"cn.archive.ubuntu..."),
+      Seq("2016-10-03","00:09:13","10.239.160.152","cn.archive.ubuntu...","GET","Debian APT-HTTP/...","text/html",448,"-",
+        "-","-","404","80","/ubuntu/dists/tru...","-","10.239.4.160",2864,218,null),
+      Seq("2016-10-03","00:09:13","10.239.160.152","cn.archive.ubuntu...","GET","Debian APT-HTTP/...","text/html",448,"-",
+        "-","-","404","80","/ubuntu/dists/tru...","-","10.239.4.160",2864,218,"cn.archive.ubuntu..."))
+      .map(row => Row.fromSeq(row))))
+
+    val inputProxyRecordsSchema = StructType(
+      Array(DateField,
+        TimeField,
+        ClientIPField,
+        HostField,
+        ReqMethodField,
+        UserAgentField,
+        ResponseContentTypeField,
+        DurationField,
+        UserNameField,
+        WebCatField,
+        RefererField,
+        RespCodeField,
+        URIPortField,
+        URIPathField,
+        URIQueryField,
+        ServerIPField,
+        SCBytesField,
+        CSBytesField,
+        FullURIField))
+
+    val inputProxyRecordsDF = sqlContext.createDataFrame(inputProxyRecordsRDD, inputProxyRecordsSchema)
+
+    val scoredProxyRecordsRDD = sparkContext.parallelize(wrapRefArray(Array(
+      Seq("2016-10-03","00:09:13","10.239.160.152","cn.archive.ubuntu...","GET","Debian APT-HTTP/...","text/html",448,"-",
+        "-","-","404","80","/ubuntu/dists/tru...","-","10.239.4.160",2864,218,"cn.archive.ubuntu...", "a word", -1d),
+      Seq("2016-10-03","00:09:13","10.239.160.152","cn.archive.ubuntu...","GET","Debian APT-HTTP/...","text/html",448,"-",
+        "-","-","404","80","/ubuntu/dists/tru...","-","10.239.4.160",2864,218,"cn.archive.ubuntu...", "a word", 1d),
+      Seq("2016-10-03","00:09:13","10.239.160.152","cn.archive.ubuntu...","GET","Debian APT-HTTP/...","text/html",448,"-",
+        "-","-","404","80","/ubuntu/dists/tru...","-","10.239.4.160",2864,218,"cn.archive.ubuntu...", "a word", 0.0000005),
+      Seq("2016-10-03","00:09:13","10.239.160.152","cn.archive.ubuntu...","GET","Debian APT-HTTP/...","text/html",448,"-",
+        "-","-","404","80","/ubuntu/dists/tru...","-","10.239.4.160",2864,218,"cn.archive.ubuntu...", "a word", 0.05),
+      Seq("2016-10-03","00:09:13","10.239.160.152","cn.archive.ubuntu...","GET","Debian APT-HTTP/...","text/html",448,"-",
+        "-","-","404","80","/ubuntu/dists/tru...","-","10.239.4.160",2864,218,"cn.archive.ubuntu...", "a word", 0.0001)
+    ).map(row => Row.fromSeq(row))))
+
+    val scoredProxyRecordsSchema = StructType(
+      Array(DateField,
+        TimeField,
+        ClientIPField,
+        HostField,
+        ReqMethodField,
+        UserAgentField,
+        ResponseContentTypeField,
+        DurationField,
+        UserNameField,
+        WebCatField,
+        RefererField,
+        RespCodeField,
+        URIPortField,
+        URIPathField,
+        URIQueryField,
+        ServerIPField,
+        SCBytesField,
+        CSBytesField,
+        FullURIField,
+        WordField,
+        ScoreField))
+
+    val scoredProxyRecordsDF = sqlContext.createDataFrame(scoredProxyRecordsRDD, scoredProxyRecordsSchema)
+
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/760dbf34/spot-oa/oa/components/iana/dns-qclass.csv
----------------------------------------------------------------------
diff --git a/spot-oa/oa/components/iana/dns-qclass.csv b/spot-oa/oa/components/iana/dns-qclass.csv
index 59a7813..86f31df 100644
--- a/spot-oa/oa/components/iana/dns-qclass.csv
+++ b/spot-oa/oa/components/iana/dns-qclass.csv
@@ -6,5 +6,6 @@ Decimal,Hexadecimal,Name,Reference
 Technology Artificial Intelligence Laboratory, June 1981.]"
 4,0x00000004,Hesiod (HS),"[Dyer, S., and F. Hsu, ""Hesiod"", Project Athena Technical
 Plan - Name Service, April 1987.]"
+5,unknown,UNKNOWN,
 254,0x000000FE,QCLASS NONE,[RFC2136]
 255,0x000000FF,QCLASS * (ANY),[RFC1035]

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/760dbf34/spot-oa/oa/components/iana/dns-qtype.csv
----------------------------------------------------------------------
diff --git a/spot-oa/oa/components/iana/dns-qtype.csv b/spot-oa/oa/components/iana/dns-qtype.csv
index 959e306..f6b9dcf 100644
--- a/spot-oa/oa/components/iana/dns-qtype.csv
+++ b/spot-oa/oa/components/iana/dns-qtype.csv
@@ -1,4 +1,5 @@
 TYPE,Value,Meaning,Reference,Template,Registration Date
+0,-1,UNKNOWN,,,
 A,1,a host address,[RFC1035],,
 NS,2,an authoritative name server,[RFC1035],,
 MD,3,a mail destination (OBSOLETE - use MX),[RFC1035],,

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/760dbf34/spot-oa/oa/components/iana/dns-rcode.csv
----------------------------------------------------------------------
diff --git a/spot-oa/oa/components/iana/dns-rcode.csv b/spot-oa/oa/components/iana/dns-rcode.csv
index 4edbb59..2012e1a 100644
--- a/spot-oa/oa/components/iana/dns-rcode.csv
+++ b/spot-oa/oa/components/iana/dns-rcode.csv
@@ -1,4 +1,5 @@
 RCODE,Name,Description,Reference
+-1,UNKNOWN,UNKNOWN,
 0,NoError,No Error,[RFC1035]
 1,FormErr,Format Error,[RFC1035]
 2,ServFail,Server Failure,[RFC1035]

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/760dbf34/spot-setup/spot.conf
----------------------------------------------------------------------
diff --git a/spot-setup/spot.conf b/spot-setup/spot.conf
index 3b217ed..65fbd74 100755
--- a/spot-setup/spot.conf
+++ b/spot-setup/spot.conf
@@ -1,5 +1,4 @@
 #node configuration
-NODES=('node-01' 'node-02')
 UINODE='node03'
 MLNODE='node04'
 GWNODE='node16'
@@ -7,8 +6,6 @@ DBNAME='spot'
 
 #hdfs - base user and data source config
 HUSER='/user/spot'
-DSOURCES='flow'
-DFOLDERS=('binary' 'csv' 'hive' 'stage')
 DNS_PATH=${HUSER}/${DSOURCE}/hive/y=${YR}/m=${MH}/d=${DY}/
 PROXY_PATH=${HUSER}/${DSOURCE}/hive/y=${YR}/m=${MH}/d=${DY}/
 FLOW_PATH=${HUSER}/${DSOURCE}/hive/y=${YR}/m=${MH}/d=${DY}/
@@ -28,19 +25,19 @@ KRB_USER=
 LUSER='/home/spot'
 LPATH=${LUSER}/ml/${DSOURCE}/${FDATE}
 RPATH=${LUSER}/ipython/user/${FDATE}
-LDAPATH=${LUSER}/ml/oni-lda-c
 LIPATH=${LUSER}/ingest
 
 #dns suspicious connects config
 USER_DOMAIN=''
 
-SPK_EXEC='400'
-SPK_EXEC_MEM='2048m'
+SPK_EXEC=''
+SPK_EXEC_MEM=''
 SPK_DRIVER_MEM=''
 SPK_DRIVER_MAX_RESULTS=''
 SPK_EXEC_CORES=''
 SPK_DRIVER_MEM_OVERHEAD=''
-SPAK_EXEC_MEM_OVERHEAD=''
+SPK_EXEC_MEM_OVERHEAD=''
 TOL='1e-6'
 
-
+TOPIC_COUNT=20
+DUPFACTOR=1000


[07/49] incubator-spot git commit: Trying to resolve conflicts by merging in an updated version of spot branch Merge branch 'spot' into Shouldbe_good_domain_fix

Posted by ev...@apache.org.
Trying to resolve conflicts by merging in an updated version of spot branch
Merge branch 'spot' into Shouldbe_good_domain_fix


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/f6c2b2ba
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/f6c2b2ba
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/f6c2b2ba

Branch: refs/heads/master
Commit: f6c2b2ba754832a3aff04baec6af791e77b53221
Parents: f7596ca 8f0988a
Author: Brandon Edwards <br...@intel.com>
Authored: Thu Dec 8 14:55:12 2016 -0800
Committer: Brandon Edwards <br...@intel.com>
Committed: Thu Dec 8 14:55:12 2016 -0800

----------------------------------------------------------------------
 ISSUES.md                                       | 53 --------------------
 README.md                                       | 51 ++++---------------
 spot-ingest/README.md                           |  4 --
 .../js/components/DetailsTablePanel.react.js    |  4 +-
 .../dns/js/components/NetworkViewPanel.react.js |  2 +-
 .../dns/js/components/SuspiciousPanel.react.js  |  2 +-
 .../js/components/DetailsTablePanel.react.js    |  2 +-
 .../js/components/NetworkViewPanel.react.js     |  2 +-
 .../flow/js/components/SuspiciousPanel.react.js |  2 +-
 .../ui/js/components/GridPanelMixin.react.js    | 20 ++++----
 .../components/PolloNetworkViewMixin.react.js   |  5 ++
 .../js/components/NetworkViewPanel.react.js     |  6 +--
 .../js/components/SuspiciousPanel.react.js      |  2 +-
 13 files changed, 36 insertions(+), 119 deletions(-)
----------------------------------------------------------------------



[18/49] incubator-spot git commit: Changed the position of the optional parameter to be at the end of all other parameters

Posted by ev...@apache.org.
Changed the position of the optional parameter to be at the end of all other parameters


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/3bf290dd
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/3bf290dd
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/3bf290dd

Branch: refs/heads/master
Commit: 3bf290dd0a33368486abea1d803163bbc7cd7736
Parents: b9cc67d
Author: Brandon Edwards <br...@intel.com>
Authored: Tue Dec 13 16:21:47 2016 -0800
Committer: Brandon Edwards <br...@intel.com>
Committed: Tue Dec 13 16:21:47 2016 -0800

----------------------------------------------------------------------
 spot-ml/ml_ops.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/3bf290dd/spot-ml/ml_ops.sh
----------------------------------------------------------------------
diff --git a/spot-ml/ml_ops.sh b/spot-ml/ml_ops.sh
index a951fe1..c5d6e43 100755
--- a/spot-ml/ml_ops.sh
+++ b/spot-ml/ml_ops.sh
@@ -105,11 +105,11 @@ time spark-submit --class "org.apache.spot.SuspiciousConnects" \
   --dupfactor ${DUPFACTOR} \
   --feedback ${FEEDBACK_PATH} \
   --ldatopiccount ${TOPIC_COUNT} \
-  $USER_DOMAIN_PARSER_CMD \
   --scored ${HDFS_SCORED_CONNECTS} \
   --threshold ${TOL} \
   --maxresults ${MAXRESULTS} \
-  --ldamaxiterations 20
+  --ldamaxiterations 20 \ 
+  $USER_DOMAIN_PARSER_CMD
 
 wait
 


[11/49] incubator-spot git commit: test_dns_topdomain

Posted by ev...@apache.org.
test_dns_topdomain

some unit testing necessitated refactoring


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/0f1a6c5e
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/0f1a6c5e
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/0f1a6c5e

Branch: refs/heads/master
Commit: 0f1a6c5e3c35247afce9c4df7da548f274d2b015
Parents: 83d157a
Author: nlsegerl <na...@intel.com>
Authored: Mon Dec 12 14:19:56 2016 -0800
Committer: nlsegerl <na...@intel.com>
Committed: Mon Dec 12 14:19:56 2016 -0800

----------------------------------------------------------------------
 .../dns/DNSSuspiciousConnectsAnalysis.scala     | 42 ++++++++++----
 .../FlowSuspiciousConnectsAnalysis.scala        | 33 +++++++++--
 .../proxy/ProxySuspiciousConnectsAnalysis.scala | 44 ++++++++++-----
 .../apache/spot/utilities/DataFrameUtils.scala  | 39 -------------
 .../org/apache/spot/DNSWordCreationTest.scala   |  4 +-
 .../org/apache/spot/FlowWordCreatorTest.scala   |  2 +-
 .../dns/DNSSuspiciousConnectsAnalysisTest.scala | 59 ++++++++++++++++++++
 7 files changed, 149 insertions(+), 74 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/0f1a6c5e/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala b/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala
index d0e6da1..f2ce7a4 100644
--- a/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala
@@ -1,5 +1,6 @@
 package org.apache.spot.dns
 
+import org.apache.log4j.Logger
 import org.apache.spark.SparkContext
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._
@@ -7,8 +8,6 @@ import org.apache.spark.sql.{DataFrame, SQLContext}
 import org.apache.spot.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig
 import org.apache.spot.dns.DNSSchema._
 import org.apache.spot.dns.model.DNSSuspiciousConnectsModel
-import org.apache.log4j.Logger
-
 import org.apache.spot.dns.model.DNSSuspiciousConnectsModel.ModelSchema
 
 /**
@@ -42,6 +41,7 @@ object DNSSuspiciousConnectsAnalysis {
 
   /**
     * Run suspicious connections analysis on DNS log data.
+    * Saves the most suspicious connections to a CSV file on HDFS.
     *
     * @param config Object encapsulating runtime parameters and CLI options.
     * @param sparkContext
@@ -58,22 +58,42 @@ object DNSSuspiciousConnectsAnalysis {
       .filter(Timestamp + " is not null and " + UnixTimestamp + " is not null")
       .select(inColumns:_*)
 
-    logger.info("Training the model")
 
-    val model =
-      DNSSuspiciousConnectsModel.trainNewModel(sparkContext, sqlContext, logger, config, rawDataDF, config.topicCount)
 
-    logger.info("Scoring")
-    val scoredDF = model.score(sparkContext, sqlContext, rawDataDF)
+    val scoredDF = detectDNSAnomalies(rawDataDF, config, sparkContext, sqlContext, logger)
 
 
     val filteredDF = scoredDF.filter(Score + " <= " + config.threshold)
     val mostSusipiciousDF: DataFrame = filteredDF.orderBy(Score).limit(config.maxResults)
 
-    val outputDF = mostSusipiciousDF.select(OutColumns:_*).sort(Score)
-
-    logger.info("DNS  suspcicious connects analysis completed.")
+    mostSusipiciousDF.select(OutColumns:_*).sort(Score)
     logger.info("Saving results to : " + config.hdfsScoredConnect)
-    outputDF.map(_.mkString(config.outputDelimiter)).saveAsTextFile(config.hdfsScoredConnect)
+
+
+    mostSusipiciousDF.map(_.mkString(config.outputDelimiter)).saveAsTextFile(config.hdfsScoredConnect)
+  }
+
+  /**
+    * Identify anomalous DNS log entries in in the provided data frame.
+    *
+    * @param data Data frame of DNS entries
+    * @param config
+    * @param sparkContext
+    * @param sqlContext
+    * @param logger
+    * @return
+    */
+  def detectDNSAnomalies(data: DataFrame, config: SuspiciousConnectsConfig,
+                         sparkContext: SparkContext,
+                         sqlContext: SQLContext,
+                         logger: Logger) : DataFrame = {
+
+
+    logger.info("Fitting probabilistic model to data")
+    val model =
+      DNSSuspiciousConnectsModel.trainNewModel(sparkContext, sqlContext, logger, config, data, config.topicCount)
+
+    logger.info("Identifying outliers")
+    model.score(sparkContext, sqlContext, data)
   }
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/0f1a6c5e/spot-ml/src/main/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysis.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysis.scala b/spot-ml/src/main/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysis.scala
index 32c0f6e..098a787 100644
--- a/spot-ml/src/main/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysis.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysis.scala
@@ -17,7 +17,7 @@ import org.apache.spot.netflow.model.FlowSuspiciousConnectsModel
 
 object FlowSuspiciousConnectsAnalysis {
 
-  def run(config: SuspiciousConnectsConfig, sparkContext: SparkContext, sqlContext: SQLContext, logger: Logger)(implicit outputDelimiter: String) = {
+  def run(config: SuspiciousConnectsConfig, sparkContext: SparkContext, sqlContext: SQLContext, logger: Logger) = {
 
     logger.info("Loading data")
 
@@ -28,11 +28,7 @@ object FlowSuspiciousConnectsAnalysis {
 
     logger.info("Training the model")
 
-    val model =
-      FlowSuspiciousConnectsModel.trainNewModel(sparkContext, sqlContext, logger, config, rawDataDF, config.topicCount)
-
-    logger.info("Scoring")
-    val scoredDF = model.score(sparkContext, sqlContext, rawDataDF)
+    val scoredDF = detectFlowAnomalies(rawDataDF, config, sparkContext, sqlContext, logger)
 
     val filteredDF = scoredDF.filter(Score + " <= " + config.threshold)
 
@@ -46,6 +42,31 @@ object FlowSuspiciousConnectsAnalysis {
     outputDF.map(_.mkString(config.outputDelimiter)).saveAsTextFile(config.hdfsScoredConnect)
   }
 
+  /**
+    * Identify anomalous netflow log entries in in the provided data frame.
+    *
+    * @param data Data frame of netflow entries
+    * @param config
+    * @param sparkContext
+    * @param sqlContext
+    * @param logger
+    * @return
+    */
+  def detectFlowAnomalies(data: DataFrame,
+                          config: SuspiciousConnectsConfig,
+                         sparkContext: SparkContext,
+                         sqlContext: SQLContext,
+                         logger: Logger) : DataFrame = {
+
+
+    logger.info("Fitting probabilistic model to data")
+    val model =
+      FlowSuspiciousConnectsModel.trainNewModel(sparkContext, sqlContext, logger, config, data, config.topicCount)
+
+    logger.info("Identifying outliers")
+    model.score(sparkContext, sqlContext, data)
+  }
+
   val inSchema = StructType(List(TimeReceivedField,
     YearField,
     MonthField,

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/0f1a6c5e/spot-ml/src/main/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysis.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysis.scala b/spot-ml/src/main/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysis.scala
index 1131406..38150ca 100644
--- a/spot-ml/src/main/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysis.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysis.scala
@@ -2,10 +2,9 @@ package org.apache.spot.proxy
 
 import org.apache.log4j.Logger
 import org.apache.spark.SparkContext
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.{DataFrame, SQLContext}
 import org.apache.spot.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig
 import org.apache.spot.proxy.ProxySchema._
-import org.apache.spot.utilities.DataFrameUtils
 
 /**
   * Run suspicious connections analysis on proxy data.
@@ -31,23 +30,40 @@ object ProxySuspiciousConnectsAnalysis {
       select(Date, Time, ClientIP, Host, ReqMethod, UserAgent, ResponseContentType, Duration, UserName,
         WebCat, Referer, RespCode, URIPort, URIPath, URIQuery, ServerIP, SCBytes, CSBytes, FullURI)
 
-    logger.info("Training the model")
-    val model =
-      ProxySuspiciousConnectsModel.trainNewModel(sparkContext, sqlContext, logger, config, rawDataDF)
+    val scoredDF = detectProxyAnomalies(rawDataDF, config, sparkContext, sqlContext, logger)
 
-    logger.info("Scoring")
-    val scoredDF = model.score(sparkContext, rawDataDF)
-
-    // take the maxResults least probable events of probability below the threshold and sort
 
     val filteredDF = scoredDF.filter(Score +  " <= " + config.threshold)
-    val topRows = DataFrameUtils.dfTakeOrdered(filteredDF, "score", config.maxResults)
-    val scoreIndex = scoredDF.schema.fieldNames.indexOf("score")
-    val outputRDD = sparkContext.parallelize(topRows).sortBy(row => row.getDouble(scoreIndex))
+    val mostSusipiciousDF: DataFrame = filteredDF.orderBy(Score).limit(config.maxResults)
 
-    logger.info("Persisting data")
-    outputRDD.map(_.mkString(config.outputDelimiter)).saveAsTextFile(config.hdfsScoredConnect)
+    logger.info("Persisting data to hdfs: " + config.hdfsScoredConnect)
+    mostSusipiciousDF.map(_.mkString(config.outputDelimiter)).saveAsTextFile(config.hdfsScoredConnect)
 
     logger.info("Proxy suspcicious connects completed")
   }
+
+
+  /**
+    * Identify anomalous proxy log entries in in the provided data frame.
+    *
+    * @param data Data frame of proxy entries
+    * @param config
+    * @param sparkContext
+    * @param sqlContext
+    * @param logger
+    * @return
+    */
+  def detectProxyAnomalies(data: DataFrame,
+                          config: SuspiciousConnectsConfig,
+                          sparkContext: SparkContext,
+                          sqlContext: SQLContext,
+                          logger: Logger) : DataFrame = {
+
+
+    logger.info("Fitting probabilistic model to data")
+    val model = ProxySuspiciousConnectsModel.trainNewModel(sparkContext, sqlContext, logger, config, data)
+    logger.info("Identifying outliers")
+
+    model.score(sparkContext, data)
+  }
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/0f1a6c5e/spot-ml/src/main/scala/org/apache/spot/utilities/DataFrameUtils.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/utilities/DataFrameUtils.scala b/spot-ml/src/main/scala/org/apache/spot/utilities/DataFrameUtils.scala
deleted file mode 100644
index d5af6ee..0000000
--- a/spot-ml/src/main/scala/org/apache/spot/utilities/DataFrameUtils.scala
+++ /dev/null
@@ -1,39 +0,0 @@
-package org.apache.spot.utilities
-
-import org.apache.spark.sql.{DataFrame, Row}
-
-/**
-  * Some handy operations on dataframes not provided by Apache Spark.
-  */
-object  DataFrameUtils {
-
-  /**
-    * Returns the rows of a dataframe whose values in a provided column are in the first k
-    * (least to greatest) values.  If strictly fewer than k rows are in the dataframe, all rows are returned.
-    *
-    * Dataframe analog to takeOrdered.
-    *
-    * @param df Input dataframe.
-    * @param colName Column to consider.
-    * @param k Maximum number of rows to return.
-    * @return Array of (at most k) rows.
-    */
-  def dfTakeOrdered(df: DataFrame, colName: String, k: Int) : Array[Row] = {
-    val count = df.count
-
-    val takeCount  = if (k == -1 || count < k) {
-      count.toInt
-    } else {
-      k
-    }
-
-    val colIndex = df.schema.fieldNames.indexOf(colName)
-
-    class DataOrdering() extends Ordering[Row] {
-      def compare(row1: Row, row2: Row) = row1.getDouble(colIndex).compare(row2.getDouble(colIndex))
-    }
-
-    implicit val rowOrdering = new DataOrdering()
-    df.rdd.takeOrdered(takeCount)
-  }
-}

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/0f1a6c5e/spot-ml/src/test/scala/org/apache/spot/DNSWordCreationTest.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/test/scala/org/apache/spot/DNSWordCreationTest.scala b/spot-ml/src/test/scala/org/apache/spot/DNSWordCreationTest.scala
index 1b02333..b756358 100644
--- a/spot-ml/src/test/scala/org/apache/spot/DNSWordCreationTest.scala
+++ b/spot-ml/src/test/scala/org/apache/spot/DNSWordCreationTest.scala
@@ -1,11 +1,9 @@
 package org.apache.spot
 
 
-import javax.swing.text.Utilities
 
-import org.apache.spot.dns.{DNSSuspiciousConnectsAnalysis, DNSWordCreation}
 import org.apache.spot.testutils.TestingSparkContextFlatSpec
-import org.apache.spot.utilities.{CountryCodes, Entropy, TopDomains}
+import org.apache.spot.utilities.Entropy
 import org.scalatest.Matchers
 
 class DNSWordCreationTest extends TestingSparkContextFlatSpec with Matchers {

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/0f1a6c5e/spot-ml/src/test/scala/org/apache/spot/FlowWordCreatorTest.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/test/scala/org/apache/spot/FlowWordCreatorTest.scala b/spot-ml/src/test/scala/org/apache/spot/FlowWordCreatorTest.scala
index f3cf715..832bbd1 100644
--- a/spot-ml/src/test/scala/org/apache/spot/FlowWordCreatorTest.scala
+++ b/spot-ml/src/test/scala/org/apache/spot/FlowWordCreatorTest.scala
@@ -39,7 +39,7 @@ class FlowWordCreatorTest extends FlatSpec with Matchers {
 
 
     dstWord shouldBe "-1_23_5_2_0"
-    srcWord shouldBe "23_5_2_0"
+    srcWord shouldBe  "23_5_2_0"
 
   }
 

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/0f1a6c5e/spot-ml/src/test/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysisTest.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/test/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysisTest.scala b/spot-ml/src/test/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysisTest.scala
new file mode 100644
index 0000000..138f32e
--- /dev/null
+++ b/spot-ml/src/test/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysisTest.scala
@@ -0,0 +1,59 @@
+package org.apache.spot.dns
+
+import org.apache.log4j.{Level, LogManager}
+import org.apache.spot.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig
+import org.apache.spot.dns.DNSSchema._
+import org.apache.spot.testutils.TestingSparkContextFlatSpec
+import org.scalatest.Matchers
+
+case class DNSInput(frame_time:String, unix_tstamp:Long, frame_len:Int, ip_dst: String, dns_qry_name:String, dns_qry_class:String, dns_qry_type: Int, dns_qry_rcode: Int)
+
+class DNSSuspiciousConnectsAnalysisTest  extends TestingSparkContextFlatSpec with Matchers {
+
+  val testConfig = SuspiciousConnectsConfig(analysis = "dns",
+  inputPath = "",
+  feedbackFile = "",
+  duplicationFactor = 1,
+  topicCount = 20,
+  hdfsScoredConnect = "",
+  threshold = 1.0d,
+  maxResults = 1000,
+  outputDelimiter = "\t",
+  ldaPRGSeed = None,
+  ldaMaxiterations = 20,
+  ldaAlpha = 1.02,
+  ldaBeta = 1.001)
+
+
+  "dns supicious connects analysis" should "estimate correct probabilities in toy data with framelength anomaly" in {
+
+    val logger = LogManager.getLogger("SuspiciousConnectsAnalysis")
+    logger.setLevel(Level.INFO)
+    val testSqlContext = new org.apache.spark.sql.SQLContext(sparkContext)
+
+    val anomalousRecord = DNSInput("May 20 2016 02:10:25.970987000 PDT",	1463735425L,	1,	"172.16.9.132",	"turner.com.122.2o7.net",	"0x00000001",	1,	0)
+    val typicalRecord   = DNSInput("May 20 2016 02:10:25.970987000 PDT",	1463735425L,	168,	"172.16.9.132",	"turner.com.122.2o7.net",	"0x00000001",	1,	0)
+
+    import testSqlContext.implicits._
+
+    val data = sparkContext.parallelize(Seq(anomalousRecord, typicalRecord, typicalRecord, typicalRecord, typicalRecord)).toDF
+
+    val scoredData = DNSSuspiciousConnectsAnalysis.detectDNSAnomalies(data, testConfig,
+      sparkContext,
+      sqlContext,
+      logger)
+
+
+    val anomalyScore = scoredData.filter(scoredData(FrameLength) === 1).first().getAs[Double](Score)
+    val typicalScores = scoredData.filter(scoredData(FrameLength) === 168).collect().map(_.getAs[Double](Score))
+
+    Math.abs(anomalyScore - 0.2d)  should be <= 0.01d
+    typicalScores.length shouldBe 4
+    Math.abs(typicalScores(0) - 0.8d)  should be <= 0.01d
+    Math.abs(typicalScores(1) - 0.8d)  should be <= 0.01d
+    Math.abs(typicalScores(2) - 0.8d)  should be <= 0.01d
+    Math.abs(typicalScores(3) - 0.8d)  should be <= 0.01d
+  }
+
+
+}


[25/49] incubator-spot git commit: Merge branch 'test_flow' into unit_test_cleanup

Posted by ev...@apache.org.
Merge branch 'test_flow' into unit_test_cleanup


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/8d13d3f4
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/8d13d3f4
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/8d13d3f4

Branch: refs/heads/master
Commit: 8d13d3f466977f2cf4d1db0a84e9842b30ae8bba
Parents: 991fd0e ac866a0
Author: nlsegerl <na...@intel.com>
Authored: Tue Dec 20 16:24:00 2016 -0800
Committer: nlsegerl <na...@intel.com>
Committed: Tue Dec 20 16:24:00 2016 -0800

----------------------------------------------------------------------
 .../FlowSuspiciousConnectsAnalysisTest.scala    | 87 ++++++++++++++++++++
 1 file changed, 87 insertions(+)
----------------------------------------------------------------------



[30/49] incubator-spot git commit: merging in recent changes to spot branch to remove merge conflicts Merge branch 'spot' of https://github.com/Open-Network-Insight/open-network-insight into User_Domain_Designation_Fix

Posted by ev...@apache.org.
merging in recent changes to spot branch to remove merge conflicts
Merge branch 'spot' of https://github.com/Open-Network-Insight/open-network-insight into User_Domain_Designation_Fix


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/06900bb6
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/06900bb6
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/06900bb6

Branch: refs/heads/master
Commit: 06900bb643ce3656498a103b2b5510b04f3711de
Parents: 1a0269f 760dbf3
Author: Brandon Edwards <br...@intel.com>
Authored: Thu Dec 22 15:12:11 2016 -0800
Committer: Brandon Edwards <br...@intel.com>
Committed: Thu Dec 22 15:12:11 2016 -0800

----------------------------------------------------------------------
 spot-ml/README.md                               |  24 +--
 spot-ml/install_ml.sh                           |  12 --
 spot-ml/ml_ops.sh                               |  30 +--
 spot-ml/ml_test.sh                              |  30 +--
 .../org/apache/spot/SuspiciousConnects.scala    |  19 +-
 .../spot/SuspiciousConnectsScoreFunction.scala  |  19 +-
 .../scala/org/apache/spot/dns/DNSSchema.scala   |   2 -
 .../dns/DNSSuspiciousConnectsAnalysis.scala     | 172 +++++++++++----
 .../org/apache/spot/dns/DNSWordCreation.scala   |  28 ++-
 .../dns/model/DNSSuspiciousConnectsModel.scala  |  99 +++++++--
 .../FlowSuspiciousConnectsAnalysis.scala        | 119 ++++++++--
 .../apache/spot/netflow/FlowWordCreator.scala   |  65 +++---
 .../spot/netflow/model/FlowScoreFunction.scala  |   9 +-
 .../model/FlowSuspiciousConnectsModel.scala     |  73 ++++---
 .../org/apache/spot/proxy/ProxySchema.scala     |  49 +++++
 .../proxy/ProxySuspiciousConnectsAnalysis.scala | 155 +++++++++++--
 .../proxy/ProxySuspiciousConnectsModel.scala    |  65 ++++--
 .../apache/spot/proxy/ProxyWordCreation.scala   |  27 ++-
 .../utilities/data/InputOutputDataHandler.scala |  63 ++++++
 .../data/validation/InvalidDataHandler.scala    |  56 +++++
 .../org/apache/spot/DNSWordCreationTest.scala   |  21 --
 .../org/apache/spot/FlowWordCreatorTest.scala   | 216 -------------------
 .../dns/DNSSuspiciousConnectsAnalysisTest.scala | 114 ++++++++++
 .../apache/spot/dns/DNSWordCreationTest.scala   |  17 ++
 .../FlowSuspiciousCoonectsAnalysis.scala        | 125 +++++++++++
 .../spot/netflow/FlowWordCreatorTest.scala      | 214 ++++++++++++++++++
 .../ProxySuspiciousConnectsAnalysisTest.scala   | 135 ++++++++++++
 spot-oa/oa/components/iana/dns-qclass.csv       |   1 +
 spot-oa/oa/components/iana/dns-qtype.csv        |   1 +
 spot-oa/oa/components/iana/dns-rcode.csv        |   1 +
 spot-setup/spot.conf                            |  13 +-
 31 files changed, 1435 insertions(+), 539 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/06900bb6/spot-ml/ml_ops.sh
----------------------------------------------------------------------
diff --cc spot-ml/ml_ops.sh
index 02a9c1d,a3406e8..1ff129e
--- a/spot-ml/ml_ops.sh
+++ b/spot-ml/ml_ops.sh
@@@ -45,31 -45,9 +45,17 @@@ els
      RAWDATA_PATH=${PROXY_PATH}
  fi
  
 +# pass the user domain designation if not empty
 +
 +if [ ! -z $USER_DOMAIN ] ; then
 +    USER_DOMAIN_CMD="--userdomain $USER_DOMAIN"
 +else
 +    USER_DOMAIN_CMD=''
 +fi
 +
  FEEDBACK_PATH=${LPATH}/${DSOURCE}_scores.csv
- DUPFACTOR=1000
- 
- PREPROCESS_STEP=${DSOURCE}_pre_lda
- POSTPROCESS_STEP=${DSOURCE}_post_lda
- 
- HDFS_WORDCOUNTS=${HPATH}/word_counts
- 
- # paths for intermediate files
- HDFS_DOCRESULTS=${HPATH}/doc_results.csv
- LOCAL_DOCRESULTS=${LPATH}/doc_results.csv
- 
- HDFS_WORDRESULTS=${HPATH}/word_results.csv
- LOCAL_WORDRESULTS=${LPATH}/word_results.csv
  
  HDFS_SCORED_CONNECTS=${HPATH}/scores
- HDFS_MODEL=${HPATH}/model
  
  LDA_OUTPUT_DIR=${DSOURCE}/${FDATE}
  
@@@ -108,12 -78,5 +86,6 @@@ time spark-submit --class "org.apache.s
    --scored ${HDFS_SCORED_CONNECTS} \
    --threshold ${TOL} \
    --maxresults ${MAXRESULTS} \
 -  --ldamaxiterations 20
 +  --ldamaxiterations 20 \
 +  $USER_DOMAIN_CMD
 +
- wait
- 
- # move results to hdfs.
- cd ${LPATH}
- hadoop fs -getmerge ${HDFS_SCORED_CONNECTS}/part-* ${DSOURCE}_results.csv && hadoop fs -moveFromLocal \
-     ${DSOURCE}_results.csv  ${HDFS_SCORED_CONNECTS}/${DSOURCE}_results.csv


[45/49] incubator-spot git commit: Ingest summary supporting 3 use cases, Netflow, DNS and Proxy

Posted by ev...@apache.org.
Ingest summary supporting 3 use cases, Netflow, DNS and Proxy


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/46470640
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/46470640
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/46470640

Branch: refs/heads/master
Commit: 46470640ef882297df1109d4f3484f017cf63d95
Parents: f0619ae
Author: Diego Ortiz Huerta <di...@intel.com>
Authored: Mon Dec 12 10:08:56 2016 -0800
Committer: Everardo Lopez Sandoval (Intel) <el...@elopezsa-mac02.ra.intel.com>
Committed: Fri Jan 20 17:01:02 2017 -0800

----------------------------------------------------------------------
 .../ui/flow/js/constants/NetflowConstants.js    |   1 -
 spot-oa/ui/ingest-summary.html                  |  37 +-
 .../js/components/IngestSummaryPanel.react.js   | 418 +++++++++----------
 spot-oa/ui/js/components/OptionPicker.react.js  |  43 ++
 spot-oa/ui/js/constants/SpotConstants.js        |  10 +-
 spot-oa/ui/js/ingest-summary.js                 |  66 +--
 spot-oa/ui/js/stores/IngestSummaryStore.js      | 107 +++--
 spot-oa/ui/package.json                         |   2 +-
 8 files changed, 378 insertions(+), 306 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/46470640/spot-oa/ui/flow/js/constants/NetflowConstants.js
----------------------------------------------------------------------
diff --git a/spot-oa/ui/flow/js/constants/NetflowConstants.js b/spot-oa/ui/flow/js/constants/NetflowConstants.js
index c0fc7a8..4de19e1 100755
--- a/spot-oa/ui/flow/js/constants/NetflowConstants.js
+++ b/spot-oa/ui/flow/js/constants/NetflowConstants.js
@@ -5,7 +5,6 @@ var NetflowConstants = {
   API_VISUAL_DETAILS: '../../data/flow/${date}/chord-${ip}.tsv',
   API_COMMENTS: '../../data/flow/${date}/threats.csv',
   API_INCIDENT_PROGRESSION: '../../data/flow/${date}/threat-dendro-${ip}.json',
-  API_INGEST_SUMMARY: '../data/flow/ingest_summary/is_${year}${month}.csv',
   API_IMPACT_ANALYSIS: '../../data/flow/${date}/stats-${ip}.json',
   API_GLOBE_VIEW: '../../data/flow/${date}/globe-${ip}.json',
   API_WORLD_110M: '../flow/world-110m.json',

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/46470640/spot-oa/ui/ingest-summary.html
----------------------------------------------------------------------
diff --git a/spot-oa/ui/ingest-summary.html b/spot-oa/ui/ingest-summary.html
index e694609..1c4a100 100755
--- a/spot-oa/ui/ingest-summary.html
+++ b/spot-oa/ui/ingest-summary.html
@@ -34,32 +34,43 @@
             height: 100%;
         }
 
-        #spot-is-header {
+        .is-chart svg {
             width: 100%;
-            position: absolute;
-            top: 0;
-            left: 0;
-            z-index: 2;
-            height: auto;
+            height: 100%;
         }
 
-        #spot-is, #spot-is-summary {
-            height: 100%;
+        .is-chart svg .header text {
+            text-anchor: middle;
+            fill: #82837e;
+        }
+
+        .is-chart svg .header text tspan.bold {
+            font-weight: bold;
         }
 
-        .axis {
+        .is-chart .axis {
             shape-rendering: crispEdges;
         }
 
-        .axis path, .axis line {
+        .is-chart .axis path, .is-chart .axis line {
             fill: none;
         }
 
-        rect.pane {
-            cursor: e-resize;
-            fill: none;
+        .is-chart .pipeline {
             pointer-events: all;
         }
+
+        .is-chart .pipeline.zoom-in {
+            cursor: zoom-in;
+        }
+
+        .is-chart .pipeline.zoom-out {
+            cursor: zoom-out;
+        }
+
+        .is-chart .pipeline.e-resize {
+            cursor: e-resize;
+        }
     </style>
 </head>
 <body>

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/46470640/spot-oa/ui/js/components/IngestSummaryPanel.react.js
----------------------------------------------------------------------
diff --git a/spot-oa/ui/js/components/IngestSummaryPanel.react.js b/spot-oa/ui/js/components/IngestSummaryPanel.react.js
index cc951ad..06c83bd 100755
--- a/spot-oa/ui/js/components/IngestSummaryPanel.react.js
+++ b/spot-oa/ui/js/components/IngestSummaryPanel.react.js
@@ -3,234 +3,216 @@ const d3 = require('d3');
 const React = require('react');
 const ReactDOM = require('react-dom');
 
+const ContentLoaderMixin = require('./ContentLoaderMixin.react');
+const ChartMixin = require('./ChartMixin.react');
 const DateUtils = require('../utils/DateUtils');
 const InSumActions = require('../actions/InSumActions');
 
-const NetflowIngestSummaryStore = require('../../flow/js/stores/IngestSummaryStore');
-
-function initialDraw() {
-  var rootNode, format, x, y, xAxis, yAxis, area, svg, rect, total, minDate, maxDate, maxFlows, numberFormat;
-
-  rootNode = d3.select(ReactDOM.findDOMNode(this));
-
-  // graph dimensions
-  var m = [100, 50, 50, 80], // Margin
-      w = $(rootNode.node()).width() - m[1] - m[3], // Width
-      h = $(rootNode.node()).height() - m[0] - m[2]; // Height
-
-  format = d3.time.format("%Y-%m-%d %H:%M");
-
-  // Scales.
-  x = d3.time.scale().range([0, w]); // get X function
-  y = d3.scale.linear().range([h, 0]); // get Y function
-  xAxis = d3.svg.axis().scale(x).orient("bottom"); // Get the X axis (Time)
-  yAxis = d3.svg.axis().scale(y).orient("left"); // Get Y Axis (Netflows)
-
-  // An area generator.
-  area = d3.svg.area()
-        .x(function (d) {
-            return x(d.date);
-        })
-        .y0(h)
-        .y1(function (d) {
-            if (!isNaN(d.total))
-                return y(d.total);
-            else
-                return y(0);
-        });
+const IngestSummaryStore = require('../stores/IngestSummaryStore');
 
-  rootNode.select('svg').remove();
-
-  // define the Main SVG
-  svg = rootNode.select('#' + this.props.id + '-summary').append("svg")
-    .attr("width", w + m[1] + m[3])
-    .attr("height", h + m[0] + m[2])
-        .append("g")
-        .attr("transform", "translate(" + m[3] + "," + m[0] + ")")
-
-  // Append the clipPath to avoid the Area overlapping
-  svg.append("clipPath")
-        .attr("id", "clip")
-        .append("rect")
-          .attr("x", x(0))
-          .attr("y", y(1))
-          .attr("width", x(1) - x(0))
-          .attr("height", y(0) - y(1));
-
-  // Append the Y Axis group
-  svg.append("g")
-    .attr("class", "y axis");
-
-  // Append the X axis group
-  svg.append("g")
-    .attr("class", "x axis")
-    .attr("transform", "translate(0," + h + ")");
-
-  // Append a pane rect, which will help us to add the zoom functionality
-  rect = svg.append("rect")
-        .attr("class", "pane")
-        .attr("width", w)
-        .attr("height", h);
-
-  this.state.data.forEach(function (dataSet)
-  {
-    var a;
-
-    a = [{date: minDate}];
-    a.push.apply(a, dataSet);
-    minDate = d3.min(a, function (d) { return d.date; });
-    a[0] = {date: maxDate, flows: maxFlows};
-    maxDate = d3.max(a, function (d) { return d.date; });
-    maxFlows = d3.max(a, function (d) { return d.total; })
-  });
-
-  !minDate && (minDate = DateUtils.parseDate(NetflowIngestSummaryStore.getStartDate()));
-  !maxDate && (maxDate = DateUtils.parseDate(NetflowIngestSummaryStore.getEndDate()));
-
-  // bind the data to the X and Y generators
-  x.domain([minDate, maxDate]);
-  y.domain([0, maxFlows]);
-
-  // Bind the data to our path element.
-  svg.selectAll("path.area").data(this.state.data).enter().insert('path', 'g')
-                                                .attr('class', 'area')
-                                                .attr('clip-path', 'url(#clip)')
-                                                .style('fill', '#0071c5')
-                                                .attr('d', function (d) {
-                                                    return area(d);
-                                                });
-
-  //Add the pane rect the zoom behavior
-  rect.call(d3.behavior.zoom().x(x)
-      .scaleExtent([0.3, 2300]) // these are magic numbers to avoid the grap be zoomable in/out to the infinity
-      .on("zoom", zoom.bind(this)));
-
-  function draw () {
-    var total, minDate, maxDate, numberFormat;
-
-    svg.select("g.x.axis").call(xAxis);
-    svg.select("g.y.axis").call(yAxis);
-    svg.selectAll("path.area").attr("d", function (d) { return area(d); });
-    numberFormat = d3.format(",d"); // number formatter (comma separated number i.e. 100,000,000)
-
-    rootNode.select('#' + this.props.id + '-range').html("Seeing total flows <strong>from:</strong> " + x.domain().map(format).join(" <strong>to:</strong> "));
-
-    //Calculate the total flows between the displayed date range
-
-    total = 0;
-    minDate = x.domain()[0];
-    maxDate = x.domain()[1];
-
-    // Go to the first millisecond on dates
-    minDate.setSeconds(0);minDate.setMilliseconds(0);
-    maxDate.setSeconds(59);maxDate.setMilliseconds(0);
-
-    svg.selectAll("path.area").data().forEach(function (pathData)
-    {
-      pathData.forEach(function (record)
-      {
-        // Discard records outside displayed date range
-        if (record.date >= minDate && record.date <= maxDate) {
-          total += +record.total;
-        }
-      });
-    });
-
-    rootNode.select('#' + this.props.id + '-total').html("<strong>Total netflows in range:</strong> " + numberFormat(total));
-  }
-
-  /*
-      Zoom event handler
-  */
-  function zoom() {
-    if (d3.event.sourceEvent.type == "wheel") {
-      if (d3.event.sourceEvent.wheelDelta < 0)
-         rect.style("cursor", "zoom-out");
-      else
-         rect.style("cursor", "zoom-in");
-    }
-    else if (d3.event.sourceEvent.type == "mousemove") {
-      rect.style("cursor", "e-resize");
-    }
+const MARGIN = [80, 50, 50, 100];
+const TIME_FORMATER = d3.time.format('%Y-%m-%d %H:%M');
+const NUMBER_FORMATER = d3.format(',d');
 
-    draw.call(this);
-  }
+const IngestSummaryPanel = React.createClass({
+    mixins: [ContentLoaderMixin, ChartMixin],
+    buildChart() {
+        // Scales
+        this.xScale = d3.time.scale();
+        this.yScale = d3.scale.linear();
 
-  draw.call(this);
-}
+        // Axis
+        this.xAxis = d3.svg.axis().scale(this.xScale).orient('bottom'); // Time
+        this.yAxis = d3.svg.axis().scale(this.yScale).orient('left'); // Totals
 
-var IngestSummaryPanel = React.createClass({
-  propTypes: {
-    id: React.PropTypes.string
-  },
-  getDefaultProperties: function () {
-    return {
-      id: 'spot-is'
-    };
-  },
-  getInitialState: function ()
-  {
-    return {loading: true};
-  },
-  render:function()
-  {
-    var content;
-
-    if (this.state.error)
-    {
-      content = (
-        <div className="text-center text-danger">
-          {this.state.error}
-        </div>
-      );
-    }
-    else if (this.state.loading)
-    {
-      content = (
-        <div className="spot-loader">
-            Loading <span className="spinner"></span>
-        </div>
-      );
-    }
-    else
-    {
-      content = (
-        <div id={this.props.id} className="text-center">
-          <div id={this.props.id + '-header'}>
-            <p id={this.props.id + '-range'}></p>
-            <p id={this.props.id + '-total'}></p>
-            <p id={this.props.id + '-istructions'} className="small">** Zoom in/out using mouse wheel or two fingers in track pad <br /> ** Move across the x-axis by clicking anywhere in the graph and dragging to left or right</p>
-          </div>
-          <div id={this.props.id + '-summary'}></div>
-        </div>
-      );
-    }
+        // An area generator.
+        this.area = d3.svg.area()
+              .x(d => this.xScale(d.date))
+              .y1(d => (isNaN(d.total) ? this.yScale(0) : this.yScale(d.total)));
 
-    return (
-      <div>{content}</div>
-    )
-  },
-  componentDidMount: function()
-  {
-    NetflowIngestSummaryStore.addChangeDataListener(this._onChange);
-    window.addEventListener('resize', this.buildGraph);
-  },
-  componentWillUnmount: function ()
-  {
-    NetflowIngestSummaryStore.removeChangeDataListener(this._onChange);
-    window.removeEventListener('resize', this.buildGraph);
+        let d3svg = d3.select(this.svg);
+
+        const d3header = d3svg.append('g').attr('class', 'header');
+
+        d3header.append('text')
+            .attr('transform', 'translate(0,15)')
+            .html('Seeing data <tspan class="bold">from</tspan > <tspan class="min-date" /> <tspan class="bold"> to </tspan> <tspan class="max-date" />');
+
+        d3header.append('text')
+            .attr('transform', 'translate(0,30)')
+            .html('<tspan class="bold">Total</tspan> records ingested: <tspan class="total" />');
+
+        d3header.append('text')
+            .attr('transform', 'translate(0,45)')
+            .text('** Zoom in/out using mouse wheel or two fingers in track pad');
+        d3header.append('text')
+            .attr('transform', 'translate(0,60)')
+            .text('** ** Move across the x-axis by clicking anywhere in the graph and dragging to left or right');
+
+        this.updateLegends(this.state.minDate, this.state.maxDate, this.state.total);
+
+        this.canvas = d3svg.append('g')
+            .attr('transform', `translate(${MARGIN[3]},${MARGIN[0]})`);
+
+        // Append the clipPath to avoid drawing not seen data
+        this.clipRect = d3svg.append('defs')
+            .append('clipPath')
+                .attr('id', 'clip')
+                .append('rect')
+                    .attr('x',0)
+                    .attr('y', 0);
+
+        this.d3xAxis = this.canvas.append('g').attr('class', 'x axis');
+        this.d3yAxis = this.canvas.append('g').attr('class', 'y axis');
+        this.pipelineCanvas = this.canvas.append('g').attr('class', 'pipeline');
+
+        this.d3zoom = d3.behavior.zoom()
+            .scaleExtent([0.3, 2300]) // these are magic numbers to avoid the grap be zoomable in/out to the infinity
+            .on('zoom', this.zoom)
+
+        this.pipelineCanvas.call(this.d3zoom);
+
+        this.pipelineColor = d3.scale.category10().domain(Object.keys(IngestSummaryStore.PIPELINES));
+    },
+    draw() {
+        let $svg = $(this.svg);
+
+        this.width = $svg.width() - MARGIN[1] - MARGIN[3];
+        this.height = $svg.height() - MARGIN[0] - MARGIN[2];
+
+        d3.select(this.svg).select('.header').attr('transform', `translate(${this.width/2},0)`);
+
+        this.xScale.range([0, this.width]).domain([this.state.minDate, this.state.maxDate]);
+        this.yScale.range([this.height, 0]).domain([0, this.state.maxTotal]);
+
+        this.d3zoom.x(this.xScale)
+
+        this.area.y0(this.height);
+
+        this.clipRect
+            .attr('width', this.width)
+            .attr('height', this.height);
+
+        this.d3xAxis.attr('transform', `translate(0,${this.height})`);
+
+        this.drawPaths();
+    },
+    drawPaths() {
+        this.d3xAxis.call(this.xAxis);
+        this.d3yAxis.call(this.yAxis);
+
+        let total = 0;
+        const [minDate, maxDate] = this.xScale.domain();
+
+        // Go to the first millisecond on dates
+        minDate.setSeconds(0);minDate.setMilliseconds(0);
+        maxDate.setSeconds(59);maxDate.setMilliseconds(0);
+
+        const pipelineData = this.state.data.map(currentMonthData => {
+            // Filter records outside current date range
+            return currentMonthData.filter(record => {
+                const included = record.date>=minDate && record.date<=maxDate;
+
+                // Sum records included in range only
+                if (included) total += record.total;
+
+                return included;
+            });
+        }).filter(monthData => monthData.length>0); // Filter out empty months
+
+        this.drawPipeline(pipelineData);
+
+        this.updateLegends(minDate, maxDate, total);
+    },
+    drawPipeline(data) {
+        const pipelineSel = {};
+
+        pipelineSel.update = this.pipelineCanvas.selectAll('path.area').data(data);
+
+        pipelineSel.enter = pipelineSel.update.enter();
+        pipelineSel.exit = pipelineSel.update.exit();
+
+        pipelineSel.enter.append('path')
+            .attr('class', 'area')
+            .style('fill', this.pipelineColor(IngestSummaryStore.getPipeline()));
+
+        pipelineSel.update.attr('d', d => this.area(d));
+
+        pipelineSel.exit.remove();
+    },
+    updateLegends(minDate, maxDate, total) {
+        const minDateStr = TIME_FORMATER(minDate);
+        const maxDateStr = TIME_FORMATER(maxDate);
+        const totalStr = NUMBER_FORMATER(total);
+
+        const d3header = d3.select(this.svg).select('.header');
+
+        d3header.select('.min-date').text(minDateStr);
+        d3header.select('.max-date').text(maxDateStr);
+        d3header.select('.total').text(totalStr);
+    },
+    zoom() {
+        if (d3.event.sourceEvent.type == 'wheel') {
+            this.pipelineCanvas.classed('zoom-out', d3.event.sourceEvent.wheelDelta < 0);
+            this.pipelineCanvas.classed('zoom-in', d3.event.sourceEvent.wheelDelta >= 0);
+            this.pipelineCanvas.classed('e-resize', false);
+      }
+      else if (d3.event.sourceEvent.type == 'mousemove') {
+        this.pipelineCanvas.classed('e-resize', true);
+        this.pipelineCanvas.classed('zoom-out', false);
+        this.pipelineCanvas.classed('zoom-in', false);
+      }
+
+      this.drawPaths();
   },
-  componentDidUpdate: function ()
-  {
-    if (!this.state.loading && !this.state.error && this.state.data)
-    {
-      this.buildGraph();
+    componentDidMount() {
+        IngestSummaryStore.addChangeDataListener(this._onChange);
+    },
+    componentWillUnmount() {
+        IngestSummaryStore.removeChangeDataListener(this._onChange);
+    },
+    _onChange() {
+        const storeData = IngestSummaryStore.getData();
+
+        if (storeData.error) {
+            this.replaceState({error: storeData.error});
+        }
+        else if (!storeData.loading && storeData.data) {
+            this.replaceState(this._getStateFromData(storeData.data));
+        }
+        else {
+            this.replaceState({loading: storeData.loading});
+        }
+    },
+    _getStateFromData(data) {
+        let total, maxTotal, minDate, maxDate;
+
+        total = 0;
+        maxTotal = 0;
+        minDate = null;
+        maxDate = null;
+
+        data.forEach(function (monthData) {
+          monthData.forEach(function (record) {
+              minDate = d3.min([minDate, record.date]);
+              maxDate = d3.max([maxDate, record.date]);
+              maxTotal = d3.max([maxTotal, +record.total]);
+              total += +record.total;
+          });
+        });
+
+        !minDate && (minDate = DateUtils.parseDate(IngestSummaryStore.getStartDate()));
+        !maxDate && (maxDate = DateUtils.parseDate(IngestSummaryStore.getEndDate()));
+
+        return {
+            loading: false,
+            total,
+            maxTotal,
+            minDate,
+            maxDate,
+            data: data
+        };
     }
-  },
-  buildGraph: initialDraw,
-  _onChange: function () {
-    this.replaceState(NetflowIngestSummaryStore.getData());
-  }
 });
 
 module.exports = IngestSummaryPanel;

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/46470640/spot-oa/ui/js/components/OptionPicker.react.js
----------------------------------------------------------------------
diff --git a/spot-oa/ui/js/components/OptionPicker.react.js b/spot-oa/ui/js/components/OptionPicker.react.js
new file mode 100644
index 0000000..f8e748d
--- /dev/null
+++ b/spot-oa/ui/js/components/OptionPicker.react.js
@@ -0,0 +1,43 @@
+const React = require('react');
+
+const RadioPicker = React.createClass({
+    propTypes: {
+        id: React.PropTypes.string,
+        name: React.PropTypes.string,
+        options: React.PropTypes.arrayOf(React.PropTypes.string).isRequired,
+        value: React.PropTypes.string
+    },
+    getDefaultProps() {
+        return {
+            id: null,
+            name: null,
+            value: null
+        };
+    },
+    getInitialState() {
+        const state = {};
+
+        state.value = this.props.value || (this.props.options.length>0 ? this.props.options[0] : null);
+
+        return state;
+    },
+    render() {
+        const options = Object.keys(this.props.options).map(option => {
+            return <option value={option} selected={this.state.value==option}>
+                {this.props.options[option]}
+            </option>;
+        });
+
+        return <select id={this.props.id} className="form-control" name={this.props.name} onChange={this.onChange}>
+            {options}
+        </select>;
+    },
+    onChange(e) {
+        const value = e.target.value;
+        this.setState({value});
+
+        this.props.onChange && this.props.onChange(value);
+    }
+});
+
+module.exports = RadioPicker;

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/46470640/spot-oa/ui/js/constants/SpotConstants.js
----------------------------------------------------------------------
diff --git a/spot-oa/ui/js/constants/SpotConstants.js b/spot-oa/ui/js/constants/SpotConstants.js
index 35e23c4..e4a711d 100755
--- a/spot-oa/ui/js/constants/SpotConstants.js
+++ b/spot-oa/ui/js/constants/SpotConstants.js
@@ -1,4 +1,7 @@
-var SpotConstants = {
+const SpotConstants = {
+  PIPELINE_NETFLOW: 'flow',
+  PIPELINE_DNS: 'dns',
+  PIPELINE_PROXY: 'proxy',
   // Search Actions
   UPDATE_FILTER: 'UPDATE_FILTER',
   UPDATE_DATE: 'UPDATE_DATE',
@@ -18,6 +21,7 @@ var SpotConstants = {
   IMPACT_ANALYSIS_PANEL:'Impact Analysis',
   GLOBE_VIEW_PANEL:'Map View | Globe',
   TIMELINE_PANEL:'Timeline',
+  INGEST_SUMMARY_PANEL:'Ingest Summary',
   // Edge Investigation
   MAX_SUSPICIOUS_ROWS: 250,
   RELOAD_SUSPICIOUS: 'RELOAD_SUSPICIOUS',
@@ -31,10 +35,10 @@ var SpotConstants = {
   RELOAD_COMMENTS: 'RELOAD_COMMENTS',
   SELECT_COMMENT: 'SELECT_COMMENT',
   // INGEST SUMMARY
+  API_INGEST_SUMMARY: '../data/${pipeline}/ingest_summary/is_${year}${month}.csv',
+  RELOAD_INGEST_SUMMARY: 'RELOAD_INGEST_SUMMARY',
   START_DATE: 'start-date',
   END_DATE: 'end-date',
-  // Ingest summary Actions
-  RELOAD_INGEST_SUMMARY: 'RELOAD_INGEST_SUMMARY',
   // Server Paths
   NOTEBOOKS_PATH: '/notebooks/ipynb'
 };

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/46470640/spot-oa/ui/js/ingest-summary.js
----------------------------------------------------------------------
diff --git a/spot-oa/ui/js/ingest-summary.js b/spot-oa/ui/js/ingest-summary.js
index c42ed41..04f2872 100755
--- a/spot-oa/ui/js/ingest-summary.js
+++ b/spot-oa/ui/js/ingest-summary.js
@@ -3,12 +3,14 @@ const ReactDOM = require('react-dom');
 
 const SpotActions = require('./actions/SpotActions');
 const InSumActions = require('./actions/InSumActions');
+const IngestSummaryStore = require('./stores/IngestSummaryStore');
 const SpotConstants = require('./constants/SpotConstants');
 const SpotUtils = require('./utils/SpotUtils');
 const DateUtils = require('./utils/DateUtils');
 
 // Build and Render Toolbar
 const DateInput = require('./components/DateInput.react');
+const OptionPicker = require('./components/OptionPicker.react');
 
 // Find out period
 var startDate, endDate, today;
@@ -41,32 +43,48 @@ if (endDate < startDate)
   endDate = today;
 }
 
+const PIPELINES = IngestSummaryStore.PIPELINES;
+const DEFAULT_PIPELINE = Object.keys(PIPELINES)[0];
+
+const loadPipeline = function loadPipeline(pipeline) {
+    IngestSummaryStore.setPipeline(pipeline);
+    InSumActions.reloadSummary();
+}
+
 ReactDOM.render(
-  (
     <form className="form-inline">
-      <div className="form-group">
-        <label htmlFor="startDatePicker">Period:</label>
-        <div className="input-group input-group-xs">
-          <div className="input-group-addon">
-            <span className="glyphicon glyphicon-calendar" aria-hidden="true"></span>
-          </div>
-          <DateInput id="startDatePicker" name={SpotConstants.START_DATE} value={startDate}/>
+        <div className="form-group">
+            <label htmlFor="pipeline-picker">Source: </label>
+            <div className="input-group input-group-xs">
+                <OptionPicker
+                    id="pipeline-picker"
+                    options={PIPELINES}
+                    value={DEFAULT_PIPELINE}
+                    onChange={loadPipeline} />
+            </div>
+        </div>
+        <div className="form-group">
+            <label htmlFor="startDatePicker">Period:</label>
+            <div className="input-group input-group-xs">
+                <div className="input-group-addon">
+                    <span className="glyphicon glyphicon-calendar" aria-hidden="true"></span>
+                </div>
+                <DateInput id="startDatePicker" name={SpotConstants.START_DATE} value={startDate}/>
+            </div>
         </div>
-      </div>
-      <div className="form-group">
-        <label htmlFor="endDatePicker"> - </label>
-        <div className="input-group input-group-xs">
-          <DateInput id="endDatePicker" name={SpotConstants.END_DATE} value={endDate} />
-          <div className="input-group-btn">
-            <button className="btn btn-default" type="button" title="Reload" onClick={InSumActions.reloadSummary}>
-              <span className="glyphicon glyphicon-repeat" aria-hidden="true"></span>
-            </button>
-          </div>
+        <div className="form-group">
+            <label htmlFor="endDatePicker"> - </label>
+            <div className="input-group input-group-xs">
+                <DateInput id="endDatePicker" name={SpotConstants.END_DATE} value={endDate} />
+                <div className="input-group-btn">
+                    <button className="btn btn-default" type="button" title="Reload" onClick={InSumActions.reloadSummary}>
+                        <span className="glyphicon glyphicon-repeat" aria-hidden="true"></span>
+                    </button>
+                </div>
+            </div>
         </div>
-      </div>
-    </form>
-  ),
-  document.getElementById('nav_form')
+    </form>,
+    document.getElementById('nav_form')
 );
 
 // Build and Render Edge Investigation's panels
@@ -79,7 +97,7 @@ ReactDOM.render(
   <div id="spot-content">
     <PanelRow maximized>
       <Panel title="Ingest Summary" container header={false} className="col-md-12">
-        <IngestSummaryPanel id="spot-is" />
+        <IngestSummaryPanel className="is-chart" />
       </Panel>
     </PanelRow>
   </div>,
@@ -91,4 +109,4 @@ SpotActions.setDate(startDate, SpotConstants.START_DATE);
 SpotActions.setDate(endDate, SpotConstants.END_DATE);
 
 // Load data
-InSumActions.reloadSummary();
+loadPipeline(DEFAULT_PIPELINE);

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/46470640/spot-oa/ui/js/stores/IngestSummaryStore.js
----------------------------------------------------------------------
diff --git a/spot-oa/ui/js/stores/IngestSummaryStore.js b/spot-oa/ui/js/stores/IngestSummaryStore.js
index 9e4ccb5..ca8a439 100755
--- a/spot-oa/ui/js/stores/IngestSummaryStore.js
+++ b/spot-oa/ui/js/stores/IngestSummaryStore.js
@@ -1,34 +1,55 @@
-var assign = require('object-assign');
-var d3 = require('d3');
-
-var SpotDispatcher = require('../../../js/dispatchers/SpotDispatcher');
-var SpotConstants = require('../../../js/constants/SpotConstants');
-var NetflowConstants = require('../constants/NetflowConstants');
-var DateUtils = require('../../../js/utils/DateUtils');
-var RestStore = require('../../../js/stores/RestStore');
-
-var START_DATE_FILTER = NetflowConstants.START_DATE;
-var END_DATE_FILTER = NetflowConstants.END_DATE;
-var CURRENT_DATE_FILTER = 'current_date';
-
-var requestQueue = [];
-var requestErrors = [];
-
-var IngestSummaryStore = assign(new RestStore(NetflowConstants.API_INGEST_SUMMARY), {
+const assign = require('object-assign');
+const d3 = require('d3');
+
+const SpotDispatcher = require('../dispatchers/SpotDispatcher');
+const SpotConstants = require('../constants/SpotConstants');
+const DateUtils = require('../utils/DateUtils');
+const RestStore = require('../stores/RestStore');
+
+const PIPELINE_FILTER = 'pipeline';
+const CURRENT_YEAR_FILTER = 'year';
+const CURRENT_MONTH_FILTER = 'month';
+
+const requestQueue = [];
+const requestErrors = [];
+
+const IngestSummaryStore = assign(new RestStore(SpotConstants.API_INGEST_SUMMARY), {
+    PIPELINES: {
+        [SpotConstants.PIPELINE_NETFLOW]: 'Netflow',
+        [SpotConstants.PIPELINE_DNS]: 'Dns',
+        [SpotConstants.PIPELINE_PROXY]: 'Proxy'
+    },
     errorMessages: {
         404: 'No details available'
     },
-    setStartDate: function (date) {
-        this.setRestFilter(START_DATE_FILTER, date);
+    setStartDate(date) {
+        this._startDate = date;
+    },
+    getStartDate() {
+        return this._startDate;
+    },
+    setEndDate(date) {
+        this._endDate = date;
     },
-    getStartDate: function () {
-        return this.getRestFilter(START_DATE_FILTER);
+    getEndDate() {
+        return this._endDate;
     },
-    setEndDate: function (date) {
-        this.setRestFilter(END_DATE_FILTER, date);
+    setPipeline(pipeline) {
+        this.setRestFilter(PIPELINE_FILTER, pipeline);
     },
-    getEndDate: function () {
-        return this.getRestFilter(END_DATE_FILTER);
+    getPipeline() {
+        return this.getRestFilter(PIPELINE_FILTER);
+    },
+    setCurrentDate(date) {
+        this.setRestFilter(CURRENT_YEAR_FILTER, date.getFullYear())
+
+        const month = date.getMonth() + 1 + "";
+        this.setRestFilter(CURRENT_MONTH_FILTER, month.length==1 ? `0${month}`:month);
+
+        this._currentDate = date;
+    },
+    getCurrentDate() {
+        return this._currentDate;
     },
     /**
      *  Start asking the server for CSV data to create the chart
@@ -36,8 +57,8 @@ var IngestSummaryStore = assign(new RestStore(NetflowConstants.API_INGEST_SUMMAR
     requestSummary: function () {
         var startDate, endDate, date, delta, startRequests, i, month;
 
-        startDate = DateUtils.parseDate(this.getRestFilter(START_DATE_FILTER));
-        endDate = DateUtils.parseDate(this.getRestFilter(END_DATE_FILTER));
+        startDate = DateUtils.parseDate(this.getStartDate());
+        endDate = DateUtils.parseDate(this.getEndDate());
 
         // Find out how many request need to be made
         delta = (endDate.getFullYear() - startDate.getFullYear()) * 12 + (endDate.getMonth() - startDate.getMonth());
@@ -58,17 +79,10 @@ var IngestSummaryStore = assign(new RestStore(NetflowConstants.API_INGEST_SUMMAR
         startRequests && this.dequeue();
     },
     dequeue: function () {
-        var date, year, month;
-
         if (requestQueue.length == 0) return;
 
-        date = requestQueue.shift();
-        this.setRestFilter(CURRENT_DATE_FILTER, date);
-        year = date.getFullYear();
-        month = date.getMonth() + 1 + "";
-        month = month.length == 1 ? "0" + month : month;
-
-        this.setEndpoint(NetflowConstants.API_INGEST_SUMMARY.replace('${year}', year).replace('${month}', month));
+        const date = requestQueue.shift();
+        this.setCurrentDate(date);
 
         this.reload();
     },
@@ -91,10 +105,10 @@ var IngestSummaryStore = assign(new RestStore(NetflowConstants.API_INGEST_SUMMAR
             requestErrors.push(data);
         }
         else if (data.data) {
-            parse = d3.time.format("%Y-%m-%d %H:%M").parse; // Date formatting parser
-            startDate = DateUtils.parseDate(this.getRestFilter(START_DATE_FILTER));
-            endDate = DateUtils.parseDate(this.getRestFilter(END_DATE_FILTER));
-            date = DateUtils.parseDate(this.getRestFilter(CURRENT_DATE_FILTER));
+            parse = d3.time.format("%Y-%m-%d %H:%M:%S%Z").parse; // Date formatting parser
+            startDate = DateUtils.parseDate(this.getStartDate());
+            endDate = DateUtils.parseDate(this.getEndDate());
+            date = DateUtils.parseDate(this.getCurrentDate());
 
             if (date.getFullYear() == startDate.getFullYear() && date.getMonth() == startDate.getMonth()) {
                 dayFilter = startDate.getDate();
@@ -112,8 +126,8 @@ var IngestSummaryStore = assign(new RestStore(NetflowConstants.API_INGEST_SUMMAR
 
             // Parse dates and numbers.
             data.data.forEach(function (d) {
-                d.date = parse(d.date);
-                d.flows = +d.flows;
+                d.date = parse(`${d.date}:00-0000`);
+                d.total = +d.total;
             });
 
             // Sort the data by date ASC
@@ -122,13 +136,14 @@ var IngestSummaryStore = assign(new RestStore(NetflowConstants.API_INGEST_SUMMAR
             });
 
             if (!this._data.data) this._data.data = [];
+
             this._data.data.push(data.data);
         }
 
         this._data.loading = requestQueue.length > 0;
 
         if (!this._data.loading) {
-            if (this._data.data.length==0) {
+            if (this._data.data && this._data.data.length==0) {
                 // Broadcast first found error
                 this._data = requestErrors[0];
             }
@@ -144,15 +159,15 @@ SpotDispatcher.register(function (action) {
     switch (action.actionType) {
         case SpotConstants.UPDATE_DATE:
             switch (action.name) {
-                case NetflowConstants.START_DATE:
+                case SpotConstants.START_DATE:
                     IngestSummaryStore.setStartDate(action.date);
                     break;
-                case NetflowConstants.END_DATE:
+                case SpotConstants.END_DATE:
                     IngestSummaryStore.setEndDate(action.date);
                     break;
             }
             break;
-        case NetflowConstants.RELOAD_INGEST_SUMMARY:
+        case SpotConstants.RELOAD_INGEST_SUMMARY:
             IngestSummaryStore.requestSummary();
             break;
     }

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/46470640/spot-oa/ui/package.json
----------------------------------------------------------------------
diff --git a/spot-oa/ui/package.json b/spot-oa/ui/package.json
index 724544d..3863c64 100644
--- a/spot-oa/ui/package.json
+++ b/spot-oa/ui/package.json
@@ -36,7 +36,7 @@
   "scripts": {
     "test": "jest",
     "postinstall": "npm run build-all",
-    "watch-ingest-summary": "watchify js/ingest-summary.js -o js/ingest-summary.bundle.min.js -v -d",
+    "watch-ingest-summary": "NODE_ENV=development watchify js/ingest-summary.js -o js/ingest-summary.bundle.min.js -v -d",
     "build-all": "npm run build-flow && npm run build-dns && npm run build-proxy && npm run build-ingest-summary",
     "build-flow": "cd flow/ && npm run build-all && cd ../",
     "build-dns": "cd dns/ && npm run build-all && cd ../",


[49/49] incubator-spot git commit: removing unnecessary space.

Posted by ev...@apache.org.
removing unnecessary space.


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/d30337d4
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/d30337d4
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/d30337d4

Branch: refs/heads/master
Commit: d30337d42c6ac8862c251827d12c4ede8d6d532d
Parents: a958cb4
Author: Everardo Lopez Sandoval (Intel) <el...@elopezsa-mac02.local>
Authored: Mon Jan 23 13:15:29 2017 -0600
Committer: Everardo Lopez Sandoval (Intel) <el...@elopezsa-mac02.local>
Committed: Mon Jan 23 13:15:29 2017 -0600

----------------------------------------------------------------------
 .../test/scala/org/apache/spot/netflow/FlowWordCreatorTest.scala   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/d30337d4/spot-ml/src/test/scala/org/apache/spot/netflow/FlowWordCreatorTest.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/test/scala/org/apache/spot/netflow/FlowWordCreatorTest.scala b/spot-ml/src/test/scala/org/apache/spot/netflow/FlowWordCreatorTest.scala
index 3208395..00d14bc 100644
--- a/spot-ml/src/test/scala/org/apache/spot/netflow/FlowWordCreatorTest.scala
+++ b/spot-ml/src/test/scala/org/apache/spot/netflow/FlowWordCreatorTest.scala
@@ -37,7 +37,7 @@ class FlowWordCreatorTest extends FlatSpec with Matchers {
 
 
     dstWord shouldBe "-1_23_5_2_0"
-    srcWord shouldBe  "23_5_2_0"
+    srcWord shouldBe "23_5_2_0"
 
   }
 


[14/49] incubator-spot git commit: Merge branch 'spot' into test_dns_topdomain

Posted by ev...@apache.org.
Merge branch 'spot' into test_dns_topdomain

# Conflicts:
#	spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/b1b5d74c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/b1b5d74c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/b1b5d74c

Branch: refs/heads/master
Commit: b1b5d74c5dd6bfa023124c63925145d0ac452a7d
Parents: 0f1a6c5 18a6967
Author: nlsegerl <na...@intel.com>
Authored: Mon Dec 12 17:02:35 2016 -0800
Committer: nlsegerl <na...@intel.com>
Committed: Mon Dec 12 17:02:35 2016 -0800

----------------------------------------------------------------------
 ISSUES.md                                       | 53 --------------------
 README.md                                       | 51 ++++---------------
 spot-ingest/README.md                           |  4 --
 spot-ml/INSTALL.md                              |  1 +
 spot-ml/README.md                               | 24 ++-------
 spot-ml/ml_ops.sh                               |  3 +-
 .../spot/SuspiciousConnectsArgumentParser.scala |  5 ++
 .../dns/DNSSuspiciousConnectsAnalysis.scala     |  6 ++-
 .../org/apache/spot/dns/DNSWordCreation.scala   |  6 ++-
 .../spot/dns/model/DNSScoreFunction.scala       | 30 +++++++----
 .../dns/model/DNSSuspiciousConnectsModel.scala  | 25 ++++++---
 .../apache/spot/utilities/DomainProcessor.scala | 17 ++++---
 .../spot/utilities/DomainProcessorTest.scala    | 28 +++++++++--
 .../js/components/DetailsTablePanel.react.js    |  4 +-
 .../dns/js/components/NetworkViewPanel.react.js |  2 +-
 .../dns/js/components/SuspiciousPanel.react.js  |  2 +-
 .../js/components/DetailsTablePanel.react.js    |  2 +-
 .../js/components/NetworkViewPanel.react.js     |  2 +-
 .../flow/js/components/SuspiciousPanel.react.js |  2 +-
 .../ui/js/components/GridPanelMixin.react.js    | 20 ++++----
 .../components/PolloNetworkViewMixin.react.js   |  5 ++
 .../js/components/NetworkViewPanel.react.js     |  6 +--
 .../js/components/SuspiciousPanel.react.js      |  2 +-
 spot-setup/spot.conf                            |  4 ++
 24 files changed, 131 insertions(+), 173 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/b1b5d74c/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala
----------------------------------------------------------------------
diff --cc spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala
index f2ce7a4,4ef4718..244b941
--- a/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala
@@@ -54,46 -54,28 +54,48 @@@ object DNSSuspiciousConnectsAnalysis 
  
      logger.info("Loading data")
  
 -    val userDomain = config.userDomain
+ 
      val rawDataDF = sqlContext.read.parquet(config.inputPath)
        .filter(Timestamp + " is not null and " + UnixTimestamp + " is not null")
        .select(inColumns:_*)
  
 -    logger.info("Training the model")
  
 -    val model =
 -      DNSSuspiciousConnectsModel.trainNewModel(sparkContext, sqlContext, logger, config, rawDataDF, config.topicCount)
  
 -    logger.info("Scoring")
 -    val scoredDF = model.score(sparkContext, sqlContext, rawDataDF, userDomain)
 +    val scoredDF = detectDNSAnomalies(rawDataDF, config, sparkContext, sqlContext, logger)
 +
  
+ 
      val filteredDF = scoredDF.filter(Score + " <= " + config.threshold)
      val mostSusipiciousDF: DataFrame = filteredDF.orderBy(Score).limit(config.maxResults)
  
 -    val outputDF = mostSusipiciousDF.select(OutColumns:_*).sort(Score)
 -
 -    logger.info("DNS  suspcicious connects analysis completed.")
 +    mostSusipiciousDF.select(OutColumns:_*).sort(Score)
      logger.info("Saving results to : " + config.hdfsScoredConnect)
 -    outputDF.map(_.mkString(config.outputDelimiter)).saveAsTextFile(config.hdfsScoredConnect)
 +
 +
 +    mostSusipiciousDF.map(_.mkString(config.outputDelimiter)).saveAsTextFile(config.hdfsScoredConnect)
 +  }
 +
 +  /**
 +    * Identify anomalous DNS log entries in in the provided data frame.
 +    *
 +    * @param data Data frame of DNS entries
 +    * @param config
 +    * @param sparkContext
 +    * @param sqlContext
 +    * @param logger
 +    * @return
 +    */
 +  def detectDNSAnomalies(data: DataFrame, config: SuspiciousConnectsConfig,
 +                         sparkContext: SparkContext,
 +                         sqlContext: SQLContext,
 +                         logger: Logger) : DataFrame = {
 +
- 
++    val userDomain = config.userDomain
 +    logger.info("Fitting probabilistic model to data")
 +    val model =
 +      DNSSuspiciousConnectsModel.trainNewModel(sparkContext, sqlContext, logger, config, data, config.topicCount)
 +
 +    logger.info("Identifying outliers")
-     model.score(sparkContext, sqlContext, data)
++    model.score(sparkContext, sqlContext, data, userDomain)
    }
  }


[40/49] incubator-spot git commit: OA Quick fix to DNS and Proxy to skip reputation check, tld check now wont break with unknown dns names

Posted by ev...@apache.org.
OA Quick fix to DNS and Proxy to skip reputation check, tld check now wont break with unknown dns names


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/c9e27ba0
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/c9e27ba0
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/c9e27ba0

Branch: refs/heads/master
Commit: c9e27ba0fb59c8ab53e9ecaf7b8001fba88ee3d7
Parents: 7a7d91d
Author: LedaLima <ga...@intel.com>
Authored: Fri Jan 13 18:21:37 2017 -0600
Committer: Everardo Lopez Sandoval (Intel) <el...@elopezsa-mac02.ra.intel.com>
Committed: Fri Jan 20 17:01:02 2017 -0800

----------------------------------------------------------------------
 spot-oa/oa/dns/dns_oa.py     | 39 +++++++++++++++++++++++++++++----------
 spot-oa/oa/proxy/proxy_oa.py | 17 +++++++++++------
 spot-oa/oa/utils.py          |  8 +++++---
 3 files changed, 45 insertions(+), 19 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/c9e27ba0/spot-oa/oa/dns/dns_oa.py
----------------------------------------------------------------------
diff --git a/spot-oa/oa/dns/dns_oa.py b/spot-oa/oa/dns/dns_oa.py
index 2f8687f..ffcc839 100644
--- a/spot-oa/oa/dns/dns_oa.py
+++ b/spot-oa/oa/dns/dns_oa.py
@@ -143,25 +143,39 @@ class OA(object):
         dns_scores_csv = "{0}/dns_scores.csv".format(self._data_path)
         dns_scores_final =  self._move_time_stamp(self._dns_scores)
         dns_scores_final.insert(0,self._dns_scores_headers)
-        Util.create_csv_file(dns_scores_csv,dns_scores_final)   
+        Util.create_csv_file(dns_scores_csv,dns_scores_final,',',0)   
 
         # create bk file
         dns_scores_bu_csv = "{0}/dns_scores_bu.csv".format(self._data_path)
-        Util.create_csv_file(dns_scores_bu_csv,dns_scores_final)  
+        Util.create_csv_file(dns_scores_bu_csv,dns_scores_final,',',0)     
 
 
     def _add_tld_column(self):
         qry_name_col = self._conf['dns_results_fields']['dns_qry_name']
-        self._dns_scores = [conn + [ get_tld("http://" + str(conn[qry_name_col]), fail_silently=True) if "http://" not in str(conn[qry_name_col]) else get_tld(str(conn[qry_name_col]), fail_silently=True)] for conn in self._dns_scores ]
+        self._dns_scores = [conn + [ self._get_valid_tld(str(conn[qry_name_col])) ] for conn in self._dns_scores ]
          
   
+    def _get_valid_tld(self, qry_name):
+        tld = ""
+        try:
+            if "http://" not in qry_name: 
+                tld = get_tld("http://" + qry_name)
+            else:
+                tld = get_tld(qry_name)
+        except ValueError:
+            self._logger.error("Unable to get top level domain from query: {0}".format(qry_name))
+            tld = "UNKNOWN"
+        return tld
+    
+
     def _add_reputation(self):
 
         # read configuration.
         reputation_conf_file = "{0}/components/reputation/reputation_config.json".format(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
         self._logger.info("Reading reputation configuration file: {0}".format(reputation_conf_file))
         rep_conf = json.loads(open(reputation_conf_file).read())
-       
+        
+        
         # initialize reputation services.
         self._rep_services = []
         self._logger.info("Initializing reputation services.")
@@ -183,13 +197,18 @@ class OA(object):
         # get reputation per column.
         self._logger.info("Getting reputation for each service in config")        
         rep_services_results = []
-        for key,value in rep_cols.items():
-            rep_services_results = [ rep_service.check(None,value) for rep_service in self._rep_services]
-            rep_results = {}            
-            for result in rep_services_results:            
-                rep_results = {k: "{0}::{1}".format(rep_results.get(k, ""), result.get(k, "")).strip('::') for k in set(rep_results) | set(result)}
 
-            self._dns_scores = [ conn + [ rep_results[conn[key]] ]   for conn in self._dns_scores  ]
+ 
+        if self._rep_services :
+            for key,value in rep_cols.items():
+                rep_services_results = [ rep_service.check(None,value) for rep_service in self._rep_services]
+                rep_results = {}            
+                for result in rep_services_results:            
+                    rep_results = {k: "{0}::{1}".format(rep_results.get(k, ""), result.get(k, "")).strip('::') for k in set(rep_results) | set(result)}
+
+                self._dns_scores = [ conn + [ rep_results[conn[key]] ]   for conn in self._dns_scores  ]
+        else:
+            self._dns_scores = [ conn + [""]   for conn in self._dns_scores  ]
 
 
 

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/c9e27ba0/spot-oa/oa/proxy/proxy_oa.py
----------------------------------------------------------------------
diff --git a/spot-oa/oa/proxy/proxy_oa.py b/spot-oa/oa/proxy/proxy_oa.py
index 7359ba1..cb13ed3 100644
--- a/spot-oa/oa/proxy/proxy_oa.py
+++ b/spot-oa/oa/proxy/proxy_oa.py
@@ -171,14 +171,19 @@ class OA(object):
         # get reputation per column.
         self._logger.info("Getting reputation for each service in config")
         rep_services_results = []
-        for key,value in rep_cols.items():
-            rep_services_results = [ rep_service.check(None,value,True) for rep_service in self._rep_services]
-            rep_results = {}
+        if self._rep_services :
+            for key,value in rep_cols.items():
+                rep_services_results = [ rep_service.check(None,value,True) for rep_service in self._rep_services]
+                rep_results = {}
+
+                for result in rep_services_results:
+                    rep_results = {k: "{0}::{1}".format(rep_results.get(k, ""), result.get(k, "")).strip('::') for k in set(rep_results) | set(result)}
+
+                self._proxy_scores = [ conn + [ rep_results[conn[key]] ]   for conn in self._proxy_scores  ]
+        else:
+            self._proxy_scores = [ conn + [""] for conn in self._proxy_scores  ]
 
-            for result in rep_services_results:
-                rep_results = {k: "{0}::{1}".format(rep_results.get(k, ""), result.get(k, "")).strip('::') for k in set(rep_results) | set(result)}
 
-            self._proxy_scores = [ conn + [ rep_results[conn[key]] ]   for conn in self._proxy_scores  ]
 
     def _add_severity(self):
         # Add severity column

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/c9e27ba0/spot-oa/oa/utils.py
----------------------------------------------------------------------
diff --git a/spot-oa/oa/utils.py b/spot-oa/oa/utils.py
index 553315a..52b251a 100644
--- a/spot-oa/oa/utils.py
+++ b/spot-oa/oa/utils.py
@@ -96,13 +96,15 @@ class Util(object):
 		except ValueError:
 			return None
 	
+	
 	@classmethod
-	def create_csv_file(cls,full_path_file,content,delimiter=','): 
-
+	def create_csv_file(cls,full_path_file,content,delimiter=',',set_quoting=3):  
+		#set_quoting: 0 - MINIMAL, 1 - ALL, 3 - NONE
 		with open(full_path_file, 'w+') as u_file:
-			writer = csv.writer(u_file, quoting=csv.QUOTE_NONE, delimiter=delimiter)
+			writer = csv.writer(u_file, quoting=set_quoting, quotechar='"', delimiter=delimiter)
 			writer.writerows(content)
 
+
 class SecHead(object):
     def __init__(self, fp):
         self.fp = fp


[03/49] incubator-spot git commit: More modifications in order to pass the user domain info around to the places that are needed.

Posted by ev...@apache.org.
More modifications in order to pass the user domain info around to the places that are needed.


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/d7d6ae07
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/d7d6ae07
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/d7d6ae07

Branch: refs/heads/master
Commit: d7d6ae07344a9a4308067963cb9b03e493b995d0
Parents: 41ffbc3
Author: Brandon Edwards <br...@intel.com>
Authored: Tue Dec 6 14:45:38 2016 -0800
Committer: Brandon Edwards <br...@intel.com>
Committed: Tue Dec 6 22:03:38 2016 -0800

----------------------------------------------------------------------
 .../spot/SuspiciousConnectsArgumentParser.scala |  2 +-
 .../dns/DNSSuspiciousConnectsAnalysis.scala     |  4 ++-
 .../spot/dns/model/DNSScoreFunction.scala       | 30 +++++++++++++-------
 .../dns/model/DNSSuspiciousConnectsModel.scala  | 16 +++++++----
 .../apache/spot/utilities/DomainProcessor.scala | 14 ++++-----
 .../spot/utilities/DomainProcessorTest.scala    | 18 ++++++++----
 6 files changed, 54 insertions(+), 30 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/d7d6ae07/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala
index 632e0d8..be0db30 100644
--- a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsArgumentParser.scala
@@ -62,7 +62,7 @@ object SuspiciousConnectsArgumentParser {
       action((x, c) => c.copy(localUser = x)).
       text("Local user path")
 
-    opt[String]("userDomain").required().valueName("<user domain>").
+    opt[String]("userdomain").required().valueName("<user domain>").
       action((x, c) => c.copy(userDomain = x)).
       text("Domain of spot user (example: intel)")
 

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/d7d6ae07/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala b/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala
index d0e6da1..4ef4718 100644
--- a/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala
@@ -54,6 +54,8 @@ object DNSSuspiciousConnectsAnalysis {
 
     logger.info("Loading data")
 
+    val userDomain = config.userDomain
+
     val rawDataDF = sqlContext.read.parquet(config.inputPath)
       .filter(Timestamp + " is not null and " + UnixTimestamp + " is not null")
       .select(inColumns:_*)
@@ -64,7 +66,7 @@ object DNSSuspiciousConnectsAnalysis {
       DNSSuspiciousConnectsModel.trainNewModel(sparkContext, sqlContext, logger, config, rawDataDF, config.topicCount)
 
     logger.info("Scoring")
-    val scoredDF = model.score(sparkContext, sqlContext, rawDataDF)
+    val scoredDF = model.score(sparkContext, sqlContext, rawDataDF, userDomain)
 
 
     val filteredDF = scoredDF.filter(Score + " <= " + config.threshold)

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/d7d6ae07/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSScoreFunction.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSScoreFunction.scala b/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSScoreFunction.scala
index 09656f2..728f269 100644
--- a/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSScoreFunction.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSScoreFunction.scala
@@ -8,15 +8,16 @@ import org.apache.spot.dns.DNSWordCreation
 /**
   * Estimate the probabilities of network events using a [[DNSSuspiciousConnectsModel]]
   *
-  * @param frameLengthCuts
-  * @param timeCuts
-  * @param subdomainLengthCuts
-  * @param entropyCuts
-  * @param numberPeriodsCuts
-  * @param topicCount
-  * @param ipToTopicMixBC
-  * @param wordToPerTopicProbBC
-  * @param topDomainsBC
+  * @param frameLengthCuts Delimeters used to define binning for frame length field
+  * @param timeCuts Delimeters used to define binning for time field
+  * @param subdomainLengthCuts Delimeters used to define binning for subdomain length field
+  * @param entropyCuts Delimeters used to define binning for entropy field
+  * @param numberPeriodsCuts Delimeters used to define binning for number of periods of subdomain field
+  * @param topicCount Number of topics used for the LDA model
+  * @param ipToTopicMixBC Topic mixes learned by the LDA model for each IP in the data
+  * @param wordToPerTopicProbBC Word mixes for each of the topics learned by the LDA model
+  * @param topDomainsBC Alexa top one million list of domains.
+  * @param userDomain Domain associated to network data (example: 'intel')
   */
 class DNSScoreFunction(frameLengthCuts: Array[Double],
                        timeCuts: Array[Double],
@@ -26,13 +27,20 @@ class DNSScoreFunction(frameLengthCuts: Array[Double],
                        topicCount: Int,
                        ipToTopicMixBC: Broadcast[Map[String, Array[Double]]],
                        wordToPerTopicProbBC: Broadcast[Map[String, Array[Double]]],
-                       topDomainsBC: Broadcast[Set[String]]) extends Serializable {
+                       topDomainsBC: Broadcast[Set[String]],
+                       userDomain: String) extends Serializable {
 
 
   val suspiciousConnectsScoreFunction =
     new SuspiciousConnectsScoreFunction(topicCount, ipToTopicMixBC, wordToPerTopicProbBC)
 
-  val dnsWordCreator = new DNSWordCreation(frameLengthCuts, timeCuts, subdomainLengthCuts, entropyCuts, numberPeriodsCuts, topDomainsBC)
+  val dnsWordCreator = new DNSWordCreation(frameLengthCuts,
+                                           timeCuts,
+                                           subdomainLengthCuts,
+                                           entropyCuts,
+                                           numberPeriodsCuts,
+                                           topDomainsBC,
+                                           userDomain)
 
   def score(timeStamp: String,
             unixTimeStamp: Long,

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/d7d6ae07/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala b/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala
index 953e1ec..047e262 100644
--- a/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala
@@ -64,10 +64,11 @@ class DNSSuspiciousConnectsModel(inTopicCount: Int,
     * @param sc         Spark Context
     * @param sqlContext Spark SQL context
     * @param inDF       Dataframe of DNS log events, containing at least the columns of [[DNSSuspiciousConnectsModel.ModelSchema]]
+    * @param userDomain Domain associated to network data (ex: 'intel')
     * @return Dataframe with a column named [[org.apache.spot.dns.DNSSchema.Score]] that contains the
     *         probability estimated for the network event at that row
     */
-  def score(sc: SparkContext, sqlContext: SQLContext, inDF: DataFrame): DataFrame = {
+  def score(sc: SparkContext, sqlContext: SQLContext, inDF: DataFrame, userDomain: String): DataFrame = {
 
     val countryCodesBC = sc.broadcast(CountryCodes.CountryCodes)
     val topDomainsBC = sc.broadcast(TopDomains.TopDomains)
@@ -84,7 +85,8 @@ class DNSSuspiciousConnectsModel(inTopicCount: Int,
         topicCount,
         ipToTopicMixBC,
         wordToPerTopicProbBC,
-        topDomainsBC)
+        topDomainsBC,
+        userDomain)
 
 
     val scoringUDF = udf((timeStamp: String,
@@ -168,7 +170,7 @@ object DNSSuspiciousConnectsModel {
     val frameLengthCuts = Quantiles.computeDeciles(totalDataDF.select(FrameLength).rdd
       .map({ case Row(frameLen: Int) => frameLen.toDouble }))
 
-    val domainStatsDF = createDomainStatsDF(sparkContext, sqlContext, countryCodesBC, topDomainsBC, totalDataDF)
+    val domainStatsDF = createDomainStatsDF(sparkContext, sqlContext, countryCodesBC, topDomainsBC, userDomain, totalDataDF)
 
     val subdomainLengthCuts = Quantiles.computeQuintiles(domainStatsDF.filter(SubdomainLength + " > 0")
       .select(SubdomainLength).rdd.map({ case Row(subdomainLength: Int) => subdomainLength.toDouble }))
@@ -238,6 +240,7 @@ object DNSSuspiciousConnectsModel {
     * @param sqlContext     Spark SQL context.
     * @param countryCodesBC Broadcast of the country codes set.
     * @param topDomainsBC   Broadcast of the most-popular domains set.
+    * @param userDomain     Domain associated to network data (ex: 'intel')
     * @param inDF           Incoming dataframe. Schema is expected to provide the field [[QueryName]]
     * @return A new dataframe with the new columns added. The new columns have the schema [[DomainStatsSchema]]
     */
@@ -246,11 +249,12 @@ object DNSSuspiciousConnectsModel {
                           sqlContext: SQLContext,
                           countryCodesBC: Broadcast[Set[String]],
                           topDomainsBC: Broadcast[Set[String]],
+                          userDomain: String,
                           inDF: DataFrame): DataFrame = {
     val queryNameIndex = inDF.schema.fieldNames.indexOf(QueryName)
 
     val domainStatsRDD: RDD[Row] = inDF.rdd.map(row =>
-      Row.fromTuple(createTempFields(countryCodesBC, topDomainsBC, row.getString(queryNameIndex))))
+      Row.fromTuple(createTempFields(countryCodesBC, topDomainsBC, userDomain, row.getString(queryNameIndex))))
 
     sqlContext.createDataFrame(domainStatsRDD, DomainStatsSchema)
   }
@@ -262,15 +266,17 @@ object DNSSuspiciousConnectsModel {
     *
     * @param countryCodesBC Broadcast of the country codes set.
     * @param topDomainsBC   Broadcast of the most-popular domains set.
+    * @param userDomain     Domain associated to network data (ex: 'intel')
     * @param url            URL string to anlayze for domain and subdomain information.
     * @return [[TempFields]]
     */
   def createTempFields(countryCodesBC: Broadcast[Set[String]],
                        topDomainsBC: Broadcast[Set[String]],
+                       userDomain: String,
                        url: String): TempFields = {
 
     val DomainInfo(_, topDomainClass, subdomain, subdomainLength, subdomainEntropy, numPeriods) =
-      DomainProcessor.extractDomainInfo(url, topDomainsBC)
+      DomainProcessor.extractDomainInfo(url, topDomainsBC, userDomain)
 
 
     TempFields(topDomainClass = topDomainClass,

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/d7d6ae07/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala b/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala
index 334ae87..c5f0d73 100644
--- a/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/utilities/DomainProcessor.scala
@@ -28,10 +28,10 @@ object DomainProcessor extends Serializable {
 
   /**
     * Commonly extracted domain features.
-    * @param domain Domain (if any) of a url.
-    * @param topDomain Numerical class of domain: 2 for Intel, 1 for Alexa top domains, 0 for others.
-    * @param subdomain Subdomain (if any) in the url.
-    * @param subdomainLength Length of the subdomain. 0 if there is none.
+    * @param domain           Domain (if any) of a url.
+    * @param topDomain        Numerical class of domain: 2 for Intel, 1 for Alexa top domains, 0 for others.
+    * @param subdomain        Subdomain (if any) in the url.
+    * @param subdomainLength  Length of the subdomain. 0 if there is none.
     * @param subdomainEntropy Entropy of the subdomain viewed as a distribution on its character set.
     *                         0 if there is no subdomain.
     * @param numPeriods Number of periods + 1 in the url. (Number of sub-strings where url is split by periods.)
@@ -46,9 +46,9 @@ object DomainProcessor extends Serializable {
 
   /**
     * Extract domain info from a url.
-    * @param url Incoming url.
-    * @param topDomainsBC Broadcast variable containing the top domains set.
-    * @param userDomain Domain of the spot user (example:'intel').
+    * @param url           Incoming url.
+    * @param topDomainsBC  Broadcast variable containing the top domains set.
+    * @param userDomain    Domain of the spot user (example:'intel').
     * @return New [[DomainInfo]] object containing extracted domain information.
     */
   def extractDomainInfo(url: String, topDomainsBC: Broadcast[Set[String]], userDomain: String): DomainInfo = {

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/d7d6ae07/spot-ml/src/test/scala/org/apache/spot/utilities/DomainProcessorTest.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/test/scala/org/apache/spot/utilities/DomainProcessorTest.scala b/spot-ml/src/test/scala/org/apache/spot/utilities/DomainProcessorTest.scala
index c1d93f0..7e2cae6 100644
--- a/spot-ml/src/test/scala/org/apache/spot/utilities/DomainProcessorTest.scala
+++ b/spot-ml/src/test/scala/org/apache/spot/utilities/DomainProcessorTest.scala
@@ -57,8 +57,10 @@ class DomainProcessorTest extends TestingSparkContextFlatSpec with Matchers {
 
     val topDomains = sparkContext.broadcast(TopDomains.TopDomains)
 
+    val userDomain = "intel"
+
     // case class DerivedFields(topDomain: String, subdomainLength: Double, subdomainEntropy: Double, numPeriods: Double)
-    val result = extractDomainInfo(url, topDomains)
+    val result = extractDomainInfo(url, topDomains, userDomain)
 
     result shouldBe DomainInfo(domain = "None", topDomain = 0, subdomain = "None", subdomainLength = 0, subdomainEntropy = 0, numPeriods = 6)
   }
@@ -69,7 +71,9 @@ class DomainProcessorTest extends TestingSparkContextFlatSpec with Matchers {
 
     val topDomains = sparkContext.broadcast(TopDomains.TopDomains)
 
-    val result = extractDomainInfo(url, topDomains)
+    val userDomain = "intel"
+
+    val result = extractDomainInfo(url, topDomains, userDomain)
 
     result shouldBe DomainInfo(domain = "amazon", topDomain = 1, subdomain = "services",
       subdomainLength = 8, subdomainEntropy = 2.5, numPeriods = 4)
@@ -80,8 +84,9 @@ class DomainProcessorTest extends TestingSparkContextFlatSpec with Matchers {
     val url = "amazon.com.mx"
     val countryCodes = sparkContext.broadcast(countryCodesSet)
     val topDomains = sparkContext.broadcast(TopDomains.TopDomains)
+    val userDomain = "intel"
 
-    val result = extractDomainInfo(url, topDomains)
+    val result = extractDomainInfo(url, topDomains, userDomain)
 
     result shouldBe DomainInfo(domain = "amazon", subdomain = "None", topDomain = 1, subdomainLength = 0, subdomainEntropy = 0, numPeriods = 3)
   }
@@ -91,8 +96,9 @@ class DomainProcessorTest extends TestingSparkContextFlatSpec with Matchers {
     val url = "services.amazon.com"
     val countryCodes = sparkContext.broadcast(countryCodesSet)
     val topDomains = sparkContext.broadcast(TopDomains.TopDomains)
+    val userDomain = "intel"
 
-    val result = extractDomainInfo(url, topDomains)
+    val result = extractDomainInfo(url, topDomains, userDomain)
 
     result shouldBe DomainInfo(domain = "amazon", subdomain = "services", topDomain = 1, subdomainLength = 8, subdomainEntropy = 2.5, numPeriods = 3)
   }
@@ -103,7 +109,9 @@ class DomainProcessorTest extends TestingSparkContextFlatSpec with Matchers {
     val url = "amazon.com"
     val countryCodes = sparkContext.broadcast(countryCodesSet)
     val topDomains = sparkContext.broadcast(TopDomains.TopDomains)
-    val result = extractDomainInfo(url, topDomains)
+    val userDomain = "intel"
+
+    val result = extractDomainInfo(url, topDomains, userDomain)
 
     result shouldBe DomainInfo(domain = "amazon", subdomain = "None", topDomain = 1, subdomainLength = 0, subdomainEntropy = 0, numPeriods = 2)
   }


[16/49] incubator-spot git commit: Modified spot.conf to contain logic that only tries to pass the userdomain option when the spot.conf variable USER_DOMAIN is non-empty.

Posted by ev...@apache.org.
Modified spot.conf to contain logic that only tries to pass the userdomain option when the spot.conf variable USER_DOMAIN is non-empty.


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/b9cc67d1
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/b9cc67d1
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/b9cc67d1

Branch: refs/heads/master
Commit: b9cc67d1ea4579781a95d86e1d34ea49cf069b68
Parents: 13fc718
Author: Brandon Edwards <br...@intel.com>
Authored: Tue Dec 13 12:37:48 2016 -0800
Committer: Brandon Edwards <br...@intel.com>
Committed: Tue Dec 13 12:37:48 2016 -0800

----------------------------------------------------------------------
 spot-ml/ml_ops.sh | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/b9cc67d1/spot-ml/ml_ops.sh
----------------------------------------------------------------------
diff --git a/spot-ml/ml_ops.sh b/spot-ml/ml_ops.sh
index cf1bc31..a951fe1 100755
--- a/spot-ml/ml_ops.sh
+++ b/spot-ml/ml_ops.sh
@@ -45,6 +45,14 @@ else
     RAWDATA_PATH=${PROXY_PATH}
 fi
 
+# pass the user domain designation if not empty
+
+if [ ! -z $USER_DOMAIN ] ; then
+    USER_DOMAIN_PARSER_CMD="--userdomain $USER_DOMAIN"
+else
+    USER_DOMAIN_PARSER_CMD=''
+fi
+
 FEEDBACK_PATH=${LPATH}/${DSOURCE}_scores.csv
 DUPFACTOR=1000
 
@@ -97,7 +105,7 @@ time spark-submit --class "org.apache.spot.SuspiciousConnects" \
   --dupfactor ${DUPFACTOR} \
   --feedback ${FEEDBACK_PATH} \
   --ldatopiccount ${TOPIC_COUNT} \
-  --userdomain ${USER_DOMAIN}\
+  $USER_DOMAIN_PARSER_CMD \
   --scored ${HDFS_SCORED_CONNECTS} \
   --threshold ${TOL} \
   --maxresults ${MAXRESULTS} \


[10/49] incubator-spot git commit: Merge pull request #168 from brandon-edwards/Shouldbe_good_domain_fix

Posted by ev...@apache.org.
Merge pull request #168 from brandon-edwards/Shouldbe_good_domain_fix

User Domain Designation Bug Fix

Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/40a1a382
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/40a1a382
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/40a1a382

Branch: refs/heads/master
Commit: 40a1a3821bac36af36bd0226ed41435d6a010e21
Parents: 8830127 13fc718
Author: NathanSegerlind <na...@intel.com>
Authored: Mon Dec 12 09:47:58 2016 -0800
Committer: GitHub <no...@github.com>
Committed: Mon Dec 12 09:47:58 2016 -0800

----------------------------------------------------------------------
 spot-ml/INSTALL.md                              |  1 +
 spot-ml/ml_ops.sh                               |  3 +-
 .../spot/SuspiciousConnectsArgumentParser.scala |  5 ++++
 .../dns/DNSSuspiciousConnectsAnalysis.scala     |  4 ++-
 .../org/apache/spot/dns/DNSWordCreation.scala   |  6 ++--
 .../spot/dns/model/DNSScoreFunction.scala       | 30 +++++++++++++-------
 .../dns/model/DNSSuspiciousConnectsModel.scala  | 25 ++++++++++++----
 .../apache/spot/utilities/DomainProcessor.scala | 17 +++++------
 .../spot/utilities/DomainProcessorTest.scala    | 28 ++++++++++++++----
 spot-setup/spot.conf                            |  4 +++
 10 files changed, 89 insertions(+), 34 deletions(-)
----------------------------------------------------------------------



[47/49] incubator-spot git commit: Removed test code from dns_oa.py

Posted by ev...@apache.org.
Removed test code from dns_oa.py


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/f0619ae3
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/f0619ae3
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/f0619ae3

Branch: refs/heads/master
Commit: f0619ae3469d504a3f197c0f0f479c600665812f
Parents: bea0e57
Author: LedaLima <ga...@intel.com>
Authored: Mon Dec 12 11:31:50 2016 -0600
Committer: Everardo Lopez Sandoval (Intel) <el...@elopezsa-mac02.ra.intel.com>
Committed: Fri Jan 20 17:01:02 2017 -0800

----------------------------------------------------------------------
 spot-oa/oa/dns/dns_oa.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/f0619ae3/spot-oa/oa/dns/dns_oa.py
----------------------------------------------------------------------
diff --git a/spot-oa/oa/dns/dns_oa.py b/spot-oa/oa/dns/dns_oa.py
index 4f0dab9..2f8687f 100644
--- a/spot-oa/oa/dns/dns_oa.py
+++ b/spot-oa/oa/dns/dns_oa.py
@@ -152,14 +152,7 @@ class OA(object):
 
     def _add_tld_column(self):
         qry_name_col = self._conf['dns_results_fields']['dns_qry_name']
-        for conn in self._dns_scores: 
-            try:
-               tld =  get_tld("http://" + str(conn[qry_name_col]))
-            except ValueError:
-                print conn[qry_name_col]
-
-
-        # self._dns_scores = [conn + [ get_tld("http://" + str(conn[qry_name_col]), fail_silently=True) if "http://" not in str(conn[qry_name_col]) else get_tld(str(conn[qry_name_col]), fail_silently=True)] for conn in self._dns_scores ]
+        self._dns_scores = [conn + [ get_tld("http://" + str(conn[qry_name_col]), fail_silently=True) if "http://" not in str(conn[qry_name_col]) else get_tld(str(conn[qry_name_col]), fail_silently=True)] for conn in self._dns_scores ]
          
   
     def _add_reputation(self):


[33/49] incubator-spot git commit: unit_test_cleanup

Posted by ev...@apache.org.
unit_test_cleanup

 CODE BASE IS NOT FUNCTIONAL AT THIS COMMIT

restructuring code for easier merge of data validation code with unit test changes


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/c638e2fa
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/c638e2fa
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/c638e2fa

Branch: refs/heads/master
Commit: c638e2fac593dcd91d900f5b350959a8bc0d49b2
Parents: 986cebf
Author: nlsegerl <na...@intel.com>
Authored: Wed Jan 4 11:31:19 2017 -0800
Committer: nlsegerl <na...@intel.com>
Committed: Wed Jan 4 11:31:19 2017 -0800

----------------------------------------------------------------------
 .../org/apache/spot/SuspiciousConnects.scala    |  20 ++-
 .../dns/DNSSuspiciousConnectsAnalysis.scala     | 169 +++++++++++++++----
 .../FlowSuspiciousConnectsAnalysis.scala        | 116 ++++++++++---
 .../proxy/ProxySuspiciousConnectsAnalysis.scala | 150 ++++++++++++++--
 4 files changed, 387 insertions(+), 68 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/c638e2fa/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnects.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnects.scala b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnects.scala
index 8751189..dfd5b8f 100644
--- a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnects.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnects.scala
@@ -7,7 +7,7 @@ import org.apache.spot.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig
 import org.apache.spot.dns.DNSSuspiciousConnectsAnalysis
 import org.apache.spot.netflow.FlowSuspiciousConnectsAnalysis
 import org.apache.spot.proxy.ProxySuspiciousConnectsAnalysis
-
+import org.apache.spot.utilities.data.InputOutputDataHandler
 
 /**
   * Top level entrypoint to execute suspicious connections analysis on network data.
@@ -43,13 +43,25 @@ object SuspiciousConnects {
         val sqlContext = new SQLContext(sparkContext)
         implicit val outputDelimiter = config.outputDelimiter
 
+        val inputDataFrame = InputOutputDataHandler.getInputDataFrame(sqlContext, config.inputPath, logger)
+          .getOrElse(sqlContext.emptyDataFrame)
+        if(inputDataFrame.rdd.isEmpty()) {
+          logger.error("Couldn't read data from location " + config.inputPath +", please verify it's a valid location and that " +
+            s"contains parquet files with a given schema and try again.")
+          System.exit(0)
+        }
+
+
+
         analysis match {
-          case "flow" => FlowSuspiciousConnectsAnalysis.run(config, sparkContext, sqlContext, logger)
-          case "dns" => DNSSuspiciousConnectsAnalysis.run(config, sparkContext, sqlContext, logger)
-          case "proxy" => ProxySuspiciousConnectsAnalysis.run(config, sparkContext, sqlContext, logger)
+          case "flow" => FlowSuspiciousConnectsAnalysis.run(config, sparkContext, sqlContext, logger, inputDataFrame)
+          case "dns" => DNSSuspiciousConnectsAnalysis.run(config, sparkContext, sqlContext, logger, inputDataFrame)
+          case "proxy" => ProxySuspiciousConnectsAnalysis.run(config, sparkContext, sqlContext, logger, inputDataFrame)
           case _ => logger.error("Unsupported (or misspelled) analysis: " + analysis)
         }
 
+        InputOutputDataHandler.mergeResultsFiles(sparkContext, config.hdfsScoredConnect, analysis, logger)
+
         sparkContext.stop()
 
       case None => logger.error("Error parsing arguments")

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/c638e2fa/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala b/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala
index 244b941..5db5c50 100644
--- a/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala
@@ -8,7 +8,11 @@ import org.apache.spark.sql.{DataFrame, SQLContext}
 import org.apache.spot.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig
 import org.apache.spot.dns.DNSSchema._
 import org.apache.spot.dns.model.DNSSuspiciousConnectsModel
+import org.apache.log4j.Logger
 import org.apache.spot.dns.model.DNSSuspiciousConnectsModel.ModelSchema
+import org.apache.spot.proxy.ProxySchema.Score
+import org.apache.spot.utilities.data.validation.{InvalidDataHandler => dataValidation}
+
 
 /**
   * The suspicious connections analysis of DNS log data develops a probabilistic model the DNS queries
@@ -17,27 +21,6 @@ import org.apache.spot.dns.model.DNSSuspiciousConnectsModel.ModelSchema
 
 object DNSSuspiciousConnectsAnalysis {
 
-  val inSchema = StructType(List(TimestampField, UnixTimestampField, FrameLengthField, ClientIPField,
-      QueryNameField, QueryClassField, QueryTypeField, QueryResponseCodeField))
-
-  val inColumns = inSchema.fieldNames.map(col)
-
-
-  assert(ModelSchema.fields.forall(inSchema.fields.contains(_)))
-
-  val OutSchema = StructType(
-    List(TimestampField,
-      UnixTimestampField,
-      FrameLengthField,
-      ClientIPField,
-      QueryNameField,
-      QueryClassField,
-      QueryTypeField,
-      QueryResponseCodeField,
-      ScoreField))
-
-  val OutColumns = OutSchema.fieldNames.map(col)
-
 
   /**
     * Run suspicious connections analysis on DNS log data.
@@ -48,31 +31,37 @@ object DNSSuspiciousConnectsAnalysis {
     * @param sqlContext
     * @param logger
     */
-  def run(config: SuspiciousConnectsConfig, sparkContext: SparkContext, sqlContext: SQLContext, logger: Logger) = {
-    logger.info("Starting DNS suspicious connects analysis.")
+  def run(config: SuspiciousConnectsConfig, sparkContext: SparkContext, sqlContext: SQLContext, logger: Logger,
+          inputDNSRecords: DataFrame) = {
 
 
-    logger.info("Loading data")
+    logger.info("Starting DNS suspicious connects analysis.")
 
+    val userDomain = config.userDomain
 
-    val rawDataDF = sqlContext.read.parquet(config.inputPath)
-      .filter(Timestamp + " is not null and " + UnixTimestamp + " is not null")
-      .select(inColumns:_*)
+    val cleanDNSRecords = filterAndSelectCleanDNSRecords(inputDNSRecords)
 
+    logger.info("Training the model")
 
+    val scoredDNSRecords = detectDNSAnomalies(cleanDNSRecords, config, sparkContext, sqlContext, logger)
 
-    val scoredDF = detectDNSAnomalies(rawDataDF, config, sparkContext, sqlContext, logger)
+    val filteredDNSRecords = filterScoredDNSRecords(scoredDNSRecords, config.threshold)
 
+    val orderedDNSRecords = filteredDNSRecords.orderBy(Score)
 
+    val mostSuspiciousDNSRecords = if(config.maxResults > 0)  orderedDNSRecords.limit(config.maxResults) else orderedDNSRecords
 
-    val filteredDF = scoredDF.filter(Score + " <= " + config.threshold)
-    val mostSusipiciousDF: DataFrame = filteredDF.orderBy(Score).limit(config.maxResults)
+    val outputDNSRecords = mostSuspiciousDNSRecords.select(OutSchema:_*).sort(Score)
 
-    mostSusipiciousDF.select(OutColumns:_*).sort(Score)
+    logger.info("DNS  suspicious connects analysis completed.")
     logger.info("Saving results to : " + config.hdfsScoredConnect)
+    outputDNSRecords.map(_.mkString(config.outputDelimiter)).saveAsTextFile(config.hdfsScoredConnect)
 
+    val invalidDNSRecords = filterAndSelectInvalidDNSRecords(inputDNSRecords)
+    dataValidation.showAndSaveInvalidRecords(invalidDNSRecords, config.hdfsScoredConnect, logger)
 
-    mostSusipiciousDF.map(_.mkString(config.outputDelimiter)).saveAsTextFile(config.hdfsScoredConnect)
+    val corruptDNSRecords = filterAndSelectCorruptDNSRecords(scoredDNSRecords)
+    dataValidation.showAndSaveCorruptRecords(corruptDNSRecords, config.hdfsScoredConnect, logger)
   }
 
   /**
@@ -98,4 +87,120 @@ object DNSSuspiciousConnectsAnalysis {
     logger.info("Identifying outliers")
     model.score(sparkContext, sqlContext, data, userDomain)
   }
+
+  /**
+    *
+    * @param inputDNSRecords raw DNS records.
+    * @return
+    */
+  def filterAndSelectCleanDNSRecords(inputDNSRecords: DataFrame): DataFrame ={
+
+    val cleanDNSRecordsFilter = inputDNSRecords(Timestamp).isNotNull &&
+      inputDNSRecords(Timestamp).notEqual("") &&
+      inputDNSRecords(Timestamp).notEqual("-") &&
+      inputDNSRecords(UnixTimestamp).isNotNull &&
+      inputDNSRecords(FrameLength).isNotNull &&
+      inputDNSRecords(QueryName).isNotNull &&
+      inputDNSRecords(QueryName).notEqual("") &&
+      inputDNSRecords(QueryName).notEqual("-") &&
+      inputDNSRecords(QueryName).notEqual("(empty)") &&
+      inputDNSRecords(ClientIP).isNotNull &&
+      inputDNSRecords(ClientIP).notEqual("") &&
+      inputDNSRecords(ClientIP).notEqual("-") &&
+      ((inputDNSRecords(QueryClass).isNotNull &&
+        inputDNSRecords(QueryClass).notEqual("") &&
+        inputDNSRecords(QueryClass).notEqual("-")) ||
+        inputDNSRecords(QueryType).isNotNull ||
+        inputDNSRecords(QueryResponseCode).isNotNull)
+
+    inputDNSRecords
+      .filter(cleanDNSRecordsFilter)
+      .select(InSchema: _*)
+      .na.fill(DefaultQueryClass, Seq(QueryClass))
+      .na.fill(DefaultQueryType, Seq(QueryType))
+      .na.fill(DefaultQueryResponseCode, Seq(QueryResponseCode))
+  }
+
+  /**
+    *
+    * @param inputDNSRecords raw DNS records.
+    * @return
+    */
+  def filterAndSelectInvalidDNSRecords(inputDNSRecords: DataFrame): DataFrame ={
+
+    val invalidDNSRecordsFilter = inputDNSRecords(Timestamp).isNull ||
+      inputDNSRecords(Timestamp).equalTo("") ||
+      inputDNSRecords(Timestamp).equalTo("-") ||
+      inputDNSRecords(UnixTimestamp).isNull ||
+      inputDNSRecords(FrameLength).isNull ||
+      inputDNSRecords(QueryName).isNull ||
+      inputDNSRecords(QueryName).equalTo("") ||
+      inputDNSRecords(QueryName).equalTo("-") ||
+      inputDNSRecords(QueryName).equalTo("(empty)") ||
+      inputDNSRecords(ClientIP).isNull ||
+      inputDNSRecords(ClientIP).equalTo("") ||
+      inputDNSRecords(ClientIP).equalTo("-") ||
+      ((inputDNSRecords(QueryClass).isNull ||
+        inputDNSRecords(QueryClass).equalTo("") ||
+        inputDNSRecords(QueryClass).equalTo("-")) &&
+        inputDNSRecords(QueryType).isNull &&
+        inputDNSRecords(QueryResponseCode).isNull)
+
+    inputDNSRecords
+      .filter(invalidDNSRecordsFilter)
+      .select(InSchema: _*)
+  }
+
+  /**
+    *
+    * @param scoredDNSRecords scored DNS records.
+    * @param threshold score tolerance.
+    * @return
+    */
+  def filterScoredDNSRecords(scoredDNSRecords: DataFrame, threshold: Double): DataFrame ={
+
+    val filteredDNSRecordsFilter = scoredDNSRecords(Score).leq(threshold) &&
+      scoredDNSRecords(Score).gt(dataValidation.ScoreError)
+
+    scoredDNSRecords.filter(filteredDNSRecordsFilter)
+  }
+
+  /**
+    *
+    * @param scoredDNSRecords scored DNS records.
+    * @return
+    */
+  def filterAndSelectCorruptDNSRecords(scoredDNSRecords: DataFrame): DataFrame = {
+
+    val corruptDNSRecordsFilter = scoredDNSRecords(Score).equalTo(dataValidation.ScoreError)
+
+    scoredDNSRecords
+      .filter(corruptDNSRecordsFilter)
+      .select(OutSchema: _*)
+
+  }
+
+
+  val DefaultQueryClass = "unknown"
+  val DefaultQueryType = -1
+  val DefaultQueryResponseCode = -1
+
+  val InStructType = StructType(List(TimestampField, UnixTimestampField, FrameLengthField, ClientIPField,
+    QueryNameField, QueryClassField, QueryTypeField, QueryResponseCodeField))
+
+  val InSchema = InStructType.fieldNames.map(col)
+
+  assert(ModelSchema.fields.forall(InStructType.fields.contains(_)))
+
+  val OutSchema = StructType(
+    List(TimestampField,
+      UnixTimestampField,
+      FrameLengthField,
+      ClientIPField,
+      QueryNameField,
+      QueryClassField,
+      QueryTypeField,
+      QueryResponseCodeField,
+      ScoreField)).fieldNames.map(col)
+
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/c638e2fa/spot-ml/src/main/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysis.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysis.scala b/spot-ml/src/main/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysis.scala
index 098a787..127b2a7 100644
--- a/spot-ml/src/main/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysis.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysis.scala
@@ -8,6 +8,7 @@ import org.apache.spark.sql.{DataFrame, SQLContext}
 import org.apache.spot.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig
 import org.apache.spot.netflow.FlowSchema._
 import org.apache.spot.netflow.model.FlowSuspiciousConnectsModel
+import org.apache.spot.utilities.data.validation.{InvalidDataHandler => dataValidation}
 
 
 /**
@@ -17,29 +18,33 @@ import org.apache.spot.netflow.model.FlowSuspiciousConnectsModel
 
 object FlowSuspiciousConnectsAnalysis {
 
-  def run(config: SuspiciousConnectsConfig, sparkContext: SparkContext, sqlContext: SQLContext, logger: Logger) = {
+  def run(config: SuspiciousConnectsConfig, sparkContext: SparkContext, sqlContext: SQLContext, logger: Logger,
+          inputFlowRecords: DataFrame) = {
 
-    logger.info("Loading data")
+    logger.info("Starting flow suspicious connects analysis.")
 
-    val rawDataDF = sqlContext.read.parquet(config.inputPath)
-      .filter(Hour + " BETWEEN 0 AND 23 AND  " + Minute + " BETWEEN 0 AND 59 AND  " + Second + " BETWEEN 0 AND 59")
-      .select(inColumns: _*)
+    val cleanFlowRecords = filterAndSelectCleanFlowRecords(inputFlowRecords)
 
+    val scoredFlowRecords = detectFlowAnomalies(cleanFlowRecords, config, sparkContext, sqlContext, logger)
 
-    logger.info("Training the model")
+    val filteredFlowRecords = filterScoredFlowRecords(scoredFlowRecords, config.threshold)
 
-    val scoredDF = detectFlowAnomalies(rawDataDF, config, sparkContext, sqlContext, logger)
+    val orderedFlowRecords = filteredFlowRecords.orderBy(Score)
 
-    val filteredDF = scoredDF.filter(Score + " <= " + config.threshold)
+    val mostSuspiciousFlowRecords =
+      if(config.maxResults > 0 ) orderedFlowRecords.limit(config.maxResults) else orderedFlowRecords
 
-    val mostSusipiciousDF: DataFrame = filteredDF.orderBy(Score).limit(config.maxResults)
-
-
-    val outputDF = mostSusipiciousDF.select(OutColumns: _*)
+    val outputFlowRecords = mostSuspiciousFlowRecords.select(OutSchema: _*)
 
     logger.info("Netflow  suspicious connects analysis completed.")
     logger.info("Saving results to : " + config.hdfsScoredConnect)
-    outputDF.map(_.mkString(config.outputDelimiter)).saveAsTextFile(config.hdfsScoredConnect)
+    outputFlowRecords.map(_.mkString(config.outputDelimiter)).saveAsTextFile(config.hdfsScoredConnect)
+
+    val invalidFlowRecords = filterAndSelectInvalidFlowRecords(inputFlowRecords)
+    dataValidation.showAndSaveInvalidRecords(invalidFlowRecords, config.hdfsScoredConnect, logger)
+
+    val corruptFlowRecords = filterAndSelectCorruptFlowRecords(scoredFlowRecords)
+    dataValidation.showAndSaveCorruptRecords(corruptFlowRecords, config.hdfsScoredConnect, logger)
   }
 
   /**
@@ -67,7 +72,83 @@ object FlowSuspiciousConnectsAnalysis {
     model.score(sparkContext, sqlContext, data)
   }
 
-  val inSchema = StructType(List(TimeReceivedField,
+  /**
+    *
+    * @param inputFlowRecords raw flow records
+    * @return
+    */
+  def filterAndSelectCleanFlowRecords(inputFlowRecords: DataFrame): DataFrame ={
+
+    val cleanFlowRecordsFilter = inputFlowRecords(Hour).between(0, 23) &&
+      inputFlowRecords(Minute).between(0, 59) &&
+      inputFlowRecords(Second).between(0, 59) &&
+      inputFlowRecords(TimeReceived).isNotNull &&
+      inputFlowRecords(SourceIP).isNotNull &&
+      inputFlowRecords(DestinationIP).isNotNull &&
+      inputFlowRecords(SourcePort).isNotNull &&
+      inputFlowRecords(DestinationPort).isNotNull &&
+      inputFlowRecords(Ibyt).isNotNull &&
+      inputFlowRecords(Ipkt).isNotNull
+
+    inputFlowRecords
+      .filter(cleanFlowRecordsFilter)
+      .select(InSchema: _*)
+
+  }
+
+  /**
+    *
+    * @param inputFlowRecords raw flow records.
+    * @return
+    */
+  def filterAndSelectInvalidFlowRecords(inputFlowRecords: DataFrame): DataFrame = {
+
+    val invalidFlowRecordsFilter = inputFlowRecords(Hour).between(0,23) &&
+      inputFlowRecords(Minute).between(0,59) &&
+      inputFlowRecords(Second).between(0,59) &&
+      inputFlowRecords(TimeReceived).isNull ||
+      inputFlowRecords(SourceIP).isNull ||
+      inputFlowRecords(DestinationIP).isNull ||
+      inputFlowRecords(SourcePort).isNull ||
+      inputFlowRecords(DestinationPort).isNull ||
+      inputFlowRecords(Ibyt).isNull ||
+      inputFlowRecords(Ipkt).isNull
+
+    inputFlowRecords
+      .filter(invalidFlowRecordsFilter)
+      .select(InSchema: _*)
+  }
+
+  /**
+    *
+    * @param scoredFlowRecords scored flow records.
+    * @param threshold score tolerance.
+    * @return
+    */
+  def filterScoredFlowRecords(scoredFlowRecords: DataFrame, threshold: Double): DataFrame = {
+
+    val filteredFlowRecordsFilter = scoredFlowRecords(Score).leq(threshold) &&
+      scoredFlowRecords(Score).gt(dataValidation.ScoreError)
+
+    scoredFlowRecords.filter(filteredFlowRecordsFilter)
+  }
+
+  /**
+    *
+    * @param scoredFlowRecords scored flow records.
+    * @return
+    */
+  def filterAndSelectCorruptFlowRecords(scoredFlowRecords: DataFrame): DataFrame = {
+
+    val corruptFlowRecordsFilter = scoredFlowRecords(Score).equalTo(dataValidation.ScoreError)
+
+    scoredFlowRecords
+      .filter(corruptFlowRecordsFilter)
+      .select(OutSchema: _*)
+
+  }
+
+  val InSchema = StructType(List(TimeReceivedField,
     YearField,
     MonthField,
     DayField,
@@ -83,9 +164,7 @@ object FlowSuspiciousConnectsAnalysis {
     IpktField,
     IbytField,
     OpktField,
-    ObytField))
-
-  val inColumns = inSchema.fieldNames.map(col)
+    ObytField)).fieldNames.map(col)
 
   val OutSchema = StructType(
     List(TimeReceivedField,
@@ -105,7 +184,6 @@ object FlowSuspiciousConnectsAnalysis {
       IbytField,
       OpktField,
       ObytField,
-      ScoreField))
+      ScoreField)).fieldNames.map(col)
 
-  val OutColumns = OutSchema.fieldNames.map(col)
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/c638e2fa/spot-ml/src/main/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysis.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysis.scala b/spot-ml/src/main/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysis.scala
index 38150ca..290f101 100644
--- a/spot-ml/src/main/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysis.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysis.scala
@@ -2,10 +2,12 @@ package org.apache.spot.proxy
 
 import org.apache.log4j.Logger
 import org.apache.spark.SparkContext
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.{DataFrame, SQLContext}
 import org.apache.spot.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig
 import org.apache.spot.proxy.ProxySchema._
-
+import org.apache.spot.utilities.data.validation.{InvalidDataHandler => dataValidation}
 /**
   * Run suspicious connections analysis on proxy data.
   */
@@ -19,27 +21,35 @@ object ProxySuspiciousConnectsAnalysis {
     * @param sqlContext   Spark SQL context.
     * @param logger       Logs execution progress, information and errors for user.
     */
-  def run(config: SuspiciousConnectsConfig, sparkContext: SparkContext, sqlContext: SQLContext, logger: Logger) = {
+  def run(config: SuspiciousConnectsConfig, sparkContext: SparkContext, sqlContext: SQLContext, logger: Logger,
+          inputProxyRecords: DataFrame) = {
 
     logger.info("Starting proxy suspicious connects analysis.")
 
-    logger.info("Loading data from: " + config.inputPath)
+    val cleanProxyRecords = filterAndSelectCleanProxyRecords(inputProxyRecords)
+
+
+    val scoredProxyRecords = detectProxyAnomalies(cleanProxyRecords, config, sparkContext, sqlContext, logger)
+
+    // take the maxResults least probable events of probability below the threshold and sort
+
+    val filteredProxyRecords = filterScoredProxyRecords(scoredProxyRecords, config.threshold)
 
-    val rawDataDF = sqlContext.read.parquet(config.inputPath).
-      filter(Date + " is not null and " + Time + " is not null and " + ClientIP + " is not null").
-      select(Date, Time, ClientIP, Host, ReqMethod, UserAgent, ResponseContentType, Duration, UserName,
-        WebCat, Referer, RespCode, URIPort, URIPath, URIQuery, ServerIP, SCBytes, CSBytes, FullURI)
+    val orderedProxyRecords = filteredProxyRecords.orderBy(Score)
 
-    val scoredDF = detectProxyAnomalies(rawDataDF, config, sparkContext, sqlContext, logger)
+    val mostSuspiciousProxyRecords = if(config.maxResults > 0)  orderedProxyRecords.limit(config.maxResults) else orderedProxyRecords
 
+    val outputProxyRecords = mostSuspiciousProxyRecords.select(OutSchema:_*)
 
-    val filteredDF = scoredDF.filter(Score +  " <= " + config.threshold)
-    val mostSusipiciousDF: DataFrame = filteredDF.orderBy(Score).limit(config.maxResults)
+    logger.info("Proxy suspicious connects analysis completed")
+    logger.info("Saving results to: " + config.hdfsScoredConnect)
+    outputProxyRecords.map(_.mkString(config.outputDelimiter)).saveAsTextFile(config.hdfsScoredConnect)
 
-    logger.info("Persisting data to hdfs: " + config.hdfsScoredConnect)
-    mostSusipiciousDF.map(_.mkString(config.outputDelimiter)).saveAsTextFile(config.hdfsScoredConnect)
+    val invalidProxyRecords = filterAndSelectInvalidProxyRecords(inputProxyRecords)
+    dataValidation.showAndSaveInvalidRecords(invalidProxyRecords, config.hdfsScoredConnect, logger)
 
-    logger.info("Proxy suspcicious connects completed")
+    val corruptProxyRecords = filterAndSelectCorruptProxyRecords(scoredProxyRecords)
+    dataValidation.showAndSaveCorruptRecords(corruptProxyRecords, config.hdfsScoredConnect, logger)
   }
 
 
@@ -66,4 +76,118 @@ object ProxySuspiciousConnectsAnalysis {
 
     model.score(sparkContext, data)
   }
+
+  /**
+    *
+    * @param inputProxyRecords raw proxy records.
+    * @return
+    */
+  def filterAndSelectCleanProxyRecords(inputProxyRecords: DataFrame): DataFrame ={
+
+    val cleanProxyRecordsFilter =  inputProxyRecords(Date).isNotNull &&
+      inputProxyRecords(Time).isNotNull &&
+      inputProxyRecords(ClientIP).isNotNull &&
+      inputProxyRecords(Host).isNotNull &&
+      inputProxyRecords(FullURI).isNotNull
+
+    inputProxyRecords
+      .filter(cleanProxyRecordsFilter)
+      .select(InSchema:_*)
+      .na.fill(DefaultUserAgent, Seq(UserAgent))
+      .na.fill(DefaultResponseContentType, Seq(ResponseContentType))
+  }
+
+  /**
+    *
+    * @param inputProxyRecords raw proxy records.
+    * @return
+    */
+  def filterAndSelectInvalidProxyRecords(inputProxyRecords: DataFrame): DataFrame ={
+
+    val invalidProxyRecordsFilter = inputProxyRecords(Date).isNull ||
+      inputProxyRecords(Time).isNull ||
+      inputProxyRecords(ClientIP).isNull ||
+      inputProxyRecords(Host).isNull ||
+      inputProxyRecords(FullURI).isNull
+
+    inputProxyRecords
+      .filter(invalidProxyRecordsFilter)
+      .select(InSchema: _*)
+  }
+
+  /**
+    *
+    * @param scoredProxyRecords scored proxy records.
+    * @param threshold score tolerance.
+    * @return
+    */
+  def filterScoredProxyRecords(scoredProxyRecords: DataFrame, threshold: Double): DataFrame ={
+
+    val filteredProxyRecordsFilter = scoredProxyRecords(Score).leq(threshold) &&
+      scoredProxyRecords(Score).gt(dataValidation.ScoreError)
+
+    scoredProxyRecords.filter(filteredProxyRecordsFilter)
+  }
+
+  /**
+    *
+    * @param scoredProxyRecords scored proxy records.
+    * @return
+    */
+  def filterAndSelectCorruptProxyRecords(scoredProxyRecords: DataFrame): DataFrame ={
+
+    val corruptProxyRecordsFilter = scoredProxyRecords(Score).equalTo(dataValidation.ScoreError)
+
+    scoredProxyRecords
+      .filter(corruptProxyRecordsFilter)
+      .select(OutSchema: _*)
+  }
+
+  val DefaultUserAgent = "-"
+  val DefaultResponseContentType = "-"
+
+  val InSchema = StructType(
+    List(DateField,
+      TimeField,
+      ClientIPField,
+      HostField,
+      ReqMethodField,
+      UserAgentField,
+      ResponseContentTypeField,
+      DurationField,
+      UserNameField,
+      WebCatField,
+      RefererField,
+      RespCodeField,
+      URIPortField,
+      URIPathField,
+      URIQueryField,
+      ServerIPField,
+      SCBytesField,
+      CSBytesField,
+      FullURIField)).fieldNames.map(col)
+
+  val OutSchema = StructType(
+    List(DateField,
+      TimeField,
+      ClientIPField,
+      HostField,
+      ReqMethodField,
+      UserAgentField,
+      ResponseContentTypeField,
+      DurationField,
+      UserNameField,
+      WebCatField,
+      RefererField,
+      RespCodeField,
+      URIPortField,
+      URIPathField,
+      URIQueryField,
+      ServerIPField,
+      SCBytesField,
+      CSBytesField,
+      FullURIField,
+      WordField,
+      ScoreField)).fieldNames.map(col)
+
 }
\ No newline at end of file


[26/49] incubator-spot git commit: unit_test_cleanup

Posted by ev...@apache.org.
unit_test_cleanup

added flow time-of-day anomaly test


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/5a75bc5a
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/5a75bc5a
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/5a75bc5a

Branch: refs/heads/master
Commit: 5a75bc5a925ff5b59753b06ba4a1628001e8a08b
Parents: 8d13d3f
Author: nlsegerl <na...@intel.com>
Authored: Tue Dec 20 16:27:42 2016 -0800
Committer: nlsegerl <na...@intel.com>
Committed: Tue Dec 20 16:27:42 2016 -0800

----------------------------------------------------------------------
 .../spot/netflow/FlowSuspiciousConnectsAnalysisTest.scala   | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/5a75bc5a/spot-ml/src/test/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysisTest.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/test/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysisTest.scala b/spot-ml/src/test/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysisTest.scala
index 07bd8b6..d59635d 100644
--- a/spot-ml/src/test/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysisTest.scala
+++ b/spot-ml/src/test/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysisTest.scala
@@ -47,20 +47,19 @@ class FlowSuspiciousConnectsAnalysisTest extends TestingSparkContextFlatSpec wit
 
     val logger = LogManager.getLogger("SuspiciousConnectsAnalysis")
     logger.setLevel(Level.OFF)
-    val testSqlContext = new org.apache.spark.sql.SQLContext(sparkContext)
 
     val anomalousRecord = FlowRecord("2016-05-05 00:11:01", 2016, 5, 5, 0, 0, 1, 0.972f, "172.16.0.129", "10.0.2.202", 1024, 80, "TCP", 39, 12522, 0, 0)
     val typicalRecord = FlowRecord("2016-05-05 13:54:58", 2016, 5, 5, 13, 54, 58, 0.972f, "172.16.0.129", "10.0.2.202", 1024, 80, "TCP", 39, 12522, 0, 0)
 
-    import testSqlContext.implicits._
 
-    val data = sparkContext.parallelize(Seq(anomalousRecord, typicalRecord, typicalRecord, typicalRecord, typicalRecord, typicalRecord,
-      typicalRecord, typicalRecord, typicalRecord, typicalRecord)).toDF()
+    val data = sqlContext.createDataFrame(Seq(anomalousRecord, typicalRecord, typicalRecord, typicalRecord, typicalRecord, typicalRecord,
+      typicalRecord, typicalRecord, typicalRecord, typicalRecord))
+
 
     val scoredData : DataFrame = FlowSuspiciousConnectsAnalysis.detectFlowAnomalies(data,
       testConfig,
       sparkContext,
-      testSqlContext,
+      sqlContext,
       logger)
 
 


[43/49] incubator-spot git commit: Added Ingest summary method to dns and proxy, added validations to flows ingest summary method

Posted by ev...@apache.org.
Added Ingest summary method to dns and proxy, added validations to flows ingest summary method


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/bea0e571
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/bea0e571
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/bea0e571

Branch: refs/heads/master
Commit: bea0e5712779b3f19dc86949d4d564c7718346ca
Parents: 7902857
Author: LedaLima <ga...@intel.com>
Authored: Mon Dec 12 10:56:25 2016 -0600
Committer: Everardo Lopez Sandoval (Intel) <el...@elopezsa-mac02.ra.intel.com>
Committed: Fri Jan 20 17:01:02 2017 -0800

----------------------------------------------------------------------
 spot-oa/oa/dns/dns_oa.py     | 74 +++++++++++++++++++++++++++++++++++++--
 spot-oa/oa/flow/flow_oa.py   | 73 +++++++++++++++++++++-----------------
 spot-oa/oa/proxy/proxy_oa.py | 58 +++++++++++++++++++++++++++++-
 3 files changed, 169 insertions(+), 36 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/bea0e571/spot-oa/oa/dns/dns_oa.py
----------------------------------------------------------------------
diff --git a/spot-oa/oa/dns/dns_oa.py b/spot-oa/oa/dns/dns_oa.py
index 884d46c..4f0dab9 100644
--- a/spot-oa/oa/dns/dns_oa.py
+++ b/spot-oa/oa/dns/dns_oa.py
@@ -5,7 +5,7 @@ import json
 import shutil
 import sys
 import datetime
-import csv
+import csv, math
 from tld import get_tld
 
 from collections import OrderedDict
@@ -14,7 +14,7 @@ from components.data.data import Data
 from components.iana.iana_transform import IanaTransform
 from components.nc.network_context import NetworkContext 
 from multiprocessing import Process
-
+import pandas as pd 
 import time
 
 class OA(object):
@@ -70,6 +70,7 @@ class OA(object):
         self._add_network_context()
         self._create_dns_scores_csv()
         self._get_oa_details()
+        self._ingest_summary()
 
         ##################
         end = time.time()
@@ -151,7 +152,15 @@ class OA(object):
 
     def _add_tld_column(self):
         qry_name_col = self._conf['dns_results_fields']['dns_qry_name']
-        self._dns_scores = [conn + [ get_tld("http://" + str(conn[qry_name_col]), fail_silently=True) if "http://" not in str(conn[qry_name_col]) else get_tld(str(conn[qry_name_col]), fail_silently=True)] for conn in self._dns_scores ] 
+        for conn in self._dns_scores: 
+            try:
+               tld =  get_tld("http://" + str(conn[qry_name_col]))
+            except ValueError:
+                print conn[qry_name_col]
+
+
+        # self._dns_scores = [conn + [ get_tld("http://" + str(conn[qry_name_col]), fail_silently=True) if "http://" not in str(conn[qry_name_col]) else get_tld(str(conn[qry_name_col]), fail_silently=True)] for conn in self._dns_scores ]
+         
   
     def _add_reputation(self):
 
@@ -326,3 +335,62 @@ class OA(object):
             dndro_qry = ("SELECT dns_a, dns_qry_name, ip_dst FROM (SELECT susp.ip_dst, susp.dns_qry_name, susp.dns_a FROM {0}.{1} as susp WHERE susp.y={2} AND susp.m={3} AND susp.d={4} AND susp.ip_dst='{5}' LIMIT {6}) AS tmp GROUP BY dns_a, dns_qry_name, ip_dst").format(db,table,year,month,day,ip_dst,limit)
             # execute query
             self._engine.query(dndro_qry,dendro_file)
+
+        
+    def _ingest_summary(self):
+        # get date parameters.
+        yr = self._date[:4]
+        mn = self._date[4:6]
+        dy = self._date[6:]
+
+        self._logger.info("Getting ingest summary data for the day")
+        
+        ingest_summary_cols = ["date","total"]		
+        result_rows = []        
+        df_filtered =  pd.DataFrame()
+
+        ingest_summary_file = "{0}/is_{1}{2}.csv".format(self._ingest_summary_path,yr,mn)			
+        ingest_summary_tmp = "{0}.tmp".format(ingest_summary_file)
+
+        if os.path.isfile(ingest_summary_file):
+        	df = pd.read_csv(ingest_summary_file, delimiter=',')
+            #discards previous rows from the same date
+        	df_filtered = df[df['date'].str.contains("{0}-{1}-{2}".format(yr, mn, dy)) == False] 
+        else:
+        	df = pd.DataFrame()
+            
+        # get ingest summary.
+        ingest_summary_qry = ("SELECT frame_time, COUNT(*) as total "
+                                    " FROM {0}.{1}"
+                                    " WHERE y={2} AND m={3} AND d={4} "
+                                    " AND unix_tstamp IS NOT NULL AND frame_time IS NOT NULL"
+                                    " AND frame_len IS NOT NULL AND dns_qry_name IS NOT NULL"
+                                    " AND ip_src IS NOT NULL " 
+                                    " AND (dns_qry_class IS NOT NULL AND dns_qry_type IS NOT NULL AND dns_qry_rcode IS NOT NULL ) "
+                                    " GROUP BY frame_time;") 
+
+        ingest_summary_qry = ingest_summary_qry.format(self._db,self._table_name, yr, mn, dy)
+        
+        results_file = "{0}/results_{1}.csv".format(self._ingest_summary_path,self._date)
+        self._engine.query(ingest_summary_qry,output_file=results_file,delimiter=",")
+
+
+        if os.path.isfile(results_file):        
+            df_results = pd.read_csv(results_file, delimiter=',') 
+
+            # Forms a new dataframe splitting the minutes from the time column
+            df_new = pd.DataFrame([["{0}-{1}-{2} {3}:{4}".format(yr, mn, dy,val['frame_time'].split(" ")[3].split(":")[0].zfill(2),val['frame_time'].split(" ")[3].split(":")[1].zfill(2)), int(val['total']) if not math.isnan(val['total']) else 0 ] for key,val in df_results.iterrows()],columns = ingest_summary_cols)
+    
+            #Groups the data by minute 
+            sf = df_new.groupby(by=['date'])['total'].sum()
+        
+            df_per_min = pd.DataFrame({'date':sf.index, 'total':sf.values})
+            
+            df_final = df_filtered.append(df_per_min, ignore_index=True)
+            df_final.to_csv(ingest_summary_tmp,sep=',', index=False)
+
+            os.remove(results_file)
+            os.rename(ingest_summary_tmp,ingest_summary_file)
+        else:
+            self._logger.info("No data found for the ingest summary")
+        
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/bea0e571/spot-oa/oa/flow/flow_oa.py
----------------------------------------------------------------------
diff --git a/spot-oa/oa/flow/flow_oa.py b/spot-oa/oa/flow/flow_oa.py
index 870251b..7c1013a 100644
--- a/spot-oa/oa/flow/flow_oa.py
+++ b/spot-oa/oa/flow/flow_oa.py
@@ -7,6 +7,7 @@ import json
 import numpy as np
 import linecache, bisect
 import csv
+import pandas as pd
 
 from collections import OrderedDict
 from multiprocessing import Process
@@ -51,7 +52,7 @@ class OA(object):
         # initialize data engine
         self._db = self._spot_conf.get('conf', 'DBNAME').replace("'", "").replace('"', '')
         self._engine = Data(self._db, self._table_name,self._logger)
-              
+                      
     def start(self):       
         
         ####################
@@ -386,47 +387,55 @@ class OA(object):
                     self._engine.query(ch_query.format(self._db,self._table_name,yr,mn,dy,ip,ips_filter),chord_file,delimiter="\\t")
 
      
-    def _ingest_summary(self):
-        
+    def _ingest_summary(self): 
         # get date parameters.
         yr = self._date[:4]
         mn = self._date[4:6]
         dy = self._date[6:]
 
+        self._logger.info("Getting ingest summary data for the day")
+        
+        ingest_summary_cols = ["date","total"]		
+        result_rows = []       
+        df_filtered =  pd.DataFrame() 
+
+        ingest_summary_file = "{0}/is_{1}{2}.csv".format(self._ingest_summary_path,yr,mn)			
+        ingest_summary_tmp = "{0}.tmp".format(ingest_summary_file)
+        if os.path.isfile(ingest_summary_file):
+            df = pd.read_csv(ingest_summary_file, delimiter=',',names=ingest_summary_cols, skiprows=1)
+            df_filtered = df[df['date'].str.contains("{0}-{1}-{2}".format(yr, mn, dy)) == False] 
+        else:
+            df = pd.DataFrame()
+        
         # get ingest summary.           
-        ingest_summary_qry = ("SELECT tryear, trmonth, trday, trhour, trminute, COUNT(*) flows"
-                              " FROM {0}.flow "
-                              " WHERE "
-                              " y={1} "
-                              " AND m={2} "
-                              " AND d={3} "
-                              " AND unix_tstamp IS NOT NULL "
-                              " GROUP BY tryear, trmonth, trday, trhour, trminute;")
-
-        ingest_summary_qry = ingest_summary_qry.format(self._db, yr, mn, dy)
-
+        ingest_summary_qry = ("SELECT tryear, trmonth, trday, trhour, trminute, COUNT(*) total"
+                            " FROM {0}.{1} "
+                            " WHERE "
+                            " y={2} "
+                            " AND m={3} "
+                            " AND d={4} "
+                            " AND unix_tstamp IS NOT NULL AND sip IS NOT NULL "
+                            " AND sport IS NOT NULL AND dip IS NOT NULL "
+                            " AND dport IS NOT NULL AND ibyt IS NOT NULL "
+                            " AND ipkt IS NOT NULL "
+                            " GROUP BY tryear, trmonth, trday, trhour, trminute;")
+
+
+        ingest_summary_qry = ingest_summary_qry.format(self._db,self._table_name, yr, mn, dy)
         results_file = "{0}/results_{1}.csv".format(self._ingest_summary_path,self._date)
         self._engine.query(ingest_summary_qry,output_file=results_file,delimiter=",")
-        
-        result_rows = []        
-        with open(results_file, 'rb') as rf:
-            csv_reader = csv.reader(rf, delimiter = ",")
-            result_rows = list(csv_reader)
-        
-        result_rows = iter(result_rows)
-        next(result_rows)
 
-        ingest_summary_results = [ ["date","flows"] ]
-        ingest_summary_results.extend([ ["{0}-{1}-{2} {3}:{4}".format(yr, mn, dy, row[3].zfill(2) ,row[4].zfill(2)), row[5]] for row in result_rows ])
-        ingest_summay_file = "{0}/is_{1}{2}.csv".format(self._ingest_summary_path,yr,mn)
+        if os.path.isfile(results_file):
+            result_rows = pd.read_csv(results_file, delimiter=',') 
 
+            df_new = pd.DataFrame([["{0}-{1}-{2} {3}:{4}".format(yr, mn, dy, str(val['trhour']).zfill(2), str(val['trminute']).zfill(2)), int(val[5])] for key,val in result_rows.iterrows()],columns = ingest_summary_cols)						
 
-        write_format =  'a' if os.path.isfile(ingest_summay_file) else 'w+'
-        with open(ingest_summay_file, write_format) as u_file:
-            writer = csv.writer(u_file, quoting=csv.QUOTE_NONE, delimiter=",")
-            writer.writerows(ingest_summary_results)
+            df_filtered = df_filtered.append(df_new, ignore_index=True)
+            df_filtered.to_csv(ingest_summary_tmp,sep=',', index=False)
 
-        rm_big_file = "rm {0}".format(results_file)
-        os.remove(results_file)
-       
+            os.remove(results_file)
+            os.rename(ingest_summary_tmp,ingest_summary_file)
+        else:
+            self._logger.info("No data found for the ingest summary")
 
+        
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/bea0e571/spot-oa/oa/proxy/proxy_oa.py
----------------------------------------------------------------------
diff --git a/spot-oa/oa/proxy/proxy_oa.py b/spot-oa/oa/proxy/proxy_oa.py
index d9b0362..7359ba1 100644
--- a/spot-oa/oa/proxy/proxy_oa.py
+++ b/spot-oa/oa/proxy/proxy_oa.py
@@ -5,13 +5,14 @@ import json
 import shutil
 import sys
 import datetime
-import csv
+import csv, math
 from collections import OrderedDict
 from utils import Util
 from components.data.data import Data
 from components.iana.iana_transform import IanaTransform
 from components.nc.network_context import NetworkContext
 from multiprocessing import Process
+import pandas as pd 
 
 import time
 import md5
@@ -70,6 +71,7 @@ class OA(object):
         self._add_hash()
         self._create_proxy_scores_csv()
         self._get_oa_details()
+        self._ingest_summary()
 
 
         ##################
@@ -292,3 +294,57 @@ class OA(object):
             except OSError:
                 pass
 
+
+    def _ingest_summary(self): 
+        # get date parameters.
+        yr = self._date[:4]
+        mn = self._date[4:6]
+        dy = self._date[6:]
+
+        self._logger.info("Getting ingest summary data for the day")
+        
+        ingest_summary_cols = ["date","total"]		
+        result_rows = []        
+        df_filtered =  pd.DataFrame()
+
+        ingest_summary_file = "{0}/is_{1}{2}.csv".format(self._ingest_summary_path,yr,mn)			
+        ingest_summary_tmp = "{0}.tmp".format(ingest_summary_file)
+
+        if os.path.isfile(ingest_summary_file):
+        	df = pd.read_csv(ingest_summary_file, delimiter=',')
+            #discards previous rows from the same date
+        	df_filtered = df[df['date'].str.contains("{0}-{1}-{2}".format(yr, mn, dy)) == False] 
+        else:
+        	df = pd.DataFrame()
+            
+        # get ingest summary.
+        ingest_summary_qry = ("SELECT p_date, p_time, COUNT(*) as total "
+                                    " FROM {0}.{1}"
+                                    " WHERE y='{2}' AND m='{3}' AND d='{4}' "
+                                    " AND p_date IS NOT NULL AND p_time IS NOT NULL " 
+                                    " AND clientip IS NOT NULL AND p_time != '' "
+                                    " AND host IS NOT NULL AND fulluri IS NOT NULL "
+                                    " GROUP BY p_date, p_time;") 
+
+        ingest_summary_qry = ingest_summary_qry.format(self._db,self._table_name, yr, mn, dy)
+        results_file = "{0}/results_{1}.csv".format(self._ingest_summary_path,self._date)        
+        self._engine.query(ingest_summary_qry,output_file=results_file,delimiter=",")
+        
+        if os.path.isfile(results_file):
+            df_results = pd.read_csv(results_file, delimiter=',')  
+            
+            #Forms a new dataframe splitting the minutes from the time column/
+            df_new = pd.DataFrame([["{0} {1}:{2}".format(val['p_date'], val['p_time'].split(":")[0].zfill(2), val['p_time'].split(":")[1].zfill(2)), int(val['total']) if not math.isnan(val['total']) else 0 ] for key,val in df_results.iterrows()],columns = ingest_summary_cols)
+            
+            #Groups the data by minute 
+            sf = df_new.groupby(by=['date'])['total'].sum()
+            df_per_min = pd.DataFrame({'date':sf.index, 'total':sf.values})
+            
+            df_final = df_filtered.append(df_per_min, ignore_index=True)
+            df_final.to_csv(ingest_summary_tmp,sep=',', index=False)
+
+            os.remove(results_file)
+            os.rename(ingest_summary_tmp,ingest_summary_file)
+        else:
+            self._logger.info("No data found for the ingest summary")
+        
\ No newline at end of file


[22/49] incubator-spot git commit: test_flow

Posted by ev...@apache.org.
test_flow

added tod anomaly test


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/ac866a00
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/ac866a00
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/ac866a00

Branch: refs/heads/master
Commit: ac866a00d19a42a5b707272ae83146cbb85f5043
Parents: a83208e
Author: nlsegerl <na...@intel.com>
Authored: Mon Dec 19 15:41:05 2016 -0800
Committer: nlsegerl <na...@intel.com>
Committed: Mon Dec 19 15:41:05 2016 -0800

----------------------------------------------------------------------
 .../FlowSuspiciousConnectsAnalysisTest.scala    | 85 ++++++++++++++++++--
 1 file changed, 79 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/ac866a00/spot-ml/src/test/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysisTest.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/test/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysisTest.scala b/spot-ml/src/test/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysisTest.scala
index 73a7913..07bd8b6 100644
--- a/spot-ml/src/test/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysisTest.scala
+++ b/spot-ml/src/test/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysisTest.scala
@@ -1,13 +1,86 @@
 package org.apache.spot.netflow
 
-import org.scalatest.FunSuite
+import org.apache.log4j.{Level, LogManager, Logger}
+import org.apache.spark.SparkContext
+import org.apache.spark.sql.{DataFrame, SQLContext}
+import org.apache.spot.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig
+import org.apache.spot.netflow.FlowSchema._
+import org.apache.spot.testutils.TestingSparkContextFlatSpec
+import org.scalatest.Matchers
+
+case class FlowRecord(treceived: String,
+                      tryear: Int,
+                      trmonth: Int,
+                      trday: Int,
+                      trhour: Int,
+                      trminute: Int,
+                      trsec: Int,
+                      tdur: Float,
+                      sip: String,
+                      dip: String,
+                      sport: Int,
+                      dport: Int,
+                      proto: String,
+                      ipkt: Int,
+                      ibyt: Int,
+                      opkt: Int,
+                      obyt: Int)
+
+class FlowSuspiciousConnectsAnalysisTest extends TestingSparkContextFlatSpec with Matchers {
+
+
+  val testConfig = SuspiciousConnectsConfig(analysis = "flow",
+    inputPath = "",
+    feedbackFile = "",
+    duplicationFactor = 1,
+    topicCount = 20,
+    hdfsScoredConnect = "",
+    threshold = 1.0d,
+    maxResults = 1000,
+    outputDelimiter = "\t",
+    ldaPRGSeed = None,
+    ldaMaxiterations = 20,
+    ldaAlpha = 1.02,
+    ldaBeta = 1.001)
+
+  "netflow suspicious connects" should "correctly identify time-of-day anomalies" in {
+
+    val logger = LogManager.getLogger("SuspiciousConnectsAnalysis")
+    logger.setLevel(Level.OFF)
+    val testSqlContext = new org.apache.spark.sql.SQLContext(sparkContext)
+
+    val anomalousRecord = FlowRecord("2016-05-05 00:11:01", 2016, 5, 5, 0, 0, 1, 0.972f, "172.16.0.129", "10.0.2.202", 1024, 80, "TCP", 39, 12522, 0, 0)
+    val typicalRecord = FlowRecord("2016-05-05 13:54:58", 2016, 5, 5, 13, 54, 58, 0.972f, "172.16.0.129", "10.0.2.202", 1024, 80, "TCP", 39, 12522, 0, 0)
+
+    import testSqlContext.implicits._
+
+    val data = sparkContext.parallelize(Seq(anomalousRecord, typicalRecord, typicalRecord, typicalRecord, typicalRecord, typicalRecord,
+      typicalRecord, typicalRecord, typicalRecord, typicalRecord)).toDF()
+
+    val scoredData : DataFrame = FlowSuspiciousConnectsAnalysis.detectFlowAnomalies(data,
+      testConfig,
+      sparkContext,
+      testSqlContext,
+      logger)
+
+
+
+    val anomalyScore = scoredData.filter(scoredData(Hour) === 0).first().getAs[Double](Score)
+    val typicalScores = scoredData.filter(scoredData(Hour) === 13).collect().map(_.getAs[Double](Score))
+
+    Math.abs(anomalyScore - 0.1d) should be < 0.01
+    typicalScores.length shouldBe 9
+    Math.abs(typicalScores(0) - 0.9d) should be < 0.01
+    Math.abs(typicalScores(1) - 0.9d) should be < 0.01
+    Math.abs(typicalScores(2) - 0.9d) should be < 0.01
+    Math.abs(typicalScores(3) - 0.9d) should be < 0.01
+    Math.abs(typicalScores(4) - 0.9d) should be < 0.01
+    Math.abs(typicalScores(5) - 0.9d) should be < 0.01
+    Math.abs(typicalScores(6) - 0.9d) should be < 0.01
+    Math.abs(typicalScores(7) - 0.9d) should be < 0.01
+    Math.abs(typicalScores(8) - 0.9d) should be < 0.01
 
-/**
-  * Created by nlsegerl on 12/13/16.
-  */
-class FlowSuspiciousConnectsAnalysisTest extends FunSuite {
 
-  test("testDetectFlowAnomalies") {
 
   }
 


[44/49] incubator-spot git commit: fixup! Ingest summary supporting 3 use cases, Netflow, DNS and Proxy

Posted by ev...@apache.org.
fixup! Ingest summary supporting 3 use cases, Netflow, DNS and Proxy


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/6321c7cc
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/6321c7cc
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/6321c7cc

Branch: refs/heads/master
Commit: 6321c7cca215272bd8810f1ecda2d7746fd2f936
Parents: 5ecde79
Author: Diego Ortiz Huerta <di...@intel.com>
Authored: Mon Dec 12 10:34:04 2016 -0800
Committer: Everardo Lopez Sandoval (Intel) <el...@elopezsa-mac02.ra.intel.com>
Committed: Fri Jan 20 17:01:02 2017 -0800

----------------------------------------------------------------------
 spot-oa/ui/ingest-summary.html | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/6321c7cc/spot-oa/ui/ingest-summary.html
----------------------------------------------------------------------
diff --git a/spot-oa/ui/ingest-summary.html b/spot-oa/ui/ingest-summary.html
index 1c4a100..0bfc189 100755
--- a/spot-oa/ui/ingest-summary.html
+++ b/spot-oa/ui/ingest-summary.html
@@ -1,7 +1,7 @@
 <!DOCTYPE html>
 <html>
 <head>
-    <title>Netflow :: Ingest Summary</title>
+    <title>Ingest Summary</title>
 
     <meta charset="UTF-8">
 
@@ -84,7 +84,7 @@
                     <span class="icon-bar"></span>
                     <span class="icon-bar"></span>
                 </button>
-                <span class="navbar-brand">Apache Spot :: Netflow :: Ingest Summary</span>
+                <span class="navbar-brand">Apache Spot :: Ingest Summary</span>
             </div>
             <!-- Collect the nav links, forms, and other content for toggling -->
             <div class="collapse navbar-collapse" id="main-menu">


[29/49] incubator-spot git commit: Spot-ml various validations and cleaning (#171)

Posted by ev...@apache.org.
Spot-ml various validations and cleaning (#171)

* Flow and Proxy data validation added.

* Added data validation for the three different data types. Added InSchema, OutSchema select style to Proxy

* Added 'unknown' and '-1' keys to iana catalogs

* Update dns-rcode.csv

Removed extra added column

* Adding one more validation for DNS.QueryName

* Adding validations

* Changin uploadResultsFile function name to mergeResultsFiles

* Fixed userDomain issue introduced after merge

* Changes based on code review. Biggest change is queries and filters are now using Column and not SQL expressions

* Deleting install_ml.sh as it's not required anymore

* Changes after code review

* Changes after code review

* Last changes after code review


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/760dbf34
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/760dbf34
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/760dbf34

Branch: refs/heads/master
Commit: 760dbf349fda2886fdfec7f6a261ee6c5d537ead
Parents: ab5ba53
Author: Ricardo Barona <ri...@intel.com>
Authored: Thu Dec 22 13:22:29 2016 -0600
Committer: GitHub <no...@github.com>
Committed: Thu Dec 22 13:22:29 2016 -0600

----------------------------------------------------------------------
 spot-ml/install_ml.sh                           |  12 --
 spot-ml/ml_ops.sh                               |  33 +--
 spot-ml/ml_test.sh                              |  30 +--
 .../org/apache/spot/SuspiciousConnects.scala    |  19 +-
 .../spot/SuspiciousConnectsScoreFunction.scala  |  19 +-
 .../scala/org/apache/spot/dns/DNSSchema.scala   |   2 -
 .../dns/DNSSuspiciousConnectsAnalysis.scala     | 172 +++++++++++----
 .../org/apache/spot/dns/DNSWordCreation.scala   |  28 ++-
 .../dns/model/DNSSuspiciousConnectsModel.scala  |  99 +++++++--
 .../FlowSuspiciousConnectsAnalysis.scala        | 119 ++++++++--
 .../apache/spot/netflow/FlowWordCreator.scala   |  65 +++---
 .../spot/netflow/model/FlowScoreFunction.scala  |   9 +-
 .../model/FlowSuspiciousConnectsModel.scala     |  73 ++++---
 .../org/apache/spot/proxy/ProxySchema.scala     |  49 +++++
 .../proxy/ProxySuspiciousConnectsAnalysis.scala | 155 +++++++++++--
 .../proxy/ProxySuspiciousConnectsModel.scala    |  65 ++++--
 .../apache/spot/proxy/ProxyWordCreation.scala   |  27 ++-
 .../utilities/data/InputOutputDataHandler.scala |  63 ++++++
 .../data/validation/InvalidDataHandler.scala    |  56 +++++
 .../org/apache/spot/DNSWordCreationTest.scala   |  21 --
 .../org/apache/spot/FlowWordCreatorTest.scala   | 216 -------------------
 .../dns/DNSSuspiciousConnectsAnalysisTest.scala | 114 ++++++++++
 .../apache/spot/dns/DNSWordCreationTest.scala   |  17 ++
 .../FlowSuspiciousCoonectsAnalysis.scala        | 125 +++++++++++
 .../spot/netflow/FlowWordCreatorTest.scala      | 214 ++++++++++++++++++
 .../ProxySuspiciousConnectsAnalysisTest.scala   | 135 ++++++++++++
 spot-oa/oa/components/iana/dns-qclass.csv       |   1 +
 spot-oa/oa/components/iana/dns-qtype.csv        |   1 +
 spot-oa/oa/components/iana/dns-rcode.csv        |   1 +
 spot-setup/spot.conf                            |  13 +-
 30 files changed, 1431 insertions(+), 522 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/760dbf34/spot-ml/install_ml.sh
----------------------------------------------------------------------
diff --git a/spot-ml/install_ml.sh b/spot-ml/install_ml.sh
deleted file mode 100755
index 2f66816..0000000
--- a/spot-ml/install_ml.sh
+++ /dev/null
@@ -1,12 +0,0 @@
- #!/bin/bash
-
-source /etc/spot.conf
-
-#  copy solution files to all nodes
-for d in "${NODES[@]}" 
-do
-    rsync -v -a --include='target' --include='target/scala-2.10' --include='target/scala-2.10/spot-ml-assembly-1.1.jar' \
-      --include 'top-1m.csv' --include='*.sh' \
-      --exclude='*' .  $d:${LUSER}/ml
-done
-

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/760dbf34/spot-ml/ml_ops.sh
----------------------------------------------------------------------
diff --git a/spot-ml/ml_ops.sh b/spot-ml/ml_ops.sh
index cf1bc31..a3406e8 100755
--- a/spot-ml/ml_ops.sh
+++ b/spot-ml/ml_ops.sh
@@ -46,33 +46,11 @@ else
 fi
 
 FEEDBACK_PATH=${LPATH}/${DSOURCE}_scores.csv
-DUPFACTOR=1000
-
-PREPROCESS_STEP=${DSOURCE}_pre_lda
-POSTPROCESS_STEP=${DSOURCE}_post_lda
-
-HDFS_WORDCOUNTS=${HPATH}/word_counts
-
-# paths for intermediate files
-HDFS_DOCRESULTS=${HPATH}/doc_results.csv
-LOCAL_DOCRESULTS=${LPATH}/doc_results.csv
-
-HDFS_WORDRESULTS=${HPATH}/word_results.csv
-LOCAL_WORDRESULTS=${LPATH}/word_results.csv
 
 HDFS_SCORED_CONNECTS=${HPATH}/scores
-HDFS_MODEL=${HPATH}/model
 
 LDA_OUTPUT_DIR=${DSOURCE}/${FDATE}
 
-TOPIC_COUNT=20
-
-nodes=${NODES[0]}
-for n in "${NODES[@]:1}" ; do nodes+=",${n}"; done
-
-hdfs dfs -rm -R -f ${HDFS_WORDCOUNTS}
-wait
-
 mkdir -p ${LPATH}
 rm -f ${LPATH}/*.{dat,beta,gamma,other,pkl} # protect the flow_scores.csv file
 
@@ -91,7 +69,7 @@ time spark-submit --class "org.apache.spot.SuspiciousConnects" \
   --conf spark.kryoserializer.buffer.max=512m \
   --conf spark.yarn.am.waitTime=100s \
   --conf spark.yarn.am.memoryOverhead=${SPK_DRIVER_MEM_OVERHEAD} \
-  --conf spark.yarn.executor.memoryOverhead=${SPAK_EXEC_MEM_OVERHEAD} target/scala-2.10/spot-ml-assembly-1.1.jar \
+  --conf spark.yarn.executor.memoryOverhead=${SPK_EXEC_MEM_OVERHEAD} target/scala-2.10/spot-ml-assembly-1.1.jar \
   --analysis ${DSOURCE} \
   --input ${RAWDATA_PATH}  \
   --dupfactor ${DUPFACTOR} \
@@ -101,11 +79,4 @@ time spark-submit --class "org.apache.spot.SuspiciousConnects" \
   --scored ${HDFS_SCORED_CONNECTS} \
   --threshold ${TOL} \
   --maxresults ${MAXRESULTS} \
-  --ldamaxiterations 20
-
-wait
-
-# move results to hdfs.
-cd ${LPATH}
-hadoop fs -getmerge ${HDFS_SCORED_CONNECTS}/part-* ${DSOURCE}_results.csv && hadoop fs -moveFromLocal \
-    ${DSOURCE}_results.csv  ${HDFS_SCORED_CONNECTS}/${DSOURCE}_results.csv
+  --ldamaxiterations 20
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/760dbf34/spot-ml/ml_test.sh
----------------------------------------------------------------------
diff --git a/spot-ml/ml_test.sh b/spot-ml/ml_test.sh
index 653d276..dbf7f8c 100755
--- a/spot-ml/ml_test.sh
+++ b/spot-ml/ml_test.sh
@@ -18,31 +18,9 @@ LPATH=${LUSER}/ml/${DSOURCE}/test
 HPATH=${HUSER}/${DSOURCE}/test/scored_results
 # prepare parameters pipeline stages
 
-
 FEEDBACK_PATH=${LPATH}/${DSOURCE}_scores.csv
-DUPFACTOR=1000
-
-HDFS_WORDCOUNTS=${HPATH}/word_counts
-
-# paths for intermediate files
-HDFS_DOCRESULTS=${HPATH}/doc_results.csv
-LOCAL_DOCRESULTS=${LPATH}/doc_results.csv
-
-HDFS_WORDRESULTS=${HPATH}/word_results.csv
-LOCAL_WORDRESULTS=${LPATH}/word_results.csv
 
 HDFS_SCORED_CONNECTS=${HPATH}/scores
-HDFS_MODEL=${HPATH}/model
-
-LDA_OUTPUT_DIR=test/${DSOURCE}
-
-TOPIC_COUNT=20
-
-nodes=${NODES[0]}
-for n in "${NODES[@]:1}" ; do nodes+=",${n}"; done
-
-hdfs dfs -rm -R -f ${HDFS_WORDCOUNTS}
-wait
 
 mkdir -p ${LPATH}
 rm -f ${LPATH}/*.{dat,beta,gamma,other,pkl} # protect the flow_scores.csv file
@@ -66,7 +44,7 @@ time spark-submit --class "org.apache.spot.SuspiciousConnects" \
   --conf spark.shuffle.service.enabled=true \
   --conf spark.yarn.am.waitTime=1000000 \
   --conf spark.yarn.driver.memoryOverhead=${SPK_DRIVER_MEM_OVERHEAD} \
-  --conf spark.yarn.executor.memoryOverhead=${SPAK_EXEC_MEM_OVERHEAD} target/scala-2.10/spot-ml-assembly-1.1.jar \
+  --conf spark.yarn.executor.memoryOverhead=${SPK_EXEC_MEM_OVERHEAD} target/scala-2.10/spot-ml-assembly-1.1.jar \
   --analysis ${DSOURCE} \
   --input ${RAWDATA_PATH}  \
   --dupfactor ${DUPFACTOR} \
@@ -75,8 +53,4 @@ time spark-submit --class "org.apache.spot.SuspiciousConnects" \
   --scored ${HDFS_SCORED_CONNECTS} \
   --threshold ${TOL} \
   --maxresults ${MAXRESULTS} \
-  --ldamaxiterations 11
-
-cd ${LPATH}
-hadoop fs -getmerge ${HDFS_SCORED_CONNECTS}/part-* ${DSOURCE}_results.csv && hadoop fs -moveFromLocal \
-    ${DSOURCE}_results.csv  ${HDFS_SCORED_CONNECTS}/${DSOURCE}_results.csv
\ No newline at end of file
+  --ldamaxiterations 11
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/760dbf34/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnects.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnects.scala b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnects.scala
index 8751189..fc7606e 100644
--- a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnects.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnects.scala
@@ -7,6 +7,7 @@ import org.apache.spot.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig
 import org.apache.spot.dns.DNSSuspiciousConnectsAnalysis
 import org.apache.spot.netflow.FlowSuspiciousConnectsAnalysis
 import org.apache.spot.proxy.ProxySuspiciousConnectsAnalysis
+import org.apache.spot.utilities.data.InputOutputDataHandler
 
 
 /**
@@ -43,16 +44,26 @@ object SuspiciousConnects {
         val sqlContext = new SQLContext(sparkContext)
         implicit val outputDelimiter = config.outputDelimiter
 
+        val inputDataFrame = InputOutputDataHandler.getInputDataFrame(sqlContext, config.inputPath, logger)
+          .getOrElse(sqlContext.emptyDataFrame)
+        if(inputDataFrame.rdd.isEmpty()) {
+          logger.error("Couldn't read data from location " + config.inputPath +", please verify it's a valid location and that " +
+            s"contains parquet files with a given schema and try again.")
+          System.exit(0)
+        }
+
         analysis match {
-          case "flow" => FlowSuspiciousConnectsAnalysis.run(config, sparkContext, sqlContext, logger)
-          case "dns" => DNSSuspiciousConnectsAnalysis.run(config, sparkContext, sqlContext, logger)
-          case "proxy" => ProxySuspiciousConnectsAnalysis.run(config, sparkContext, sqlContext, logger)
+          case "flow" => FlowSuspiciousConnectsAnalysis.run(config, sparkContext, sqlContext, logger, inputDataFrame)
+          case "dns" => DNSSuspiciousConnectsAnalysis.run(config, sparkContext, sqlContext, logger, inputDataFrame)
+          case "proxy" => ProxySuspiciousConnectsAnalysis.run(config, sparkContext, sqlContext, logger, inputDataFrame)
           case _ => logger.error("Unsupported (or misspelled) analysis: " + analysis)
         }
 
+        InputOutputDataHandler.mergeResultsFiles(sparkContext, config.hdfsScoredConnect, analysis, logger)
+
         sparkContext.stop()
 
-      case None => logger.error("Error parsing arguments")
+      case None => logger.error("Error parsing arguments.")
     }
 
     System.exit(0)

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/760dbf34/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsScoreFunction.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsScoreFunction.scala b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsScoreFunction.scala
index e7f901b..04db60e 100644
--- a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsScoreFunction.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnectsScoreFunction.scala
@@ -1,6 +1,7 @@
 package org.apache.spot
 
 import org.apache.spark.broadcast.Broadcast
+import org.apache.spot.utilities.data.validation.InvalidDataHandler
 
 
 
@@ -12,13 +13,17 @@ class SuspiciousConnectsScoreFunction(topicCount: Int,
 
     val zeroProb = Array.fill(topicCount) { 0d }
 
-    // If either the ip or the word key value cannot be found it means that it was not seen in training.
-    val topicGivenDocProbs = ipToTopicMixBC.value.getOrElse(ip, zeroProb)
-    val wordGivenTopicProbs = wordToPerTopicProbBC.value.getOrElse(word, zeroProb)
-
-    topicGivenDocProbs.zip(wordGivenTopicProbs)
-      .map({ case (pWordGivenTopic, pTopicGivenDoc) => pWordGivenTopic * pTopicGivenDoc })
-      .sum
+    if(word == InvalidDataHandler.WordError){
+      InvalidDataHandler.ScoreError
+    } else {
+      // If either the ip or the word key value cannot be found it means that it was not seen in training.
+      val topicGivenDocProbs = ipToTopicMixBC.value.getOrElse(ip, zeroProb)
+      val wordGivenTopicProbs = wordToPerTopicProbBC.value.getOrElse(word, zeroProb)
+
+      topicGivenDocProbs.zip(wordGivenTopicProbs)
+        .map({ case (pWordGivenTopic, pTopicGivenDoc) => pWordGivenTopic * pTopicGivenDoc })
+        .sum
+    }
   }
 
 }

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/760dbf34/spot-ml/src/main/scala/org/apache/spot/dns/DNSSchema.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/dns/DNSSchema.scala b/spot-ml/src/main/scala/org/apache/spot/dns/DNSSchema.scala
index fd8f33e..020fc46 100644
--- a/spot-ml/src/main/scala/org/apache/spot/dns/DNSSchema.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/dns/DNSSchema.scala
@@ -1,8 +1,6 @@
 package org.apache.spot.dns
 
-import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._
-import org.apache.spot.dns.model.DNSSuspiciousConnectsModel.ModelSchema
 
 /**
   * Data frame schemas and column names used in the DNS suspicious connects analysis.

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/760dbf34/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala b/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala
index 4ef4718..f444dfe 100644
--- a/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/dns/DNSSuspiciousConnectsAnalysis.scala
@@ -8,8 +8,9 @@ import org.apache.spot.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig
 import org.apache.spot.dns.DNSSchema._
 import org.apache.spot.dns.model.DNSSuspiciousConnectsModel
 import org.apache.log4j.Logger
-
 import org.apache.spot.dns.model.DNSSuspiciousConnectsModel.ModelSchema
+import org.apache.spot.proxy.ProxySchema.Score
+import org.apache.spot.utilities.data.validation.{InvalidDataHandler => dataValidation}
 
 /**
   * The suspicious connections analysis of DNS log data develops a probabilistic model the DNS queries
@@ -18,28 +19,6 @@ import org.apache.spot.dns.model.DNSSuspiciousConnectsModel.ModelSchema
 
 object DNSSuspiciousConnectsAnalysis {
 
-  val inSchema = StructType(List(TimestampField, UnixTimestampField, FrameLengthField, ClientIPField,
-      QueryNameField, QueryClassField, QueryTypeField, QueryResponseCodeField))
-
-  val inColumns = inSchema.fieldNames.map(col)
-
-
-  assert(ModelSchema.fields.forall(inSchema.fields.contains(_)))
-
-  val OutSchema = StructType(
-    List(TimestampField,
-      UnixTimestampField,
-      FrameLengthField,
-      ClientIPField,
-      QueryNameField,
-      QueryClassField,
-      QueryTypeField,
-      QueryResponseCodeField,
-      ScoreField))
-
-  val OutColumns = OutSchema.fieldNames.map(col)
-
-
   /**
     * Run suspicious connections analysis on DNS log data.
     *
@@ -48,34 +27,153 @@ object DNSSuspiciousConnectsAnalysis {
     * @param sqlContext
     * @param logger
     */
-  def run(config: SuspiciousConnectsConfig, sparkContext: SparkContext, sqlContext: SQLContext, logger: Logger) = {
-    logger.info("Starting DNS suspicious connects analysis.")
+  def run(config: SuspiciousConnectsConfig, sparkContext: SparkContext, sqlContext: SQLContext, logger: Logger,
+          inputDNSRecords: DataFrame) = {
 
-
-    logger.info("Loading data")
+    logger.info("Starting DNS suspicious connects analysis.")
 
     val userDomain = config.userDomain
 
-    val rawDataDF = sqlContext.read.parquet(config.inputPath)
-      .filter(Timestamp + " is not null and " + UnixTimestamp + " is not null")
-      .select(inColumns:_*)
+    val cleanDNSRecords = filterAndSelectCleanDNSRecords(inputDNSRecords)
 
     logger.info("Training the model")
 
     val model =
-      DNSSuspiciousConnectsModel.trainNewModel(sparkContext, sqlContext, logger, config, rawDataDF, config.topicCount)
+      DNSSuspiciousConnectsModel.trainNewModel(sparkContext, sqlContext, logger, config, cleanDNSRecords, config.topicCount)
 
     logger.info("Scoring")
-    val scoredDF = model.score(sparkContext, sqlContext, rawDataDF, userDomain)
+    val scoredDNSRecords = model.score(sparkContext, sqlContext, cleanDNSRecords, userDomain)
 
+    val filteredDNSRecords = filterScoredDNSRecords(scoredDNSRecords, config.threshold)
 
-    val filteredDF = scoredDF.filter(Score + " <= " + config.threshold)
-    val mostSusipiciousDF: DataFrame = filteredDF.orderBy(Score).limit(config.maxResults)
+    val orderedDNSRecords = filteredDNSRecords.orderBy(Score)
 
-    val outputDF = mostSusipiciousDF.select(OutColumns:_*).sort(Score)
+    val mostSuspiciousDNSRecords = if(config.maxResults > 0)  orderedDNSRecords.limit(config.maxResults) else orderedDNSRecords
 
-    logger.info("DNS  suspcicious connects analysis completed.")
+    val outputDNSRecords = mostSuspiciousDNSRecords.select(OutSchema:_*).sort(Score)
+
+    logger.info("DNS  suspicious connects analysis completed.")
     logger.info("Saving results to : " + config.hdfsScoredConnect)
-    outputDF.map(_.mkString(config.outputDelimiter)).saveAsTextFile(config.hdfsScoredConnect)
+    outputDNSRecords.map(_.mkString(config.outputDelimiter)).saveAsTextFile(config.hdfsScoredConnect)
+
+    val invalidDNSRecords = filterAndSelectInvalidDNSRecords(inputDNSRecords)
+    dataValidation.showAndSaveInvalidRecords(invalidDNSRecords, config.hdfsScoredConnect, logger)
+
+    val corruptDNSRecords = filterAndSelectCorruptDNSRecords(scoredDNSRecords)
+    dataValidation.showAndSaveCorruptRecords(corruptDNSRecords, config.hdfsScoredConnect, logger)
+  }
+
+  /**
+    *
+    * @param inputDNSRecords raw DNS records.
+    * @return
+    */
+  def filterAndSelectCleanDNSRecords(inputDNSRecords: DataFrame): DataFrame ={
+
+    val cleanDNSRecordsFilter = inputDNSRecords(Timestamp).isNotNull &&
+      inputDNSRecords(Timestamp).notEqual("") &&
+      inputDNSRecords(Timestamp).notEqual("-") &&
+      inputDNSRecords(UnixTimestamp).isNotNull &&
+      inputDNSRecords(FrameLength).isNotNull &&
+      inputDNSRecords(QueryName).isNotNull &&
+      inputDNSRecords(QueryName).notEqual("") &&
+      inputDNSRecords(QueryName).notEqual("-") &&
+      inputDNSRecords(QueryName).notEqual("(empty)") &&
+      inputDNSRecords(ClientIP).isNotNull &&
+      inputDNSRecords(ClientIP).notEqual("") &&
+      inputDNSRecords(ClientIP).notEqual("-") &&
+      ((inputDNSRecords(QueryClass).isNotNull &&
+        inputDNSRecords(QueryClass).notEqual("") &&
+        inputDNSRecords(QueryClass).notEqual("-")) ||
+        inputDNSRecords(QueryType).isNotNull ||
+        inputDNSRecords(QueryResponseCode).isNotNull)
+
+    inputDNSRecords
+      .filter(cleanDNSRecordsFilter)
+      .select(InSchema: _*)
+      .na.fill(DefaultQueryClass, Seq(QueryClass))
+      .na.fill(DefaultQueryType, Seq(QueryType))
+      .na.fill(DefaultQueryResponseCode, Seq(QueryResponseCode))
+  }
+
+  /**
+    *
+    * @param inputDNSRecords raw DNS records.
+    * @return
+    */
+  def filterAndSelectInvalidDNSRecords(inputDNSRecords: DataFrame): DataFrame ={
+
+    val invalidDNSRecordsFilter = inputDNSRecords(Timestamp).isNull ||
+      inputDNSRecords(Timestamp).equalTo("") ||
+      inputDNSRecords(Timestamp).equalTo("-") ||
+      inputDNSRecords(UnixTimestamp).isNull ||
+      inputDNSRecords(FrameLength).isNull ||
+      inputDNSRecords(QueryName).isNull ||
+      inputDNSRecords(QueryName).equalTo("") ||
+      inputDNSRecords(QueryName).equalTo("-") ||
+      inputDNSRecords(QueryName).equalTo("(empty)") ||
+      inputDNSRecords(ClientIP).isNull ||
+      inputDNSRecords(ClientIP).equalTo("") ||
+      inputDNSRecords(ClientIP).equalTo("-") ||
+      ((inputDNSRecords(QueryClass).isNull ||
+        inputDNSRecords(QueryClass).equalTo("") ||
+        inputDNSRecords(QueryClass).equalTo("-")) &&
+        inputDNSRecords(QueryType).isNull &&
+        inputDNSRecords(QueryResponseCode).isNull)
+
+    inputDNSRecords
+      .filter(invalidDNSRecordsFilter)
+      .select(InSchema: _*)
+  }
+
+  /**
+    *
+    * @param scoredDNSRecords scored DNS records.
+    * @param threshold score tolerance.
+    * @return
+    */
+  def filterScoredDNSRecords(scoredDNSRecords: DataFrame, threshold: Double): DataFrame ={
+
+    val filteredDNSRecordsFilter = scoredDNSRecords(Score).leq(threshold) &&
+      scoredDNSRecords(Score).gt(dataValidation.ScoreError)
+
+    scoredDNSRecords.filter(filteredDNSRecordsFilter)
   }
+
+  /**
+    *
+    * @param scoredDNSRecords scored DNS records.
+    * @return
+    */
+  def filterAndSelectCorruptDNSRecords(scoredDNSRecords: DataFrame): DataFrame = {
+
+    val corruptDNSRecordsFilter = scoredDNSRecords(Score).equalTo(dataValidation.ScoreError)
+
+    scoredDNSRecords
+      .filter(corruptDNSRecordsFilter)
+      .select(OutSchema: _*)
+
+  }
+
+  val DefaultQueryClass = "unknown"
+  val DefaultQueryType = -1
+  val DefaultQueryResponseCode = -1
+
+  val InStructType = StructType(List(TimestampField, UnixTimestampField, FrameLengthField, ClientIPField,
+    QueryNameField, QueryClassField, QueryTypeField, QueryResponseCodeField))
+
+  val InSchema = InStructType.fieldNames.map(col)
+
+  assert(ModelSchema.fields.forall(InStructType.fields.contains(_)))
+
+  val OutSchema = StructType(
+    List(TimestampField,
+      UnixTimestampField,
+      FrameLengthField,
+      ClientIPField,
+      QueryNameField,
+      QueryClassField,
+      QueryTypeField,
+      QueryResponseCodeField,
+      ScoreField)).fieldNames.map(col)
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/760dbf34/spot-ml/src/main/scala/org/apache/spot/dns/DNSWordCreation.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/dns/DNSWordCreation.scala b/spot-ml/src/main/scala/org/apache/spot/dns/DNSWordCreation.scala
index e4595e1..383eb2f 100644
--- a/spot-ml/src/main/scala/org/apache/spot/dns/DNSWordCreation.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/dns/DNSWordCreation.scala
@@ -4,6 +4,9 @@ import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.sql.functions._
 import org.apache.spot.utilities.DomainProcessor.{DomainInfo, extractDomainInfo}
 import org.apache.spot.utilities.Quantiles
+import org.apache.spot.utilities.data.validation.InvalidDataHandler
+
+import scala.util.{Failure, Success, Try}
 
 
 /**
@@ -79,18 +82,23 @@ class DNSWordCreation(frameLengthCuts: Array[Double],
               dnsQueryType: Int,
               dnsQueryRcode: Int): String = {
 
+    Try {
+      val DomainInfo(domain, topDomain, subdomain, subdomainLength, subdomainEntropy, numPeriods) =
+        extractDomainInfo(queryName, topDomainsBC, userDomain)
 
-    val DomainInfo(domain, topDomain, subdomain, subdomainLength, subdomainEntropy, numPeriods) =
-      extractDomainInfo(queryName, topDomainsBC, userDomain)
+      Seq(topDomain,
+        Quantiles.bin(frameLength.toDouble, frameLengthCuts),
+        Quantiles.bin(unixTimeStamp.toDouble, timeCuts),
+        Quantiles.bin(subdomainLength.toDouble, subdomainLengthCuts),
+        Quantiles.bin(subdomainEntropy, entropyCuts),
+        Quantiles.bin(numPeriods.toDouble, numberPeriodsCuts),
+        dnsQueryType,
+        dnsQueryRcode).mkString("_")
+    } match {
+      case Success(word) => word
+      case _ => InvalidDataHandler.WordError
+    }
 
-    Seq(topDomain,
-      Quantiles.bin(frameLength.toDouble, frameLengthCuts),
-      Quantiles.bin(unixTimeStamp.toDouble, timeCuts),
-      Quantiles.bin(subdomainLength.toDouble, subdomainLengthCuts),
-      Quantiles.bin(subdomainEntropy, entropyCuts),
-      Quantiles.bin(numPeriods.toDouble, numberPeriodsCuts),
-      dnsQueryType,
-      dnsQueryRcode).mkString("_")
   }
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/760dbf34/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala b/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala
index 047e262..47c32a7 100644
--- a/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/dns/model/DNSSuspiciousConnectsModel.scala
@@ -13,8 +13,11 @@ import org.apache.spot.dns.DNSWordCreation
 import org.apache.spot.lda.SpotLDAWrapper
 import org.apache.spot.lda.SpotLDAWrapper.{SpotLDAInput, SpotLDAOutput}
 import org.apache.spot.utilities.DomainProcessor.DomainInfo
+import org.apache.spot.utilities.data.validation.InvalidDataHandler
 import org.apache.spot.utilities.{CountryCodes, DomainProcessor, Quantiles, TopDomains}
 
+import scala.util.{Failure, Success, Try}
+
 
 /**
   * A probabilistic model of the DNS queries issued by each client IP.
@@ -138,7 +141,7 @@ object DNSSuspiciousConnectsModel {
     * @param logger
     * @param config     Analysis configuration object containing CLI parameters.
     *                   Contains the path to the feedback file in config.scoresFile
-    * @param inDF       Data used to train the model.
+    * @param inputRecords       Data used to train the model.
     * @param topicCount Number of topics (traffic profiles) used to build the model.
     * @return A new [[DNSSuspiciousConnectsModel]] instance trained on the dataframe and feedback file.
     */
@@ -146,14 +149,14 @@ object DNSSuspiciousConnectsModel {
                     sqlContext: SQLContext,
                     logger: Logger,
                     config: SuspiciousConnectsConfig,
-                    inDF: DataFrame,
+                    inputRecords: DataFrame,
                     topicCount: Int): DNSSuspiciousConnectsModel = {
 
     logger.info("Training DNS suspicious connects model from " + config.inputPath)
 
-    val selectedDF = inDF.select(modelColumns: _*)
+    val selectedRecords = inputRecords.select(modelColumns: _*)
 
-    val totalDataDF = selectedDF.unionAll(DNSFeedback.loadFeedbackDF(sparkContext,
+    val totalRecords = selectedRecords.unionAll(DNSFeedback.loadFeedbackDF(sparkContext,
       sqlContext,
       config.feedbackFile,
       config.duplicationFactor))
@@ -164,22 +167,70 @@ object DNSSuspiciousConnectsModel {
 
     // create quantile cut-offs
 
-    val timeCuts = Quantiles.computeDeciles(totalDataDF.select(UnixTimestamp).rdd.
-      map({ case Row(unixTimeStamp: Long) => unixTimeStamp.toDouble }))
-
-    val frameLengthCuts = Quantiles.computeDeciles(totalDataDF.select(FrameLength).rdd
-      .map({ case Row(frameLen: Int) => frameLen.toDouble }))
-
-    val domainStatsDF = createDomainStatsDF(sparkContext, sqlContext, countryCodesBC, topDomainsBC, userDomain, totalDataDF)
-
-    val subdomainLengthCuts = Quantiles.computeQuintiles(domainStatsDF.filter(SubdomainLength + " > 0")
-      .select(SubdomainLength).rdd.map({ case Row(subdomainLength: Int) => subdomainLength.toDouble }))
-
-    val entropyCuts = Quantiles.computeQuintiles(domainStatsDF.filter(SubdomainEntropy + " > 0")
-      .select(SubdomainEntropy).rdd.map({ case Row(subdomainEntropy: Double) => subdomainEntropy }))
-
-    val numberPeriodsCuts = Quantiles.computeQuintiles(domainStatsDF.filter(NumPeriods + " > 0")
-      .select(NumPeriods).rdd.map({ case Row(numberPeriods: Int) => numberPeriods.toDouble }))
+    val timeCuts =
+      Quantiles.computeDeciles(totalRecords
+        .select(UnixTimestamp)
+        .rdd
+        .flatMap({ case Row(unixTimeStamp: Long) => {
+          Try {unixTimeStamp.toDouble} match {
+              case Failure(_) => Seq()
+              case Success(timestamp) => Seq(timestamp)
+            }
+          }
+        }))
+
+    val frameLengthCuts =
+      Quantiles.computeDeciles(totalRecords
+        .select(FrameLength)
+        .rdd
+        .flatMap({case Row(frameLen: Int) => {
+            Try{frameLen.toDouble} match{
+              case Failure(_) => Seq()
+              case Success(frameLen) => Seq(frameLen)
+            }
+          }
+        }))
+
+    val domainStatsRecords = createDomainStatsDF(sparkContext, sqlContext, countryCodesBC, topDomainsBC, userDomain, totalRecords)
+
+    val subdomainLengthCuts =
+      Quantiles.computeQuintiles(domainStatsRecords
+        .filter(domainStatsRecords(SubdomainLength).gt(0))
+        .select(SubdomainLength)
+        .rdd
+        .flatMap({ case Row(subdomainLength: Int) => {
+            Try{subdomainLength.toDouble} match {
+              case Failure(_) => Seq()
+              case Success(subdomainLength) => Seq(subdomainLength)
+            }
+          }
+        }))
+
+    val entropyCuts =
+      Quantiles.computeQuintiles(domainStatsRecords
+        .filter(domainStatsRecords(SubdomainEntropy).gt(0))
+        .select(SubdomainEntropy)
+        .rdd
+        .flatMap({ case Row(subdomainEntropy: Double) => {
+          Try{subdomainEntropy.toDouble} match {
+            case Failure(_) => Seq()
+            case Success(subdomainEntropy) => Seq(subdomainEntropy)
+            }
+          }
+        }))
+
+    val numberPeriodsCuts =
+      Quantiles.computeQuintiles(domainStatsRecords
+        .filter(domainStatsRecords(NumPeriods).gt(0))
+        .select(NumPeriods)
+        .rdd
+        .flatMap({ case Row(numberPeriods: Int) => {
+          Try {numberPeriods.toDouble} match {
+            case Failure(_) => Seq()
+            case Success(numberPeriods) => Seq(numberPeriods)
+            }
+          }
+        }))
 
     // simplify DNS log entries into "words"
 
@@ -191,11 +242,14 @@ object DNSSuspiciousConnectsModel {
                                              topDomainsBC,
                                              userDomain)
 
-    val dataWithWordDF = totalDataDF.withColumn(Word, dnsWordCreator.wordCreationUDF(modelColumns: _*))
+    val dataWithWord = totalRecords.withColumn(Word, dnsWordCreator.wordCreationUDF(modelColumns: _*))
 
     // aggregate per-word counts at each IP
     val ipDstWordCounts =
-      dataWithWordDF.select(ClientIP, Word).map({ case Row(destIP: String, word: String) => (destIP, word) -> 1 })
+      dataWithWord
+        .select(ClientIP, Word)
+        .filter(dataWithWord(Word).notEqual(InvalidDataHandler.WordError))
+        .map({ case Row(destIP: String, word: String) => (destIP, word) -> 1 })
         .reduceByKey(_ + _)
         .map({ case ((ipDst, word), count) => SpotLDAInput(ipDst, word, count) })
 
@@ -251,6 +305,7 @@ object DNSSuspiciousConnectsModel {
                           topDomainsBC: Broadcast[Set[String]],
                           userDomain: String,
                           inDF: DataFrame): DataFrame = {
+
     val queryNameIndex = inDF.schema.fieldNames.indexOf(QueryName)
 
     val domainStatsRDD: RDD[Row] = inDF.rdd.map(row =>

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/760dbf34/spot-ml/src/main/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysis.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysis.scala b/spot-ml/src/main/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysis.scala
index 32c0f6e..2ff1383 100644
--- a/spot-ml/src/main/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysis.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/netflow/FlowSuspiciousConnectsAnalysis.scala
@@ -8,7 +8,7 @@ import org.apache.spark.sql.{DataFrame, SQLContext}
 import org.apache.spot.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig
 import org.apache.spot.netflow.FlowSchema._
 import org.apache.spot.netflow.model.FlowSuspiciousConnectsModel
-
+import org.apache.spot.utilities.data.validation.{InvalidDataHandler => dataValidation}
 
 /**
   * The suspicious connections analysis of netflow records develops a probabilistic model the traffic about each
@@ -17,36 +17,119 @@ import org.apache.spot.netflow.model.FlowSuspiciousConnectsModel
 
 object FlowSuspiciousConnectsAnalysis {
 
-  def run(config: SuspiciousConnectsConfig, sparkContext: SparkContext, sqlContext: SQLContext, logger: Logger)(implicit outputDelimiter: String) = {
-
-    logger.info("Loading data")
+  def run(config: SuspiciousConnectsConfig, sparkContext: SparkContext, sqlContext: SQLContext,
+          logger: Logger, inputFlowRecords: DataFrame) = {
 
-    val rawDataDF = sqlContext.read.parquet(config.inputPath)
-      .filter(Hour + " BETWEEN 0 AND 23 AND  " + Minute + " BETWEEN 0 AND 59 AND  " + Second + " BETWEEN 0 AND 59")
-      .select(inColumns: _*)
+    logger.info("Starting flow suspicious connects analysis.")
 
+    val cleanFlowRecords = filterAndSelectCleanFlowRecords(inputFlowRecords)
 
     logger.info("Training the model")
 
     val model =
-      FlowSuspiciousConnectsModel.trainNewModel(sparkContext, sqlContext, logger, config, rawDataDF, config.topicCount)
+      FlowSuspiciousConnectsModel.trainNewModel(sparkContext, sqlContext, logger, config, cleanFlowRecords, config.topicCount)
 
     logger.info("Scoring")
-    val scoredDF = model.score(sparkContext, sqlContext, rawDataDF)
+    val scoredFlowRecords = model.score(sparkContext, sqlContext, cleanFlowRecords)
 
-    val filteredDF = scoredDF.filter(Score + " <= " + config.threshold)
+    val filteredFlowRecords = filterScoredFlowRecords(scoredFlowRecords, config.threshold)
 
-    val mostSusipiciousDF: DataFrame = filteredDF.orderBy(Score).limit(config.maxResults)
+    val orderedFlowRecords = filteredFlowRecords.orderBy(Score)
 
+    val mostSuspiciousFlowRecords =
+      if(config.maxResults > 0 ) orderedFlowRecords.limit(config.maxResults) else orderedFlowRecords
 
-    val outputDF = mostSusipiciousDF.select(OutColumns: _*)
+    val outputFlowRecords = mostSuspiciousFlowRecords.select(OutSchema: _*)
 
     logger.info("Netflow  suspicious connects analysis completed.")
     logger.info("Saving results to : " + config.hdfsScoredConnect)
-    outputDF.map(_.mkString(config.outputDelimiter)).saveAsTextFile(config.hdfsScoredConnect)
+    outputFlowRecords.map(_.mkString(config.outputDelimiter)).saveAsTextFile(config.hdfsScoredConnect)
+
+    val invalidFlowRecords = filterAndSelectInvalidFlowRecords(inputFlowRecords)
+    dataValidation.showAndSaveInvalidRecords(invalidFlowRecords, config.hdfsScoredConnect, logger)
+
+    val corruptFlowRecords = filterAndSelectCorruptFlowRecords(scoredFlowRecords)
+    dataValidation.showAndSaveCorruptRecords(corruptFlowRecords, config.hdfsScoredConnect, logger)
+
   }
 
-  val inSchema = StructType(List(TimeReceivedField,
+  /**
+    *
+    * @param inputFlowRecords raw flow records
+    * @return
+    */
+  def filterAndSelectCleanFlowRecords(inputFlowRecords: DataFrame): DataFrame ={
+
+    val cleanFlowRecordsFilter = inputFlowRecords(Hour).between(0, 23) &&
+      inputFlowRecords(Minute).between(0, 59) &&
+      inputFlowRecords(Second).between(0, 59) &&
+      inputFlowRecords(TimeReceived).isNotNull &&
+      inputFlowRecords(SourceIP).isNotNull &&
+      inputFlowRecords(DestinationIP).isNotNull &&
+      inputFlowRecords(SourcePort).isNotNull &&
+      inputFlowRecords(DestinationPort).isNotNull &&
+      inputFlowRecords(Ibyt).isNotNull &&
+      inputFlowRecords(Ipkt).isNotNull
+
+    inputFlowRecords
+      .filter(cleanFlowRecordsFilter)
+      .select(InSchema: _*)
+
+  }
+
+  /**
+    *
+    * @param inputFlowRecords raw flow records.
+    * @return
+    */
+  def filterAndSelectInvalidFlowRecords(inputFlowRecords: DataFrame): DataFrame = {
+
+    val invalidFlowRecordsFilter = inputFlowRecords(Hour).between(0,23) &&
+      inputFlowRecords(Minute).between(0,59) &&
+      inputFlowRecords(Second).between(0,59) &&
+      inputFlowRecords(TimeReceived).isNull ||
+      inputFlowRecords(SourceIP).isNull ||
+      inputFlowRecords(DestinationIP).isNull ||
+      inputFlowRecords(SourcePort).isNull ||
+      inputFlowRecords(DestinationPort).isNull ||
+      inputFlowRecords(Ibyt).isNull ||
+      inputFlowRecords(Ipkt).isNull
+
+    inputFlowRecords
+      .filter(invalidFlowRecordsFilter)
+      .select(InSchema: _*)
+  }
+
+  /**
+    *
+    * @param scoredFlowRecords scored flow records.
+    * @param threshold score tolerance.
+    * @return
+    */
+  def filterScoredFlowRecords(scoredFlowRecords: DataFrame, threshold: Double): DataFrame = {
+
+    val filteredFlowRecordsFilter = scoredFlowRecords(Score).leq(threshold) &&
+    scoredFlowRecords(Score).gt(dataValidation.ScoreError)
+
+    scoredFlowRecords.filter(filteredFlowRecordsFilter)
+  }
+
+  /**
+    *
+    * @param scoredFlowRecords scored flow records.
+    * @return
+    */
+  def filterAndSelectCorruptFlowRecords(scoredFlowRecords: DataFrame): DataFrame = {
+
+    val corruptFlowRecordsFilter = scoredFlowRecords(Score).equalTo(dataValidation.ScoreError)
+
+    scoredFlowRecords
+      .filter(corruptFlowRecordsFilter)
+      .select(OutSchema: _*)
+
+  }
+
+  val InSchema = StructType(List(TimeReceivedField,
     YearField,
     MonthField,
     DayField,
@@ -62,9 +145,7 @@ object FlowSuspiciousConnectsAnalysis {
     IpktField,
     IbytField,
     OpktField,
-    ObytField))
-
-  val inColumns = inSchema.fieldNames.map(col)
+    ObytField)).fieldNames.map(col)
 
   val OutSchema = StructType(
     List(TimeReceivedField,
@@ -84,7 +165,5 @@ object FlowSuspiciousConnectsAnalysis {
       IbytField,
       OpktField,
       ObytField,
-      ScoreField))
-
-  val OutColumns = OutSchema.fieldNames.map(col)
+      ScoreField)).fieldNames.map(col)
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/760dbf34/spot-ml/src/main/scala/org/apache/spot/netflow/FlowWordCreator.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/netflow/FlowWordCreator.scala b/spot-ml/src/main/scala/org/apache/spot/netflow/FlowWordCreator.scala
index f82d270..50e4f71 100644
--- a/spot-ml/src/main/scala/org/apache/spot/netflow/FlowWordCreator.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/netflow/FlowWordCreator.scala
@@ -2,6 +2,9 @@ package org.apache.spot.netflow
 
 import org.apache.spark.sql.functions._
 import org.apache.spot.utilities.Quantiles
+import org.apache.spot.utilities.data.validation.InvalidDataHandler
+
+import scala.util.{Failure, Success, Try}
 
 
 /**
@@ -71,55 +74,59 @@ class FlowWordCreator(timeCuts: Array[Double],
     */
   def flowWords(hour: Int, minute: Int, second: Int, srcPort: Int, dstPort: Int, ipkt: Long, ibyt: Long): FlowWords = {
 
+    Try {
+      val timeOfDay: Double = hour.toDouble + minute.toDouble / 60 + second.toDouble / 3600
 
-    val timeOfDay: Double = hour.toDouble + minute.toDouble / 60 + second.toDouble / 3600
+      val timeBin = Quantiles.bin(timeOfDay, timeCuts)
+      val ibytBin = Quantiles.bin(ibyt, ibytCuts)
+      val ipktBin = Quantiles.bin(ipkt, ipktCuts)
 
-    val timeBin = Quantiles.bin(timeOfDay, timeCuts).toString()
-    val ibytBin = Quantiles.bin(ibyt, ibytCuts).toString()
-    val ipktBin = Quantiles.bin(ipkt, ipktCuts).toString()
 
+      val LowToLowPortEncoding = 111111
+      val HighToHighPortEncoding = 333333
 
-    val LowToLowPortEncoding = 111111
-    val HighToHighPortEncoding = 333333
+      if (dstPort == 0 && srcPort == 0) {
 
-    if (dstPort == 0 && srcPort == 0) {
+        val baseWord = Array("0", timeBin, ibytBin, ipktBin).mkString("_")
+        FlowWords(srcWord = baseWord, dstWord = baseWord)
 
-      val baseWord = Array("0", timeBin, ibytBin, ipktBin).mkString("_")
-      FlowWords(srcWord = baseWord, dstWord = baseWord)
+      } else if (dstPort == 0 && srcPort > 0) {
 
-    } else if (dstPort == 0 && srcPort > 0) {
+        val baseWord = Array(srcPort.toString(), timeBin, ibytBin, ipktBin).mkString("_")
+        FlowWords(srcWord = "-1_" + baseWord, dstWord = baseWord)
 
-      val baseWord = Array(srcPort.toString(), timeBin, ibytBin, ipktBin).mkString("_")
-      FlowWords(srcWord = "-1_" + baseWord, dstWord = baseWord)
+      } else if (srcPort == 0 && dstPort > 0) {
 
-    } else if (srcPort == 0 && dstPort > 0) {
+        val baseWord = Array(dstPort.toString(), timeBin, ibytBin, ipktBin).mkString("_")
+        FlowWords(srcWord = baseWord, dstWord = "-1_" + baseWord)
 
-      val baseWord = Array(dstPort.toString(), timeBin, ibytBin, ipktBin).mkString("_")
-      FlowWords(srcWord = baseWord, dstWord = "-1_" + baseWord)
+      } else if (srcPort <= 1024 && dstPort <= 1024) {
 
-    } else if (srcPort <= 1024 && dstPort <= 1024) {
+        val baseWord = Array(LowToLowPortEncoding, timeBin, ibytBin, ipktBin).mkString("_")
+        FlowWords(srcWord = baseWord, dstWord = baseWord)
 
-      val baseWord = Array(LowToLowPortEncoding, timeBin, ibytBin, ipktBin).mkString("_")
-      FlowWords(srcWord = baseWord, dstWord = baseWord)
+      } else if (srcPort <= 1024 && dstPort > 1024) {
 
-    } else if (srcPort <= 1024 && dstPort > 1024) {
+        val baseWord = Array(srcPort.toString(), timeBin, ibytBin, ipktBin).mkString("_")
+        FlowWords(srcWord = "-1_" + baseWord, dstWord = baseWord)
 
-      val baseWord = Array(srcPort.toString(), timeBin, ibytBin, ipktBin).mkString("_")
-      FlowWords(srcWord = "-1_" + baseWord, dstWord = baseWord)
+      } else if (srcPort > 1024 && dstPort <= 1024) {
 
-    } else if (srcPort > 1024 && dstPort <= 1024) {
+        val baseWord = Array(dstPort.toString(), timeBin, ibytBin, ipktBin).mkString("_")
+        FlowWords(srcWord = baseWord, dstWord = "-1_" + baseWord)
 
-      val baseWord = Array(dstPort.toString(), timeBin, ibytBin, ipktBin).mkString("_")
-      FlowWords(srcWord = baseWord, dstWord = "-1_" + baseWord)
+      } else {
 
-    } else {
+        // this is the srcPort > 1024 && dstPort > 1024 case
 
-      // this is the srcPort > 1024 && dstPort > 1024 case
+        val baseWord = Array(HighToHighPortEncoding, timeBin, ibytBin, ipktBin).mkString("_")
+        FlowWords(srcWord = baseWord, dstWord = baseWord)
+      }
 
-      val baseWord = Array(HighToHighPortEncoding, timeBin, ibytBin, ipktBin).mkString("_")
-      FlowWords(srcWord = baseWord, dstWord = baseWord)
+    } match {
+      case Success(flowWords) => flowWords
+      case _ => FlowWords(InvalidDataHandler.WordError, InvalidDataHandler.WordError)
     }
-
   }
 
 }

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/760dbf34/spot-ml/src/main/scala/org/apache/spot/netflow/model/FlowScoreFunction.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/netflow/model/FlowScoreFunction.scala b/spot-ml/src/main/scala/org/apache/spot/netflow/model/FlowScoreFunction.scala
index 766d594..b82cd3f 100644
--- a/spot-ml/src/main/scala/org/apache/spot/netflow/model/FlowScoreFunction.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/netflow/model/FlowScoreFunction.scala
@@ -3,6 +3,7 @@ package org.apache.spot.netflow.model
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spot.SuspiciousConnectsScoreFunction
 import org.apache.spot.netflow.{FlowWordCreator, FlowWords}
+import org.apache.spot.utilities.data.validation.InvalidDataHandler
 
 
 /**
@@ -61,10 +62,14 @@ class FlowScoreFunction(timeCuts: Array[Double],
 
     val zeroProb = Array.fill(topicCount) { 0.0 }
 
-    /** A null value for srcTopicMix or dstTopicMix indicated the ip (source or dest respectively)
+    /** WordError indicates there was a problem creating a word and should not be used for scoring.
+
+      A null value for srcTopicMix or dstTopicMix indicated the ip (source or dest respectively)
       * were not seen in training.
       */
-    if (srcTopicMix == null || dstTopicMix == null) {
+    if(srcWord == InvalidDataHandler.WordError || dstWord == InvalidDataHandler.WordError){
+      InvalidDataHandler.ScoreError
+    } else if (srcTopicMix == null || dstTopicMix == null) {
        0.0
     } else {
 

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/760dbf34/spot-ml/src/main/scala/org/apache/spot/netflow/model/FlowSuspiciousConnectsModel.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/netflow/model/FlowSuspiciousConnectsModel.scala b/spot-ml/src/main/scala/org/apache/spot/netflow/model/FlowSuspiciousConnectsModel.scala
index 3590d9a..4f07fba 100644
--- a/spot-ml/src/main/scala/org/apache/spot/netflow/model/FlowSuspiciousConnectsModel.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/netflow/model/FlowSuspiciousConnectsModel.scala
@@ -13,6 +13,9 @@ import org.apache.spot.lda.SpotLDAWrapperSchema._
 import org.apache.spot.netflow.FlowSchema._
 import org.apache.spot.netflow.FlowWordCreator
 import org.apache.spot.utilities.Quantiles
+import org.apache.spot.utilities.data.validation.InvalidDataHandler
+
+import scala.util.{Failure, Success, Try}
 
 /**
   * A probabilistic model of the netflow traffic observed in a network.
@@ -30,7 +33,7 @@ import org.apache.spot.utilities.Quantiles
   * Create these models using the  factory in the companion object.
   *
   * @param topicCount Number of topics (profiles of common traffic patterns) used in the topic modelling routine.
-  * @param ipToTopicMixDF DataFrame assigning a distribution on topics to each document or IP.
+  * @param ipToTopicMix DataFrame assigning a distribution on topics to each document or IP.
   * @param wordToPerTopicProb Map assigning to each word it's per-topic probabilities.
   *                           Ie. Prob [word | t ] for t = 0 to topicCount -1
   * @param timeCuts Quantile cut-offs for binning time-of-day values when forming words from netflow records.
@@ -39,13 +42,12 @@ import org.apache.spot.utilities.Quantiles
   */
 
 class FlowSuspiciousConnectsModel(topicCount: Int,
-                                  ipToTopicMixDF: DataFrame,
+                                  ipToTopicMix: DataFrame,
                                   wordToPerTopicProb: Map[String, Array[Double]],
                                   timeCuts: Array[Double],
                                   ibytCuts: Array[Double],
                                   ipktCuts: Array[Double]) {
 
-
   def score(sc: SparkContext, sqlContext: SQLContext, flowRecords: DataFrame): DataFrame = {
 
     val wordToPerTopicProbBC = sc.broadcast(wordToPerTopicProb)
@@ -56,14 +58,14 @@ class FlowSuspiciousConnectsModel(topicCount: Int,
       */
     val dataWithSrcTopicMix = {
 
-      val recordsWithSrcIPTopicMixes = flowRecords.join(ipToTopicMixDF,
-        flowRecords(SourceIP) === ipToTopicMixDF(DocumentName), "left_outer")
+      val recordsWithSrcIPTopicMixes = flowRecords.join(ipToTopicMix,
+        flowRecords(SourceIP) === ipToTopicMix(DocumentName), "left_outer")
       val schemaWithSrcTopicMix = flowRecords.schema.fieldNames :+ TopicProbabilityMix
       val dataWithSrcIpProb: DataFrame = recordsWithSrcIPTopicMixes.selectExpr(schemaWithSrcTopicMix: _*)
         .withColumnRenamed(TopicProbabilityMix, SrcIpTopicMix)
 
-      val recordsWithIPTopicMixes = dataWithSrcIpProb.join(ipToTopicMixDF,
-        dataWithSrcIpProb(DestinationIP) === ipToTopicMixDF(DocumentName), "left_outer")
+      val recordsWithIPTopicMixes = dataWithSrcIpProb.join(ipToTopicMix,
+        dataWithSrcIpProb(DestinationIP) === ipToTopicMix(DocumentName), "left_outer")
       val schema = dataWithSrcIpProb.schema.fieldNames :+  TopicProbabilityMix
         recordsWithIPTopicMixes.selectExpr(schema: _*).withColumnRenamed(TopicProbabilityMix, DstIpTopicMix)
     }
@@ -129,46 +131,61 @@ object FlowSuspiciousConnectsModel {
                     sqlContext: SQLContext,
                     logger: Logger,
                     config: SuspiciousConnectsConfig,
-                    inDF: DataFrame,
+                    inputRecords: DataFrame,
                     topicCount: Int): FlowSuspiciousConnectsModel = {
 
     logger.info("Training netflow suspicious connects model from " + config.inputPath)
 
-    val selectedDF = inDF.select(ModelColumns: _*)
+    val selectedRecords = inputRecords.select(ModelColumns: _*)
 
 
-    val totalDataDF = selectedDF.unionAll(FlowFeedback.loadFeedbackDF(sparkContext,
+    val totalRecords = selectedRecords.unionAll(FlowFeedback.loadFeedbackDF(sparkContext,
       sqlContext,
       config.feedbackFile,
       config.duplicationFactor))
 
-
-
     // create quantile cut-offs
 
-    val timeCuts = Quantiles.computeDeciles(totalDataDF
+    val timeCuts = Quantiles.computeDeciles(totalRecords
       .select(Hour, Minute, Second)
       .rdd
-      .map({ case Row(hours: Int, minutes: Int, seconds: Int) => 3600 * hours + 60 * minutes + seconds }))
+      .flatMap({ case Row(hours: Int, minutes: Int, seconds: Int) => {
+          Try {  (3600 * hours + 60 * minutes + seconds).toDouble } match{
+            case Failure(_) => Seq()
+            case Success(time) => Seq(time)
+          }
+        }
+      }))
 
     logger.info(timeCuts.mkString(","))
 
     logger.info("calculating byte cuts ...")
 
-    val ibytCuts = Quantiles.computeDeciles(totalDataDF
+    val ibytCuts = Quantiles.computeDeciles(totalRecords
       .select(Ibyt)
       .rdd
-      .map({ case Row(ibyt: Long) => ibyt.toDouble }))
+      .flatMap({ case Row(ibyt: Long) => {
+          Try {  ibyt.toDouble } match{
+            case Failure(_) => Seq()
+            case Success(ibyt) => Seq(ibyt)
+          }
+        }
+      }))
 
     logger.info(ibytCuts.mkString(","))
 
     logger.info("calculating pkt cuts")
 
-    val ipktCuts = Quantiles.computeQuintiles(totalDataDF
+    val ipktCuts = Quantiles.computeQuintiles(totalRecords
       .select(Ipkt)
       .rdd
-      .map({ case Row(ipkt: Long) => ipkt.toDouble }))
-
+      .flatMap({ case Row(ipkt: Long) => {
+          Try { ipkt.toDouble } match {
+            case Failure(_) => Seq()
+            case Success(ipkt) => Seq(ipkt)
+          }
+        }
+      }))
 
     logger.info(ipktCuts.mkString(","))
 
@@ -176,18 +193,19 @@ object FlowSuspiciousConnectsModel {
 
     val flowWordCreator = new FlowWordCreator(timeCuts, ibytCuts, ipktCuts)
 
-    val srcWordUDF = flowWordCreator.srcWordUDF
-    val dstWordUDF = flowWordCreator.dstWordUDF
-
-    val dataWithWordsDF = totalDataDF.withColumn(SourceWord, flowWordCreator.srcWordUDF(ModelColumns: _*))
+    val dataWithWords = totalRecords.withColumn(SourceWord, flowWordCreator.srcWordUDF(ModelColumns: _*))
       .withColumn(DestinationWord, flowWordCreator.dstWordUDF(ModelColumns: _*))
 
     // Aggregate per-word counts at each IP
-    val srcWordCounts = dataWithWordsDF.select(SourceIP, SourceWord)
+    val srcWordCounts = dataWithWords
+      .filter(dataWithWords(SourceWord).notEqual(InvalidDataHandler.WordError))
+      .select(SourceIP, SourceWord)
       .map({ case Row(sourceIp: String, sourceWord: String) => (sourceIp, sourceWord) -> 1 })
       .reduceByKey(_ + _)
 
-    val dstWordCounts = dataWithWordsDF.select(DestinationIP, DestinationWord)
+    val dstWordCounts = dataWithWords
+      .filter(dataWithWords(DestinationWord).notEqual(InvalidDataHandler.WordError))
+      .select(DestinationIP, DestinationWord)
       .map({ case Row(destinationIp: String, destinationWord: String) => (destinationIp, destinationWord) -> 1 })
       .reduceByKey(_ + _)
 
@@ -197,7 +215,7 @@ object FlowSuspiciousConnectsModel {
         .map({ case ((ip, word), count) => SpotLDAInput(ip, word, count) })
 
 
-    val SpotLDAOutput(ipToTopicMixDF, wordToPerTopicProb) = SpotLDAWrapper.runLDA(sparkContext,
+    val SpotLDAOutput(ipToTopicMix, wordToPerTopicProb) = SpotLDAWrapper.runLDA(sparkContext,
       sqlContext,
       ipWordCounts,
       config.topicCount,
@@ -208,12 +226,11 @@ object FlowSuspiciousConnectsModel {
       config.ldaMaxiterations)
 
     new FlowSuspiciousConnectsModel(topicCount,
-      ipToTopicMixDF,
+      ipToTopicMix,
       wordToPerTopicProb,
       timeCuts,
       ibytCuts,
       ipktCuts)
-
   }
 
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/760dbf34/spot-ml/src/main/scala/org/apache/spot/proxy/ProxySchema.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/proxy/ProxySchema.scala b/spot-ml/src/main/scala/org/apache/spot/proxy/ProxySchema.scala
index 28d1dac..4c63208 100644
--- a/spot-ml/src/main/scala/org/apache/spot/proxy/ProxySchema.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/proxy/ProxySchema.scala
@@ -1,5 +1,7 @@
 package org.apache.spot.proxy
 
+import org.apache.spark.sql.types._
+
 /**
   * Data frame column names used in the proxy suspicious connects analysis.
   */
@@ -8,36 +10,83 @@ object ProxySchema {
   // fields from the input
 
   val Date = "p_date"
+  val DateField = StructField(Date, StringType, nullable = true)
+
   val Time = "p_time"
+  val TimeField = StructField(Time, StringType, nullable = true)
+
   val ClientIP = "clientip"
+  val ClientIPField = StructField(ClientIP, StringType, nullable = true)
+
   val Host = "host"
+  val HostField = StructField(Host, StringType, nullable = true)
+
   val ReqMethod = "reqmethod"
+  val ReqMethodField = StructField(ReqMethod, StringType, nullable = true)
+
   val UserAgent = "useragent"
+  val UserAgentField = StructField(UserAgent, StringType, nullable = true)
+
   val ResponseContentType = "resconttype"
+  val ResponseContentTypeField = StructField(ResponseContentType, StringType, nullable = true)
+
   val Duration = "duration"
+  val DurationField = StructField(Duration, IntegerType, nullable = true)
+
   val UserName = "username"
+  val UserNameField = StructField(UserName, StringType, nullable = true)
+
   val AuthGroup = "authgroup"
+
   val ExceptionId = "exceptionid"
+
   val FilterResult = "filterresult"
+
   val WebCat = "webcat"
+  val WebCatField = StructField(WebCat, StringType, nullable = true)
+
   val Referer = "referer"
+  val RefererField = StructField(Referer, StringType, nullable = true)
+
   val RespCode = "respcode"
+  val RespCodeField = StructField(RespCode, StringType, nullable = true)
+
   val Action = "action"
+
   val URIScheme = "urischeme"
+
   val URIPort = "uriport"
+  val URIPortField = StructField(URIPort, StringType, nullable = true)
+
   val URIPath = "uripath"
+  val URIPathField = StructField(URIPath, StringType, nullable = true)
+
   val URIQuery = "uriquery"
+  val URIQueryField = StructField(URIQuery, StringType, nullable = true)
+
   val URIExtension = "uriextension"
+
   val ServerIP = "serverip"
+  val ServerIPField = StructField(ServerIP, StringType, nullable = true)
+
   val SCBytes = "scbytes"
+  val SCBytesField = StructField(SCBytes, IntegerType, nullable = true)
+
   val CSBytes = "csbytes"
+  val CSBytesField = StructField(CSBytes, IntegerType, nullable = true)
+
   val VirusID = "virusid"
   val BcappName = "bcappname"
   val BcappOper = "bcappoper"
+
   val FullURI = "fulluri"
+  val FullURIField = StructField(FullURI, StringType, nullable = true)
 
   // output fields
 
   val Word = "word"
+  val WordField = StructField(Word, StringType, nullable = true)
+
   val Score = "score"
+  val ScoreField = StructField(Score, DoubleType, nullable = true)
 }

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/760dbf34/spot-ml/src/main/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysis.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysis.scala b/spot-ml/src/main/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysis.scala
index 1131406..cc2319f 100644
--- a/spot-ml/src/main/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysis.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysis.scala
@@ -2,10 +2,12 @@ package org.apache.spot.proxy
 
 import org.apache.log4j.Logger
 import org.apache.spark.SparkContext
-import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.{DataFrame, SQLContext}
 import org.apache.spot.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig
 import org.apache.spot.proxy.ProxySchema._
-import org.apache.spot.utilities.DataFrameUtils
+import org.apache.spot.utilities.data.validation.{InvalidDataHandler => dataValidation}
 
 /**
   * Run suspicious connections analysis on proxy data.
@@ -20,34 +22,151 @@ object ProxySuspiciousConnectsAnalysis {
     * @param sqlContext   Spark SQL context.
     * @param logger       Logs execution progress, information and errors for user.
     */
-  def run(config: SuspiciousConnectsConfig, sparkContext: SparkContext, sqlContext: SQLContext, logger: Logger) = {
+  def run(config: SuspiciousConnectsConfig, sparkContext: SparkContext, sqlContext: SQLContext, logger: Logger,
+          inputProxyRecords: DataFrame) = {
 
     logger.info("Starting proxy suspicious connects analysis.")
 
-    logger.info("Loading data from: " + config.inputPath)
-
-    val rawDataDF = sqlContext.read.parquet(config.inputPath).
-      filter(Date + " is not null and " + Time + " is not null and " + ClientIP + " is not null").
-      select(Date, Time, ClientIP, Host, ReqMethod, UserAgent, ResponseContentType, Duration, UserName,
-        WebCat, Referer, RespCode, URIPort, URIPath, URIQuery, ServerIP, SCBytes, CSBytes, FullURI)
+    val cleanProxyRecords = filterAndSelectCleanProxyRecords(inputProxyRecords)
 
     logger.info("Training the model")
     val model =
-      ProxySuspiciousConnectsModel.trainNewModel(sparkContext, sqlContext, logger, config, rawDataDF)
+      ProxySuspiciousConnectsModel.trainNewModel(sparkContext, sqlContext, logger, config, cleanProxyRecords)
 
     logger.info("Scoring")
-    val scoredDF = model.score(sparkContext, rawDataDF)
+    val scoredProxyRecords = model.score(sparkContext, cleanProxyRecords)
 
     // take the maxResults least probable events of probability below the threshold and sort
 
-    val filteredDF = scoredDF.filter(Score +  " <= " + config.threshold)
-    val topRows = DataFrameUtils.dfTakeOrdered(filteredDF, "score", config.maxResults)
-    val scoreIndex = scoredDF.schema.fieldNames.indexOf("score")
-    val outputRDD = sparkContext.parallelize(topRows).sortBy(row => row.getDouble(scoreIndex))
+    val filteredProxyRecords = filterScoredProxyRecords(scoredProxyRecords, config.threshold)
+
+    val orderedProxyRecords = filteredProxyRecords.orderBy(Score)
+
+    val mostSuspiciousProxyRecords = if(config.maxResults > 0)  orderedProxyRecords.limit(config.maxResults) else orderedProxyRecords
+
+    val outputProxyRecords = mostSuspiciousProxyRecords.select(OutSchema:_*)
+
+    logger.info("Proxy suspicious connects analysis completed")
+    logger.info("Saving results to: " + config.hdfsScoredConnect)
+    outputProxyRecords.map(_.mkString(config.outputDelimiter)).saveAsTextFile(config.hdfsScoredConnect)
+
+    val invalidProxyRecords = filterAndSelectInvalidProxyRecords(inputProxyRecords)
+    dataValidation.showAndSaveInvalidRecords(invalidProxyRecords, config.hdfsScoredConnect, logger)
+
+    val corruptProxyRecords = filterAndSelectCorruptProxyRecords(scoredProxyRecords)
+    dataValidation.showAndSaveCorruptRecords(corruptProxyRecords, config.hdfsScoredConnect, logger)
+  }
+
+  /**
+    *
+    * @param inputProxyRecords raw proxy records.
+    * @return
+    */
+  def filterAndSelectCleanProxyRecords(inputProxyRecords: DataFrame): DataFrame ={
+
+    val cleanProxyRecordsFilter =  inputProxyRecords(Date).isNotNull &&
+      inputProxyRecords(Time).isNotNull &&
+      inputProxyRecords(ClientIP).isNotNull &&
+      inputProxyRecords(Host).isNotNull &&
+      inputProxyRecords(FullURI).isNotNull
+
+    inputProxyRecords
+      .filter(cleanProxyRecordsFilter)
+      .select(InSchema:_*)
+      .na.fill(DefaultUserAgent, Seq(UserAgent))
+      .na.fill(DefaultResponseContentType, Seq(ResponseContentType))
+  }
+
+  /**
+    *
+    * @param inputProxyRecords raw proxy records.
+    * @return
+    */
+  def filterAndSelectInvalidProxyRecords(inputProxyRecords: DataFrame): DataFrame ={
+
+    val invalidProxyRecordsFilter = inputProxyRecords(Date).isNull ||
+      inputProxyRecords(Time).isNull ||
+      inputProxyRecords(ClientIP).isNull ||
+      inputProxyRecords(Host).isNull ||
+      inputProxyRecords(FullURI).isNull
+
+    inputProxyRecords
+      .filter(invalidProxyRecordsFilter)
+      .select(InSchema: _*)
+  }
+
+  /**
+    *
+    * @param scoredProxyRecords scored proxy records.
+    * @param threshold score tolerance.
+    * @return
+    */
+  def filterScoredProxyRecords(scoredProxyRecords: DataFrame, threshold: Double): DataFrame ={
+
+    val filteredProxyRecordsFilter = scoredProxyRecords(Score).leq(threshold) &&
+      scoredProxyRecords(Score).gt(dataValidation.ScoreError)
 
-    logger.info("Persisting data")
-    outputRDD.map(_.mkString(config.outputDelimiter)).saveAsTextFile(config.hdfsScoredConnect)
+    scoredProxyRecords.filter(filteredProxyRecordsFilter)
+  }
+
+  /**
+    *
+    * @param scoredProxyRecords scored proxy records.
+    * @return
+    */
+  def filterAndSelectCorruptProxyRecords(scoredProxyRecords: DataFrame): DataFrame ={
 
-    logger.info("Proxy suspcicious connects completed")
+    val corruptProxyRecordsFilter = scoredProxyRecords(Score).equalTo(dataValidation.ScoreError)
+
+    scoredProxyRecords
+      .filter(corruptProxyRecordsFilter)
+      .select(OutSchema: _*)
   }
+
+  val DefaultUserAgent = "-"
+  val DefaultResponseContentType = "-"
+
+  val InSchema = StructType(
+    List(DateField,
+      TimeField,
+      ClientIPField,
+      HostField,
+      ReqMethodField,
+      UserAgentField,
+      ResponseContentTypeField,
+      DurationField,
+      UserNameField,
+      WebCatField,
+      RefererField,
+      RespCodeField,
+      URIPortField,
+      URIPathField,
+      URIQueryField,
+      ServerIPField,
+      SCBytesField,
+      CSBytesField,
+      FullURIField)).fieldNames.map(col)
+
+  val OutSchema = StructType(
+    List(DateField,
+      TimeField,
+      ClientIPField,
+      HostField,
+      ReqMethodField,
+      UserAgentField,
+      ResponseContentTypeField,
+      DurationField,
+      UserNameField,
+      WebCatField,
+      RefererField,
+      RespCodeField,
+      URIPortField,
+      URIPathField,
+      URIQueryField,
+      ServerIPField,
+      SCBytesField,
+      CSBytesField,
+      FullURIField,
+      WordField,
+      ScoreField)).fieldNames.map(col)
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/760dbf34/spot-ml/src/main/scala/org/apache/spot/proxy/ProxySuspiciousConnectsModel.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/proxy/ProxySuspiciousConnectsModel.scala b/spot-ml/src/main/scala/org/apache/spot/proxy/ProxySuspiciousConnectsModel.scala
index 582821b..c38ed93 100644
--- a/spot-ml/src/main/scala/org/apache/spot/proxy/ProxySuspiciousConnectsModel.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/proxy/ProxySuspiciousConnectsModel.scala
@@ -12,6 +12,10 @@ import org.apache.spot.utilities._
 import org.apache.spot.SuspiciousConnectsScoreFunction
 import org.apache.spot.lda.SpotLDAWrapper
 import org.apache.spot.lda.SpotLDAWrapper.{SpotLDAInput, SpotLDAOutput}
+import org.apache.spot.utilities.data.validation.InvalidDataHandler
+
+import scala.util.{Failure, Success, Try}
+
 /**
   * Encapsulation of a proxy suspicious connections model.
   *
@@ -84,7 +88,7 @@ object ProxySuspiciousConnectsModel {
     * @param sqlContext   SQL context.
     * @param logger       Logge object.
     * @param config       SuspiciousConnetsArgumnetParser.Config object containg CLI arguments.
-    * @param inDF         Dataframe for training data, with columns Host, Time, ReqMethod, FullURI, ResponseContentType,
+    * @param inputRecords         Dataframe for training data, with columns Host, Time, ReqMethod, FullURI, ResponseContentType,
     *                     UserAgent, RespCode (as defined in ProxySchema object).
     * @return ProxySuspiciousConnectsModel
     */
@@ -92,30 +96,54 @@ object ProxySuspiciousConnectsModel {
                     sqlContext: SQLContext,
                     logger: Logger,
                     config: SuspiciousConnectsConfig,
-                    inDF: DataFrame): ProxySuspiciousConnectsModel = {
+                    inputRecords: DataFrame): ProxySuspiciousConnectsModel = {
 
     logger.info("training new proxy suspcious connects model")
 
 
-    val df = inDF.select(Date, Time, ClientIP, Host, ReqMethod, UserAgent, ResponseContentType, RespCode, FullURI)
+    val selectedRecords = inputRecords.select(Date, Time, ClientIP, Host, ReqMethod, UserAgent, ResponseContentType, RespCode, FullURI)
       .unionAll(ProxyFeedback.loadFeedbackDF(sparkContext, sqlContext, config.feedbackFile, config.duplicationFactor))
 
     val timeCuts =
-      Quantiles.computeDeciles(df.select(Time).rdd.map({ case Row(t: String) => TimeUtilities.getTimeAsDouble(t) }))
-
-    val entropyCuts = Quantiles.computeQuintiles(df.select(FullURI).
-      rdd.map({ case Row(uri: String) => Entropy.stringEntropy(uri) }))
+      Quantiles.computeDeciles(selectedRecords
+        .select(Time)
+        .rdd
+        .flatMap({ case Row(t: String) => {
+            Try {TimeUtilities.getTimeAsDouble(t)} match {
+              case Failure(_) => Seq()
+              case Success(time) =>  Seq(time)
+            }
+          }
+        }))
+
+    val entropyCuts = Quantiles.computeQuintiles(selectedRecords
+      .select(FullURI)
+      .rdd
+      .flatMap({ case Row(uri: String) => {
+          Try {Entropy.stringEntropy(uri)} match {
+            case Failure(_) => Seq()
+            case Success(entropy) => Seq(entropy)
+          }
+        }
+      }))
 
     val agentToCount: Map[String, Long] =
-      df.select(UserAgent).rdd.map({ case Row(agent: String) => (agent, 1L) }).reduceByKey(_ + _).collect().toMap
+      selectedRecords.select(UserAgent)
+        .rdd
+        .map({ case Row(agent: String) => (agent, 1L) })
+        .reduceByKey(_ + _).collect()
+        .toMap
 
     val agentToCountBC = sparkContext.broadcast(agentToCount)
 
     val agentCuts =
-      Quantiles.computeQuintiles(df.select(UserAgent).rdd.map({ case Row(agent: String) => agentToCountBC.value(agent) }))
+      Quantiles.computeQuintiles(selectedRecords
+        .select(UserAgent)
+        .rdd
+        .map({ case Row(agent: String) => agentToCountBC.value(agent) }))
 
     val docWordCount: RDD[SpotLDAInput] =
-      getIPWordCounts(sparkContext, sqlContext, logger, df, config.feedbackFile, config.duplicationFactor, agentToCount, timeCuts, entropyCuts, agentCuts)
+      getIPWordCounts(sparkContext, sqlContext, logger, selectedRecords, config.feedbackFile, config.duplicationFactor, agentToCount, timeCuts, entropyCuts, agentCuts)
 
 
     val SpotLDAOutput(ipToTopicMixDF, wordResults) = SpotLDAWrapper.runLDA(sparkContext,
@@ -154,7 +182,7 @@ object ProxySuspiciousConnectsModel {
   def getIPWordCounts(sc: SparkContext,
                       sqlContext: SQLContext,
                       logger: Logger,
-                      inDF: DataFrame,
+                      inputRecords: DataFrame,
                       feedbackFile: String,
                       duplicationFactor: Int,
                       agentToCount: Map[String, Long],
@@ -164,9 +192,9 @@ object ProxySuspiciousConnectsModel {
 
 
     logger.info("Read source data")
-    val df = inDF.select(Date, Time, ClientIP, Host, ReqMethod, UserAgent, ResponseContentType, RespCode, FullURI)
+    val selectedRecords = inputRecords.select(Date, Time, ClientIP, Host, ReqMethod, UserAgent, ResponseContentType, RespCode, FullURI)
 
-    val wc = ipWordCountFromDF(sc, df, agentToCount, timeCuts, entropyCuts, agentCuts)
+    val wc = ipWordCountFromDF(sc, selectedRecords, agentToCount, timeCuts, entropyCuts, agentCuts)
     logger.info("proxy pre LDA completed")
 
     wc
@@ -184,17 +212,20 @@ object ProxySuspiciousConnectsModel {
     val agentToCountBC = sc.broadcast(agentToCount)
     val udfWordCreation = ProxyWordCreation.udfWordCreation(topDomains, agentToCountBC, timeCuts, entropyCuts, agentCuts)
 
-    val ipWordDF = dataFrame.withColumn(Word,
+    val ipWord = dataFrame.withColumn(Word,
       udfWordCreation(dataFrame(Host),
         dataFrame(Time),
         dataFrame(ReqMethod),
         dataFrame(FullURI),
         dataFrame(ResponseContentType),
         dataFrame(UserAgent),
-        dataFrame(RespCode))).
-      select(ClientIP, Word)
+        dataFrame(RespCode)))
+      .select(ClientIP, Word)
 
-    ipWordDF.rdd.map({ case Row(ip, word) => ((ip.asInstanceOf[String], word.asInstanceOf[String]), 1) })
+    ipWord
+      .filter(ipWord(Word).notEqual(InvalidDataHandler.WordError))
+      .rdd
+      .map({ case Row(ip, word) => ((ip.asInstanceOf[String], word.asInstanceOf[String]), 1) })
       .reduceByKey(_ + _).map({ case ((ip, word), count) => SpotLDAInput(ip, word, count) })
   }
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/760dbf34/spot-ml/src/main/scala/org/apache/spot/proxy/ProxyWordCreation.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/proxy/ProxyWordCreation.scala b/spot-ml/src/main/scala/org/apache/spot/proxy/ProxyWordCreation.scala
index bfc4f99..445a371 100644
--- a/spot-ml/src/main/scala/org/apache/spot/proxy/ProxyWordCreation.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/proxy/ProxyWordCreation.scala
@@ -2,7 +2,10 @@ package org.apache.spot.proxy
 
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.sql.functions._
-import org.apache.spot.utilities.{Entropy, Quantiles, DomainProcessor, TimeUtilities}
+import org.apache.spot.utilities.data.validation.InvalidDataHandler
+import org.apache.spot.utilities.{DomainProcessor, Entropy, Quantiles, TimeUtilities}
+
+import scala.util.{Success, Try}
 
 
 object ProxyWordCreation {
@@ -39,15 +42,19 @@ object ProxyWordCreation {
                 timeCuts: Array[Double],
                 entropyCuts: Array[Double],
                 agentCuts: Array[Double]): String = {
-
-    List(topDomain(proxyHost, topDomains.value).toString,
-      Quantiles.bin(TimeUtilities.getTimeAsDouble(time), timeCuts).toString,
-      reqMethod,
-      Quantiles.bin(Entropy.stringEntropy(uri), entropyCuts),
-      if (contentType.split('/').length > 0) contentType.split('/')(0) else "unknown_content_type",
-          // just the top level content type for now
-      Quantiles.bin(agentCounts.value(userAgent), agentCuts),
-      responseCode(0)).mkString("_")
+    Try{
+      List(topDomain(proxyHost, topDomains.value).toString,
+        Quantiles.bin(TimeUtilities.getTimeAsDouble(time), timeCuts).toString,
+        reqMethod,
+        Quantiles.bin(Entropy.stringEntropy(uri), entropyCuts),
+        if (contentType.split('/').length > 0) contentType.split('/')(0) else "unknown_content_type",
+        // just the top level content type for now
+        Quantiles.bin(agentCounts.value(userAgent), agentCuts),
+        if (responseCode != null) responseCode(0) else "unknown_response_code").mkString("_")
+    } match {
+      case Success(proxyWord) => proxyWord
+      case _ => InvalidDataHandler.WordError
+    }
   }
 
 

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/760dbf34/spot-ml/src/main/scala/org/apache/spot/utilities/data/InputOutputDataHandler.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/utilities/data/InputOutputDataHandler.scala b/spot-ml/src/main/scala/org/apache/spot/utilities/data/InputOutputDataHandler.scala
new file mode 100644
index 0000000..e7934f4
--- /dev/null
+++ b/spot-ml/src/main/scala/org/apache/spot/utilities/data/InputOutputDataHandler.scala
@@ -0,0 +1,63 @@
+package org.apache.spot.utilities.data
+
+import org.apache.log4j.Logger
+import org.apache.hadoop.fs.{LocatedFileStatus, Path, RemoteIterator, FileUtil => fileUtil}
+import org.apache.spark.SparkContext
+import org.apache.spark.sql.{DataFrame, SQLContext}
+
+
+/**
+  * Handles input and output data for every data set or pipep line implementation.
+  * One method to read input records and one method to merge results in HDFS.
+  */
+object InputOutputDataHandler {
+
+  /**
+    *
+    * @param sqlContext Application SqlContext.
+    * @param inputPath HDFS input folder for every execution; flow, dns or proxy.
+    * @param logger Application logger.
+    * @return raw data frame.
+    */
+  def getInputDataFrame(sqlContext: SQLContext, inputPath: String, logger: Logger): Option[DataFrame] ={
+    try {
+      logger.info("Loading data from: " + inputPath)
+      Some(sqlContext.read.parquet(inputPath))
+    } catch {
+      case _ : Throwable => {
+        None
+      }
+    }
+  }
+
+  /**
+    *
+    * @param sparkContext Application SparkContext.
+    * @param hdfsScoredConnect HDFS output folder. The location where results were saved; flow, dns or proxy.
+    * @param analysis Data type to analyze.
+    * @param logger Application Logger.
+    */
+  def mergeResultsFiles(sparkContext: SparkContext, hdfsScoredConnect: String, analysis: String, logger: Logger) {
+    val hadoopConfiguration = sparkContext.hadoopConfiguration
+    val fileSystem = org.apache.hadoop.fs.FileSystem.get(hadoopConfiguration)
+
+    val exists = fileSystem.exists(new org.apache.hadoop.fs.Path(hdfsScoredConnect))
+
+    if(exists){
+      val srcDir = new Path(hdfsScoredConnect)
+      val dstFile = new Path(hdfsScoredConnect+"/"+analysis+"_results.csv")
+      fileUtil.copyMerge(fileSystem,srcDir, fileSystem, dstFile, false, hadoopConfiguration, "")
+
+      val files: RemoteIterator[LocatedFileStatus] = fileSystem.listFiles(srcDir, false)
+      while (files.hasNext){
+        val filePath = files.next().getPath()
+        if(filePath.toString.contains("part-")){
+          fileSystem.delete(filePath, false)
+        }
+      }
+    }
+    else logger.info(s"Couldn't find results in $hdfsScoredConnect." +
+        s"Please check previous logs to see if there were errors.")
+   }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/760dbf34/spot-ml/src/main/scala/org/apache/spot/utilities/data/validation/InvalidDataHandler.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/utilities/data/validation/InvalidDataHandler.scala b/spot-ml/src/main/scala/org/apache/spot/utilities/data/validation/InvalidDataHandler.scala
new file mode 100644
index 0000000..7cfaa34
--- /dev/null
+++ b/spot-ml/src/main/scala/org/apache/spot/utilities/data/validation/InvalidDataHandler.scala
@@ -0,0 +1,56 @@
+package org.apache.spot.utilities.data.validation
+
+import org.apache.log4j.Logger
+import org.apache.spark.sql.DataFrame
+
+/**
+  * Handles invalid and corrupt records.
+  * One method for each kind of invalid data, this object prints the total errors and saves the invalid/corrupt records.
+  */
+object InvalidDataHandler {
+
+  val WordError = "word_error"
+  val ScoreError = -1d
+
+  /**
+    *
+    * @param invalidRecords Records with null or invalid values in key columns.
+    * @param outputPath HDFS output folder for invalid records; scored_results/date/scores/invalid
+    * @param logger Application logger.
+    */
+  def showAndSaveInvalidRecords(invalidRecords: DataFrame, outputPath: String, logger: Logger) {
+
+    if (invalidRecords.count > 0) {
+
+      val invalidRecordsFile = outputPath + "/invalid_records"
+      logger.warn("Saving invalid records to " + invalidRecordsFile)
+
+      invalidRecords.write.mode("overwrite").parquet(invalidRecordsFile)
+
+      logger.warn("Total records discarded due to NULL values in key fields: " + invalidRecords.count +
+        " . Please go to " + invalidRecordsFile + " for more details.")
+    }
+  }
+
+  /**
+    *
+    * @param corruptRecords Records with Score = -1. This means that during word creation these records throw an exception
+    *                       and they got assigned the word word_error and hence during scoring they got a score -1.
+    * @param outputPath HDFS output folder for corrupt records; scored_results/date/scores/corrupt
+    * @param logger Application logger.
+    */
+  def showAndSaveCorruptRecords(corruptRecords: DataFrame, outputPath: String, logger: Logger) {
+    if(corruptRecords.count > 0){
+
+      val corruptRecordsFile = outputPath + "/corrupt_records"
+
+      logger.warn("Saving corrupt records to " + corruptRecordsFile)
+
+      corruptRecords.write.mode("overwrite").parquet(corruptRecordsFile)
+
+      logger.warn("Total records discarded due to invalid values in key fields: " + corruptRecords.count +
+        "Please go to " + corruptRecordsFile + " for more details.")
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/760dbf34/spot-ml/src/test/scala/org/apache/spot/DNSWordCreationTest.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/test/scala/org/apache/spot/DNSWordCreationTest.scala b/spot-ml/src/test/scala/org/apache/spot/DNSWordCreationTest.scala
deleted file mode 100644
index 1b02333..0000000
--- a/spot-ml/src/test/scala/org/apache/spot/DNSWordCreationTest.scala
+++ /dev/null
@@ -1,21 +0,0 @@
-package org.apache.spot
-
-
-import javax.swing.text.Utilities
-
-import org.apache.spot.dns.{DNSSuspiciousConnectsAnalysis, DNSWordCreation}
-import org.apache.spot.testutils.TestingSparkContextFlatSpec
-import org.apache.spot.utilities.{CountryCodes, Entropy, TopDomains}
-import org.scalatest.Matchers
-
-class DNSWordCreationTest extends TestingSparkContextFlatSpec with Matchers {
-
-    "entropy" should "return 2.807354922057603 with value abcdefg" in {
-    val value = "abcdefg"
-
-    val result = Entropy.stringEntropy(value)
-
-    result shouldBe 2.807354922057604
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/760dbf34/spot-ml/src/test/scala/org/apache/spot/FlowWordCreatorTest.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/test/scala/org/apache/spot/FlowWordCreatorTest.scala b/spot-ml/src/test/scala/org/apache/spot/FlowWordCreatorTest.scala
deleted file mode 100644
index f3cf715..0000000
--- a/spot-ml/src/test/scala/org/apache/spot/FlowWordCreatorTest.scala
+++ /dev/null
@@ -1,216 +0,0 @@
-package org.apache.spot
-
-
-import org.apache.spot.netflow.{FlowWordCreator, FlowWords}
-import org.scalatest.{FlatSpec, Matchers}
-
-
-class FlowWordCreatorTest extends FlatSpec with Matchers {
-
-  // Replace ports in index 10 and 11
-  val srcIP = "10.0.2.115"
-  val dstIP = "172.16.0.107"
-  val hour = 12
-  val minute = 59
-  val second = 32
-
-  val ibyts = 222L
-  val ipkts = 3L
-
-  val timeCuts = Array(2.4, 4.8, 7.2, 9.6, 12.0, 14.4, 16.8, 19.2, 21.6, 24.0)
-  val ipktCuts = Array(10d, 20d, 30d, 40d, 50d, 60d, 70d, 80d, 90d, 100d)
-  val ibytCuts = Array(100d, 200d, 300d, 400d, 500d)
-
-  val expectedIpktBin = 0
-  val expectedIbytBin = 2
-  val expectedTimeBin = 5
-
-
-  val flowWordCreator = new FlowWordCreator(timeCuts, ibytCuts, ipktCuts)
-
-
-  // 1. Test when sip is less than dip and sip is not 0 and dport is <= 1024 & sport > 1024 and min(dport, sport) !=0 +
-  "flowWords" should "create word with ip_pair as sourceIp-destIp, port is dport and dest_word direction is -1" in {
-    val srcPort = 2132
-    val dstPort = 23
-
-    val FlowWords(srcWord, dstWord) =
-      flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
-
-
-    dstWord shouldBe "-1_23_5_2_0"
-    srcWord shouldBe "23_5_2_0"
-
-  }
-
-  // 2. Test when sip is less than dip and sip is not 0 and sport is <= 1024 & dport > 1024 and min(dport, sport) !=0 +
-  it should "create word with ip_pair as sourceIp-destIp, port is sport and src_word direction is -1" in {
-
-    val srcPort = 23
-    val dstPort = 2132
-
-    val FlowWords(srcWord, dstWord) =
-      flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
-
-    dstWord shouldBe "23_5_2_0"
-    srcWord shouldBe "-1_23_5_2_0"
-  }
-
-  // 3. Test when sip is less than dip and sip is not 0 and dport and sport are > 1024 +
-  it should "create word with ip_pair as sourceIp-destIp, port is 333333 and both words direction is 1 (not showing)" in {
-    val srcPort = 8392
-    val dstPort = 9874
-
-    val FlowWords(srcWord, dstWord) =
-      flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
-
-    dstWord shouldBe "333333_5_2_0"
-    srcWord shouldBe "333333_5_2_0"
-  }
-
-  // 4. Test when sip is less than dip and sip is not 0 and dport is 0 but sport is not +
-  it should "create word with ip_pair as sourceIp-destIp, port is sport and source_word direction is -1" in {
-    val srcPort = 80
-    val dstPort = 0
-
-    val FlowWords(srcWord, dstWord) =
-      flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
-
-
-    dstWord shouldBe "80_5_2_0"
-    srcWord shouldBe "-1_80_5_2_0"
-  }
-
-  // 5. Test when sip is less than dip and sip is not 0 and sport is 0 but dport is not +
-  it should "create word with ip_pair as sourceIp-destIp, port is dport and dest_word direction is -1 II" in {
-
-    val srcPort = 0
-    val dstPort = 43
-
-
-    val FlowWords(srcWord, dstWord) =
-      flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
-
-
-    dstWord shouldBe "-1_43_5_2_0"
-    srcWord shouldBe "43_5_2_0"
-  }
-
-  // 6. Test when sip is less than dip and sip is not 0 and sport and dport are less or equal than 1024 +
-  it should "create word with ip_pair as sourceIp-destIp, port is 111111 and both words direction is 1 (not showing)" in {
-    val srcPort = 1024
-    val dstPort = 80
-
-
-    val FlowWords(srcWord, dstWord) =
-      flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
-
-    dstWord shouldBe "111111_5_2_0"
-    srcWord shouldBe "111111_5_2_0"
-  }
-
-  // 7. Test when sip is less than dip and sip is not 0 and sport and dport are 0+
-  it should "create word with ip_pair as sourceIp-destIp, port is max(0,0) and both words direction is 1 (not showing)" in {
-    val srcPort = 0
-    val dstPort = 0
-
-
-    val FlowWords(srcWord, dstWord) =
-      flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
-
-    dstWord shouldBe "0_5_2_0"
-    srcWord shouldBe "0_5_2_0"
-  }
-
-  // 8. Test when sip is not less than dip and dport is <= 1024 & sport > 1024 and min(dport, sport) !=0+
-  it should "create word with ip_pair as destIp-sourceIp, port is dport and dest_word direction is -1" in {
-    val srcPort = 3245
-    val dstPort = 43
-
-
-    val FlowWords(srcWord, dstWord) =
-      flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
-
-    dstWord shouldBe "-1_43_5_2_0"
-    srcWord shouldBe "43_5_2_0"
-
-  }
-
-  // 9. Test when sip is not less than dip and sport is <= 1024 & dport > 1024 and min(dport, sport) !=0 +
-  it should "create word with ip_pair as destIp-sourceIp, port is sport and src_word direction is -1" in {
-    val srcPort = 80
-    val dstPort = 2435
-
-    val FlowWords(srcWord, dstWord) =
-      flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
-
-    dstWord shouldBe "80_5_2_0"
-    srcWord shouldBe "-1_80_5_2_0"
-
-  }
-
-  // 10. Test when sip is not less than dip and dport and sport are > 1024 +
-  it should "create word with ip_pair as destIp-sourceIp, port is 333333 and both words direction is 1 (not showing)" in {
-    val srcPort = 2354
-    val dstPort = 2435
-
-
-    val FlowWords(srcWord, dstWord) =
-      flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
-
-    dstWord shouldBe "333333_5_2_0"
-    srcWord shouldBe "333333_5_2_0"
-  }
-
-  // 11. Test when sip is not less than dip and dport is 0 but sport is not +
-  it should "create word with ip_pair as destIp-sourceIp, port is sport and src_word direction is -1 II" in {
-    val srcPort = 80
-    val dstPort = 0
-
-
-    val FlowWords(srcWord, dstWord) =
-      flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
-
-    dstWord shouldBe "80_5_2_0"
-    srcWord shouldBe "-1_80_5_2_0"
-  }
-
-  // 12. Test when sip is not less than dip and sport is 0 but dport is not +
-  it should "create word with ip_pair as destIp-sourceIp, port is dport and dest_word direction is -1 II" in {
-    val srcPort = 0
-    val dstPort = 2435
-
-
-    val FlowWords(srcWord, dstWord) =
-      flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
-
-    dstWord shouldBe "-1_2435_5_2_0"
-    srcWord shouldBe "2435_5_2_0"
-  }
-
-  // 13. Test when sip is not less than dip and sport and dport are less or equal than 1024
-  it should "create word with ip_pair as destIp-sourceIp, port 111111 and both words direction is 1 (not showing)" in {
-    val srcPort = 80
-    val dstPort = 1024
-
-
-    val FlowWords(srcWord, dstWord) =
-      flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
-
-    dstWord shouldBe "111111_5_2_0"
-    srcWord shouldBe "111111_5_2_0"
-  }
-
-  // 14. Test when sip is not less than dip and sport and dport are 0
-  it should "create word with ip_pair as destIp-sourceIp, port is max(0,0) and both words direction is 1 (not showing)" in {
-    val srcPort = 0
-    val dstPort = 0
-
-
-    val FlowWords(srcWord, dstWord) =
-      flowWordCreator.flowWords(hour, minute, second, srcPort, dstPort, ipkts, ibyts)
-
-    dstWord shouldBe "0_5_2_0"
-    srcWord shouldBe "0_5_2_0"
-  }
-}


[06/49] incubator-spot git commit: Updating spot branch from upstream (spot repo) Merge branch 'spot' of https://github.com/Open-Network-Insight/open-network-insight into spot

Posted by ev...@apache.org.
Updating spot branch from upstream (spot repo)
Merge branch 'spot' of https://github.com/Open-Network-Insight/open-network-insight into spot


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/8f0988ae
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/8f0988ae
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/8f0988ae

Branch: refs/heads/master
Commit: 8f0988ae205a54e3d3d67c56aa7dab7f8b5e5584
Parents: d7d6ae0 8830127
Author: Brandon Edwards <br...@intel.com>
Authored: Thu Dec 8 14:52:59 2016 -0800
Committer: Brandon Edwards <br...@intel.com>
Committed: Thu Dec 8 14:52:59 2016 -0800

----------------------------------------------------------------------
 ISSUES.md                                       | 53 --------------------
 README.md                                       | 51 ++++---------------
 spot-ingest/README.md                           |  4 --
 .../js/components/DetailsTablePanel.react.js    |  4 +-
 .../dns/js/components/NetworkViewPanel.react.js |  2 +-
 .../dns/js/components/SuspiciousPanel.react.js  |  2 +-
 .../js/components/DetailsTablePanel.react.js    |  2 +-
 .../js/components/NetworkViewPanel.react.js     |  2 +-
 .../flow/js/components/SuspiciousPanel.react.js |  2 +-
 .../ui/js/components/GridPanelMixin.react.js    | 20 ++++----
 .../components/PolloNetworkViewMixin.react.js   |  5 ++
 .../js/components/NetworkViewPanel.react.js     |  6 +--
 .../js/components/SuspiciousPanel.react.js      |  2 +-
 13 files changed, 36 insertions(+), 119 deletions(-)
----------------------------------------------------------------------



[36/49] incubator-spot git commit: unit_test_cleanup

Posted by ev...@apache.org.
unit_test_cleanup

removed unnecessary implicit variable


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/a70dbf0f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/a70dbf0f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/a70dbf0f

Branch: refs/heads/master
Commit: a70dbf0f04a690fa477e2d3f1445052aae11a8d5
Parents: 24b3a37
Author: nlsegerl <na...@intel.com>
Authored: Wed Jan 4 15:42:46 2017 -0800
Committer: nlsegerl <na...@intel.com>
Committed: Wed Jan 4 15:42:46 2017 -0800

----------------------------------------------------------------------
 spot-ml/src/main/scala/org/apache/spot/SuspiciousConnects.scala | 1 -
 1 file changed, 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/a70dbf0f/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnects.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnects.scala b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnects.scala
index fc7606e..352650e 100644
--- a/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnects.scala
+++ b/spot-ml/src/main/scala/org/apache/spot/SuspiciousConnects.scala
@@ -42,7 +42,6 @@ object SuspiciousConnects {
         val sparkConfig = new SparkConf().setAppName("Spot ML:  " + analysis + " suspicious connects analysis")
         val sparkContext = new SparkContext(sparkConfig)
         val sqlContext = new SQLContext(sparkContext)
-        implicit val outputDelimiter = config.outputDelimiter
 
         val inputDataFrame = InputOutputDataHandler.getInputDataFrame(sqlContext, config.inputPath, logger)
           .getOrElse(sqlContext.emptyDataFrame)


[17/49] incubator-spot git commit: test_proxy

Posted by ev...@apache.org.
test_proxy

added the topDomainAnomaly test case for the proxy


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/deeed03e
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/deeed03e
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/deeed03e

Branch: refs/heads/master
Commit: deeed03e2721c2236d299cb900dc978a54164738
Parents: b1b5d74
Author: nlsegerl <na...@intel.com>
Authored: Tue Dec 13 13:15:34 2016 -0800
Committer: nlsegerl <na...@intel.com>
Committed: Tue Dec 13 13:15:34 2016 -0800

----------------------------------------------------------------------
 .../ProxySuspiciousConnectsAnalysisTest.scala   | 97 ++++++++++++++++++++
 1 file changed, 97 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/deeed03e/spot-ml/src/test/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysisTest.scala
----------------------------------------------------------------------
diff --git a/spot-ml/src/test/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysisTest.scala b/spot-ml/src/test/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysisTest.scala
new file mode 100644
index 0000000..9e06d6c
--- /dev/null
+++ b/spot-ml/src/test/scala/org/apache/spot/proxy/ProxySuspiciousConnectsAnalysisTest.scala
@@ -0,0 +1,97 @@
+package org.apache.spot.proxy
+
+import org.apache.log4j.{Level, LogManager}
+import org.apache.spot.SuspiciousConnectsArgumentParser.SuspiciousConnectsConfig
+import org.apache.spot.proxy.ProxySchema._
+import org.apache.spot.testutils.TestingSparkContextFlatSpec
+import org.scalatest.Matchers
+
+case class ProxyInput(p_date:String,
+                      p_time:String,
+                      clientip:String,
+                      host:String,
+                      reqmethod:String,
+                      useragent:String,
+                      resconttype:String,
+                      duration:Int,
+                      username:String,
+                      webcat:String,
+                      referer:String,
+                      respcode:String,
+                      uriport:Int,
+                      uripath:String,
+                      uriquery:String,
+                      serverip:String,
+                      scbytes:Int,
+                      csbytes:Int,
+                      fulluri:String)
+
+class ProxySuspiciousConnectsAnalysisTest extends TestingSparkContextFlatSpec with Matchers {
+
+
+
+  val testConfig = SuspiciousConnectsConfig(analysis = "proxy",
+    inputPath = "",
+    feedbackFile = "",
+    duplicationFactor = 1,
+    topicCount = 20,
+    hdfsScoredConnect = "",
+    threshold = 1.0d,
+    maxResults = 1000,
+    outputDelimiter = "\t",
+    ldaPRGSeed = None,
+    ldaMaxiterations = 20,
+    ldaAlpha = 1.02,
+    ldaBeta = 1.001)
+
+
+  "proxy supicious connects analysis" should "estimate correct probabilities in toy data with top domain anomaly" in {
+
+    val logger = LogManager.getLogger("SuspiciousConnectsAnalysis")
+    logger.setLevel(Level.INFO)
+    val testSqlContext = new org.apache.spark.sql.SQLContext(sparkContext)
+
+    val anomalousRecord = ProxyInput("2016-10-03",	"04:57:36", "127.0.0.1",	"intel.com",	"PUT",
+      "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36",
+      "text/plain", 230,	"-", 	"Technology/Internet",	"http://www.spoonflower.com/tags/color",	"202",	80,
+      "/sites/c37i4q22szvir8ga3m8mtxaft7gwnm5fio8hfxo35mu81absi1/carts/4b3a313d-50f6-4117-8ffd-4e804fd354ef/fiddle",
+      "-",	"127.0.0.1",	338,	647,
+      "maw.bronto.com/sites/c37i4q22szvir8ga3m8mtxaft7gwnm5fio8hfxo35mu81absi1/carts/4b3a313d-50f6-4117-8ffd-4e804fd354ef/fiddle")
+
+    val typicalRecord   = ProxyInput("2016-10-03",	"04:57:36", "127.0.0.1",	"maw.bronto.com",	"PUT",
+      "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36",
+      "text/plain", 230,	"-", 	"Technology/Internet",	"http://www.spoonflower.com/tags/color",	"202",	80,
+      "/sites/c37i4q22szvir8ga3m8mtxaft7gwnm5fio8hfxo35mu81absi1/carts/4b3a313d-50f6-4117-8ffd-4e804fd354ef/fiddle",
+      "-",	"127.0.0.1",	338,	647,
+      "maw.bronto.com/sites/c37i4q22szvir8ga3m8mtxaft7gwnm5fio8hfxo35mu81absi1/carts/4b3a313d-50f6-4117-8ffd-4e804fd354ef/fiddle")
+
+    import testSqlContext.implicits._
+
+    val data = sparkContext.parallelize(Seq(anomalousRecord, typicalRecord, typicalRecord, typicalRecord, typicalRecord,
+      typicalRecord, typicalRecord, typicalRecord, typicalRecord, typicalRecord)).toDF
+
+    val scoredData = ProxySuspiciousConnectsAnalysis.detectProxyAnomalies(data, testConfig,
+      sparkContext,
+      sqlContext,
+      logger)
+
+
+
+    val anomalyScore = scoredData.filter(scoredData(Host) ===  "intel.com").first().getAs[Double](Score)
+    val typicalScores = scoredData.filter(scoredData(Host) === "maw.bronto.com").collect().map(_.getAs[Double](Score))
+
+    Math.abs(anomalyScore - 0.1d)  should be <= 0.01d
+    typicalScores.length shouldBe 9
+    Math.abs(typicalScores(0) - 0.9d)  should be <= 0.01d
+    Math.abs(typicalScores(1) - 0.9d)  should be <= 0.01d
+    Math.abs(typicalScores(2) - 0.9d)  should be <= 0.01d
+    Math.abs(typicalScores(3) - 0.9d)  should be <= 0.01d
+    Math.abs(typicalScores(4) - 0.9d)  should be <= 0.01d
+    Math.abs(typicalScores(5) - 0.9d)  should be <= 0.01d
+    Math.abs(typicalScores(6) - 0.9d)  should be <= 0.01d
+    Math.abs(typicalScores(7) - 0.9d)  should be <= 0.01d
+    Math.abs(typicalScores(8) - 0.9d)  should be <= 0.01d
+  }
+
+
+}


[19/49] incubator-spot git commit: Omitted an extra space after one of the jar parameter entry lines that was causing an error.

Posted by ev...@apache.org.
Omitted an extra space after one of the jar parameter entry lines that was causing an error.


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/452fca35
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/452fca35
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/452fca35

Branch: refs/heads/master
Commit: 452fca35bd33c291b88cd4a664f531db1ba7d7e2
Parents: 3bf290d
Author: Brandon Edwards <br...@intel.com>
Authored: Wed Dec 14 08:25:30 2016 -0800
Committer: Brandon Edwards <br...@intel.com>
Committed: Wed Dec 14 08:25:30 2016 -0800

----------------------------------------------------------------------
 spot-ml/ml_ops.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/452fca35/spot-ml/ml_ops.sh
----------------------------------------------------------------------
diff --git a/spot-ml/ml_ops.sh b/spot-ml/ml_ops.sh
index c5d6e43..ece60cd 100755
--- a/spot-ml/ml_ops.sh
+++ b/spot-ml/ml_ops.sh
@@ -108,7 +108,7 @@ time spark-submit --class "org.apache.spot.SuspiciousConnects" \
   --scored ${HDFS_SCORED_CONNECTS} \
   --threshold ${TOL} \
   --maxresults ${MAXRESULTS} \
-  --ldamaxiterations 20 \ 
+  --ldamaxiterations 20 \
   $USER_DOMAIN_PARSER_CMD
 
 wait


[12/49] incubator-spot git commit: spot

Posted by ev...@apache.org.
spot

Updated README file to reflect removal of MPI based LDA.
Cleaned up the discussion of Parquet and Hive.


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/18a69675
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/18a69675
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/18a69675

Branch: refs/heads/master
Commit: 18a696750e6993a9e4b77f8573ec844068defa39
Parents: 40a1a38
Author: nlsegerl <na...@intel.com>
Authored: Mon Dec 12 14:34:13 2016 -0800
Committer: nlsegerl <na...@intel.com>
Committed: Mon Dec 12 14:34:13 2016 -0800

----------------------------------------------------------------------
 spot-ml/README.md | 24 +++++-------------------
 1 file changed, 5 insertions(+), 19 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/18a69675/spot-ml/README.md
----------------------------------------------------------------------
diff --git a/spot-ml/README.md b/spot-ml/README.md
index a4eab66..3200c06 100644
--- a/spot-ml/README.md
+++ b/spot-ml/README.md
@@ -3,7 +3,7 @@
 Machine learning routines for Apache Spot (incubating).
 
 At present, spot-ml contains routines for performing *suspicious connections* analyses on netflow, DNS or proxy data gathered from a network. These
-analyses consume a (possibly very lage) collection of network events and produces a list of the events that considered to be the least probable (or most suspicious).
+analyses consume a (possibly very lage) collection of network events and produces a list of the events that considered to be the least probable (most suspicious).
 
 spot-ml is designed to be run as a component of Spot. It relies on the ingest component of Spot to collect and load
 netflow and DNS records, and spot-ml will try to load data to the operational analytics component of Spot.  It is suggested that when experimenting with spot-ml, you do so as a part of the unified Spot system: Please see [the Spot wiki]
@@ -19,9 +19,8 @@ The data format and location where the data is stored differs for netflow and DN
 
 **Netflow Data**
 
-Netflow data for the year YEAR, month  MONTH, and day DAY is stored in HDFS at `HUSER/flow/csv/y=YEAR/m=MONTH/d=DAY/*`
-
-Data for spot-ml netflow analyses is currently stored in text csv files using the following schema:
+Netflow data for the year YEAR, month  MONTH, and day DAY is stored in in a Parquet table  at `HUSER/flow/csv/y=YEAR/m=MONTH/d=DAY/*` according to
+ the following schema:
 
 - time: String
 - year: Double
@@ -53,9 +52,7 @@ Data for spot-ml netflow analyses is currently stored in text csv files using th
 
 **DNS Data**
 
-DNS data for the year YEAR, month MONTH and day DAY is stored in Hive at `HUSER/dns/hive/y=YEAR/m=MONTH/d=DAY/`
-
-The Hive tables containing DNS data for spot-ml analyses have the following schema:
+DNS data for the year YEAR, month MONTH and day DAY is stored in Parquet at `HUSER/dns/hive/y=YEAR/m=MONTH/d=DAY/` using the following schema:
 
 - frame_time: STRING
 - unix_tstamp: BIGINT
@@ -68,6 +65,7 @@ The Hive tables containing DNS data for spot-ml analyses have the following sche
 - dns_a: STRING
 
 **PROXY DATA**
+Proxy data for the year YEAR, month MONTH and day DAY is stored in Parquet at `HUSER/dns/hive/y=YEAR/m=MONTH/d=DAY/` using the following schema:
 
 - p_date: STRING
 - p_time: STRING  
@@ -138,18 +136,6 @@ spot-ml output will be found under the ``HPATH`` at one of
 
 It is a csv file in which network events annotated with estimated probabilities and sorted in ascending order.
 
-A successful run of spot-ml will also create and populate a directory at `LPATH/<source>/YYYYMMDD` where `<source>` is one of flow, dns or proxy, and
-`YYYYMMDD` is the date argument provided to `ml_ops.sh` 
-This directory will contain the following files generated during the LDA procedure used for topic-modelling:
-
-- model.dat An intermediate file in which each line corresponds to a "document" (the flow traffic about an IP, or the DNS queries of a client IP), and contains the size of the document and the list of "words" (simplified network events) occurring in the document with their frequencies. Words are encoded as integers per the file words.dat. 
-- final.beta  A space-separated text file that contains the logs of the probabilities of each word given each topic. Each line corresponds to a topic and the words are columns. 
-- final.gamma A space-separated text file that contains the unnormalized probabilities of each topic given each document. Each line corresponds to a document and the topics are the columns.
-- final.other  Auxilliary information from the LDA run: Number of topics, number of terms, alpha.
-- likelihood.dat Convergence information for the LDA run.
-
-In addition, on each worker node identified in NODES, in the `LPATH/<source>/YYYYMMDD` directory files of the form `<worker index>.beta` and `<workder index>.gamma`, these are local temporary files that are combined to form `final.beta` and `final.gamma`, respectively.
-
 ## Licensing
 
 spot-ml is licensed under Apache Version 2.0


[41/49] incubator-spot git commit: Ingest summary is now a top level section

Posted by ev...@apache.org.
Ingest summary is now a top level section


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/8a695097
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/8a695097
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/8a695097

Branch: refs/heads/master
Commit: 8a69509780620491de1a2801796b9d764061356c
Parents: 5901e06
Author: Diego Ortiz Huerta <di...@intel.com>
Authored: Tue Dec 6 08:53:11 2016 -0800
Committer: Everardo Lopez Sandoval (Intel) <el...@elopezsa-mac02.ra.intel.com>
Committed: Fri Jan 20 17:01:02 2017 -0800

----------------------------------------------------------------------
 spot-oa/ui/flow/ingest-summary.html             | 158 -------------
 spot-oa/ui/flow/js/actions/InSumActions.js      |  12 -
 .../js/components/IngestSummaryPanel.react.js   | 235 ------------------
 .../ui/flow/js/constants/NetflowConstants.js    |   9 +-
 spot-oa/ui/flow/js/ingest-summary.js            |  94 --------
 spot-oa/ui/flow/js/stores/IngestSummaryStore.js |  34 +--
 spot-oa/ui/flow/package.json                    |   6 +-
 spot-oa/ui/ingest-summary.html                  | 155 ++++++++++++
 spot-oa/ui/js/actions/InSumActions.js           |  12 +
 .../js/components/IngestSummaryPanel.react.js   | 236 +++++++++++++++++++
 spot-oa/ui/js/constants/SpotConstants.js        |   5 +
 spot-oa/ui/js/ingest-summary.js                 |  94 ++++++++
 spot-oa/ui/js/stores/IngestSummaryStore.js      | 161 +++++++++++++
 spot-oa/ui/package.json                         |   6 +-
 14 files changed, 688 insertions(+), 529 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/8a695097/spot-oa/ui/flow/ingest-summary.html
----------------------------------------------------------------------
diff --git a/spot-oa/ui/flow/ingest-summary.html b/spot-oa/ui/flow/ingest-summary.html
deleted file mode 100755
index c13ab9f..0000000
--- a/spot-oa/ui/flow/ingest-summary.html
+++ /dev/null
@@ -1,158 +0,0 @@
-<!DOCTYPE html>
-<html>
-<head>
-    <title>Netflow :: Ingest Summary</title>
-
-    <meta charset="UTF-8">
-
-    <!--Bootstrap styles-->
-    <link rel="stylesheet" type="text/css" href="../css/bootstrap-spot.min.css" />
-    <!--Bootstrap Date picker styles-->
-    <link rel="stylesheet" type="text/css" href="../node_modules/bootstrap-datepicker/dist/css/bootstrap-datepicker3.min.css" />
-    <!-- Spot styles -->
-    <link rel="stylesheet" type="text/css" href="../css/main.css" />
-    <!-- Favicon -->
-    <link rel="apple-touch-icon" sizes="57x57" href="../images/favicon/apple-icon-57x57.png">
-    <link rel="apple-touch-icon" sizes="60x60" href="../images/favicon/apple-icon-60x60.png"
-    <link rel="apple-touch-icon" sizes="72x72" href="../images/favicon/apple-icon-72x72.png">
-    <link rel="apple-touch-icon" sizes="76x76" href="../images/favicon/apple-icon-76x76.png">
-    <link rel="apple-touch-icon" sizes="114x114" href="../images/favicon/apple-icon-114x114.png">
-    <link rel="apple-touch-icon" sizes="120x120" href="../images/favicon/apple-icon-120x120.png">
-    <link rel="apple-touch-icon" sizes="144x144" href="../images/favicon/apple-icon-144x144.png">
-    <link rel="apple-touch-icon" sizes="152x152" href="../images/favicon/apple-icon-152x152.png">
-    <link rel="apple-touch-icon" sizes="180x180" href="../images/favicon/apple-icon-180x180.png">
-    <link rel="icon" type="image/png" sizes="192x192"  href="../images/favicon/android-icon-192x192.png">
-    <link rel="icon" type="image/png" sizes="32x32" href="../images/favicon/favicon-32x32.png">
-    <link rel="icon" type="image/png" sizes="96x96" href="../images/favicon/favicon-96x96.png">
-    <link rel="icon" type="image/png" sizes="16x16" href="../images/favicon/favicon-16x16.png">
-    <link rel="manifest" href="../images/favicon/manifest.json">
-    <meta name="msapplication-TileColor" content="#ffffff">
-    <meta name="msapplication-TileImage" content="../images/favicon/ms-icon-144x144.png">
-    <meta name="theme-color" content="#ffffff">
-    <style type="text/css">
-        .spot-row {
-            height: 100%;
-        }
-
-        #spot-is-header {
-            width: 100%;
-            position: absolute;
-            top: 0;
-            left: 0;
-            z-index: 2;
-            height: auto;
-        }
-
-        #spot-is, #spot-is-summary {
-            height: 100%;
-        }
-
-        .axis {
-            shape-rendering: crispEdges;
-        }
-
-        .axis path, .axis line {
-            fill: none;
-        }
-
-        rect.pane {
-            cursor: e-resize;
-            fill: none;
-            pointer-events: all;
-        }
-    </style>
-</head>
-<body>
-    <nav id="spot-nav" class="navbar navbar-default">
-        <div class="container-fluid">
-            <!-- App name and toggle get grouped for better mobile display -->
-            <div class="navbar-header">
-                <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#main-menu">
-                    <span class="sr-only">Toggle navigation</span>
-                    <span class="icon-bar"></span>
-                    <span class="icon-bar"></span>
-                    <span class="icon-bar"></span>
-                </button>
-                <span class="navbar-brand">Apache Spot :: Netflow :: Ingest Summary</span>
-            </div>
-            <!-- Collect the nav links, forms, and other content for toggling -->
-            <div class="collapse navbar-collapse" id="main-menu">
-                <ul class="nav navbar-nav navbar-right">
-                    <li class="dropdown">
-                        <a class="dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="false">
-                            Flows
-                            <span class="caret"></span>
-                        </a>
-                        <ul class="dropdown-menu" aria-labelledby="flowsMenu">
-                            <li>
-                                <a data-href="suspicious.html#date=${end-date}">Suspicious</a>
-                            </li>
-                            <li>
-                                <a data-href="threat-investigation.html#date=${end-date}">Threat Investigation</a>
-                            </li>
-                            <li>
-                                <a data-href="storyboard.html#date=${end-date}">Storyboard</a>
-                            </li>
-                            <li>
-                                <a data-href="ingest-summary.html#end-date=${end-date}">Ingest Summary</a>
-                            </li>
-                        </ul>
-                    </li>
-                    <li class="dropdown">
-                        <a class="dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="false">
-                            DNS
-                            <span class="caret"></span>
-                        </a>
-                        <ul class="dropdown-menu" aria-labelledby="dnsMenu">
-                            <li>
-                                <a data-href="../dns/suspicious.html#date=${end-date}">Suspicious</a>
-                            </li>
-                            <li>
-                                <a data-href="../dns/threat-investigation.html#date=${end-date}">Threat Investigation</a>
-                            </li>
-                            <li>
-                                <a data-href="../dns/storyboard.html#date=${end-date}">Storyboard</a>
-                            </li>
-                        </ul>
-                    </li>
-                    <li class="dropdown">
-                        <a class="dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="false">
-                            Proxy
-                            <span class="caret"></span>
-                        </a>
-                        <ul class="dropdown-menu" aria-labelledby="proxyMenu">
-                            <li>
-                                <a data-href="../proxy/suspicious.html#date=${end-date}">Suspicious</a>
-                            </li>
-                            <li>
-                                <a data-href="../proxy/threat-investigation.html#date=${end-date}">Threat Investigation</a>
-                            </li>
-                            <li>
-                                <a data-href="../proxy/storyboard.html#date=${end-date}">Storyboard</a>
-                            </li>
-                        </ul>
-                    </li>
-                </ul>
-            </div>
-            <div id="search-box" class="row text-right">
-                <!--Tools Buttons-->
-                <div id="nav_form" class="col-md-12">
-                  <!-- Search form placeholder -->
-                </div>
-            </div> <!-- /Tools Buttons-->
-        </div>
-    </nav>
-    <div id="spot-content-wrapper" class="container-fluid">
-      <!-- Main Content Placeholder -->
-    </div>
-
-    <!-- SCRIPTS -->
-    <script type="application/javascript" src="../node_modules/jquery/dist/jquery.min.js"></script>
-    <script type="application/javascript" src="../node_modules/d3/d3.min.js"></script>
-    <script type="application/javascript" src="../node_modules/bootstrap/dist/js/bootstrap.min.js"></script>
-    <script type="application/javascript" src="../node_modules/bootstrap-datepicker/dist/js/bootstrap-datepicker.min.js"></script>
-    <script type="application/javascript" src="../node_modules/react/dist/react.min.js"></script>
-    <script type="application/javascript" src="../node_modules/react-dom/dist/react-dom.min.js"></script>
-    <script type="application/javascript" src="js/ingest-summary.bundle.min.js"></script>
-</body>
-</html>

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/8a695097/spot-oa/ui/flow/js/actions/InSumActions.js
----------------------------------------------------------------------
diff --git a/spot-oa/ui/flow/js/actions/InSumActions.js b/spot-oa/ui/flow/js/actions/InSumActions.js
deleted file mode 100755
index 7b8f9df..0000000
--- a/spot-oa/ui/flow/js/actions/InSumActions.js
+++ /dev/null
@@ -1,12 +0,0 @@
-var SpotDispatcher = require('../../../js/dispatchers/SpotDispatcher');
-var NetflowConstants = require('../constants/NetflowConstants');
-
-var InSumActions = {
-  reloadSummary: function () {
-    SpotDispatcher.dispatch({
-      actionType: NetflowConstants.RELOAD_INGEST_SUMMARY
-    });
-  }
-};
-
-module.exports = InSumActions;

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/8a695097/spot-oa/ui/flow/js/components/IngestSummaryPanel.react.js
----------------------------------------------------------------------
diff --git a/spot-oa/ui/flow/js/components/IngestSummaryPanel.react.js b/spot-oa/ui/flow/js/components/IngestSummaryPanel.react.js
deleted file mode 100755
index 77af5dd..0000000
--- a/spot-oa/ui/flow/js/components/IngestSummaryPanel.react.js
+++ /dev/null
@@ -1,235 +0,0 @@
-const $ = require('jquery');
-const d3 = require('d3');
-const React = require('react');
-const ReactDOM = require('react-dom');
-
-var DateUtils = require('../../../js/utils/DateUtils');
-var InSumActions = require('../actions/InSumActions');
-var IngestSummaryStore = require('../stores/IngestSummaryStore');
-
-function initialDraw() {
-  var rootNode, format, x, y, xAxis, yAxis, area, svg, rect, total, minDate, maxDate, maxFlows, numberFormat;
-
-  rootNode = d3.select(ReactDOM.findDOMNode(this));
-
-  // graph dimensions
-  var m = [100, 50, 50, 80], // Margin
-      w = $(rootNode.node()).width() - m[1] - m[3], // Width
-      h = $(rootNode.node()).height() - m[0] - m[2]; // Height
-
-  format = d3.time.format("%Y-%m-%d %H:%M");
-
-  // Scales.
-  x = d3.time.scale().range([0, w]); // get X function
-  y = d3.scale.linear().range([h, 0]); // get Y function
-  xAxis = d3.svg.axis().scale(x).orient("bottom"); // Get the X axis (Time)
-  yAxis = d3.svg.axis().scale(y).orient("left"); // Get Y Axis (Netflows)
-
-  // An area generator.
-  area = d3.svg.area()
-        .x(function (d) {
-            return x(d.date);
-        })
-        .y0(h)
-        .y1(function (d) {
-            if (!isNaN(d.flows))
-                return y(d.flows);
-            else
-                return y(0);
-        });
-
-  rootNode.select('svg').remove();
-
-  // define the Main SVG
-  svg = rootNode.select('#' + this.props.id + '-summary').append("svg")
-    .attr("width", w + m[1] + m[3])
-    .attr("height", h + m[0] + m[2])
-        .append("g")
-        .attr("transform", "translate(" + m[3] + "," + m[0] + ")")
-
-  // Append the clipPath to avoid the Area overlapping
-  svg.append("clipPath")
-        .attr("id", "clip")
-        .append("rect")
-          .attr("x", x(0))
-          .attr("y", y(1))
-          .attr("width", x(1) - x(0))
-          .attr("height", y(0) - y(1));
-
-  // Append the Y Axis group
-  svg.append("g")
-    .attr("class", "y axis");
-
-  // Append the X axis group
-  svg.append("g")
-    .attr("class", "x axis")
-    .attr("transform", "translate(0," + h + ")");
-
-  // Append a pane rect, which will help us to add the zoom functionality
-  rect = svg.append("rect")
-        .attr("class", "pane")
-        .attr("width", w)
-        .attr("height", h);
-
-  this.state.data.forEach(function (dataSet)
-  {
-    var a;
-
-    a = [{date: minDate}];
-    a.push.apply(a, dataSet);
-    minDate = d3.min(a, function (d) { return d.date; });
-    a[0] = {date: maxDate, flows: maxFlows};
-    maxDate = d3.max(a, function (d) { return d.date; });
-    maxFlows = d3.max(a, function (d) { return d.flows; })
-  });
-
-  !minDate && (minDate = DateUtils.parseDate(IngestSummaryStore.getStartDate()));
-  !maxDate && (maxDate = DateUtils.parseDate(IngestSummaryStore.getEndDate()));
-
-  // bind the data to the X and Y generators
-  x.domain([minDate, maxDate]);
-  y.domain([0, maxFlows]);
-
-  // Bind the data to our path element.
-  svg.selectAll("path.area").data(this.state.data).enter().insert('path', 'g')
-                                                .attr('class', 'area')
-                                                .attr('clip-path', 'url(#clip)')
-                                                .style('fill', '#0071c5')
-                                                .attr('d', function (d) {
-                                                    return area(d);
-                                                });
-
-  //Add the pane rect the zoom behavior
-  rect.call(d3.behavior.zoom().x(x)
-      .scaleExtent([0.3, 2300]) // these are magic numbers to avoid the grap be zoomable in/out to the infinity
-      .on("zoom", zoom.bind(this)));
-
-  function draw () {
-    var total, minDate, maxDate, numberFormat;
-
-    svg.select("g.x.axis").call(xAxis);
-    svg.select("g.y.axis").call(yAxis);
-    svg.selectAll("path.area").attr("d", function (d) { return area(d); });
-    numberFormat = d3.format(",d"); // number formatter (comma separated number i.e. 100,000,000)
-
-    rootNode.select('#' + this.props.id + '-range').html("Seeing total flows <strong>from:</strong> " + x.domain().map(format).join(" <strong>to:</strong> "));
-
-    //Calculate the total flows between the displayed date range
-
-    total = 0;
-    minDate = x.domain()[0];
-    maxDate = x.domain()[1];
-
-    // Go to the first millisecond on dates
-    minDate.setSeconds(0);minDate.setMilliseconds(0);
-    maxDate.setSeconds(59);maxDate.setMilliseconds(0);
-
-    svg.selectAll("path.area").data().forEach(function (pathData)
-    {
-      pathData.forEach(function (record)
-      {
-        // Discard records outside displayed date range
-        if (record.date >= minDate && record.date <= maxDate) {
-          total += +record.flows;
-        }
-      });
-    });
-
-    rootNode.select('#' + this.props.id + '-total').html("<strong>Total netflows in range:</strong> " + numberFormat(total));
-  }
-
-  /*
-      Zoom event handler
-  */
-  function zoom() {
-    if (d3.event.sourceEvent.type == "wheel") {
-      if (d3.event.sourceEvent.wheelDelta < 0)
-         rect.style("cursor", "zoom-out");
-      else
-         rect.style("cursor", "zoom-in");
-    }
-    else if (d3.event.sourceEvent.type == "mousemove") {
-      rect.style("cursor", "e-resize");
-    }
-
-    draw.call(this);
-  }
-
-  draw.call(this);
-}
-
-var IngestSummaryPanel = React.createClass({
-  propTypes: {
-    id: React.PropTypes.string
-  },
-  getDefaultProperties: function () {
-    return {
-      id: 'spot-is'
-    };
-  },
-  getInitialState: function ()
-  {
-    return {loading: true};
-  },
-  render:function()
-  {
-    var content;
-
-    if (this.state.error)
-    {
-      content = (
-        <div className="text-center text-danger">
-          {this.state.error}
-        </div>
-      );
-    }
-    else if (this.state.loading)
-    {
-      content = (
-        <div className="spot-loader">
-            Loading <span className="spinner"></span>
-        </div>
-      );
-    }
-    else
-    {
-      content = (
-        <div id={this.props.id} className="text-center">
-          <div id={this.props.id + '-header'}>
-            <p id={this.props.id + '-range'}></p>
-            <p id={this.props.id + '-total'}></p>
-            <p id={this.props.id + '-istructions'} className="small">** Zoom in/out using mouse wheel or two fingers in track pad <br /> ** Move across the x-axis by clicking anywhere in the graph and dragging to left or right</p>
-          </div>
-          <div id={this.props.id + '-summary'}></div>
-        </div>
-      );
-    }
-
-    return (
-      <div>{content}</div>
-    )
-  },
-  componentDidMount: function()
-  {
-    IngestSummaryStore.addChangeDataListener(this._onChange);
-    window.addEventListener('resize', this.buildGraph);
-  },
-  componentWillUnmount: function ()
-  {
-    IngestSummaryStore.removeChangeDataListener(this._onChange);
-    window.removeEventListener('resize', this.buildGraph);
-  },
-  componentDidUpdate: function ()
-  {
-    if (!this.state.loading && !this.state.error && this.state.data)
-    {
-      this.buildGraph();
-    }
-  },
-  buildGraph: initialDraw,
-  _onChange: function () {
-    this.replaceState(IngestSummaryStore.getData());
-  }
-});
-
-module.exports = IngestSummaryPanel;

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/8a695097/spot-oa/ui/flow/js/constants/NetflowConstants.js
----------------------------------------------------------------------
diff --git a/spot-oa/ui/flow/js/constants/NetflowConstants.js b/spot-oa/ui/flow/js/constants/NetflowConstants.js
index 92da4b2..c0fc7a8 100755
--- a/spot-oa/ui/flow/js/constants/NetflowConstants.js
+++ b/spot-oa/ui/flow/js/constants/NetflowConstants.js
@@ -1,20 +1,15 @@
 var NetflowConstants = {
-  // Netflow Actions
-  RELOAD_INGEST_SUMMARY: 'RELOAD_INGEST_SUMMARY',
-  // INGEST SUMMARY
-  START_DATE: 'start-date',
-  END_DATE: 'end-date',
   // Data source URLS
   API_SUSPICIOUS: '../../data/flow/${date}/flow_scores.csv',
   API_DETAILS: '../../data/flow/${date}/edge-${src_ip}-${dst_ip}-${time}.tsv',
   API_VISUAL_DETAILS: '../../data/flow/${date}/chord-${ip}.tsv',
   API_COMMENTS: '../../data/flow/${date}/threats.csv',
   API_INCIDENT_PROGRESSION: '../../data/flow/${date}/threat-dendro-${ip}.json',
-  API_INGEST_SUMMARY: '../../data/flow/ingest_summary/is_${year}${month}.csv',
+  API_INGEST_SUMMARY: '../data/flow/ingest_summary/is_${year}${month}.csv',
   API_IMPACT_ANALYSIS: '../../data/flow/${date}/stats-${ip}.json',
   API_GLOBE_VIEW: '../../data/flow/${date}/globe-${ip}.json',
   API_WORLD_110M: '../flow/world-110m.json',
-  API_TIMELINE: '../../data/flow/${date}/sbdet-${ip}.tsv',
+  API_TIMELINE: '../../data/flow/${date}/sbdet-${ip}.tsv'
 };
 
 module.exports = NetflowConstants;

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/8a695097/spot-oa/ui/flow/js/ingest-summary.js
----------------------------------------------------------------------
diff --git a/spot-oa/ui/flow/js/ingest-summary.js b/spot-oa/ui/flow/js/ingest-summary.js
deleted file mode 100755
index 1bf71b7..0000000
--- a/spot-oa/ui/flow/js/ingest-summary.js
+++ /dev/null
@@ -1,94 +0,0 @@
-const React = require('react');
-const ReactDOM = require('react-dom');
-
-const SpotActions = require('../../js/actions/SpotActions');
-const InSumActions = require('./actions/InSumActions');
-const NetflowConstants = require('./constants/NetflowConstants');
-const SpotUtils = require('../../js/utils/SpotUtils');
-const DateUtils = require('../../js/utils/DateUtils');
-
-// Build and Render Toolbar
-const DateInput = require('../../js/components/DateInput.react');
-
-// Find out period
-var startDate, endDate, today;
-
-today = new Date();
-startDate = SpotUtils.getUrlParam(NetflowConstants.START_DATE);
-endDate = SpotUtils.getUrlParam(NetflowConstants.END_DATE);
-
-if (!startDate && endDate)
-{
-  startDate = DateUtils.formatDate(DateUtils.calcDate(DateUtils.parseDate(endDate), -7));
-}
-else if (startDate && !endDate)
-{
-  endDate = DateUtils.formatDate(DateUtils.calcDate(DateUtils.parseDate(startDate), 7));
-}
-else if (!startDate && !endDate)
-{
-  // Default endDate to today and startDate to 7 days before
-  startDate = DateUtils.formatDate(DateUtils.calcDate(today, -7));
-  endDate = DateUtils.formatDate(today);
-}
-
-// We got values for both dates, make use endDate is after startDate
-if (endDate < startDate)
-{
-  // Use today var to switch dates
-  today = startDate;
-  startDate = endDate;
-  endDate = today;
-}
-
-ReactDOM.render(
-  (
-    <form className="form-inline">
-      <div className="form-group">
-        <label htmlFor="startDatePicker">Period:</label>
-        <div className="input-group input-group-xs">
-          <div className="input-group-addon">
-            <span className="glyphicon glyphicon-calendar" aria-hidden="true"></span>
-          </div>
-          <DateInput id="startDatePicker" name={NetflowConstants.START_DATE} value={startDate}/>
-        </div>
-      </div>
-      <div className="form-group">
-        <label htmlFor="endDatePicker"> - </label>
-        <div className="input-group input-group-xs">
-          <DateInput id="endDatePicker" name={NetflowConstants.END_DATE} value={endDate} />
-          <div className="input-group-btn">
-            <button className="btn btn-default" type="button" title="Reload" onClick={InSumActions.reloadSummary}>
-              <span className="glyphicon glyphicon-repeat" aria-hidden="true"></span>
-            </button>
-          </div>
-        </div>
-      </div>
-    </form>
-  ),
-  document.getElementById('nav_form')
-);
-
-// Build and Render Edge Investigation's panels
-const PanelRow = require('../../js/components/PanelRow.react');
-const Panel = require('../../js/components/Panel.react');
-//
-const IngestSummaryPanel = require('./components/IngestSummaryPanel.react');
-
-ReactDOM.render(
-  <div id="spot-content">
-    <PanelRow maximized>
-      <Panel title="Ingest Summary" container header={false} className="col-md-12">
-        <IngestSummaryPanel id="spot-is" />
-      </Panel>
-    </PanelRow>
-  </div>,
-  document.getElementById('spot-content-wrapper')
-);
-
-// Set period
-SpotActions.setDate(startDate, NetflowConstants.START_DATE);
-SpotActions.setDate(endDate, NetflowConstants.END_DATE);
-
-// Load data
-InSumActions.reloadSummary();

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/8a695097/spot-oa/ui/flow/js/stores/IngestSummaryStore.js
----------------------------------------------------------------------
diff --git a/spot-oa/ui/flow/js/stores/IngestSummaryStore.js b/spot-oa/ui/flow/js/stores/IngestSummaryStore.js
index 9e4ccb5..55f2106 100755
--- a/spot-oa/ui/flow/js/stores/IngestSummaryStore.js
+++ b/spot-oa/ui/flow/js/stores/IngestSummaryStore.js
@@ -1,20 +1,20 @@
-var assign = require('object-assign');
-var d3 = require('d3');
+const assign = require('object-assign');
+const d3 = require('d3');
 
-var SpotDispatcher = require('../../../js/dispatchers/SpotDispatcher');
-var SpotConstants = require('../../../js/constants/SpotConstants');
-var NetflowConstants = require('../constants/NetflowConstants');
-var DateUtils = require('../../../js/utils/DateUtils');
-var RestStore = require('../../../js/stores/RestStore');
+const SpotDispatcher = require('../../../js/dispatchers/SpotDispatcher');
+const SpotConstants = require('../../../js/constants/SpotConstants');
+const NetflowConstants = require('../constants/NetflowConstants');
+const DateUtils = require('../../../js/utils/DateUtils');
+const RestStore = require('../../../js/stores/RestStore');
 
-var START_DATE_FILTER = NetflowConstants.START_DATE;
-var END_DATE_FILTER = NetflowConstants.END_DATE;
-var CURRENT_DATE_FILTER = 'current_date';
+const START_DATE_FILTER = SpotConstants.START_DATE;
+const END_DATE_FILTER = SpotConstants.END_DATE;
+const CURRENT_DATE_FILTER = 'current_date';
 
-var requestQueue = [];
-var requestErrors = [];
+const requestQueue = [];
+const requestErrors = [];
 
-var IngestSummaryStore = assign(new RestStore(NetflowConstants.API_INGEST_SUMMARY), {
+const IngestSummaryStore = assign(new RestStore(NetflowConstants.API_INGEST_SUMMARY), {
     errorMessages: {
         404: 'No details available'
     },
@@ -128,7 +128,7 @@ var IngestSummaryStore = assign(new RestStore(NetflowConstants.API_INGEST_SUMMAR
         this._data.loading = requestQueue.length > 0;
 
         if (!this._data.loading) {
-            if (this._data.data.length==0) {
+            if (this._data.data && this._data.data.length==0) {
                 // Broadcast first found error
                 this._data = requestErrors[0];
             }
@@ -144,15 +144,15 @@ SpotDispatcher.register(function (action) {
     switch (action.actionType) {
         case SpotConstants.UPDATE_DATE:
             switch (action.name) {
-                case NetflowConstants.START_DATE:
+                case SpotConstants.START_DATE:
                     IngestSummaryStore.setStartDate(action.date);
                     break;
-                case NetflowConstants.END_DATE:
+                case SpotConstants.END_DATE:
                     IngestSummaryStore.setEndDate(action.date);
                     break;
             }
             break;
-        case NetflowConstants.RELOAD_INGEST_SUMMARY:
+        case SpotConstants.RELOAD_INGEST_SUMMARY:
             IngestSummaryStore.requestSummary();
             break;
     }

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/8a695097/spot-oa/ui/flow/package.json
----------------------------------------------------------------------
diff --git a/spot-oa/ui/flow/package.json b/spot-oa/ui/flow/package.json
index dcf4171..62f47ae 100755
--- a/spot-oa/ui/flow/package.json
+++ b/spot-oa/ui/flow/package.json
@@ -6,12 +6,10 @@
     "watch-suspicious": "watchify js/suspicious.js -o js/suspicious.bundle.min.js -v -d",
     "watch-threat-investigation": "watchify js/threat-investigation.js -o js/threat-investigation.bundle.min.js -v -d",
     "watch-storyboard": "watchify js/storyboard.js -o js/storyboard.bundle.min.js -v -d",
-    "watch-ingest-summary": "watchify js/ingest-summary.js -o js/ingest-summary.bundle.min.js -v -d",
-    "build-all": "npm run build-suspicious && npm run build-threat-investigation && npm run build-ingest-summary && npm run build-storyboard",
+    "build-all": "npm run build-suspicious && npm run build-threat-investigation && npm run build-storyboard",
     "build-suspicious": "browserify js/suspicious.js | uglifyjs -cm > js/suspicious.bundle.min.js",
     "build-threat-investigation": "browserify js/threat-investigation.js | uglifyjs -cm > js/threat-investigation.bundle.min.js",
-    "build-storyboard": "browserify js/storyboard.js | uglifyjs -cm > js/storyboard.bundle.min.js",
-    "build-ingest-summary": "browserify js/ingest-summary.js | uglifyjs -cm > js/ingest-summary.bundle.min.js"
+    "build-storyboard": "browserify js/storyboard.js | uglifyjs -cm > js/storyboard.bundle.min.js"
   },
   "browserify": {
     "transform": [

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/8a695097/spot-oa/ui/ingest-summary.html
----------------------------------------------------------------------
diff --git a/spot-oa/ui/ingest-summary.html b/spot-oa/ui/ingest-summary.html
new file mode 100755
index 0000000..e694609
--- /dev/null
+++ b/spot-oa/ui/ingest-summary.html
@@ -0,0 +1,155 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <title>Netflow :: Ingest Summary</title>
+
+    <meta charset="UTF-8">
+
+    <!--Bootstrap styles-->
+    <link rel="stylesheet" type="text/css" href="css/bootstrap-spot.min.css" />
+    <!--Bootstrap Date picker styles-->
+    <link rel="stylesheet" type="text/css" href="node_modules/bootstrap-datepicker/dist/css/bootstrap-datepicker3.min.css" />
+    <!-- Spot styles -->
+    <link rel="stylesheet" type="text/css" href="css/main.css" />
+    <!-- Favicon -->
+    <link rel="apple-touch-icon" sizes="57x57" href="images/favicon/apple-icon-57x57.png">
+    <link rel="apple-touch-icon" sizes="60x60" href="images/favicon/apple-icon-60x60.png"
+    <link rel="apple-touch-icon" sizes="72x72" href="images/favicon/apple-icon-72x72.png">
+    <link rel="apple-touch-icon" sizes="76x76" href="images/favicon/apple-icon-76x76.png">
+    <link rel="apple-touch-icon" sizes="114x114" href="images/favicon/apple-icon-114x114.png">
+    <link rel="apple-touch-icon" sizes="120x120" href="images/favicon/apple-icon-120x120.png">
+    <link rel="apple-touch-icon" sizes="144x144" href="images/favicon/apple-icon-144x144.png">
+    <link rel="apple-touch-icon" sizes="152x152" href="images/favicon/apple-icon-152x152.png">
+    <link rel="apple-touch-icon" sizes="180x180" href="images/favicon/apple-icon-180x180.png">
+    <link rel="icon" type="image/png" sizes="192x192"  href="images/favicon/android-icon-192x192.png">
+    <link rel="icon" type="image/png" sizes="32x32" href="images/favicon/favicon-32x32.png">
+    <link rel="icon" type="image/png" sizes="96x96" href="images/favicon/favicon-96x96.png">
+    <link rel="icon" type="image/png" sizes="16x16" href="images/favicon/favicon-16x16.png">
+    <link rel="manifest" href="images/favicon/manifest.json">
+    <meta name="msapplication-TileColor" content="#ffffff">
+    <meta name="msapplication-TileImage" content="images/favicon/ms-icon-144x144.png">
+    <meta name="theme-color" content="#ffffff">
+    <style type="text/css">
+        .spot-row {
+            height: 100%;
+        }
+
+        #spot-is-header {
+            width: 100%;
+            position: absolute;
+            top: 0;
+            left: 0;
+            z-index: 2;
+            height: auto;
+        }
+
+        #spot-is, #spot-is-summary {
+            height: 100%;
+        }
+
+        .axis {
+            shape-rendering: crispEdges;
+        }
+
+        .axis path, .axis line {
+            fill: none;
+        }
+
+        rect.pane {
+            cursor: e-resize;
+            fill: none;
+            pointer-events: all;
+        }
+    </style>
+</head>
+<body>
+    <nav id="spot-nav" class="navbar navbar-default">
+        <div class="container-fluid">
+            <!-- App name and toggle get grouped for better mobile display -->
+            <div class="navbar-header">
+                <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#main-menu">
+                    <span class="sr-only">Toggle navigation</span>
+                    <span class="icon-bar"></span>
+                    <span class="icon-bar"></span>
+                    <span class="icon-bar"></span>
+                </button>
+                <span class="navbar-brand">Apache Spot :: Netflow :: Ingest Summary</span>
+            </div>
+            <!-- Collect the nav links, forms, and other content for toggling -->
+            <div class="collapse navbar-collapse" id="main-menu">
+                <ul class="nav navbar-nav navbar-right">
+                    <li class="dropdown">
+                        <a class="dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="false">
+                            Flows
+                            <span class="caret"></span>
+                        </a>
+                        <ul class="dropdown-menu" aria-labelledby="flowsMenu">
+                            <li>
+                                <a data-href="flow/suspicious.html#date=${end-date}">Suspicious</a>
+                            </li>
+                            <li>
+                                <a data-href="flow/threat-investigation.html#date=${end-date}">Threat Investigation</a>
+                            </li>
+                            <li>
+                                <a data-href="flow/storyboard.html#date=${end-date}">Storyboard</a>
+                            </li>
+                        </ul>
+                    </li>
+                    <li class="dropdown">
+                        <a class="dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="false">
+                            DNS
+                            <span class="caret"></span>
+                        </a>
+                        <ul class="dropdown-menu" aria-labelledby="dnsMenu">
+                            <li>
+                                <a data-href="dns/suspicious.html#date=${end-date}">Suspicious</a>
+                            </li>
+                            <li>
+                                <a data-href="dns/threat-investigation.html#date=${end-date}">Threat Investigation</a>
+                            </li>
+                            <li>
+                                <a data-href="dns/storyboard.html#date=${end-date}">Storyboard</a>
+                            </li>
+                        </ul>
+                    </li>
+                    <li class="dropdown">
+                        <a class="dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="false">
+                            Proxy
+                            <span class="caret"></span>
+                        </a>
+                        <ul class="dropdown-menu" aria-labelledby="proxyMenu">
+                            <li>
+                                <a data-href="proxy/suspicious.html#date=${end-date}">Suspicious</a>
+                            </li>
+                            <li>
+                                <a data-href="proxy/threat-investigation.html#date=${end-date}">Threat Investigation</a>
+                            </li>
+                            <li>
+                                <a data-href="proxy/storyboard.html#date=${end-date}">Storyboard</a>
+                            </li>
+                        </ul>
+                    </li>
+                </ul>
+            </div>
+            <div id="search-box" class="row text-right">
+                <!--Tools Buttons-->
+                <div id="nav_form" class="col-md-12">
+                  <!-- Search form placeholder -->
+                </div>
+            </div> <!-- /Tools Buttons-->
+        </div>
+    </nav>
+    <div id="spot-content-wrapper" class="container-fluid">
+      <!-- Main Content Placeholder -->
+    </div>
+
+    <!-- SCRIPTS -->
+    <script type="application/javascript" src="node_modules/jquery/dist/jquery.min.js"></script>
+    <script type="application/javascript" src="node_modules/d3/d3.min.js"></script>
+    <script type="application/javascript" src="node_modules/bootstrap/dist/js/bootstrap.min.js"></script>
+    <script type="application/javascript" src="node_modules/bootstrap-datepicker/dist/js/bootstrap-datepicker.min.js"></script>
+    <script type="application/javascript" src="node_modules/react/dist/react.min.js"></script>
+    <script type="application/javascript" src="node_modules/react-dom/dist/react-dom.min.js"></script>
+    <script type="application/javascript" src="js/ingest-summary.bundle.min.js"></script>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/8a695097/spot-oa/ui/js/actions/InSumActions.js
----------------------------------------------------------------------
diff --git a/spot-oa/ui/js/actions/InSumActions.js b/spot-oa/ui/js/actions/InSumActions.js
new file mode 100755
index 0000000..ea15715
--- /dev/null
+++ b/spot-oa/ui/js/actions/InSumActions.js
@@ -0,0 +1,12 @@
+const SpotDispatcher = require('../dispatchers/SpotDispatcher');
+const SpotConstants = require('../constants/SpotConstants');
+
+const InSumActions = {
+  reloadSummary: function () {
+    SpotDispatcher.dispatch({
+      actionType: SpotConstants.RELOAD_INGEST_SUMMARY
+    });
+  }
+};
+
+module.exports = InSumActions;

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/8a695097/spot-oa/ui/js/components/IngestSummaryPanel.react.js
----------------------------------------------------------------------
diff --git a/spot-oa/ui/js/components/IngestSummaryPanel.react.js b/spot-oa/ui/js/components/IngestSummaryPanel.react.js
new file mode 100755
index 0000000..cc951ad
--- /dev/null
+++ b/spot-oa/ui/js/components/IngestSummaryPanel.react.js
@@ -0,0 +1,236 @@
+const $ = require('jquery');
+const d3 = require('d3');
+const React = require('react');
+const ReactDOM = require('react-dom');
+
+const DateUtils = require('../utils/DateUtils');
+const InSumActions = require('../actions/InSumActions');
+
+const NetflowIngestSummaryStore = require('../../flow/js/stores/IngestSummaryStore');
+
+function initialDraw() {
+  var rootNode, format, x, y, xAxis, yAxis, area, svg, rect, total, minDate, maxDate, maxFlows, numberFormat;
+
+  rootNode = d3.select(ReactDOM.findDOMNode(this));
+
+  // graph dimensions
+  var m = [100, 50, 50, 80], // Margin
+      w = $(rootNode.node()).width() - m[1] - m[3], // Width
+      h = $(rootNode.node()).height() - m[0] - m[2]; // Height
+
+  format = d3.time.format("%Y-%m-%d %H:%M");
+
+  // Scales.
+  x = d3.time.scale().range([0, w]); // get X function
+  y = d3.scale.linear().range([h, 0]); // get Y function
+  xAxis = d3.svg.axis().scale(x).orient("bottom"); // Get the X axis (Time)
+  yAxis = d3.svg.axis().scale(y).orient("left"); // Get Y Axis (Netflows)
+
+  // An area generator.
+  area = d3.svg.area()
+        .x(function (d) {
+            return x(d.date);
+        })
+        .y0(h)
+        .y1(function (d) {
+            if (!isNaN(d.total))
+                return y(d.total);
+            else
+                return y(0);
+        });
+
+  rootNode.select('svg').remove();
+
+  // define the Main SVG
+  svg = rootNode.select('#' + this.props.id + '-summary').append("svg")
+    .attr("width", w + m[1] + m[3])
+    .attr("height", h + m[0] + m[2])
+        .append("g")
+        .attr("transform", "translate(" + m[3] + "," + m[0] + ")")
+
+  // Append the clipPath to avoid the Area overlapping
+  svg.append("clipPath")
+        .attr("id", "clip")
+        .append("rect")
+          .attr("x", x(0))
+          .attr("y", y(1))
+          .attr("width", x(1) - x(0))
+          .attr("height", y(0) - y(1));
+
+  // Append the Y Axis group
+  svg.append("g")
+    .attr("class", "y axis");
+
+  // Append the X axis group
+  svg.append("g")
+    .attr("class", "x axis")
+    .attr("transform", "translate(0," + h + ")");
+
+  // Append a pane rect, which will help us to add the zoom functionality
+  rect = svg.append("rect")
+        .attr("class", "pane")
+        .attr("width", w)
+        .attr("height", h);
+
+  this.state.data.forEach(function (dataSet)
+  {
+    var a;
+
+    a = [{date: minDate}];
+    a.push.apply(a, dataSet);
+    minDate = d3.min(a, function (d) { return d.date; });
+    a[0] = {date: maxDate, flows: maxFlows};
+    maxDate = d3.max(a, function (d) { return d.date; });
+    maxFlows = d3.max(a, function (d) { return d.total; })
+  });
+
+  !minDate && (minDate = DateUtils.parseDate(NetflowIngestSummaryStore.getStartDate()));
+  !maxDate && (maxDate = DateUtils.parseDate(NetflowIngestSummaryStore.getEndDate()));
+
+  // bind the data to the X and Y generators
+  x.domain([minDate, maxDate]);
+  y.domain([0, maxFlows]);
+
+  // Bind the data to our path element.
+  svg.selectAll("path.area").data(this.state.data).enter().insert('path', 'g')
+                                                .attr('class', 'area')
+                                                .attr('clip-path', 'url(#clip)')
+                                                .style('fill', '#0071c5')
+                                                .attr('d', function (d) {
+                                                    return area(d);
+                                                });
+
+  //Add the pane rect the zoom behavior
+  rect.call(d3.behavior.zoom().x(x)
+      .scaleExtent([0.3, 2300]) // these are magic numbers to avoid the grap be zoomable in/out to the infinity
+      .on("zoom", zoom.bind(this)));
+
+  function draw () {
+    var total, minDate, maxDate, numberFormat;
+
+    svg.select("g.x.axis").call(xAxis);
+    svg.select("g.y.axis").call(yAxis);
+    svg.selectAll("path.area").attr("d", function (d) { return area(d); });
+    numberFormat = d3.format(",d"); // number formatter (comma separated number i.e. 100,000,000)
+
+    rootNode.select('#' + this.props.id + '-range').html("Seeing total flows <strong>from:</strong> " + x.domain().map(format).join(" <strong>to:</strong> "));
+
+    //Calculate the total flows between the displayed date range
+
+    total = 0;
+    minDate = x.domain()[0];
+    maxDate = x.domain()[1];
+
+    // Go to the first millisecond on dates
+    minDate.setSeconds(0);minDate.setMilliseconds(0);
+    maxDate.setSeconds(59);maxDate.setMilliseconds(0);
+
+    svg.selectAll("path.area").data().forEach(function (pathData)
+    {
+      pathData.forEach(function (record)
+      {
+        // Discard records outside displayed date range
+        if (record.date >= minDate && record.date <= maxDate) {
+          total += +record.total;
+        }
+      });
+    });
+
+    rootNode.select('#' + this.props.id + '-total').html("<strong>Total netflows in range:</strong> " + numberFormat(total));
+  }
+
+  /*
+      Zoom event handler
+  */
+  function zoom() {
+    if (d3.event.sourceEvent.type == "wheel") {
+      if (d3.event.sourceEvent.wheelDelta < 0)
+         rect.style("cursor", "zoom-out");
+      else
+         rect.style("cursor", "zoom-in");
+    }
+    else if (d3.event.sourceEvent.type == "mousemove") {
+      rect.style("cursor", "e-resize");
+    }
+
+    draw.call(this);
+  }
+
+  draw.call(this);
+}
+
+var IngestSummaryPanel = React.createClass({
+  propTypes: {
+    id: React.PropTypes.string
+  },
+  getDefaultProperties: function () {
+    return {
+      id: 'spot-is'
+    };
+  },
+  getInitialState: function ()
+  {
+    return {loading: true};
+  },
+  render:function()
+  {
+    var content;
+
+    if (this.state.error)
+    {
+      content = (
+        <div className="text-center text-danger">
+          {this.state.error}
+        </div>
+      );
+    }
+    else if (this.state.loading)
+    {
+      content = (
+        <div className="spot-loader">
+            Loading <span className="spinner"></span>
+        </div>
+      );
+    }
+    else
+    {
+      content = (
+        <div id={this.props.id} className="text-center">
+          <div id={this.props.id + '-header'}>
+            <p id={this.props.id + '-range'}></p>
+            <p id={this.props.id + '-total'}></p>
+            <p id={this.props.id + '-istructions'} className="small">** Zoom in/out using mouse wheel or two fingers in track pad <br /> ** Move across the x-axis by clicking anywhere in the graph and dragging to left or right</p>
+          </div>
+          <div id={this.props.id + '-summary'}></div>
+        </div>
+      );
+    }
+
+    return (
+      <div>{content}</div>
+    )
+  },
+  componentDidMount: function()
+  {
+    NetflowIngestSummaryStore.addChangeDataListener(this._onChange);
+    window.addEventListener('resize', this.buildGraph);
+  },
+  componentWillUnmount: function ()
+  {
+    NetflowIngestSummaryStore.removeChangeDataListener(this._onChange);
+    window.removeEventListener('resize', this.buildGraph);
+  },
+  componentDidUpdate: function ()
+  {
+    if (!this.state.loading && !this.state.error && this.state.data)
+    {
+      this.buildGraph();
+    }
+  },
+  buildGraph: initialDraw,
+  _onChange: function () {
+    this.replaceState(NetflowIngestSummaryStore.getData());
+  }
+});
+
+module.exports = IngestSummaryPanel;

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/8a695097/spot-oa/ui/js/constants/SpotConstants.js
----------------------------------------------------------------------
diff --git a/spot-oa/ui/js/constants/SpotConstants.js b/spot-oa/ui/js/constants/SpotConstants.js
index 03471c4..35e23c4 100755
--- a/spot-oa/ui/js/constants/SpotConstants.js
+++ b/spot-oa/ui/js/constants/SpotConstants.js
@@ -30,6 +30,11 @@ var SpotConstants = {
   // Storyboard
   RELOAD_COMMENTS: 'RELOAD_COMMENTS',
   SELECT_COMMENT: 'SELECT_COMMENT',
+  // INGEST SUMMARY
+  START_DATE: 'start-date',
+  END_DATE: 'end-date',
+  // Ingest summary Actions
+  RELOAD_INGEST_SUMMARY: 'RELOAD_INGEST_SUMMARY',
   // Server Paths
   NOTEBOOKS_PATH: '/notebooks/ipynb'
 };

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/8a695097/spot-oa/ui/js/ingest-summary.js
----------------------------------------------------------------------
diff --git a/spot-oa/ui/js/ingest-summary.js b/spot-oa/ui/js/ingest-summary.js
new file mode 100755
index 0000000..c42ed41
--- /dev/null
+++ b/spot-oa/ui/js/ingest-summary.js
@@ -0,0 +1,94 @@
+const React = require('react');
+const ReactDOM = require('react-dom');
+
+const SpotActions = require('./actions/SpotActions');
+const InSumActions = require('./actions/InSumActions');
+const SpotConstants = require('./constants/SpotConstants');
+const SpotUtils = require('./utils/SpotUtils');
+const DateUtils = require('./utils/DateUtils');
+
+// Build and Render Toolbar
+const DateInput = require('./components/DateInput.react');
+
+// Find out period
+var startDate, endDate, today;
+
+today = new Date();
+startDate = SpotUtils.getUrlParam(SpotConstants.START_DATE);
+endDate = SpotUtils.getUrlParam(SpotConstants.END_DATE);
+
+if (!startDate && endDate)
+{
+  startDate = DateUtils.formatDate(DateUtils.calcDate(DateUtils.parseDate(endDate), -7));
+}
+else if (startDate && !endDate)
+{
+  endDate = DateUtils.formatDate(DateUtils.calcDate(DateUtils.parseDate(startDate), 7));
+}
+else if (!startDate && !endDate)
+{
+  // Default endDate to today and startDate to 7 days before
+  startDate = DateUtils.formatDate(DateUtils.calcDate(today, -7));
+  endDate = DateUtils.formatDate(today);
+}
+
+// We got values for both dates, make use endDate is after startDate
+if (endDate < startDate)
+{
+  // Use today var to switch dates
+  today = startDate;
+  startDate = endDate;
+  endDate = today;
+}
+
+ReactDOM.render(
+  (
+    <form className="form-inline">
+      <div className="form-group">
+        <label htmlFor="startDatePicker">Period:</label>
+        <div className="input-group input-group-xs">
+          <div className="input-group-addon">
+            <span className="glyphicon glyphicon-calendar" aria-hidden="true"></span>
+          </div>
+          <DateInput id="startDatePicker" name={SpotConstants.START_DATE} value={startDate}/>
+        </div>
+      </div>
+      <div className="form-group">
+        <label htmlFor="endDatePicker"> - </label>
+        <div className="input-group input-group-xs">
+          <DateInput id="endDatePicker" name={SpotConstants.END_DATE} value={endDate} />
+          <div className="input-group-btn">
+            <button className="btn btn-default" type="button" title="Reload" onClick={InSumActions.reloadSummary}>
+              <span className="glyphicon glyphicon-repeat" aria-hidden="true"></span>
+            </button>
+          </div>
+        </div>
+      </div>
+    </form>
+  ),
+  document.getElementById('nav_form')
+);
+
+// Build and Render Edge Investigation's panels
+const PanelRow = require('./components/PanelRow.react');
+const Panel = require('./components/Panel.react');
+//
+const IngestSummaryPanel = require('./components/IngestSummaryPanel.react');
+
+ReactDOM.render(
+  <div id="spot-content">
+    <PanelRow maximized>
+      <Panel title="Ingest Summary" container header={false} className="col-md-12">
+        <IngestSummaryPanel id="spot-is" />
+      </Panel>
+    </PanelRow>
+  </div>,
+  document.getElementById('spot-content-wrapper')
+);
+
+// Set period
+SpotActions.setDate(startDate, SpotConstants.START_DATE);
+SpotActions.setDate(endDate, SpotConstants.END_DATE);
+
+// Load data
+InSumActions.reloadSummary();

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/8a695097/spot-oa/ui/js/stores/IngestSummaryStore.js
----------------------------------------------------------------------
diff --git a/spot-oa/ui/js/stores/IngestSummaryStore.js b/spot-oa/ui/js/stores/IngestSummaryStore.js
new file mode 100755
index 0000000..9e4ccb5
--- /dev/null
+++ b/spot-oa/ui/js/stores/IngestSummaryStore.js
@@ -0,0 +1,161 @@
+var assign = require('object-assign');
+var d3 = require('d3');
+
+var SpotDispatcher = require('../../../js/dispatchers/SpotDispatcher');
+var SpotConstants = require('../../../js/constants/SpotConstants');
+var NetflowConstants = require('../constants/NetflowConstants');
+var DateUtils = require('../../../js/utils/DateUtils');
+var RestStore = require('../../../js/stores/RestStore');
+
+var START_DATE_FILTER = NetflowConstants.START_DATE;
+var END_DATE_FILTER = NetflowConstants.END_DATE;
+var CURRENT_DATE_FILTER = 'current_date';
+
+var requestQueue = [];
+var requestErrors = [];
+
+var IngestSummaryStore = assign(new RestStore(NetflowConstants.API_INGEST_SUMMARY), {
+    errorMessages: {
+        404: 'No details available'
+    },
+    setStartDate: function (date) {
+        this.setRestFilter(START_DATE_FILTER, date);
+    },
+    getStartDate: function () {
+        return this.getRestFilter(START_DATE_FILTER);
+    },
+    setEndDate: function (date) {
+        this.setRestFilter(END_DATE_FILTER, date);
+    },
+    getEndDate: function () {
+        return this.getRestFilter(END_DATE_FILTER);
+    },
+    /**
+     *  Start asking the server for CSV data to create the chart
+     **/
+    requestSummary: function () {
+        var startDate, endDate, date, delta, startRequests, i, month;
+
+        startDate = DateUtils.parseDate(this.getRestFilter(START_DATE_FILTER));
+        endDate = DateUtils.parseDate(this.getRestFilter(END_DATE_FILTER));
+
+        // Find out how many request need to be made
+        delta = (endDate.getFullYear() - startDate.getFullYear()) * 12 + (endDate.getMonth() - startDate.getMonth());
+
+        startRequests = requestQueue.length == 0;
+
+        // Go to first day in month
+        date = new Date(startDate);
+        date.setDate(1);
+
+        // Queue date requests
+        requestQueue.push(date);
+        for (i = 1; i <= delta; i++) {
+            requestQueue.push(DateUtils.calcDate(date, i, 'month'));
+        }
+
+        // dequeue is no request is running
+        startRequests && this.dequeue();
+    },
+    dequeue: function () {
+        var date, year, month;
+
+        if (requestQueue.length == 0) return;
+
+        date = requestQueue.shift();
+        this.setRestFilter(CURRENT_DATE_FILTER, date);
+        year = date.getFullYear();
+        month = date.getMonth() + 1 + "";
+        month = month.length == 1 ? "0" + month : month;
+
+        this.setEndpoint(NetflowConstants.API_INGEST_SUMMARY.replace('${year}', year).replace('${month}', month));
+
+        this.reload();
+    },
+    setData: function (data) {
+        var startDate, endDate, date, dayFilter, parse;
+
+        // Does the loading indicator needs to be displayed?
+        if (data.loading) {
+            if (!this._data.loading) {
+                this._data = data;
+                this.emitChangeData();
+            }
+
+            // Do nothing when loading is in progress
+            return;
+        }
+
+        // Store errors for later usage
+        if (data.error) {
+            requestErrors.push(data);
+        }
+        else if (data.data) {
+            parse = d3.time.format("%Y-%m-%d %H:%M").parse; // Date formatting parser
+            startDate = DateUtils.parseDate(this.getRestFilter(START_DATE_FILTER));
+            endDate = DateUtils.parseDate(this.getRestFilter(END_DATE_FILTER));
+            date = DateUtils.parseDate(this.getRestFilter(CURRENT_DATE_FILTER));
+
+            if (date.getFullYear() == startDate.getFullYear() && date.getMonth() == startDate.getMonth()) {
+                dayFilter = startDate.getDate();
+                data.data = data.data.filter(function (row) {
+                    return DateUtils.parseDate(row.date, true).getDate() >= dayFilter
+                });
+            }
+
+            if (date.getFullYear() == endDate.getFullYear() && date.getMonth() == endDate.getMonth()) {
+                dayFilter = endDate.getDate();
+                data.data = data.data.filter(function (row) {
+                    return DateUtils.parseDate(row.date, true).getDate() <= dayFilter
+                });
+            }
+
+            // Parse dates and numbers.
+            data.data.forEach(function (d) {
+                d.date = parse(d.date);
+                d.flows = +d.flows;
+            });
+
+            // Sort the data by date ASC
+            data.data.sort(function (a, b) {
+                return a.date - b.date;
+            });
+
+            if (!this._data.data) this._data.data = [];
+            this._data.data.push(data.data);
+        }
+
+        this._data.loading = requestQueue.length > 0;
+
+        if (!this._data.loading) {
+            if (this._data.data.length==0) {
+                // Broadcast first found error
+                this._data = requestErrors[0];
+            }
+            this.emitChangeData();
+        }
+        else {
+            setTimeout(this.dequeue.bind(this), 1);
+        }
+    }
+});
+
+SpotDispatcher.register(function (action) {
+    switch (action.actionType) {
+        case SpotConstants.UPDATE_DATE:
+            switch (action.name) {
+                case NetflowConstants.START_DATE:
+                    IngestSummaryStore.setStartDate(action.date);
+                    break;
+                case NetflowConstants.END_DATE:
+                    IngestSummaryStore.setEndDate(action.date);
+                    break;
+            }
+            break;
+        case NetflowConstants.RELOAD_INGEST_SUMMARY:
+            IngestSummaryStore.requestSummary();
+            break;
+    }
+});
+
+module.exports = IngestSummaryStore;

http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/8a695097/spot-oa/ui/package.json
----------------------------------------------------------------------
diff --git a/spot-oa/ui/package.json b/spot-oa/ui/package.json
index 2f40acb..724544d 100644
--- a/spot-oa/ui/package.json
+++ b/spot-oa/ui/package.json
@@ -36,10 +36,12 @@
   "scripts": {
     "test": "jest",
     "postinstall": "npm run build-all",
-    "build-all": "npm run build-flow && npm run build-dns && npm run build-proxy",
+    "watch-ingest-summary": "watchify js/ingest-summary.js -o js/ingest-summary.bundle.min.js -v -d",
+    "build-all": "npm run build-flow && npm run build-dns && npm run build-proxy && npm run build-ingest-summary",
     "build-flow": "cd flow/ && npm run build-all && cd ../",
     "build-dns": "cd dns/ && npm run build-all && cd ../",
-    "build-proxy": "cd proxy/ && npm run build-all && cd ../"
+    "build-proxy": "cd proxy/ && npm run build-all && cd ../",
+    "build-ingest-summary": "browserify js/ingest-summary.js | uglifyjs -cm > js/ingest-summary.bundle.min.js"
   },
   "browserify": {
     "transform": [


[48/49] incubator-spot git commit: Replacing old wiki with apache spot doc

Posted by ev...@apache.org.
Replacing old wiki with apache spot doc


Project: http://git-wip-us.apache.org/repos/asf/incubator-spot/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-spot/commit/6ae14f50
Tree: http://git-wip-us.apache.org/repos/asf/incubator-spot/tree/6ae14f50
Diff: http://git-wip-us.apache.org/repos/asf/incubator-spot/diff/6ae14f50

Branch: refs/heads/master
Commit: 6ae14f501d64ac3359727b7e0e8b982f75b1686a
Parents: 6321c7c
Author: Everardo Lopez Sandoval (Intel) <el...@elopezsa-mac02.local>
Authored: Thu Jan 19 09:19:04 2017 -0800
Committer: Everardo Lopez Sandoval (Intel) <el...@elopezsa-mac02.ra.intel.com>
Committed: Fri Jan 20 17:01:02 2017 -0800

----------------------------------------------------------------------
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-spot/blob/6ae14f50/README.md
----------------------------------------------------------------------
diff --git a/README.md b/README.md
index f089cb7..541bba9 100644
--- a/README.md
+++ b/README.md
@@ -56,7 +56,7 @@ For the full instructions visit the [spot](https://hub.docker.com/r/opennetworki
 
 ## **Getting Started**
 
-Apache Spot can be installed by following our installation manual. To get started, [check out the installation instructions in the documentation](https://github.com/Open-Network-Insight/open-network-insight/wiki).
+Apache Spot can be installed by following our installation manual. To get started, [check out the installation instructions in the documentation](http://spot.incubator.apache.org/doc/).
 
 ## **Documentation (Developer Guide)**