You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@esme.apache.org by vd...@apache.org on 2009/12/09 22:02:28 UTC

svn commit: r888970 - /incubator/esme/trunk/server/src/main/scala/org/apache/esme/model/Message.scala

Author: vdichev
Date: Wed Dec  9 21:02:28 2009
New Revision: 888970

URL: http://svn.apache.org/viewvc?rev=888970&view=rev
Log:
Move stemmer and methods to Message's companion object.

Modified:
    incubator/esme/trunk/server/src/main/scala/org/apache/esme/model/Message.scala

Modified: incubator/esme/trunk/server/src/main/scala/org/apache/esme/model/Message.scala
URL: http://svn.apache.org/viewvc/incubator/esme/trunk/server/src/main/scala/org/apache/esme/model/Message.scala?rev=888970&r1=888969&r2=888970&view=diff
==============================================================================
--- incubator/esme/trunk/server/src/main/scala/org/apache/esme/model/Message.scala (original)
+++ incubator/esme/trunk/server/src/main/scala/org/apache/esme/model/Message.scala Wed Dec  9 21:02:28 2009
@@ -60,6 +60,7 @@
 
   private val idCache = new LRU[Long, Message](cacheSize)
 
+  private val stemmer:PorterStemmer = new PorterStemmer()
 
   /**
    * A list of messages is requested from a simple cache
@@ -214,10 +215,73 @@
     logger.fine("Modified query: " + modifiedQueryParams)
     super.findMapDb(dbId, modifiedQueryParams:_*)(f)
   }
+
+  // Compounds a bunch of (String, Int) elements so that [(String1, Int1), (String2, Int2)] becomes [(StringN, Int1+Int2)]
+  //  if String1 and String2 have the same stem (according to the Porter stemming algorithm). StringN is the shorter of
+  //  String1 and String2
+  private[model] def compoundStem(llsi: List[(String,Int)]): List[(String,Int)] = {
+    val stemCache = llsi.foldLeft[Map[String, String]](Map.empty){
+      case (map, (str, _)) => if (map.contains(str)) map
+        else map + (str -> stemWord(str))
+    }
+    def shortWord(a: String, b: String): String =
+    if (a.length < b.length) a else b
+
+    val stemToWord: Map[String, String] = Map(
+      // create a map from stem to all the words that
+      // stem down to that word
+      stemCache.toList.
+      foldLeft[Map[String, List[String]]](Map.empty){
+        case (map, (word, stem)) =>
+          map + (stem -> (word :: map.getOrElse(stem, Nil)))
+      }.toList.
+      // convert the list of stemmed words to the shortest word
+      // matching the stem
+      map{
+        case (key, value) => (key, value.reduceLeft(shortWord))
+      } :_*)
+
+    llsi.foldLeft[Map[String, Int]](Map.empty){
+      case (map, (str, cnt)) =>
+        val sw = stemCache(str)
+        map + (sw -> (map.getOrElse(sw, 0) + cnt))
+    }.toList.map{ case (stem, cnt) => (stemToWord(stem), cnt)}
+  }
+  
+  def centreWeightedTopNWordFreqs(messages: List[Message], n:Int):List[(String, Float)] = {
+    val weights = compoundStem(messages.flatMap(_.wordFrequencies))
+
+    // Start with the top N tags
+    val sortedWeights = weights.sort(_._2 > _._2).take(n)
+
+    // And create a normalized cente-weighted list, e.g. smallest, small, Larger, BIG, *HUGE*, BIG, Larger, small, smallest
+    TagUtils.normalize(TagUtils.everyEven(sortedWeights).reverse ::: TagUtils.everyOdd(sortedWeights))
+  }
+
+  /**
+   * Stem an incoming string
+   */
+  def stemWord(in: String): String = stemmer.synchronized {
+    stemmer.setCurrent(in)
+    stemmer.stem()
+    stemmer.getCurrent()
+  }
+  
+  def transformBody(ns: NodeSeq) = {
+    import scala.xml.transform.{RuleTransformer, RewriteRule}
+    toXml.map(new RuleTransformer(new RewriteRule{
+      override def transform(n: Node) = n match {
+        case e: Elem if "body" == e.label => <body>{ns}</body>
+        case _ => n
+      }
+    })).first
+  }
 }
 
 @Searchable
 class Message extends LongKeyedMapper[Message] {
+  import Message._
+  
   def getSingleton = Message // what's the "meta" server
   def primaryKeyField = id
 
@@ -344,16 +408,6 @@
 
   private lazy val originalXml = XML.loadString(text.is)
   
-  def transformBody(ns: NodeSeq) = {
-    import scala.xml.transform.{RuleTransformer, RewriteRule}
-    toXml.map(new RuleTransformer(new RewriteRule{
-      override def transform(n: Node) = n match {
-        case e: Elem if "body" == e.label => <body>{ns}</body>
-        case _ => n
-      }
-    })).first
-  }
-  
   lazy val toXHTML = transformBody(digestedXHTML)
     
   lazy val toPlainTextBody = transformBody(Text(body))
@@ -516,55 +570,4 @@
     tags.map((_, 1)).toList
   }
 
-  def centreWeightedTopNWordFreqs(messages: List[Message], n:Int):List[(String, Float)] = {
-    val weights = compoundStem(messages.flatMap(_.wordFrequencies))
-
-    // Start with the top N tags
-    val sortedWeights = weights.sort(_._2 > _._2).take(n)
-
-    // And create a normalized cente-weighted list, e.g. smallest, small, Larger, BIG, *HUGE*, BIG, Larger, small, smallest
-    TagUtils.normalize(TagUtils.everyEven(sortedWeights).reverse ::: TagUtils.everyOdd(sortedWeights))
-  }
-
-  /**
-   * Stem an incoming string
-   */
-  private val stemmer:PorterStemmer = new PorterStemmer()
-  def stemWord(in: String): String = stemmer.synchronized {
-    stemmer.setCurrent(in)
-    stemmer.stem()
-    stemmer.getCurrent()
-  }
-
-  // Compounds a bunch of (String, Int) elements so that [(String1, Int1), (String2, Int2)] becomes [(StringN, Int1+Int2)]
-  //  if String1 and String2 have the same stem (according to the Porter stemming algorithm). StringN is the shorter of
-  //  String1 and String2
-  private[model] def compoundStem(llsi: List[(String,Int)]): List[(String,Int)] = {
-    val stemCache = llsi.foldLeft[Map[String, String]](Map.empty){
-      case (map, (str, _)) => if (map.contains(str)) map
-        else map + (str -> stemWord(str))
-    }
-    def shortWord(a: String, b: String): String =
-    if (a.length < b.length) a else b
-
-    val stemToWord: Map[String, String] = Map(
-      // create a map from stem to all the words that
-      // stem down to that word
-      stemCache.toList.
-      foldLeft[Map[String, List[String]]](Map.empty){
-        case (map, (word, stem)) =>
-          map + (stem -> (word :: map.getOrElse(stem, Nil)))
-      }.toList.
-      // convert the list of stemmed words to the shortest word
-      // matching the stem
-      map{
-        case (key, value) => (key, value.reduceLeft(shortWord))
-      } :_*)
-
-    llsi.foldLeft[Map[String, Int]](Map.empty){
-      case (map, (str, cnt)) =>
-        val sw = stemCache(str)
-        map + (sw -> (map.getOrElse(sw, 0) + cnt))
-    }.toList.map{ case (stem, cnt) => (stemToWord(stem), cnt)}
-  }
 }