You are viewing a plain text version of this content. The canonical link for it is here.
Posted to pylucene-commits@lucene.apache.org by va...@apache.org on 2010/11/08 01:13:02 UTC

svn commit: r1032428 - /lucene/pylucene/branches/branch_3x/test/test_ThaiAnalyzer.py

Author: vajda
Date: Mon Nov  8 00:13:02 2010
New Revision: 1032428

URL: http://svn.apache.org/viewvc?rev=1032428&view=rev
Log:
refreshed test_ThaiAnalyzer.py

Modified:
    lucene/pylucene/branches/branch_3x/test/test_ThaiAnalyzer.py

Modified: lucene/pylucene/branches/branch_3x/test/test_ThaiAnalyzer.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/branches/branch_3x/test/test_ThaiAnalyzer.py?rev=1032428&r1=1032427&r2=1032428&view=diff
==============================================================================
--- lucene/pylucene/branches/branch_3x/test/test_ThaiAnalyzer.py (original)
+++ lucene/pylucene/branches/branch_3x/test/test_ThaiAnalyzer.py Mon Nov  8 00:13:02 2010
@@ -14,15 +14,64 @@
 # ====================================================================
 
 from unittest import TestCase, main
-from lucene import ThaiAnalyzer, StringReader, Version
+from lucene import ThaiAnalyzer, ThaiWordFilter, StringReader, Version
 from BaseTokenStreamTestCase import BaseTokenStreamTestCase
 
 
 class ThaiAnalyzerTestCase(BaseTokenStreamTestCase):
 
-    def testAnalyzer(self):
+    def testOffsets(self):
+        self.assert_(ThaiWordFilter.DBBI_AVAILABLE,
+                     "JRE does not support Thai dictionary-based BreakIterator")
+
+        self._assertAnalyzesTo(ThaiAnalyzer(Version.LUCENE_CURRENT),
+                               u"การที่ได้ต้องแสดงว่างานดี", 
+                               [ u"การ", u"ที่", u"ได้", u"ต้อง", u"แสดง",
+                                 u"ว่า", u"งาน", u"ดี" ],
+                               [ 0, 3, 6, 9, 13, 17, 20, 23 ],
+                               [ 3, 6, 9, 13, 17, 20, 23, 25 ])
+
+    def testTokenType(self):
+        self.assert_(ThaiWordFilter.DBBI_AVAILABLE,
+                     "JRE does not support Thai dictionary-based BreakIterator")
+
+        self._assertAnalyzesTo(ThaiAnalyzer(Version.LUCENE_CURRENT),
+                               u"การที่ได้ต้องแสดงว่างานดี ๑๒๓", 
+                               [ u"การ", u"ที่", u"ได้", u"ต้อง", u"แสดง",
+                                 u"ว่า", u"งาน", u"ดี", u"๑๒๓" ],
+                               None, None,
+                               [ "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>", 
+                                 "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>", 
+                                 "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
+                                 "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
+                                 "<NUM>" ])
+
+    def testPositionIncrements(self):
+        self.assert_(ThaiWordFilter.DBBI_AVAILABLE,
+                     "JRE does not support Thai dictionary-based BreakIterator")
 
         analyzer = ThaiAnalyzer(Version.LUCENE_CURRENT)
+
+        self._assertAnalyzesTo(analyzer, u"การที่ได้ต้อง the แสดงว่างานดี", 
+                               [ u"การ", u"ที่", u"ได้", u"ต้อง", u"แสดง",
+                                 u"ว่า", u"งาน", u"ดี" ],
+                               [ 0, 3, 6, 9, 18, 22, 25, 28 ],
+                               [ 3, 6, 9, 13, 22, 25, 28, 30 ],
+                               None,
+                               [ 1, 1, 1, 1, 2, 1, 1, 1 ])
+	 
+        # case that a stopword is adjacent to thai text, with no whitespace
+        self._assertAnalyzesTo(analyzer, u"การที่ได้ต้องthe แสดงว่างานดี", 
+                               [ u"การ", u"ที่", u"ได้", u"ต้อง", u"แสดง",
+                                 u"ว่า", u"งาน", u"ดี" ],
+                               [ 0, 3, 6, 9, 17, 21, 24, 27 ],
+                               [ 3, 6, 9, 13, 21, 24, 27, 29 ],
+                               None,
+                               [ 1, 1, 1, 1, 2, 1, 1, 1 ])
+
+    def testAnalyzer30(self):
+
+        analyzer = ThaiAnalyzer(Version.LUCENE_30)
     
         self._assertAnalyzesTo(analyzer, u"", [])