You are viewing a plain text version of this content. The canonical link for it is here.
Posted to pylucene-commits@lucene.apache.org by va...@apache.org on 2010/11/08 01:13:02 UTC
svn commit: r1032428 -
/lucene/pylucene/branches/branch_3x/test/test_ThaiAnalyzer.py
Author: vajda
Date: Mon Nov 8 00:13:02 2010
New Revision: 1032428
URL: http://svn.apache.org/viewvc?rev=1032428&view=rev
Log:
refreshed test_ThaiAnalyzer.py
Modified:
lucene/pylucene/branches/branch_3x/test/test_ThaiAnalyzer.py
Modified: lucene/pylucene/branches/branch_3x/test/test_ThaiAnalyzer.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/branches/branch_3x/test/test_ThaiAnalyzer.py?rev=1032428&r1=1032427&r2=1032428&view=diff
==============================================================================
--- lucene/pylucene/branches/branch_3x/test/test_ThaiAnalyzer.py (original)
+++ lucene/pylucene/branches/branch_3x/test/test_ThaiAnalyzer.py Mon Nov 8 00:13:02 2010
@@ -14,15 +14,64 @@
# ====================================================================
from unittest import TestCase, main
-from lucene import ThaiAnalyzer, StringReader, Version
+from lucene import ThaiAnalyzer, ThaiWordFilter, StringReader, Version
from BaseTokenStreamTestCase import BaseTokenStreamTestCase
class ThaiAnalyzerTestCase(BaseTokenStreamTestCase):
- def testAnalyzer(self):
+ def testOffsets(self):
+ self.assert_(ThaiWordFilter.DBBI_AVAILABLE,
+ "JRE does not support Thai dictionary-based BreakIterator")
+
+ self._assertAnalyzesTo(ThaiAnalyzer(Version.LUCENE_CURRENT),
+ u"à¸à¸²à¸£à¸à¸µà¹à¹à¸à¹à¸à¹à¸à¸à¹à¸ªà¸à¸à¸§à¹à¸²à¸à¸²à¸à¸à¸µ",
+ [ u"à¸à¸²à¸£", u"à¸à¸µà¹", u"à¹à¸à¹", u"à¸à¹à¸à¸", u"à¹à¸ªà¸à¸",
+ u"วà¹à¸²", u"à¸à¸²à¸", u"à¸à¸µ" ],
+ [ 0, 3, 6, 9, 13, 17, 20, 23 ],
+ [ 3, 6, 9, 13, 17, 20, 23, 25 ])
+
+ def testTokenType(self):
+ self.assert_(ThaiWordFilter.DBBI_AVAILABLE,
+ "JRE does not support Thai dictionary-based BreakIterator")
+
+ self._assertAnalyzesTo(ThaiAnalyzer(Version.LUCENE_CURRENT),
+ u"à¸à¸²à¸£à¸à¸µà¹à¹à¸à¹à¸à¹à¸à¸à¹à¸ªà¸à¸à¸§à¹à¸²à¸à¸²à¸à¸à¸µ à¹à¹à¹",
+ [ u"à¸à¸²à¸£", u"à¸à¸µà¹", u"à¹à¸à¹", u"à¸à¹à¸à¸", u"à¹à¸ªà¸à¸",
+ u"วà¹à¸²", u"à¸à¸²à¸", u"à¸à¸µ", u"à¹à¹à¹" ],
+ None, None,
+ [ "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
+ "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
+ "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
+ "<SOUTHEAST_ASIAN>", "<SOUTHEAST_ASIAN>",
+ "<NUM>" ])
+
+ def testPositionIncrements(self):
+ self.assert_(ThaiWordFilter.DBBI_AVAILABLE,
+ "JRE does not support Thai dictionary-based BreakIterator")
analyzer = ThaiAnalyzer(Version.LUCENE_CURRENT)
+
+ self._assertAnalyzesTo(analyzer, u"à¸à¸²à¸£à¸à¸µà¹à¹à¸à¹à¸à¹à¸à¸ the à¹à¸ªà¸à¸à¸§à¹à¸²à¸à¸²à¸à¸à¸µ",
+ [ u"à¸à¸²à¸£", u"à¸à¸µà¹", u"à¹à¸à¹", u"à¸à¹à¸à¸", u"à¹à¸ªà¸à¸",
+ u"วà¹à¸²", u"à¸à¸²à¸", u"à¸à¸µ" ],
+ [ 0, 3, 6, 9, 18, 22, 25, 28 ],
+ [ 3, 6, 9, 13, 22, 25, 28, 30 ],
+ None,
+ [ 1, 1, 1, 1, 2, 1, 1, 1 ])
+
+ # case that a stopword is adjacent to thai text, with no whitespace
+ self._assertAnalyzesTo(analyzer, u"à¸à¸²à¸£à¸à¸µà¹à¹à¸à¹à¸à¹à¸à¸the à¹à¸ªà¸à¸à¸§à¹à¸²à¸à¸²à¸à¸à¸µ",
+ [ u"à¸à¸²à¸£", u"à¸à¸µà¹", u"à¹à¸à¹", u"à¸à¹à¸à¸", u"à¹à¸ªà¸à¸",
+ u"วà¹à¸²", u"à¸à¸²à¸", u"à¸à¸µ" ],
+ [ 0, 3, 6, 9, 17, 21, 24, 27 ],
+ [ 3, 6, 9, 13, 21, 24, 27, 29 ],
+ None,
+ [ 1, 1, 1, 1, 2, 1, 1, 1 ])
+
+ def testAnalyzer30(self):
+
+ analyzer = ThaiAnalyzer(Version.LUCENE_30)
self._assertAnalyzesTo(analyzer, u"", [])