You are viewing a plain text version of this content. The canonical link for it is here.
Posted to pylucene-commits@lucene.apache.org by va...@apache.org on 2010/04/27 22:27:07 UTC
svn commit: r938638 - in /lucene/pylucene/trunk: CHANGES Makefile
python/ICUTransformFilter.py test/test_ICUTransformFilter.py
Author: vajda
Date: Tue Apr 27 20:27:07 2010
New Revision: 938638
URL: http://svn.apache.org/viewvc?rev=938638&view=rev
Log:
- added port of ICUTransformFilter using C++ ICU's Transliterator via PyICU
Added:
lucene/pylucene/trunk/python/ICUTransformFilter.py (with props)
lucene/pylucene/trunk/test/test_ICUTransformFilter.py (with props)
Modified:
lucene/pylucene/trunk/CHANGES
lucene/pylucene/trunk/Makefile
Modified: lucene/pylucene/trunk/CHANGES
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/CHANGES?rev=938638&r1=938637&r2=938638&view=diff
==============================================================================
--- lucene/pylucene/trunk/CHANGES (original)
+++ lucene/pylucene/trunk/CHANGES Tue Apr 27 20:27:07 2010
@@ -5,6 +5,7 @@ Version 3.0.0 ->
- added wininst target to Makefile
- added port of ICUNormalizer2Filter using C++ ICU's Normalizer2 via PyICU
- added port of ICUFoldingFilter using C++ ICU's Normalizer2 via PyICU
+ - added port of ICUTransformFilter using C++ ICU's Transliterator via PyICU
- PyLucene built with JCC 2.6
-
Modified: lucene/pylucene/trunk/Makefile
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/Makefile?rev=938638&r1=938637&r2=938638&view=diff
==============================================================================
--- lucene/pylucene/trunk/Makefile (original)
+++ lucene/pylucene/trunk/Makefile Tue Apr 27 20:27:07 2010
@@ -202,7 +202,7 @@ resources: $(LUCENE)/contrib/icu/src/res
$(LUCENE)/contrib/icu/src/resources/org/apache/lucene/analysis/icu/utr30.dat: $(LUCENE)/contrib/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm
rm -f $@
- cd $(dir $<); icupkg --type $(ENDIANNESS) --add $(notdir $<) new $(notdir $@)
+ cd $(dir $<); $(ICUPKG) --type $(ENDIANNESS) --add $(notdir $<) new $(notdir $@)
else
@@ -239,6 +239,7 @@ GENERATE=$(JCC) $(foreach jar,$(JARS),--
--module python/collections.py \
--module python/ICUNormalizer2Filter.py \
--module python/ICUFoldingFilter.py \
+ --module python/ICUTransformFilter.py \
$(RESOURCES) \
--files $(NUM_FILES)
Added: lucene/pylucene/trunk/python/ICUTransformFilter.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/python/ICUTransformFilter.py?rev=938638&view=auto
==============================================================================
--- lucene/pylucene/trunk/python/ICUTransformFilter.py (added)
+++ lucene/pylucene/trunk/python/ICUTransformFilter.py Tue Apr 27 20:27:07 2010
@@ -0,0 +1,85 @@
+# -*- coding: utf-8 -*-
+# ====================================================================
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ====================================================================
+#
+# Port of java/org/apache/lucene/analysis/icu/ICUTransformFilter.java
+# using IBM's C++ ICU wrapped by PyICU (http://pyicu.osafoundation.org)
+#
+# A TokenFilter that transforms text with ICU.
+#
+# ICU provides text-transformation functionality via its Transliteration API.
+# Although script conversion is its most common use, a Transliterator can
+# actually perform a more general class of tasks. In fact, Transliterator
+# defines a very general API which specifies only that a segment of the input
+# text is replaced by new text. The particulars of this conversion are
+# determined entirely by subclasses of Transliterator.
+#
+# Some useful transformations for search are built-in:
+# - Conversion from Traditional to Simplified Chinese characters
+# - Conversion from Hiragana to Katakana
+# - Conversion from Fullwidth to Halfwidth forms.
+# - Script conversions, for example Serbian Cyrillic to Latin
+#
+# Example usage: <blockquote>stream = new ICUTransformFilter(stream,
+# Transliterator.getInstance("Traditional-Simplified"));</blockquote>
+#
+# For more details, see the ICU User Guide at:
+# http://userguide.icu-project.org/transforms/general
+#
+# ====================================================================
+
+from lucene import PythonTokenFilter, CharTermAttribute
+from icu import Transliterator, UTransPosition
+
+
+class ICUTransformFilter(PythonTokenFilter):
+
+ # Create a new ICUTransformFilter that transforms text on the given
+ # stream.
+ #
+ # @param input {@link TokenStream} to filter.
+ # @param transform Transliterator to transform the text.
+
+ def __init__(self, input, transform):
+
+ super(ICUTransformFilter, self).__init__(input)
+
+ # Reusable position object
+ self.position = UTransPosition()
+
+ # term attribute, will be updated with transformed text.
+ self.termAtt = self.addAttribute(CharTermAttribute.class_)
+
+ self.input = input
+ self.transform = transform
+
+ def incrementToken(self):
+
+ if self.input.incrementToken():
+ text = self.termAtt.toString()
+ length = len(text)
+
+ self.position.start = 0
+ self.position.limit = length
+ self.position.contextStart = 0
+ self.position.contextLimit = length
+
+ text = self.transform.filteredTransliterate(text, self.position,
+ False)
+ self.termAtt.setEmpty()
+ self.termAtt.append(text)
+
+ return True
+
+ return False
Propchange: lucene/pylucene/trunk/python/ICUTransformFilter.py
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/pylucene/trunk/python/ICUTransformFilter.py
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: lucene/pylucene/trunk/test/test_ICUTransformFilter.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/test/test_ICUTransformFilter.py?rev=938638&view=auto
==============================================================================
--- lucene/pylucene/trunk/test/test_ICUTransformFilter.py (added)
+++ lucene/pylucene/trunk/test/test_ICUTransformFilter.py Tue Apr 27 20:27:07 2010
@@ -0,0 +1,92 @@
+# -*- coding: utf-8 -*-
+# ====================================================================
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ====================================================================
+#
+# Port of java/org/apache/lucene/analysis/icu/ICUTransformFilter.java
+# using IBM's C++ ICU wrapped by PyICU (http://pyicu.osafoundation.org)
+
+try:
+ from icu import Transliterator, UTransDirection
+except ImportError, e:
+ pass
+
+from unittest import main
+from BaseTokenStreamTestCase import BaseTokenStreamTestCase
+
+from lucene import *
+from lucene.ICUTransformFilter import ICUTransformFilter
+
+
+class TestICUTransformFilter(BaseTokenStreamTestCase):
+
+ def _checkToken(self, transform, input, expected):
+
+ ts = ICUTransformFilter(KeywordTokenizer(StringReader(input)),
+ transform)
+ self._assertTokenStreamContents(ts, [ expected ])
+
+ def _getTransliterator(self, name):
+
+ return Transliterator.createInstance(name, UTransDirection.FORWARD)
+
+ def testBasicFunctionality(self):
+
+ self._checkToken(self._getTransliterator("Traditional-Simplified"),
+ u"ç°¡åå", u"ç®åå")
+ self._checkToken(self._getTransliterator("Katakana-Hiragana"),
+ u"ãã©ã¬ã", u"ã²ãããª")
+ self._checkToken(self._getTransliterator("Fullwidth-Halfwidth"),
+ u"ã¢ã«ã¢ããªã¦", u"ï½±ï¾ï½±ï¾ï¾ï½³")
+ self._checkToken(self._getTransliterator("Any-Latin"),
+ u"ÎλÏαβηÏικÏÏ ÎαÏάλογοÏ", u"AlphabÄtikós Katálogos")
+ self._checkToken(self._getTransliterator("NFD; [:Nonspacing Mark:] Remove"),
+ u"AlphabÄtikós Katálogos", u"Alphabetikos Katalogos")
+ self._checkToken(self._getTransliterator("Han-Latin"),
+ u"ä¸å½", u"zhÅng guó")
+
+ def testCustomFunctionality(self):
+
+ # convert a's to b's and b's to c's
+ rules = "a > b; b > c;"
+ self._checkToken(Transliterator.createFromRules("test", rules, UTransDirection.FORWARD), "abacadaba", "bcbcbdbcb")
+
+ def testCustomFunctionality2(self):
+
+ # convert a's to b's and b's to c's
+ rules = "c { a > b; a > d;"
+ self._checkToken(Transliterator.createFromRules("test", rules, UTransDirection.FORWARD), "caa", "cbd")
+
+ def testOptimizer2(self):
+
+ self._checkToken(self._getTransliterator("Traditional-Simplified; Lower"),
+ "ABCDE", "abcde")
+
+
+if __name__ == "__main__":
+ import sys, lucene
+ try:
+ import icu
+ except ImportError:
+ pass
+ else:
+ lucene.initVM()
+ if '-loop' in sys.argv:
+ sys.argv.remove('-loop')
+ while True:
+ try:
+ main()
+ except:
+ pass
+ else:
+ main()
Propchange: lucene/pylucene/trunk/test/test_ICUTransformFilter.py
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/pylucene/trunk/test/test_ICUTransformFilter.py
------------------------------------------------------------------------------
svn:mime-type = text/plain