You are viewing a plain text version of this content. The canonical link for it is here.
Posted to pylucene-commits@lucene.apache.org by va...@apache.org on 2010/04/27 22:27:07 UTC

svn commit: r938638 - in /lucene/pylucene/trunk: CHANGES Makefile python/ICUTransformFilter.py test/test_ICUTransformFilter.py

Author: vajda
Date: Tue Apr 27 20:27:07 2010
New Revision: 938638

URL: http://svn.apache.org/viewvc?rev=938638&view=rev
Log:
 - added port of ICUTransformFilter using C++ ICU's Transliterator via PyICU

Added:
    lucene/pylucene/trunk/python/ICUTransformFilter.py   (with props)
    lucene/pylucene/trunk/test/test_ICUTransformFilter.py   (with props)
Modified:
    lucene/pylucene/trunk/CHANGES
    lucene/pylucene/trunk/Makefile

Modified: lucene/pylucene/trunk/CHANGES
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/CHANGES?rev=938638&r1=938637&r2=938638&view=diff
==============================================================================
--- lucene/pylucene/trunk/CHANGES (original)
+++ lucene/pylucene/trunk/CHANGES Tue Apr 27 20:27:07 2010
@@ -5,6 +5,7 @@ Version 3.0.0 ->
  - added wininst target to Makefile
  - added port of ICUNormalizer2Filter using C++ ICU's Normalizer2 via PyICU
  - added port of ICUFoldingFilter using C++ ICU's Normalizer2 via PyICU
+ - added port of ICUTransformFilter using C++ ICU's Transliterator via PyICU
  - PyLucene built with JCC 2.6
  - 
 

Modified: lucene/pylucene/trunk/Makefile
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/Makefile?rev=938638&r1=938637&r2=938638&view=diff
==============================================================================
--- lucene/pylucene/trunk/Makefile (original)
+++ lucene/pylucene/trunk/Makefile Tue Apr 27 20:27:07 2010
@@ -202,7 +202,7 @@ resources: $(LUCENE)/contrib/icu/src/res
 
 $(LUCENE)/contrib/icu/src/resources/org/apache/lucene/analysis/icu/utr30.dat: $(LUCENE)/contrib/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm
 	rm -f $@
-	cd $(dir $<); icupkg --type $(ENDIANNESS) --add $(notdir $<) new $(notdir $@)
+	cd $(dir $<); $(ICUPKG) --type $(ENDIANNESS) --add $(notdir $<) new $(notdir $@)
 
 else
 
@@ -239,6 +239,7 @@ GENERATE=$(JCC) $(foreach jar,$(JARS),--
            --module python/collections.py \
            --module python/ICUNormalizer2Filter.py \
            --module python/ICUFoldingFilter.py \
+           --module python/ICUTransformFilter.py \
            $(RESOURCES) \
            --files $(NUM_FILES)
 

Added: lucene/pylucene/trunk/python/ICUTransformFilter.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/python/ICUTransformFilter.py?rev=938638&view=auto
==============================================================================
--- lucene/pylucene/trunk/python/ICUTransformFilter.py (added)
+++ lucene/pylucene/trunk/python/ICUTransformFilter.py Tue Apr 27 20:27:07 2010
@@ -0,0 +1,85 @@
+# -*- coding: utf-8 -*-
+# ====================================================================
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+# ====================================================================
+#
+#  Port of java/org/apache/lucene/analysis/icu/ICUTransformFilter.java
+#  using IBM's C++ ICU wrapped by PyICU (http://pyicu.osafoundation.org)
+#
+#  A TokenFilter that transforms text with ICU.
+#
+#  ICU provides text-transformation functionality via its Transliteration API.
+#  Although script conversion is its most common use, a Transliterator can
+#  actually perform a more general class of tasks. In fact, Transliterator
+#  defines a very general API which specifies only that a segment of the input
+#  text is replaced by new text. The particulars of this conversion are
+#  determined entirely by subclasses of Transliterator.
+#
+#  Some useful transformations for search are built-in:
+#   - Conversion from Traditional to Simplified Chinese characters
+#   - Conversion from Hiragana to Katakana
+#   - Conversion from Fullwidth to Halfwidth forms.
+#   - Script conversions, for example Serbian Cyrillic to Latin
+#
+#  Example usage: <blockquote>stream = new ICUTransformFilter(stream,
+#  Transliterator.getInstance("Traditional-Simplified"));</blockquote>
+#
+#  For more details, see the ICU User Guide at:
+#  http://userguide.icu-project.org/transforms/general
+#
+# ====================================================================
+
+from lucene import PythonTokenFilter, CharTermAttribute
+from icu import Transliterator, UTransPosition
+
+
+class ICUTransformFilter(PythonTokenFilter):
+
+    # Create a new ICUTransformFilter that transforms text on the given
+    # stream.
+    #  
+    #  @param input {@link TokenStream} to filter.
+    #  @param transform Transliterator to transform the text.
+
+    def __init__(self, input, transform):
+
+        super(ICUTransformFilter, self).__init__(input)
+
+        # Reusable position object
+        self.position = UTransPosition()
+
+        # term attribute, will be updated with transformed text.
+        self.termAtt = self.addAttribute(CharTermAttribute.class_)
+
+        self.input = input
+        self.transform = transform
+
+    def incrementToken(self):
+
+        if self.input.incrementToken():
+            text = self.termAtt.toString()
+            length = len(text)
+
+            self.position.start = 0
+            self.position.limit = length
+            self.position.contextStart = 0
+            self.position.contextLimit = length
+
+            text = self.transform.filteredTransliterate(text, self.position,
+                                                        False)
+            self.termAtt.setEmpty()
+            self.termAtt.append(text)
+            
+            return True
+
+        return False

Propchange: lucene/pylucene/trunk/python/ICUTransformFilter.py
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/pylucene/trunk/python/ICUTransformFilter.py
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: lucene/pylucene/trunk/test/test_ICUTransformFilter.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/test/test_ICUTransformFilter.py?rev=938638&view=auto
==============================================================================
--- lucene/pylucene/trunk/test/test_ICUTransformFilter.py (added)
+++ lucene/pylucene/trunk/test/test_ICUTransformFilter.py Tue Apr 27 20:27:07 2010
@@ -0,0 +1,92 @@
+# -*- coding: utf-8 -*-
+# ====================================================================
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+# ====================================================================
+#
+#  Port of java/org/apache/lucene/analysis/icu/ICUTransformFilter.java
+#  using IBM's C++ ICU wrapped by PyICU (http://pyicu.osafoundation.org)
+
+try:
+    from icu import Transliterator, UTransDirection
+except ImportError, e:
+    pass
+
+from unittest import main
+from BaseTokenStreamTestCase import BaseTokenStreamTestCase
+
+from lucene import *
+from lucene.ICUTransformFilter import ICUTransformFilter
+
+
+class TestICUTransformFilter(BaseTokenStreamTestCase):
+  
+    def _checkToken(self, transform, input, expected):
+
+        ts = ICUTransformFilter(KeywordTokenizer(StringReader(input)),
+                                transform)
+        self._assertTokenStreamContents(ts, [ expected ])
+
+    def _getTransliterator(self, name):
+
+        return Transliterator.createInstance(name, UTransDirection.FORWARD)
+
+    def testBasicFunctionality(self):
+
+        self._checkToken(self._getTransliterator("Traditional-Simplified"), 
+                         u"簡化字", u"简化字")
+        self._checkToken(self._getTransliterator("Katakana-Hiragana"),
+                         u"ヒラガナ", u"ひらがな")
+        self._checkToken(self._getTransliterator("Fullwidth-Halfwidth"), 
+                         u"アルアノリウ", u"アルアノリウ")
+        self._checkToken(self._getTransliterator("Any-Latin"), 
+                         u"Αλφαβητικός Κατάλογος", u"Alphabētikós Katálogos")
+        self._checkToken(self._getTransliterator("NFD; [:Nonspacing Mark:] Remove"), 
+                         u"Alphabētikós Katálogos", u"Alphabetikos Katalogos")
+        self._checkToken(self._getTransliterator("Han-Latin"),
+                         u"中国", u"zhōng guó")
+  
+    def testCustomFunctionality(self):
+
+        # convert a's to b's and b's to c's        
+        rules = "a > b; b > c;"
+        self._checkToken(Transliterator.createFromRules("test", rules, UTransDirection.FORWARD), "abacadaba", "bcbcbdbcb")
+  
+    def testCustomFunctionality2(self):
+        
+        # convert a's to b's and b's to c's        
+        rules = "c { a > b; a > d;"
+        self._checkToken(Transliterator.createFromRules("test", rules, UTransDirection.FORWARD), "caa", "cbd")
+  
+    def testOptimizer2(self):
+
+        self._checkToken(self._getTransliterator("Traditional-Simplified; Lower"),
+                         "ABCDE", "abcde")
+
+
+if __name__ == "__main__":
+    import sys, lucene
+    try:
+        import icu
+    except ImportError:
+        pass
+    else:
+        lucene.initVM()
+        if '-loop' in sys.argv:
+            sys.argv.remove('-loop')
+            while True:
+                try:
+                    main()
+                except:
+                    pass
+        else:
+             main()

Propchange: lucene/pylucene/trunk/test/test_ICUTransformFilter.py
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/pylucene/trunk/test/test_ICUTransformFilter.py
------------------------------------------------------------------------------
    svn:mime-type = text/plain