You are viewing a plain text version of this content. The canonical link for it is here.
Posted to pylucene-commits@lucene.apache.org by va...@apache.org on 2010/04/23 01:43:50 UTC

svn commit: r937108 - in /lucene/pylucene/trunk: CHANGES Makefile python/ICUFoldingFilter.py test/test_ICUFoldingFilter.py test/test_PositionIncrement.py

Author: vajda
Date: Thu Apr 22 23:43:49 2010
New Revision: 937108

URL: http://svn.apache.org/viewvc?rev=937108&view=rev
Log:
 - added port of ICUFoldingFilter using C++ ICU's Normalizer2 via PyICU

Added:
    lucene/pylucene/trunk/python/ICUFoldingFilter.py   (with props)
    lucene/pylucene/trunk/test/test_ICUFoldingFilter.py   (with props)
Modified:
    lucene/pylucene/trunk/CHANGES
    lucene/pylucene/trunk/Makefile
    lucene/pylucene/trunk/test/test_PositionIncrement.py

Modified: lucene/pylucene/trunk/CHANGES
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/CHANGES?rev=937108&r1=937107&r2=937108&view=diff
==============================================================================
--- lucene/pylucene/trunk/CHANGES (original)
+++ lucene/pylucene/trunk/CHANGES Thu Apr 22 23:43:49 2010
@@ -4,6 +4,8 @@ Version 3.0.0 ->
  - improved support for building on Windows with mingw32
  - added wininst target to Makefile
  - added port of ICUNormalizer2Filter using C++ ICU's Normalizer2 via PyICU
+ - added port of ICUFoldingFilter using C++ ICU's Normalizer2 via PyICU
+ - PyLucene built with JCC 2.6
  - 
 
 Version 2.9.0 -> 3.0.0

Modified: lucene/pylucene/trunk/Makefile
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/Makefile?rev=937108&r1=937107&r2=937108&view=diff
==============================================================================
--- lucene/pylucene/trunk/Makefile (original)
+++ lucene/pylucene/trunk/Makefile Thu Apr 22 23:43:49 2010
@@ -143,6 +143,7 @@ MEMORY_JAR=$(LUCENE)/build/contrib/memor
 QUERIES_JAR=$(LUCENE)/build/contrib/queries/lucene-queries-$(LUCENE_VER).jar
 EXTENSIONS_JAR=build/jar/extensions.jar
 
+ICUPKG:=$(shell which icupkg)
 
 .PHONY: generate compile install default all clean realclean \
 	sources test jars distrib
@@ -191,6 +192,27 @@ JARS=$(LUCENE_JAR) $(ANALYZERS_JAR) \
 
 jars: $(JARS)
 
+
+ifneq ($(ICUPKG),)
+
+RESOURCES=--resources $(LUCENE)/contrib/icu/src/resources
+ENDIANNESS:=$(shell $(PYTHON) -c "import struct; print struct.pack('h', 1) == '\000\001' and 'b' or 'l'")
+
+resources: $(LUCENE)/contrib/icu/src/resources/org/apache/lucene/analysis/icu/utr30.dat
+
+$(LUCENE)/contrib/icu/src/resources/org/apache/lucene/analysis/icu/utr30.dat: $(LUCENE)/contrib/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm
+	rm -f $@
+	cd $(dir $<); icupkg --type $(ENDIANNESS) --add $(notdir $<) new $(notdir $@)
+
+else
+
+RESOURCES=
+
+resources:
+	@echo ICU not installed
+
+endif
+
 GENERATE=$(JCC) $(foreach jar,$(JARS),--jar $(jar)) \
            --package java.lang java.lang.System \
                                java.lang.Runtime \
@@ -216,6 +238,8 @@ GENERATE=$(JCC) $(foreach jar,$(JARS),--
            --version $(LUCENE_VER) \
            --module python/collections.py \
            --module python/ICUNormalizer2Filter.py \
+           --module python/ICUFoldingFilter.py \
+           $(RESOURCES) \
            --files $(NUM_FILES)
 
 generate: jars
@@ -233,7 +257,7 @@ bdist: jars
 wininst: jars
 	$(GENERATE) --wininst
 
-all: sources jars compile
+all: sources jars resources compile
 	@echo build of $(PYLUCENE_LIB) complete
 
 clean:

Added: lucene/pylucene/trunk/python/ICUFoldingFilter.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/python/ICUFoldingFilter.py?rev=937108&view=auto
==============================================================================
--- lucene/pylucene/trunk/python/ICUFoldingFilter.py (added)
+++ lucene/pylucene/trunk/python/ICUFoldingFilter.py Thu Apr 22 23:43:49 2010
@@ -0,0 +1,74 @@
+# -*- coding: utf-8 -*-
+# ====================================================================
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+# ====================================================================
+#
+#  Port of java/org/apache/lucene/analysis/icu/ICUFoldingFilter.java
+#  using IBM's C++ ICU wrapped by PyICU (http://pyicu.osafoundation.org)
+#
+#  A TokenFilter that applies search term folding to Unicode text,
+#  applying foldings from UTR#30 Character Foldings.
+#
+#  This filter applies the following foldings from the report to unicode text:
+#
+#  Accent removal
+#  Case folding
+#  Canonical duplicates folding
+#  Dashes folding
+#  Diacritic removal (including stroke, hook, descender)
+#  Greek letterforms folding
+#  Han Radical folding
+#  Hebrew Alternates folding
+#  Jamo folding
+#  Letterforms folding
+#  Math symbol folding
+#  Multigraph Expansions: All
+#  Native digit folding
+#  No-break folding
+#  Overline folding
+#  Positional forms folding
+#  Small forms folding
+#  Space folding
+#  Spacing Accents folding
+#  Subscript folding
+#  Superscript folding
+#  Suzhou Numeral folding
+#  Symbol folding
+#  Underline folding
+#  Vertical forms folding
+#  Width folding
+#
+#  Additionally, Default Ignorables are removed, and text is normalized to NFKC.
+#  All foldings, case folding, and normalization mappings are applied
+#  recursively to ensure a fully folded and normalized result.
+#
+# ====================================================================
+
+import os, lucene
+
+from lucene.ICUNormalizer2Filter import ICUNormalizer2Filter
+from icu import ResourceBundle, Normalizer2, UNormalizationMode2
+
+utr30 = os.path.join(lucene.__dir__, 'resources',
+                     'org', 'apache', 'lucene', 'analysis', 'icu',
+                     'utr30.dat')
+ResourceBundle.setAppData("utr30", utr30)
+
+
+class ICUFoldingFilter(ICUNormalizer2Filter):
+
+    def __init__(self, input):
+
+        normalizer = Normalizer2.getInstance("utr30", "utr30",
+                                             UNormalizationMode2.COMPOSE)
+        super(ICUFoldingFilter, self).__init__(input, normalizer)

Propchange: lucene/pylucene/trunk/python/ICUFoldingFilter.py
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/pylucene/trunk/python/ICUFoldingFilter.py
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: lucene/pylucene/trunk/test/test_ICUFoldingFilter.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/test/test_ICUFoldingFilter.py?rev=937108&view=auto
==============================================================================
--- lucene/pylucene/trunk/test/test_ICUFoldingFilter.py (added)
+++ lucene/pylucene/trunk/test/test_ICUFoldingFilter.py Thu Apr 22 23:43:49 2010
@@ -0,0 +1,90 @@
+# -*- coding: utf-8 -*-
+# ====================================================================
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+# ====================================================================
+#
+#  Port of java/org/apache/lucene/analysis/icu/ICUFoldingFilter.java
+#  using IBM's C++ ICU wrapped by PyICU (http://pyicu.osafoundation.org)
+
+try:
+    from icu import Normalizer2, UNormalizationMode2
+except ImportError, e:
+    pass
+
+from unittest import main
+from BaseTokenStreamTestCase import BaseTokenStreamTestCase
+
+from lucene import *
+from lucene.ICUFoldingFilter import ICUFoldingFilter
+
+
+class TestICUFoldingFilter(BaseTokenStreamTestCase):
+
+    def testDefaults(self):
+
+        class _analyzer(PythonAnalyzer):
+            def tokenStream(_self, fieldName, reader):
+                return ICUFoldingFilter(WhitespaceTokenizer(Version.LUCENE_CURRENT, reader))
+
+        a = _analyzer()
+
+        # case folding
+        self._assertAnalyzesTo(a, "This is a test",
+                               [ "this", "is", "a", "test" ])
+
+        # case folding
+        self._assertAnalyzesTo(a, u"Ruß", [ "russ" ])
+    
+        # case folding with accent removal
+        self._assertAnalyzesTo(a, u"ΜΆΪΟΣ", [ u"μαιοσ" ])
+        self._assertAnalyzesTo(a, u"Μάϊος", [ u"μαιοσ" ])
+
+        # supplementary case folding
+        self._assertAnalyzesTo(a, u"𐐖", [ u"𐐾" ])
+    
+        # normalization
+        self._assertAnalyzesTo(a, u"ﴳﴺﰧ", [ u"طمطمطم" ])
+
+        # removal of default ignorables
+        self._assertAnalyzesTo(a, u"क्‍ष", [ u"कष" ])
+    
+        # removal of latin accents (composed)
+        self._assertAnalyzesTo(a, u"résumé", [ "resume" ])
+    
+        # removal of latin accents (decomposed)
+        self._assertAnalyzesTo(a, u"re\u0301sume\u0301", [ u"resume" ])
+    
+        # fold native digits
+        self._assertAnalyzesTo(a, u"৭০৬", [ "706" ])
+    
+        # ascii-folding-filter type stuff
+        self._assertAnalyzesTo(a, u"đis is cræzy", [ "dis", "is", "craezy" ])
+
+
+if __name__ == "__main__":
+    import sys, lucene
+    try:
+        import icu
+    except ImportError:
+        pass
+    else:
+        lucene.initVM()
+        if '-loop' in sys.argv:
+            sys.argv.remove('-loop')
+            while True:
+                try:
+                    main()
+                except:
+                    pass
+        else:
+             main()

Propchange: lucene/pylucene/trunk/test/test_ICUFoldingFilter.py
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/pylucene/trunk/test/test_ICUFoldingFilter.py
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: lucene/pylucene/trunk/test/test_PositionIncrement.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/test/test_PositionIncrement.py?rev=937108&r1=937107&r2=937108&view=diff
==============================================================================
--- lucene/pylucene/trunk/test/test_PositionIncrement.py (original)
+++ lucene/pylucene/trunk/test/test_PositionIncrement.py Thu Apr 22 23:43:49 2010
@@ -42,11 +42,11 @@ class PositionIncrementTestCase(TestCase
                         self_.posIncrAtt.setPositionIncrement(self_.INCREMENTS[self_.i])
                         self_.i += 1
                         return True
-                    def end(self):
+                    def end(self_):
                         pass
-                    def reset(self):
+                    def reset(self_):
                         pass
-                    def close(self):
+                    def close(self_):
                         pass
                 return _tokenStream()