You are viewing a plain text version of this content. The canonical link for it is here.
Posted to pylucene-commits@lucene.apache.org by va...@apache.org on 2010/04/23 01:43:50 UTC
svn commit: r937108 - in /lucene/pylucene/trunk: CHANGES Makefile
python/ICUFoldingFilter.py test/test_ICUFoldingFilter.py
test/test_PositionIncrement.py
Author: vajda
Date: Thu Apr 22 23:43:49 2010
New Revision: 937108
URL: http://svn.apache.org/viewvc?rev=937108&view=rev
Log:
- added port of ICUFoldingFilter using C++ ICU's Normalizer2 via PyICU
Added:
lucene/pylucene/trunk/python/ICUFoldingFilter.py (with props)
lucene/pylucene/trunk/test/test_ICUFoldingFilter.py (with props)
Modified:
lucene/pylucene/trunk/CHANGES
lucene/pylucene/trunk/Makefile
lucene/pylucene/trunk/test/test_PositionIncrement.py
Modified: lucene/pylucene/trunk/CHANGES
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/CHANGES?rev=937108&r1=937107&r2=937108&view=diff
==============================================================================
--- lucene/pylucene/trunk/CHANGES (original)
+++ lucene/pylucene/trunk/CHANGES Thu Apr 22 23:43:49 2010
@@ -4,6 +4,8 @@ Version 3.0.0 ->
- improved support for building on Windows with mingw32
- added wininst target to Makefile
- added port of ICUNormalizer2Filter using C++ ICU's Normalizer2 via PyICU
+ - added port of ICUFoldingFilter using C++ ICU's Normalizer2 via PyICU
+ - PyLucene built with JCC 2.6
-
Version 2.9.0 -> 3.0.0
Modified: lucene/pylucene/trunk/Makefile
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/Makefile?rev=937108&r1=937107&r2=937108&view=diff
==============================================================================
--- lucene/pylucene/trunk/Makefile (original)
+++ lucene/pylucene/trunk/Makefile Thu Apr 22 23:43:49 2010
@@ -143,6 +143,7 @@ MEMORY_JAR=$(LUCENE)/build/contrib/memor
QUERIES_JAR=$(LUCENE)/build/contrib/queries/lucene-queries-$(LUCENE_VER).jar
EXTENSIONS_JAR=build/jar/extensions.jar
+ICUPKG:=$(shell which icupkg)
.PHONY: generate compile install default all clean realclean \
sources test jars distrib
@@ -191,6 +192,27 @@ JARS=$(LUCENE_JAR) $(ANALYZERS_JAR) \
jars: $(JARS)
+
+ifneq ($(ICUPKG),)
+
+RESOURCES=--resources $(LUCENE)/contrib/icu/src/resources
+ENDIANNESS:=$(shell $(PYTHON) -c "import struct; print struct.pack('h', 1) == '\000\001' and 'b' or 'l'")
+
+resources: $(LUCENE)/contrib/icu/src/resources/org/apache/lucene/analysis/icu/utr30.dat
+
+$(LUCENE)/contrib/icu/src/resources/org/apache/lucene/analysis/icu/utr30.dat: $(LUCENE)/contrib/icu/src/resources/org/apache/lucene/analysis/icu/utr30.nrm
+ rm -f $@
+ cd $(dir $<); icupkg --type $(ENDIANNESS) --add $(notdir $<) new $(notdir $@)
+
+else
+
+RESOURCES=
+
+resources:
+ @echo ICU not installed
+
+endif
+
GENERATE=$(JCC) $(foreach jar,$(JARS),--jar $(jar)) \
--package java.lang java.lang.System \
java.lang.Runtime \
@@ -216,6 +238,8 @@ GENERATE=$(JCC) $(foreach jar,$(JARS),--
--version $(LUCENE_VER) \
--module python/collections.py \
--module python/ICUNormalizer2Filter.py \
+ --module python/ICUFoldingFilter.py \
+ $(RESOURCES) \
--files $(NUM_FILES)
generate: jars
@@ -233,7 +257,7 @@ bdist: jars
wininst: jars
$(GENERATE) --wininst
-all: sources jars compile
+all: sources jars resources compile
@echo build of $(PYLUCENE_LIB) complete
clean:
Added: lucene/pylucene/trunk/python/ICUFoldingFilter.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/python/ICUFoldingFilter.py?rev=937108&view=auto
==============================================================================
--- lucene/pylucene/trunk/python/ICUFoldingFilter.py (added)
+++ lucene/pylucene/trunk/python/ICUFoldingFilter.py Thu Apr 22 23:43:49 2010
@@ -0,0 +1,74 @@
+# -*- coding: utf-8 -*-
+# ====================================================================
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ====================================================================
+#
+# Port of java/org/apache/lucene/analysis/icu/ICUFoldingFilter.java
+# using IBM's C++ ICU wrapped by PyICU (http://pyicu.osafoundation.org)
+#
+# A TokenFilter that applies search term folding to Unicode text,
+# applying foldings from UTR#30 Character Foldings.
+#
+# This filter applies the following foldings from the report to unicode text:
+#
+# Accent removal
+# Case folding
+# Canonical duplicates folding
+# Dashes folding
+# Diacritic removal (including stroke, hook, descender)
+# Greek letterforms folding
+# Han Radical folding
+# Hebrew Alternates folding
+# Jamo folding
+# Letterforms folding
+# Math symbol folding
+# Multigraph Expansions: All
+# Native digit folding
+# No-break folding
+# Overline folding
+# Positional forms folding
+# Small forms folding
+# Space folding
+# Spacing Accents folding
+# Subscript folding
+# Superscript folding
+# Suzhou Numeral folding
+# Symbol folding
+# Underline folding
+# Vertical forms folding
+# Width folding
+#
+# Additionally, Default Ignorables are removed, and text is normalized to NFKC.
+# All foldings, case folding, and normalization mappings are applied
+# recursively to ensure a fully folded and normalized result.
+#
+# ====================================================================
+
+import os, lucene
+
+from lucene.ICUNormalizer2Filter import ICUNormalizer2Filter
+from icu import ResourceBundle, Normalizer2, UNormalizationMode2
+
+utr30 = os.path.join(lucene.__dir__, 'resources',
+ 'org', 'apache', 'lucene', 'analysis', 'icu',
+ 'utr30.dat')
+ResourceBundle.setAppData("utr30", utr30)
+
+
+class ICUFoldingFilter(ICUNormalizer2Filter):
+
+ def __init__(self, input):
+
+ normalizer = Normalizer2.getInstance("utr30", "utr30",
+ UNormalizationMode2.COMPOSE)
+ super(ICUFoldingFilter, self).__init__(input, normalizer)
Propchange: lucene/pylucene/trunk/python/ICUFoldingFilter.py
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/pylucene/trunk/python/ICUFoldingFilter.py
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: lucene/pylucene/trunk/test/test_ICUFoldingFilter.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/test/test_ICUFoldingFilter.py?rev=937108&view=auto
==============================================================================
--- lucene/pylucene/trunk/test/test_ICUFoldingFilter.py (added)
+++ lucene/pylucene/trunk/test/test_ICUFoldingFilter.py Thu Apr 22 23:43:49 2010
@@ -0,0 +1,90 @@
+# -*- coding: utf-8 -*-
+# ====================================================================
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ====================================================================
+#
+# Port of java/org/apache/lucene/analysis/icu/ICUFoldingFilter.java
+# using IBM's C++ ICU wrapped by PyICU (http://pyicu.osafoundation.org)
+
+try:
+ from icu import Normalizer2, UNormalizationMode2
+except ImportError, e:
+ pass
+
+from unittest import main
+from BaseTokenStreamTestCase import BaseTokenStreamTestCase
+
+from lucene import *
+from lucene.ICUFoldingFilter import ICUFoldingFilter
+
+
+class TestICUFoldingFilter(BaseTokenStreamTestCase):
+
+ def testDefaults(self):
+
+ class _analyzer(PythonAnalyzer):
+ def tokenStream(_self, fieldName, reader):
+ return ICUFoldingFilter(WhitespaceTokenizer(Version.LUCENE_CURRENT, reader))
+
+ a = _analyzer()
+
+ # case folding
+ self._assertAnalyzesTo(a, "This is a test",
+ [ "this", "is", "a", "test" ])
+
+ # case folding
+ self._assertAnalyzesTo(a, u"RuÃ", [ "russ" ])
+
+ # case folding with accent removal
+ self._assertAnalyzesTo(a, u"ÎÎΪÎΣ", [ u"μαιοÏ" ])
+ self._assertAnalyzesTo(a, u"ÎάÏοÏ", [ u"μαιοÏ" ])
+
+ # supplementary case folding
+ self._assertAnalyzesTo(a, u"ð", [ u"ð¾" ])
+
+ # normalization
+ self._assertAnalyzesTo(a, u"ﴳﴺﰧ", [ u"Ø·Ù
Ø·Ù
Ø·Ù
" ])
+
+ # removal of default ignorables
+ self._assertAnalyzesTo(a, u"à¤à¥âष", [ u"à¤à¤·" ])
+
+ # removal of latin accents (composed)
+ self._assertAnalyzesTo(a, u"résumé", [ "resume" ])
+
+ # removal of latin accents (decomposed)
+ self._assertAnalyzesTo(a, u"re\u0301sume\u0301", [ u"resume" ])
+
+ # fold native digits
+ self._assertAnalyzesTo(a, u"à§à§¦à§¬", [ "706" ])
+
+ # ascii-folding-filter type stuff
+ self._assertAnalyzesTo(a, u"Äis is cræzy", [ "dis", "is", "craezy" ])
+
+
+if __name__ == "__main__":
+ import sys, lucene
+ try:
+ import icu
+ except ImportError:
+ pass
+ else:
+ lucene.initVM()
+ if '-loop' in sys.argv:
+ sys.argv.remove('-loop')
+ while True:
+ try:
+ main()
+ except:
+ pass
+ else:
+ main()
Propchange: lucene/pylucene/trunk/test/test_ICUFoldingFilter.py
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/pylucene/trunk/test/test_ICUFoldingFilter.py
------------------------------------------------------------------------------
svn:mime-type = text/plain
Modified: lucene/pylucene/trunk/test/test_PositionIncrement.py
URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/test/test_PositionIncrement.py?rev=937108&r1=937107&r2=937108&view=diff
==============================================================================
--- lucene/pylucene/trunk/test/test_PositionIncrement.py (original)
+++ lucene/pylucene/trunk/test/test_PositionIncrement.py Thu Apr 22 23:43:49 2010
@@ -42,11 +42,11 @@ class PositionIncrementTestCase(TestCase
self_.posIncrAtt.setPositionIncrement(self_.INCREMENTS[self_.i])
self_.i += 1
return True
- def end(self):
+ def end(self_):
pass
- def reset(self):
+ def reset(self_):
pass
- def close(self):
+ def close(self_):
pass
return _tokenStream()