You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@openoffice.apache.org by hd...@apache.org on 2012/01/23 14:14:56 UTC
svn commit: r1234777 - in /incubator/ooo/trunk/main/i18npool/source/search:
textsearch.cxx textsearch.hxx
Author: hdu
Date: Mon Jan 23 13:14:56 2012
New Revision: 1234777
URL: http://svn.apache.org/viewvc?rev=1234777&view=rev
Log:
emulate word boundary matching of old regex engine
The new ICU regex engine has much improved unicode capabilities.
The old regex engine had the extensions \< and \> for matching word boundaries.
For the convenience of a smooth upgrade experience these artifacts now get mapped to \b which is supported by almost all regex engines.
Modified:
incubator/ooo/trunk/main/i18npool/source/search/textsearch.cxx
incubator/ooo/trunk/main/i18npool/source/search/textsearch.hxx
Modified: incubator/ooo/trunk/main/i18npool/source/search/textsearch.cxx
URL: http://svn.apache.org/viewvc/incubator/ooo/trunk/main/i18npool/source/search/textsearch.cxx?rev=1234777&r1=1234776&r2=1234777&view=diff
==============================================================================
--- incubator/ooo/trunk/main/i18npool/source/search/textsearch.cxx (original)
+++ incubator/ooo/trunk/main/i18npool/source/search/textsearch.cxx Mon Jan 23 13:14:56 2012
@@ -176,27 +176,8 @@ void TextSearch::setOptions( const Searc
case SearchAlgorithms_REGEXP:
fnForward = &TextSearch::RESrchFrwrd;
fnBackward = &TextSearch::RESrchBkwrd;
-
- {
- sal_uInt32 nIcuSearchFlags = 0;
- // map com::sun::star::util::SearchFlags to ICU uregex.h flags
- // TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE
- // REG_NEWLINE is neither defined properly nor used anywhere => not implemented
- // REG_NOSUB is not used anywhere => not implemented
- // NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute
- // LEV_RELAXED is only used for SearchAlgorithm==Approximate
- // why is even ALL_IGNORE_CASE deprecated in UNO? because of transliteration taking care of it???
- if( (aSrchPara.searchFlag & com::sun::star::util::SearchFlags::ALL_IGNORE_CASE) != 0)
- nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE;
- UErrorCode nIcuErr = U_ZERO_ERROR;
- // assumption: transliteration doesn't mangle regexp control chars
- OUString& rPatternStr = (aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK) ? sSrchStr
- : ((aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK) ? sSrchStr2 : aSrchPara.searchString);
- const IcuUniString aIcuSearchPatStr( rPatternStr.getStr(), rPatternStr.getLength());
- pRegexMatcher = new RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr);
- if( nIcuErr)
- { delete pRegexMatcher; pRegexMatcher = NULL;}
- } break;
+ RESrchPrepare( aSrchPara);
+ break;
case SearchAlgorithms_APPROXIMATE:
fnForward = &TextSearch::ApproxSrchFrwrd;
@@ -720,6 +701,41 @@ SearchResult TextSearch::NSrchBkwrd( con
return aRet;
}
+void TextSearch::RESrchPrepare( const ::com::sun::star::util::SearchOptions& rOptions)
+{
+ // select the transliterated pattern string
+ const OUString& rPatternStr =
+ (rOptions.transliterateFlags & SIMPLE_TRANS_MASK) ? sSrchStr
+ : ((rOptions.transliterateFlags & COMPLEX_TRANS_MASK) ? sSrchStr2 : rOptions.searchString);
+
+ sal_uInt32 nIcuSearchFlags = 0;
+ // map com::sun::star::util::SearchFlags to ICU uregex.h flags
+ // TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE
+ // REG_NEWLINE is neither properly defined nor used anywhere => not implemented
+ // REG_NOSUB is not used anywhere => not implemented
+ // NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute
+ // LEV_RELAXED is only used for SearchAlgorithm==Approximate
+ // why is even ALL_IGNORE_CASE deprecated in UNO? because of transliteration taking care of it???
+ if( (rOptions.searchFlag & com::sun::star::util::SearchFlags::ALL_IGNORE_CASE) != 0)
+ nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE;
+ UErrorCode nIcuErr = U_ZERO_ERROR;
+ // assumption: transliteration didn't mangle regexp control chars
+ IcuUniString aIcuSearchPatStr( rPatternStr.getStr(), rPatternStr.getLength());
+#if 1
+ // for conveniance specific syntax elements of the old regex engine are emulated
+ // by using regular word boundary matching \b to replace \< and \>
+ static const IcuUniString aChevronPattern( "\\<|\\>", -1, IcuUniString::kInvariant);
+ static const IcuUniString aChevronReplace( "\\b", -1, IcuUniString::kInvariant);
+ static RegexMatcher aChevronMatcher( aChevronPattern, 0, nIcuErr);
+ aChevronMatcher.reset( aIcuSearchPatStr);
+ aIcuSearchPatStr = aChevronMatcher.replaceAll( aChevronReplace, nIcuErr);
+ aChevronMatcher.reset();
+#endif
+ pRegexMatcher = new RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr);
+ if( nIcuErr)
+ { delete pRegexMatcher; pRegexMatcher = NULL;}
+}
+
//---------------------------------------------------------------------------
SearchResult TextSearch::RESrchFrwrd( const OUString& searchStr,
Modified: incubator/ooo/trunk/main/i18npool/source/search/textsearch.hxx
URL: http://svn.apache.org/viewvc/incubator/ooo/trunk/main/i18npool/source/search/textsearch.hxx?rev=1234777&r1=1234776&r2=1234777&view=diff
==============================================================================
--- incubator/ooo/trunk/main/i18npool/source/search/textsearch.hxx (original)
+++ incubator/ooo/trunk/main/i18npool/source/search/textsearch.hxx Mon Jan 23 13:14:56 2012
@@ -101,6 +101,7 @@ class TextSearch: public cppu::WeakImplH
RESrchBkwrd( const ::rtl::OUString& searchStr,
sal_Int32 startPos, sal_Int32 endPos )
throw(::com::sun::star::uno::RuntimeException);
+ void RESrchPrepare( const ::com::sun::star::util::SearchOptions&);
// Members and methods for the "Weight Levenshtein-Distance" search
int nLimit;