You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@openoffice.apache.org by hd...@apache.org on 2012/01/23 14:14:56 UTC

svn commit: r1234777 - in /incubator/ooo/trunk/main/i18npool/source/search: textsearch.cxx textsearch.hxx

Author: hdu
Date: Mon Jan 23 13:14:56 2012
New Revision: 1234777

URL: http://svn.apache.org/viewvc?rev=1234777&view=rev
Log:
emulate word boundary matching of old regex engine

The new ICU regex engine has much improved unicode capabilities.
The old regex engine had the extensions \< and \> for matching word boundaries.
For the convenience of a smooth upgrade experience these artifacts now get mapped to \b which is supported by almost all regex engines.

Modified:
    incubator/ooo/trunk/main/i18npool/source/search/textsearch.cxx
    incubator/ooo/trunk/main/i18npool/source/search/textsearch.hxx

Modified: incubator/ooo/trunk/main/i18npool/source/search/textsearch.cxx
URL: http://svn.apache.org/viewvc/incubator/ooo/trunk/main/i18npool/source/search/textsearch.cxx?rev=1234777&r1=1234776&r2=1234777&view=diff
==============================================================================
--- incubator/ooo/trunk/main/i18npool/source/search/textsearch.cxx (original)
+++ incubator/ooo/trunk/main/i18npool/source/search/textsearch.cxx Mon Jan 23 13:14:56 2012
@@ -176,27 +176,8 @@ void TextSearch::setOptions( const Searc
 		case SearchAlgorithms_REGEXP:
 			fnForward = &TextSearch::RESrchFrwrd;
 			fnBackward = &TextSearch::RESrchBkwrd;
-
-			{
-			sal_uInt32 nIcuSearchFlags = 0;
-			// map com::sun::star::util::SearchFlags to ICU uregex.h flags
-			// TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE
-			// REG_NEWLINE is neither defined properly nor used anywhere => not implemented
-			// REG_NOSUB is not used anywhere => not implemented
-			// NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute
-			// LEV_RELAXED is only used for SearchAlgorithm==Approximate
-			// why is even ALL_IGNORE_CASE deprecated in UNO? because of transliteration taking care of it???
-			if( (aSrchPara.searchFlag & com::sun::star::util::SearchFlags::ALL_IGNORE_CASE) != 0)
-				nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE;
-			UErrorCode nIcuErr = U_ZERO_ERROR;
-			// assumption: transliteration doesn't mangle regexp control chars
-			OUString& rPatternStr = (aSrchPara.transliterateFlags & SIMPLE_TRANS_MASK) ? sSrchStr
-					: ((aSrchPara.transliterateFlags & COMPLEX_TRANS_MASK) ? sSrchStr2 : aSrchPara.searchString);
-			const IcuUniString aIcuSearchPatStr( rPatternStr.getStr(), rPatternStr.getLength());
-			pRegexMatcher = new RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr);
-			if( nIcuErr)
-				{ delete pRegexMatcher; pRegexMatcher = NULL;}
-			} break;
+			RESrchPrepare( aSrchPara);
+			break;
 
 		case SearchAlgorithms_APPROXIMATE:
             fnForward = &TextSearch::ApproxSrchFrwrd;
@@ -720,6 +701,41 @@ SearchResult TextSearch::NSrchBkwrd( con
     return aRet;
 }
 
+void TextSearch::RESrchPrepare( const ::com::sun::star::util::SearchOptions& rOptions)
+{
+	// select the transliterated pattern string
+	const OUString& rPatternStr = 
+		(rOptions.transliterateFlags & SIMPLE_TRANS_MASK) ? sSrchStr
+		: ((rOptions.transliterateFlags & COMPLEX_TRANS_MASK) ? sSrchStr2 : rOptions.searchString);
+
+	sal_uInt32 nIcuSearchFlags = 0;
+	// map com::sun::star::util::SearchFlags to ICU uregex.h flags
+	// TODO: REG_EXTENDED, REG_NOT_BEGINOFLINE, REG_NOT_ENDOFLINE
+	// REG_NEWLINE is neither properly defined nor used anywhere => not implemented
+	// REG_NOSUB is not used anywhere => not implemented
+	// NORM_WORD_ONLY is only used for SearchAlgorithm==Absolute
+	// LEV_RELAXED is only used for SearchAlgorithm==Approximate
+	// why is even ALL_IGNORE_CASE deprecated in UNO? because of transliteration taking care of it???
+	if( (rOptions.searchFlag & com::sun::star::util::SearchFlags::ALL_IGNORE_CASE) != 0)
+		nIcuSearchFlags |= UREGEX_CASE_INSENSITIVE;
+	UErrorCode nIcuErr = U_ZERO_ERROR;
+	// assumption: transliteration didn't mangle regexp control chars
+	IcuUniString aIcuSearchPatStr( rPatternStr.getStr(), rPatternStr.getLength());
+#if 1
+	// for conveniance specific syntax elements of the old regex engine are emulated
+	// by using regular word boundary matching \b to replace \< and \>
+	static const IcuUniString aChevronPattern( "\\<|\\>", -1, IcuUniString::kInvariant);
+	static const IcuUniString aChevronReplace( "\\b", -1, IcuUniString::kInvariant);
+	static RegexMatcher aChevronMatcher( aChevronPattern, 0, nIcuErr);
+	aChevronMatcher.reset( aIcuSearchPatStr);
+	aIcuSearchPatStr = aChevronMatcher.replaceAll( aChevronReplace, nIcuErr);
+	aChevronMatcher.reset();
+#endif
+	pRegexMatcher = new RegexMatcher( aIcuSearchPatStr, nIcuSearchFlags, nIcuErr);
+	if( nIcuErr)
+		{ delete pRegexMatcher; pRegexMatcher = NULL;}
+}
+
 //---------------------------------------------------------------------------
 
 SearchResult TextSearch::RESrchFrwrd( const OUString& searchStr,

Modified: incubator/ooo/trunk/main/i18npool/source/search/textsearch.hxx
URL: http://svn.apache.org/viewvc/incubator/ooo/trunk/main/i18npool/source/search/textsearch.hxx?rev=1234777&r1=1234776&r2=1234777&view=diff
==============================================================================
--- incubator/ooo/trunk/main/i18npool/source/search/textsearch.hxx (original)
+++ incubator/ooo/trunk/main/i18npool/source/search/textsearch.hxx Mon Jan 23 13:14:56 2012
@@ -101,6 +101,7 @@ class TextSearch: public cppu::WeakImplH
 		RESrchBkwrd( const ::rtl::OUString& searchStr,
 								sal_Int32 startPos, sal_Int32 endPos )
 							throw(::com::sun::star::uno::RuntimeException);
+	void RESrchPrepare( const ::com::sun::star::util::SearchOptions&);
 
 	// Members and methods for the "Weight Levenshtein-Distance" search
 	int nLimit;