You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@xerces.apache.org by db...@apache.org on 2005/07/27 20:56:36 UTC
svn commit: r225575 - in /xerces/c/trunk/src/xercesc/util:
Transcoders/ICU/ICUTransService.cpp regx/RangeToken.cpp
regx/RegularExpression.cpp regx/RegularExpression.hpp regx/RegxParser.cpp
regx/RegxUtil.cpp regx/RegxUtil.hpp
Author: dbertoni
Date: Wed Jul 27 11:56:33 2005
New Revision: 225575
URL: http://svn.apache.org/viewcvs?rev=225575&view=rev
Log:
Fixes for Jira issue XERCESC-1463.
Modified:
xerces/c/trunk/src/xercesc/util/Transcoders/ICU/ICUTransService.cpp
xerces/c/trunk/src/xercesc/util/regx/RangeToken.cpp
xerces/c/trunk/src/xercesc/util/regx/RegularExpression.cpp
xerces/c/trunk/src/xercesc/util/regx/RegularExpression.hpp
xerces/c/trunk/src/xercesc/util/regx/RegxParser.cpp
xerces/c/trunk/src/xercesc/util/regx/RegxUtil.cpp
xerces/c/trunk/src/xercesc/util/regx/RegxUtil.hpp
Modified: xerces/c/trunk/src/xercesc/util/Transcoders/ICU/ICUTransService.cpp
URL: http://svn.apache.org/viewcvs/xerces/c/trunk/src/xercesc/util/Transcoders/ICU/ICUTransService.cpp?rev=225575&r1=225574&r2=225575&view=diff
==============================================================================
--- xerces/c/trunk/src/xercesc/util/Transcoders/ICU/ICUTransService.cpp (original)
+++ xerces/c/trunk/src/xercesc/util/Transcoders/ICU/ICUTransService.cpp Wed Jul 27 11:56:33 2005
@@ -161,7 +161,7 @@
* if we clean up here, users' code may crash
*
#if (U_ICU_VERSION_MAJOR_NUM >= 2)
- // release all lasily allocated data
+ // release all lazily allocated data
u_cleanup();
#endif
*/
@@ -174,29 +174,37 @@
int ICUTransService::compareIString(const XMLCh* const comp1
, const XMLCh* const comp2)
{
- const XMLCh* psz1 = comp1;
- const XMLCh* psz2 = comp2;
+ size_t i = 0;
+ size_t j = 0;
- unsigned int curCount = 0;
- while (true)
+ for(;;)
{
- //
- // If an inequality, then return the difference. Note that the XMLCh
- // might be bigger physically than UChar, but it won't hold anything
- // larger than 0xFFFF, so our cast here will work for both possible
- // sizes of XMLCh.
- //
- if (u_toupper(UChar(*psz1)) != u_toupper(UChar(*psz2)))
- return int(*psz1) - int(*psz2);
+ UChar32 ch1;
+ UChar32 ch2;
- // If either has ended, then they both ended, so equal
- if (!*psz1 || !*psz2)
- break;
+ U16_NEXT_UNSAFE(comp1, i, ch1);
+ U16_NEXT_UNSAFE(comp2, j, ch2);
+
+ const UChar32 folded1 =
+ u_foldCase(ch1, U_FOLD_CASE_DEFAULT);
- // Move upwards for the next round
- psz1++;
- psz2++;
+ const UChar32 folded2 =
+ u_foldCase(ch2, U_FOLD_CASE_DEFAULT);
+
+ if (folded1 !=
+ folded2)
+ {
+ return folded1 - folded2;
+ }
+ else if (ch1 == 0)
+ {
+ // If ch1 is 0, the ch2 must also be
+ // 0. Otherwise, the previous if
+ // would have failed.
+ break;
+ }
}
+
return 0;
}
@@ -205,38 +213,49 @@
, const XMLCh* const comp2
, const unsigned int maxChars)
{
- const XMLCh* psz1 = comp1;
- const XMLCh* psz2 = comp2;
-
- unsigned int curCount = 0;
- while (true)
+ if (maxChars > 0)
{
- //
- // If an inequality, then return the difference. Note that the XMLCh
- // might be bigger physically than UChar, but it won't hold anything
- // larger than 0xFFFF, so our cast here will work for both possible
- // sizes of XMLCh.
- //
- if (u_toupper(UChar(*psz1)) != u_toupper(UChar(*psz2)))
- return int(*psz1) - int(*psz2);
-
- // If either ended, then both ended, so equal
- if (!*psz1 || !*psz2)
- break;
-
- // Move upwards to next chars
- psz1++;
- psz2++;
-
- //
- // Bump the count of chars done. If it equals the count then we
- // are equal for the requested count, so break out and return
- // equal.
- //
- curCount++;
- if (maxChars == curCount)
- break;
+ // Note that this function has somewhat broken semantics, as it's
+ // possible for two strings of different lengths to compare as equal
+ // in a case-insensitive manner, since one character could be
+ // represented as a surrogate pair.
+ size_t i = 0;
+ size_t j = 0;
+
+ for(;;)
+ {
+ UChar32 ch1;
+ UChar32 ch2;
+
+ U16_NEXT_UNSAFE(comp1, i, ch1);
+ U16_NEXT_UNSAFE(comp2, j, ch2);
+
+ const UChar32 folded1 =
+ u_foldCase(ch1, U_FOLD_CASE_DEFAULT);
+
+ const UChar32 folded2 =
+ u_foldCase(ch2, U_FOLD_CASE_DEFAULT);
+
+ if (folded1 != folded2)
+ {
+ return folded1 - folded2;
+ }
+ else if (i == maxChars)
+ {
+ // If we're at the end of both strings, return 0.
+ // Otherwise, we've run out of characters in the
+ // left string, so return -1.
+ return j == maxChars ? 0 : -1;
+ }
+ else if (j == maxChars)
+ {
+ // We've run out of characters in the right string,
+ // but not the left, so return 1.
+ return 1;
+ }
+ }
}
+
return 0;
}
@@ -289,24 +308,59 @@
}
-void ICUTransService::upperCase(XMLCh* const toUpperCase) const
-{
- XMLCh* outPtr = toUpperCase;
- while (*outPtr)
+template <class FunctionType>
+static void
+doCaseConvert(
+ XMLCh* convertString,
+ FunctionType caseFunction)
+{
+ // Note the semantics of this function are broken, since it's
+ // possible that changing the case of a string could increase
+ // its length, but there's no way to handle such a situation.
+ const unsigned int len =
+ XMLString::stringLen(convertString);
+
+ size_t readPos = 0;
+ size_t writePos = 0;
+
+ while(readPos < len)
{
- *outPtr = XMLCh(u_toupper(UChar(*outPtr)));
- outPtr++;
+ UChar32 original;
+
+ // Get the next Unicode code point.
+ U16_NEXT_UNSAFE(convertString, readPos, original);
+
+ // Convert the code point
+ const UChar32 converted = caseFunction(original);
+
+ // OK, now here's where it gets ugly.
+ if (!U_IS_BMP(converted) && U_IS_BMP(original) &&
+ readPos - writePos == 1)
+ {
+ // We do not have room to convert the
+ // character without overwriting the next
+ // character, so we will just stop.
+ break;
+ }
+ else
+ {
+ U16_APPEND_UNSAFE(convertString, writePos, converted);
+ }
}
+
+ convertString[writePos] = 0;
+}
+
+
+
+void ICUTransService::upperCase(XMLCh* const toUpperCase) const
+{
+ doCaseConvert(toUpperCase, u_toupper);
}
void ICUTransService::lowerCase(XMLCh* const toLowerCase) const
{
- XMLCh* outPtr = toLowerCase;
- while (*outPtr)
- {
- *outPtr = XMLCh(u_tolower(UChar(*outPtr)));
- outPtr++;
- }
+ doCaseConvert(toLowerCase, u_tolower);
}
Modified: xerces/c/trunk/src/xercesc/util/regx/RangeToken.cpp
URL: http://svn.apache.org/viewcvs/xerces/c/trunk/src/xercesc/util/regx/RangeToken.cpp?rev=225575&r1=225574&r2=225575&view=diff
==============================================================================
--- xerces/c/trunk/src/xercesc/util/regx/RangeToken.cpp (original)
+++ xerces/c/trunk/src/xercesc/util/regx/RangeToken.cpp Wed Jul 27 11:56:33 2005
@@ -27,6 +27,12 @@
#include <xercesc/util/regx/TokenFactory.hpp>
#include <xercesc/util/IllegalArgumentException.hpp>
+#if defined(XML_USE_ICU_TRANSCODER) || defined (XML_USE_UNICONV390_TRANSCODER)
+#include <unicode/uchar.h>
+#else
+#include <xercesc/util/XMLUniDefs.hpp>
+#endif
+
XERCES_CPP_NAMESPACE_BEGIN
// ---------------------------------------------------------------------------
@@ -66,15 +72,55 @@
// ---------------------------------------------------------------------------
RangeToken* RangeToken::getCaseInsensitiveToken(TokenFactory* const tokFactory) {
- // REVIST
- // We will not build a token with case insenstive ranges
- // For now we will return a copy of ourselves.
if (fCaseIToken == 0 && tokFactory) {
bool isNRange = (getTokenType() == T_NRANGE) ? true : false;
RangeToken* lwrToken = tokFactory->createRange(isNRange);
+ for (unsigned int i = 0; i < fElemCount - 1; i += 2) {
+ for (XMLInt32 ch = fRanges[i]; ch <= fRanges[i + 1]; ++ch) {
+#if defined(XML_USE_ICU_TRANSCODER) || defined (XML_USE_UNICONV390_TRANSCODER)
+ const XMLInt32 upperCh = u_toupper(ch);
+
+ if (upperCh != ch)
+ {
+ lwrToken->addRange(upperCh, upperCh);
+ }
+
+ const XMLInt32 lowerCh = u_tolower(ch);
+
+ if (lowerCh != ch)
+ {
+ lwrToken->addRange(lowerCh, lowerCh);
+ }
+
+ const XMLInt32 titleCh = u_totitle(ch);
+
+ if (titleCh != ch && titleCh != upperCh)
+ {
+ lwrToken->addRange(titleCh, titleCh);
+ }
+#else
+ if (ch >= chLatin_A && ch <= chLatin_Z)
+ {
+ ch += chLatin_a - chLatin_A;
+
+ lwrToken->addRange(ch, ch);
+ }
+ else if (ch >= chLatin_a && ch <= chLatin_z)
+ {
+ ch -= chLatin_a - chLatin_A;
+
+ lwrToken->addRange(ch, ch);
+ }
+#endif
+ }
+ }
+
lwrToken->mergeRanges(this);
+ lwrToken->compactRanges();
+ lwrToken->createMap();
+
fCaseIToken = lwrToken;
}
@@ -259,6 +305,7 @@
}
fElemCount = rangeTok->fElemCount;
+ fSorted = true;
return;
}
Modified: xerces/c/trunk/src/xercesc/util/regx/RegularExpression.cpp
URL: http://svn.apache.org/viewcvs/xerces/c/trunk/src/xercesc/util/regx/RegularExpression.cpp?rev=225575&r1=225574&r2=225575&view=diff
==============================================================================
--- xerces/c/trunk/src/xercesc/util/regx/RegularExpression.cpp (original)
+++ xerces/c/trunk/src/xercesc/util/regx/RegularExpression.cpp Wed Jul 27 11:56:33 2005
@@ -23,7 +23,6 @@
// ---------------------------------------------------------------------------
#include <xercesc/util/regx/RegularExpression.hpp>
#include <xercesc/util/PlatformUtils.hpp>
-#include <xercesc/util/regx/RegxUtil.hpp>
#include <xercesc/util/regx/Match.hpp>
#include <xercesc/util/regx/RangeToken.hpp>
#include <xercesc/util/regx/RegxDefs.hpp>
@@ -36,6 +35,7 @@
#include <xercesc/util/OutOfMemoryException.hpp>
#include <xercesc/util/XMLInitializer.hpp>
#include <xercesc/util/XMLRegisterCleanup.hpp>
+#include <xercesc/util/XMLUniDefs.hpp>
XERCES_CPP_NAMESPACE_BEGIN
@@ -69,6 +69,55 @@
+bool RegularExpression::matchIgnoreCase(const XMLInt32 ch1,
+ const XMLInt32 ch2)
+{
+ if (ch1 >= 0x10000)
+ {
+ XMLCh string1[2];
+ XMLCh string2[2];
+
+ RegxUtil::decomposeToSurrogates(ch1, string1[0], string1[1]);
+
+ if (ch2 >= 0x10000)
+ {
+ RegxUtil::decomposeToSurrogates(ch2, string2[0], string2[1]);
+ }
+ else
+ {
+ // XMLString::compareNIString is broken, because it assume the
+ // two strings must be of the same length. Note that two strings
+ // of different length could compare as equal, because there is no
+ // guarantee that a Unicode code point that is encoded in UTF-16 as
+ // a surrogate pair does not have a case mapping to a code point
+ // that is not in the surrogate range. Just to be safe, we pad the
+ // shorter string with a space, which cannot hvae a case mapping.
+ string2[0] = (XMLCh)ch2;
+ string2[1] = chSpace;
+ }
+
+ return (0==XMLString::compareNIString(string1, string2, 2));
+ }
+ else if (ch2 >= 0x10000)
+ {
+ const XMLCh string1[2] = { (XMLCh)ch1, chSpace };
+ XMLCh string2[2];
+
+ RegxUtil::decomposeToSurrogates(ch2, string2[0], string2[1]);
+
+ return (0==XMLString::compareNIString(string1, string2, 2));
+ }
+ else
+ {
+ const XMLCh char1 = (XMLCh)ch1;
+ const XMLCh char2 = (XMLCh)ch2;
+
+ return (0==XMLString::compareNIString(&char1, &char2, 1));
+ }
+ }
+
+
+
// ---------------------------------------------------------------------------
// RegularExpression::Context: Constructors and Destructor
// ---------------------------------------------------------------------------
@@ -540,11 +589,6 @@
if (!range->match(ch)) {
- if (!ignoreCase)
- continue;
-
- // Perform case insensitive match
- // REVISIT
continue;
}
@@ -1098,21 +1142,10 @@
bool match = false;
if (ignoreCase) {
-
- //REVISIT we should match ignoring case, but for now
- //we will do a normal match
- //tok = tok->getCaseInsensitiveToken();
- //if (!token->match(strCh)) {
-
- // if (strCh > 0x10000)
- // return -1;
- // Do case insensitive matching - uppercase match
- // or lowercase match
- //}
- match = tok->match(strCh);
+ tok = tok->getCaseInsensitiveToken(fTokenFactory);
}
- else
- match = tok->match(strCh);
+
+ match = tok->match(strCh);
if (!match)
return false;
@@ -1498,7 +1531,12 @@
}
rangeTok->createMap();
- }
+
+ if (isSet(fOptions, IGNORE_CASE))
+ {
+ rangeTok->getCaseInsensitiveToken(fTokenFactory);
+ }
+ }
if (fOperations != 0 && fOperations->getNextOp() == 0 &&
(fOperations->getOpType() == Op::O_STRING ||
Modified: xerces/c/trunk/src/xercesc/util/regx/RegularExpression.hpp
URL: http://svn.apache.org/viewcvs/xerces/c/trunk/src/xercesc/util/regx/RegularExpression.hpp?rev=225575&r1=225574&r2=225575&view=diff
==============================================================================
--- xerces/c/trunk/src/xercesc/util/regx/RegularExpression.hpp (original)
+++ xerces/c/trunk/src/xercesc/util/regx/RegularExpression.hpp Wed Jul 27 11:56:33 2005
@@ -33,6 +33,7 @@
#include <xercesc/util/regx/ModifierToken.hpp>
#include <xercesc/util/regx/ConditionToken.hpp>
#include <xercesc/util/regx/OpFactory.hpp>
+#include <xercesc/util/regx/RegxUtil.hpp>
XERCES_CPP_NAMESPACE_BEGIN
@@ -147,6 +148,8 @@
static void
staticCleanup();
+ static bool isSet(const int options, const int flag);
+
private:
// -----------------------------------------------------------------------
// Private data types
@@ -195,7 +198,6 @@
// -----------------------------------------------------------------------
void prepare();
int parseOptions(const XMLCh* const options);
- bool isSet(const int options, const int flag);
unsigned short getWordType(const XMLCh* const target, const int begin,
const int end, const int offset);
unsigned short getCharType(const XMLCh ch);
@@ -604,14 +606,6 @@
return ret;
}
-
- inline bool RegularExpression::matchIgnoreCase(const XMLInt32 ch1,
- const XMLInt32 ch2)
- {
-
- return (0==XMLString::compareNIString((const XMLCh*)&ch1,(const XMLCh*)&ch2, 1));
- }
-
XERCES_CPP_NAMESPACE_END
Modified: xerces/c/trunk/src/xercesc/util/regx/RegxParser.cpp
URL: http://svn.apache.org/viewcvs/xerces/c/trunk/src/xercesc/util/regx/RegxParser.cpp?rev=225575&r1=225574&r2=225575&view=diff
==============================================================================
--- xerces/c/trunk/src/xercesc/util/regx/RegxParser.cpp (original)
+++ xerces/c/trunk/src/xercesc/util/regx/RegxParser.cpp Wed Jul 27 11:56:33 2005
@@ -1175,6 +1175,15 @@
tok->sortRanges();
tok->compactRanges();
+
+ // If the case-insensitive option is enabled, we need to
+ // have the new RangeToken instance build its internal
+ // case-insensitive RangeToken.
+ if (RegularExpression::isSet(fOptions, RegularExpression::IGNORE_CASE))
+ {
+ tok->getCaseInsensitiveToken(fTokenFactory);
+ }
+
setParseContext(S_NORMAL);
processNext();
Modified: xerces/c/trunk/src/xercesc/util/regx/RegxUtil.cpp
URL: http://svn.apache.org/viewcvs/xerces/c/trunk/src/xercesc/util/regx/RegxUtil.cpp?rev=225575&r1=225574&r2=225575&view=diff
==============================================================================
--- xerces/c/trunk/src/xercesc/util/regx/RegxUtil.cpp (original)
+++ xerces/c/trunk/src/xercesc/util/regx/RegxUtil.cpp Wed Jul 27 11:56:33 2005
@@ -31,10 +31,9 @@
XMLCh* pszStr = (XMLCh*) manager->allocate(3 * sizeof(XMLCh));//new XMLCh[3];
- ch -= 0x10000;
- pszStr[0] = XMLCh((ch >> 10) + 0xD800);
- pszStr[1] = XMLCh((ch & 0x03FF) + 0xDC00);
- pszStr[2] = chNull;
+ decomposeToSurrogates(ch, pszStr[0], pszStr[1]);
+
+ pszStr[2] = chNull;
return pszStr;
}
Modified: xerces/c/trunk/src/xercesc/util/regx/RegxUtil.hpp
URL: http://svn.apache.org/viewcvs/xerces/c/trunk/src/xercesc/util/regx/RegxUtil.hpp?rev=225575&r1=225574&r2=225575&view=diff
==============================================================================
--- xerces/c/trunk/src/xercesc/util/regx/RegxUtil.hpp (original)
+++ xerces/c/trunk/src/xercesc/util/regx/RegxUtil.hpp Wed Jul 27 11:56:33 2005
@@ -44,6 +44,8 @@
static bool isWordChar(const XMLCh);
static bool isLowSurrogate(const XMLCh ch);
static bool isHighSurrogate(const XMLCh ch);
+ static void decomposeToSurrogates(XMLInt32 ch, XMLCh& high, XMLCh& low);
+
static XMLCh* decomposeToSurrogates(XMLInt32 ch,
MemoryManager* const manager);
static XMLCh* stripExtendedComment(const XMLCh* const expression,
@@ -76,6 +78,13 @@
inline bool RegxUtil::isHighSurrogate(const XMLCh ch) {
return (ch & 0xFC00) == 0xD800;
+}
+
+inline void RegxUtil::decomposeToSurrogates(XMLInt32 ch, XMLCh& high, XMLCh& low) {
+
+ ch -= 0x10000;
+ high = XMLCh((ch >> 10) + 0xD800);
+ low = XMLCh((ch & 0x03FF) + 0xDC00);
}
inline bool RegxUtil::isWordChar(const XMLCh ch) {
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@xerces.apache.org
For additional commands, e-mail: commits-help@xerces.apache.org