You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by ea...@apache.org on 2007/02/03 18:19:14 UTC
svn commit: r503265 [2/6] - /incubator/uima/uimacpp/trunk/src/test/src/
Added: incubator/uima/uimacpp/trunk/src/test/src/doc_buffer.cpp
URL: http://svn.apache.org/viewvc/incubator/uima/uimacpp/trunk/src/test/src/doc_buffer.cpp?view=auto&rev=503265
==============================================================================
--- incubator/uima/uimacpp/trunk/src/test/src/doc_buffer.cpp (added)
+++ incubator/uima/uimacpp/trunk/src/test/src/doc_buffer.cpp Sat Feb 3 09:19:12 2007
@@ -0,0 +1,240 @@
+/** @name doc_buffer.cpp
+
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+
+-------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------- */
+/* Include dependencies */
+/* ----------------------------------------------------------------------- */
+
+#include "uima/doc_buffer.hpp"
+
+#include "uima/ccsid.hpp"
+#include "uima/macros.h"
+#include "uima/macros.h"
+#include "uima/trace.hpp"
+
+#include "uima/cp2ucnvrt.hpp"
+#include "uima/comp_ids.h"
+#include "uima/err_ids.h"
+#include "uima/msg.h"
+
+#include <algorithm>
+
+/* ----------------------------------------------------------------------- */
+/* Constants */
+/* ----------------------------------------------------------------------- */
+
+#define UIMA_DOC_BUFFER_RESERVE_SIZE (64 * 1024)
+
+/* ----------------------------------------------------------------------- */
+/* Forward declarations */
+/* ----------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------- */
+/* Types / Classes */
+/* ----------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------- */
+/* Private */
+/* ----------------------------------------------------------------------- */
+
+namespace uima {
+
+ void DocBuffer::addDocPartImp(const char * cpacDocPartText,
+ size_t uDocPartSizeInBytes,
+ CodePage2UnicodeConverter & crclConverter)
+ /* ----------------------------------------------------------------------- */
+ {
+ size_t uEstPartSizeRequired;
+ size_t uEstNewSize;
+ size_t uCurrentSize;
+ size_t uSizeConverted;
+ size_t uSizeAvailable;
+ UChar * pw16Target;
+
+ assert(EXISTS(cpacDocPartText));
+ assert(uDocPartSizeInBytes > 0);
+ /////assert(crclConverter.isSupported());
+ //ee uEstPartSizeRequired = crclConverter.getMaximumLength(cpacDocPartText, uDocPartSizeInBytes);
+ //ee - use the safest estimate
+ uEstPartSizeRequired = sizeof(UChar) * uDocPartSizeInBytes;
+ uCurrentSize = iv_uLength * sizeof(UChar);
+ uEstNewSize = uCurrentSize + uEstPartSizeRequired;
+ UIMA_TPRINT("input: uDocPartSizeInBytes: " << uDocPartSizeInBytes);
+ UIMA_TPRINT(" uEstPartSizeRequired: " << uEstPartSizeRequired);
+ UIMA_TPRINT(" uEstNewSize: " << uEstNewSize);
+ UIMA_TPRINT("uCurrentSize: " << uCurrentSize);
+ UIMA_TPRINT("iv_uSizeAllocated: " << iv_uSizeAllocated);
+
+ /* we already have allocated the initial block in the ctor */
+ /* check whether we need to re-allocate */
+ if (uEstNewSize > iv_uSizeAllocated) {
+ /* this block does not fit into the first block of the memory pool -
+ we need to allocate a new block with no limitations by the pool */
+ const UChar * cpw16DocumentCurrent = iv_cpw16Document;
+
+ iv_uSizeAllocated = uEstNewSize + iv_uMemPoolReserve;
+ UIMA_TPRINT("*** new iv_uSizeAllocated: " << iv_uSizeAllocated << "***");
+ iv_cpw16Document = (const UChar *) malloc(iv_uSizeAllocated);
+ assert(EXISTS(iv_cpw16Document));
+ /* and we need to copy the old document buffer into the newly allocated one */
+ pw16Target = CONST_CAST(UChar *, iv_cpw16Document);
+ memcpy((char *) pw16Target, (const char *) cpw16DocumentCurrent, uCurrentSize);
+ free((void*)cpw16DocumentCurrent); // Release too-small block
+ }
+ pw16Target = CONST_CAST(UChar *, (iv_cpw16Document + iv_uLength));
+ uSizeAvailable = iv_uSizeAllocated - uCurrentSize;
+ assert(EXISTS(pw16Target));
+ assert(uSizeAvailable > 0);
+ UIMA_TPRINT("uSizeAvailable: " << uSizeAvailable);
+ uSizeConverted = crclConverter.convertBytes(pw16Target,
+ uSizeAvailable,
+ cpacDocPartText,
+ uDocPartSizeInBytes);
+ UIMA_TPRINT("uSizeConverted: " << uSizeConverted);
+ iv_uLength += (uSizeConverted / sizeof(UChar));
+ UIMA_TPRINT("new iv_uiLength: " << iv_uLength);
+ }
+
+ void DocBuffer::resetMemPool(void)
+ /* ----------------------------------------------------------------------- */
+ {
+ /* Allocate if necessary */
+ if (iv_cpw16Document == 0) {
+ iv_uSizeAllocated = iv_uMemPoolInitialSize;
+ iv_cpw16Document = (const UChar *) malloc(iv_uSizeAllocated);
+ }
+
+ assert(EXISTS(iv_cpw16Document));
+ iv_uLength = 0;
+ }
+
+ /* ----------------------------------------------------------------------- */
+ /* Public */
+ /* ----------------------------------------------------------------------- */
+
+ DocBuffer::DocBuffer() :
+ iv_uMemPoolInitialSize(100000),
+ iv_uMemPoolReserve(UIMA_DOC_BUFFER_RESERVE_SIZE),
+ iv_cpw16Document(0),
+ iv_uLength(0),
+ iv_uSizeAllocated(0) {
+ init();
+ }
+
+ // Replace pool by a malloc'd buffer
+ DocBuffer::DocBuffer(size_t uMemPoolInitialSize, size_t) :
+ iv_uMemPoolInitialSize(uMemPoolInitialSize),
+ iv_uMemPoolReserve(UIMA_DOC_BUFFER_RESERVE_SIZE),
+ iv_cpw16Document(0),
+ iv_uLength(0),
+ iv_uSizeAllocated(0)
+ /* ----------------------------------------------------------------------- */
+ {
+ init();
+ }
+
+ DocBuffer::~DocBuffer()
+ /* ----------------------------------------------------------------------- */
+ {
+ if ( iv_cpw16Document != 0 )
+ free((void*)iv_cpw16Document);
+ }
+
+ void DocBuffer::init()
+ /* ----------------------------------------------------------------------- */
+ {
+ resetMemPool();
+ }
+
+ bool DocBuffer::isValid(void) const
+ /* ----------------------------------------------------------------------- */
+ {
+ return(iv_cpw16Document != 0);
+ }
+
+ UnicodeStringRef DocBuffer::getText(TyDocIndex uIndexBegin,
+ TyDocIndex uIndexEnd) const UIMA_THROW(ExcDocBuffer)
+ /* ----------------------------------------------------------------------- */
+ {
+ assert(EXISTS(iv_cpw16Document));
+ assert(uIndexBegin <= uIndexEnd);
+ /* in case the assert is gone in ship mode */
+ if (!isValidIndex(uIndexBegin)) {
+ UIMA_EXC_THROW_NEW(ExcDocBuffer,
+ UIMA_ERR_DOCUMENT_INVALID_INDEX,
+ UIMA_MSG_ID_EXC_DOCUMENT_INVALID_IDX,
+ ErrorMessage(UIMA_MSG_ID_EXCON_DOCUMENT_INVALID_IDX, (unsigned long) uIndexBegin),
+ ErrorInfo::recoverable);
+ }
+ if (!isValidIndex(uIndexEnd)) {
+ UIMA_EXC_THROW_NEW(ExcDocBuffer,
+ UIMA_ERR_DOCUMENT_INVALID_INDEX,
+ UIMA_MSG_ID_EXC_DOCUMENT_INVALID_IDX,
+ ErrorMessage(UIMA_MSG_ID_EXCON_DOCUMENT_INVALID_IDX, (unsigned long) uIndexEnd),
+ ErrorInfo::recoverable);
+ }
+ return(UnicodeStringRef(iv_cpw16Document + uIndexBegin, (uIndexEnd - uIndexBegin + 1)));
+ }
+
+ void DocBuffer::addDocPart(const char * cpacDocPartText,
+ size_t uDocPartSizeInBytes,
+ CodePage2UnicodeConverter & crclConverter)
+ /* ----------------------------------------------------------------------- */
+ {
+ /////assert(crclConverter.isSupported());
+ addDocPartImp(cpacDocPartText, uDocPartSizeInBytes, crclConverter);
+ }
+
+
+ void DocBuffer::addDocPart(const char * cpacDocPartText,
+ size_t uDocPartSize,
+ const char * crclCCSID) {
+ CodePage2UnicodeConverter converter(crclCCSID);
+ addDocPartImp(cpacDocPartText, uDocPartSize, converter);
+ }
+
+
+ void DocBuffer::addDocPart(const UChar * cpclDocPartText,
+ size_t uDocPartLength)
+ /* ----------------------------------------------------------------------- */
+ {
+ CodePage2UnicodeConverter clConverter("UTF16_PlatformEndian");
+ size_t uDocPartSizeInBytes;
+
+ //// assert(clConverter.getTargetCCSID().isUCS2HostEndian());
+ ////assert(clConverter.getSourceCCSID().isUCS2HostEndian());
+ ////assert(clConverter.isSupported());
+ ////assert(clConverter.isBuiltIn());
+ uDocPartSizeInBytes = uDocPartLength * sizeof(UChar);
+ addDocPartImp((const char *) cpclDocPartText, uDocPartSizeInBytes, clConverter);
+ }
+
+ void DocBuffer::reset(void)
+ /* ----------------------------------------------------------------------- */
+ {
+ resetMemPool();
+ }
+
+
+}
+
+/* <EOF> */
+
Propchange: incubator/uima/uimacpp/trunk/src/test/src/doc_buffer.cpp
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/uima/uimacpp/trunk/src/test/src/plugin_annotator_dump.vcproj
URL: http://svn.apache.org/viewvc/incubator/uima/uimacpp/trunk/src/test/src/plugin_annotator_dump.vcproj?view=auto&rev=503265
==============================================================================
--- incubator/uima/uimacpp/trunk/src/test/src/plugin_annotator_dump.vcproj (added)
+++ incubator/uima/uimacpp/trunk/src/test/src/plugin_annotator_dump.vcproj Sat Feb 3 09:19:12 2007
@@ -0,0 +1,147 @@
+<?xml version="1.0" encoding="Windows-1252"?>
+<VisualStudioProject
+ ProjectType="Visual C++"
+ Version="7.10"
+ Name="plugin_annotator_dump"
+ ProjectGUID="{0BC37546-5E6B-4383-9984-EF348B9FB966}"
+ Keyword="Win32Proj">
+ <Platforms>
+ <Platform
+ Name="Win32"/>
+ </Platforms>
+ <Configurations>
+ <Configuration
+ Name="Debug|Win32"
+ OutputDirectory="$(InputDir)\.."
+ IntermediateDirectory="..\Debug\$(ProjectName)"
+ ConfigurationType="2"
+ CharacterSet="2">
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="0"
+ AdditionalIncludeDirectories=".;"$(APR_HOME)\include";"$(ICU_HOME)\include";"$(XERCES_HOME)\include";"$(JAVA_HOME)\include";"$(JAVA_HOME)\include\win32";"$(UIMACPP_HOME)\include";"$(UIMACPP_HOME)\include\apr""
+ PreprocessorDefinitions="UIMA_SUPPRESS_TIMING;WIN32;_DEBUG;_WINDOWS;_USRDLL"
+ MinimalRebuild="TRUE"
+ BasicRuntimeChecks="3"
+ RuntimeLibrary="3"
+ UsePrecompiledHeader="0"
+ WarningLevel="3"
+ Detect64BitPortabilityProblems="TRUE"
+ DebugInformationFormat="3"/>
+ <Tool
+ Name="VCCustomBuildTool"/>
+ <Tool
+ Name="VCLinkerTool"
+ AdditionalDependencies="icuuc.lib libapr$(APR_VER).lib uimaD.lib"
+ OutputFile="$(OutDir)\libdump.dll"
+ LinkIncremental="1"
+ AdditionalLibraryDirectories=""$(OutDir)";"$(APR_HOME)\Debug";"$(XERCES_HOME)\lib";"$(ICU_HOME)\lib";"$(UIMACPP_HOME)\lib""
+ GenerateDebugInformation="TRUE"
+ ProgramDatabaseFile="$(OutDir)/$(TargetName).pdb"
+ SubSystem="2"
+ ImportLibrary="$(OutDir)/libdump.lib"
+ TargetMachine="1"/>
+ <Tool
+ Name="VCMIDLTool"/>
+ <Tool
+ Name="VCPostBuildEventTool"/>
+ <Tool
+ Name="VCPreBuildEventTool"/>
+ <Tool
+ Name="VCPreLinkEventTool"/>
+ <Tool
+ Name="VCResourceCompilerTool"/>
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"/>
+ <Tool
+ Name="VCXMLDataGeneratorTool"/>
+ <Tool
+ Name="VCWebDeploymentTool"/>
+ <Tool
+ Name="VCManagedWrapperGeneratorTool"/>
+ <Tool
+ Name="VCAuxiliaryManagedWrapperGeneratorTool"/>
+ </Configuration>
+ <Configuration
+ Name="Release|Win32"
+ OutputDirectory="$(InputDir)\.."
+ IntermediateDirectory="..\Release\$(ProjectName)"
+ ConfigurationType="2"
+ CharacterSet="2">
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="3"
+ AdditionalIncludeDirectories=".;"$(APR_HOME)\include";"$(ICU_HOME)\include";"$(XERCES_HOME)\include";"$(JAVA_HOME)\include";"$(JAVA_HOME)\include\win32";"$(UIMACPP_HOME)\include";"$(UIMACPP_HOME)\include\apr""
+ PreprocessorDefinitions="UIMA_SUPPRESS_TIMING;TRACEOFF;NDEBUG;WIN32;_WINDOWS;_USRDLL"
+ MinimalRebuild="FALSE"
+ BasicRuntimeChecks="0"
+ RuntimeLibrary="2"
+ UsePrecompiledHeader="0"
+ WarningLevel="3"
+ Detect64BitPortabilityProblems="TRUE"
+ DebugInformationFormat="3"/>
+ <Tool
+ Name="VCCustomBuildTool"/>
+ <Tool
+ Name="VCLinkerTool"
+ AdditionalDependencies="icuuc.lib libapr$(APR_VER).lib uima.lib"
+ OutputFile="$(OutDir)\libdump.dll"
+ LinkIncremental="1"
+ AdditionalLibraryDirectories=""$(OutDir)";"$(APR_HOME)\Release";"$(XERCES_HOME)\lib";"$(ICU_HOME)\lib";"$(UIMACPP_HOME)\lib""
+ GenerateDebugInformation="FALSE"
+ ProgramDatabaseFile="$(OutDir)/$(TargetName).pdb"
+ SubSystem="2"
+ OptimizeReferences="2"
+ EnableCOMDATFolding="2"
+ ImportLibrary="$(OutDir)/libdump.lib"
+ TargetMachine="1"/>
+ <Tool
+ Name="VCMIDLTool"/>
+ <Tool
+ Name="VCPostBuildEventTool"/>
+ <Tool
+ Name="VCPreBuildEventTool"/>
+ <Tool
+ Name="VCPreLinkEventTool"/>
+ <Tool
+ Name="VCResourceCompilerTool"/>
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"/>
+ <Tool
+ Name="VCXMLDataGeneratorTool"/>
+ <Tool
+ Name="VCWebDeploymentTool"/>
+ <Tool
+ Name="VCManagedWrapperGeneratorTool"/>
+ <Tool
+ Name="VCAuxiliaryManagedWrapperGeneratorTool"/>
+ </Configuration>
+ </Configurations>
+ <References>
+ </References>
+ <Files>
+ <Filter
+ Name="Source Files"
+ Filter="cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx"
+ UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}">
+ <File
+ RelativePath="annotator_dump.cpp">
+ </File>
+ </Filter>
+ <Filter
+ Name="Header Files"
+ Filter="h;hpp;hxx;hm;inl;inc;xsd"
+ UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}">
+ <File
+ RelativePath="include\annotator_dump.hpp">
+ </File>
+ </Filter>
+ <Filter
+ Name="Resource Files"
+ Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx"
+ UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}">
+ </Filter>
+ </Files>
+ <Globals>
+ </Globals>
+</VisualStudioProject>
Propchange: incubator/uima/uimacpp/trunk/src/test/src/plugin_annotator_dump.vcproj
------------------------------------------------------------------------------
svn:eol-style = CRLF
Added: incubator/uima/uimacpp/trunk/src/test/src/plugin_tokenizer.vcproj
URL: http://svn.apache.org/viewvc/incubator/uima/uimacpp/trunk/src/test/src/plugin_tokenizer.vcproj?view=auto&rev=503265
==============================================================================
--- incubator/uima/uimacpp/trunk/src/test/src/plugin_tokenizer.vcproj (added)
+++ incubator/uima/uimacpp/trunk/src/test/src/plugin_tokenizer.vcproj Sat Feb 3 09:19:12 2007
@@ -0,0 +1,156 @@
+<?xml version="1.0" encoding="Windows-1252"?>
+<VisualStudioProject
+ ProjectType="Visual C++"
+ Version="7.10"
+ Name="plugin_tokenizer"
+ ProjectGUID="{05B599CC-FBF4-4D21-8DAF-4DCAE35A116A}"
+ Keyword="Win32Proj">
+ <Platforms>
+ <Platform
+ Name="Win32"/>
+ </Platforms>
+ <Configurations>
+ <Configuration
+ Name="Debug|Win32"
+ OutputDirectory="$(InputDir)\.."
+ IntermediateDirectory="..\Debug\$(ProjectName)"
+ ConfigurationType="2"
+ CharacterSet="2">
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="0"
+ AdditionalIncludeDirectories=".;"$(APR_HOME)\include";"$(ICU_HOME)\include";"$(XERCES_HOME)\include";"$(JAVA_HOME)\include";"$(JAVA_HOME)\include\win32";"$(UIMACPP_HOME)\include";"$(UIMACPP_HOME)\include\apr""
+ PreprocessorDefinitions="UIMA_SUPPRESS_TIMING;WIN32;_DEBUG;_WINDOWS;_USRDLL"
+ MinimalRebuild="TRUE"
+ BasicRuntimeChecks="3"
+ RuntimeLibrary="3"
+ UsePrecompiledHeader="0"
+ WarningLevel="3"
+ Detect64BitPortabilityProblems="TRUE"
+ DebugInformationFormat="3"/>
+ <Tool
+ Name="VCCustomBuildTool"/>
+ <Tool
+ Name="VCLinkerTool"
+ AdditionalDependencies="icuuc.lib uimaD.lib libapr$(APR_VER).lib"
+ OutputFile="$(OutDir)\libtoknz.dll"
+ LinkIncremental="1"
+ AdditionalLibraryDirectories=""$(OutDir)";"$(APR_HOME)\Debug";"$(XERCES_HOME)\lib";"$(ICU_HOME)\lib";"$(UIMACPP_HOME)\lib""
+ GenerateDebugInformation="TRUE"
+ ProgramDatabaseFile="$(OutDir)/$(TargetName).pdb"
+ SubSystem="2"
+ ImportLibrary="$(OutDir)/libitutoknz.lib"
+ TargetMachine="1"/>
+ <Tool
+ Name="VCMIDLTool"/>
+ <Tool
+ Name="VCPostBuildEventTool"/>
+ <Tool
+ Name="VCPreBuildEventTool"/>
+ <Tool
+ Name="VCPreLinkEventTool"/>
+ <Tool
+ Name="VCResourceCompilerTool"/>
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"/>
+ <Tool
+ Name="VCXMLDataGeneratorTool"/>
+ <Tool
+ Name="VCWebDeploymentTool"/>
+ <Tool
+ Name="VCManagedWrapperGeneratorTool"/>
+ <Tool
+ Name="VCAuxiliaryManagedWrapperGeneratorTool"/>
+ </Configuration>
+ <Configuration
+ Name="Release|Win32"
+ OutputDirectory="$(InputDir)\.."
+ IntermediateDirectory="..\Release\$(ProjectName)"
+ ConfigurationType="2"
+ CharacterSet="2">
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="3"
+ AdditionalIncludeDirectories=".;"$(APR_HOME)\include";"$(ICU_HOME)\include";"$(XERCES_HOME)\include";"$(JAVA_HOME)\include";"$(JAVA_HOME)\include\win32";"$(UIMACPP_HOME)\include";"$(UIMACPP_HOME)\include\apr""
+ PreprocessorDefinitions="UIMA_SUPPRESS_TIMING;TRACEOFF;NDEBUG;WIN32;_WINDOWS;_USRDLL"
+ MinimalRebuild="FALSE"
+ BasicRuntimeChecks="0"
+ RuntimeLibrary="2"
+ UsePrecompiledHeader="0"
+ WarningLevel="3"
+ Detect64BitPortabilityProblems="TRUE"
+ DebugInformationFormat="3"/>
+ <Tool
+ Name="VCCustomBuildTool"/>
+ <Tool
+ Name="VCLinkerTool"
+ AdditionalDependencies="icuuc.lib uima.lib"
+ OutputFile="$(OutDir)\libtoknz.dll"
+ LinkIncremental="1"
+ AdditionalLibraryDirectories=""$(OutDir)";"$(APR_HOME)\Release";"$(XERCES_HOME)\lib";"$(ICU_HOME)\lib";"$(UIMACPP_HOME)\lib""
+ GenerateDebugInformation="FALSE"
+ ProgramDatabaseFile="$(OutDir)/$(TargetName).pdb"
+ SubSystem="2"
+ OptimizeReferences="2"
+ EnableCOMDATFolding="2"
+ ImportLibrary="$(OutDir)/libitutoknz.lib"
+ TargetMachine="1"/>
+ <Tool
+ Name="VCMIDLTool"/>
+ <Tool
+ Name="VCPostBuildEventTool"/>
+ <Tool
+ Name="VCPreBuildEventTool"/>
+ <Tool
+ Name="VCPreLinkEventTool"/>
+ <Tool
+ Name="VCResourceCompilerTool"/>
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"/>
+ <Tool
+ Name="VCXMLDataGeneratorTool"/>
+ <Tool
+ Name="VCWebDeploymentTool"/>
+ <Tool
+ Name="VCManagedWrapperGeneratorTool"/>
+ <Tool
+ Name="VCAuxiliaryManagedWrapperGeneratorTool"/>
+ </Configuration>
+ </Configurations>
+ <References>
+ </References>
+ <Files>
+ <Filter
+ Name="Source Files"
+ Filter="cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx"
+ UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}">
+ <File
+ RelativePath="annotator_tok.cpp">
+ </File>
+ <File
+ RelativePath="ss_tokenizer.cpp">
+ </File>
+ <File
+ RelativePath="tt_types.cpp">
+ </File>
+ </Filter>
+ <Filter
+ Name="Header Files"
+ Filter="h;hpp;hxx;hm;inl;inc;xsd"
+ UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}">
+ <File
+ RelativePath="include\annotator_tok.hpp">
+ </File>
+ <File
+ RelativePath="include\ss_tokenizer.hpp">
+ </File>
+ </Filter>
+ <Filter
+ Name="Resource Files"
+ Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx"
+ UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}">
+ </Filter>
+ </Files>
+ <Globals>
+ </Globals>
+</VisualStudioProject>
Propchange: incubator/uima/uimacpp/trunk/src/test/src/plugin_tokenizer.vcproj
------------------------------------------------------------------------------
svn:eol-style = CRLF
Added: incubator/uima/uimacpp/trunk/src/test/src/ss_tokenizer.cpp
URL: http://svn.apache.org/viewvc/incubator/uima/uimacpp/trunk/src/test/src/ss_tokenizer.cpp?view=auto&rev=503265
==============================================================================
--- incubator/uima/uimacpp/trunk/src/test/src/ss_tokenizer.cpp (added)
+++ incubator/uima/uimacpp/trunk/src/test/src/ss_tokenizer.cpp Sat Feb 3 09:19:12 2007
@@ -0,0 +1,770 @@
+/**
+
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+
+-----------------------------------------------------------------------------
+
+ Description: A Unicode Tokenizer
+
+-------------------------------------------------------------------------- */
+
+
+
+/* ----------------------------------------------------------------------- */
+/* Include dependencies */
+/* ----------------------------------------------------------------------- */
+// must be first include file to surpress silly compiler warnings
+#include "uima/pragmas.hpp"
+#include "uima/assertmsg.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "unicode/uchar.h"
+
+#include "uima/ss_tokenizer.hpp"
+
+#include "uima/language.hpp"
+#include "uima/resmgr.hpp"
+#include "uima/err_ids.h"
+#include "uima/msg.h"
+
+namespace uima {
+
+ static TyCharmap gs_cauiCharMapWard = {
+ /*
+ * character map for unicode character
+ * The table is made up in "ward" tables. A "ward" is the first
+ * byte in a unicode character.
+ * Characters with ward 0 are the same as in codepage 819 (ISRI8859-1)
+ */
+ // WARD 0 (start 0x000)
+ { /* 0x01, 0x02 required for masking, leave part of token! */
+ CH_SPC, CH_USC, CH_USC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, /* 00-07 ' ' */
+ CH_SPC, CH_BLK, CH_NWL, CH_SPC, CH_SPC, CH_BLK, CH_SPC, CH_SPC, /* 08-0F ' ' */
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, /* 10-17 ' ' */
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, /* 18-1F ' ' */
+ CH_BLK, CH_SND, CH_SPC, CH_SPC, CH_CUR, CH_SPC, CH_SPC, CH_APS, /* 20-27 ' !"#$%&'' */
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_NSP, CH_CWS, CH_PRD, CH_CWS, /* 28-2F '()*+,-./' */
+ CH_NUM, CH_NUM, CH_NUM, CH_NUM, CH_NUM, CH_NUM, CH_NUM, CH_NUM, /* 30-37 '01234567' */
+ CH_NUM, CH_NUM, CH_CWS, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SND, /* 38-3F '89:;<=>?' */
+ CH_CWS, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, /* 40-47 '@ABCDEFG' */
+ CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, /* 48-4F 'HIJKLMNO' */
+ CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, /* 50-57 'PQRSTUVW' */
+ CH_UPR, CH_UPR, CH_UPR, CH_SPC, CH_CWS, CH_SPC, CH_SPC, CH_USC, /* 58-5F 'XYZ[\]^_' */
+ CH_APS, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, /* 60-67 '`abcdefg' */
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, /* 68-6F 'hijklmno' */
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, /* 70-77 'pqrstuvw' */
+ CH_LWR, CH_LWR, CH_LWR, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, /* 78-7F 'xyz ' */
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, /* 80-87 ' ' */
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, /* 88-8F ' ' */
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, /* 90-97 ' ' */
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, /* 98-9F ' ' */
+ CH_BLK, CH_SND, CH_SPC, CH_CUR, CH_CUR, CH_CUR, CH_SPC, CH_SPC, /* A0-A7 ' ' */
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, /* A8-AF ' ' */
+ CH_CUR, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, /* B0-B7 '° ' */
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SND, /* B8-BF ' ' */
+ CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, /* C0-C7 ' ' */
+ CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, /* C8-CF ' ' */
+ CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_SPC, /* D0-D7 ' ' */
+ CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_LWR, /* D8-DF ' ' */
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, /* E0-E7 ' ' */
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, /* E8-EF ' ' */
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_SPC, /* F0-F7 ' ' */
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR /* F8-FF ' ' */
+ },
+
+ // WARD 1 (start 0x010)
+ {
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, /* 00-07 */
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, /* 08-0F */
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, /* 10-17 */
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, /* 18-1F */
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, /* 20-27 */
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, /* 28-2F */
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, /* 30-37 */
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, /* 38-3F */
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, /* 40-47 */
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, /* 48-4F */
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, /* 50-57 */
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, /* 58-5F */
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, /* 60-67 */
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, /* 68-6F */
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, /* 70-77 */
+ CH_UPR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_LWR, /* 78-7F */
+ CH_LWR, CH_UPR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_UPR, /* 80-87 */
+ CH_LWR, CH_UPR, CH_UPR, CH_UPR, CH_LWR, CH_LWR, CH_UPR, CH_UPR, /* 88-8F */
+ CH_UPR, CH_UPR, CH_LWR, CH_UPR, CH_UPR, CH_LWR, CH_UPR, CH_UPR, /* 90-97 */
+ CH_UPR, CH_LWR, CH_LWR, CH_LWR, CH_UPR, CH_UPR, CH_LWR, CH_UPR, /* 98-9F */
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_LWR, CH_UPR, /* A0-A7 */
+ CH_LWR, CH_UPR, CH_LWR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_UPR, /* A8-AF */
+ CH_LWR, CH_UPR, CH_UPR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, /* B0-B7 */
+ CH_UPR, CH_LWR, CH_LWR, CH_LWR, CH_UPR, CH_LWR, CH_LWR, CH_LWR, /* B8-BF */
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_UPR, CH_UPR, CH_LWR, CH_UPR, /* C0-C7 */
+ CH_UPR, CH_LWR, CH_UPR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, /* C8-CF */
+ CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, /* D0-D7 */
+ CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_LWR, CH_UPR, CH_LWR, /* D8-DF */
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, /* E0-E7 */
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, /* E8-EF */
+ CH_LWR, CH_UPR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_UPR, /* F0-F7 */
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR /* F8-FF */
+ },
+ // WARD 2 (start 0x020)
+ {
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, /* 00-07 */
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, /* 08-0F */
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, /* 10-17 */
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, /* 18-1F */
+ CH_SPC, CH_SPC, CH_UPR, CH_LWR, CH_SPC, CH_SPC, CH_SPC, CH_SPC, /* 20-27 */
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, /* 28-2F */
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, /* 30-37 */
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, /* 38-3F */
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, /* 40-47 */
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, /* 48-4F */
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, /* 50-57 */
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, /* 58-5F */
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, /* 60-67 */
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, /* 68-6F */
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, /* 70-77 */
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, /* 78-7F */
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, /* 80-87 */
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, /* 88-8F */
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, /* 90-97 */
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, /* 98-9F */
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, /* A0-A7 */
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_SPC, CH_SPC, /* A8-AF */
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, /* B0-B7 */
+ CH_LWR, CH_SPC, CH_SPC, CH_SPC, CH_APS, CH_SPC, CH_SPC, CH_SPC, /* B8-BF */
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, /* C0-C7 */
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, /* C8-CF */
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_CWS, /* D0-D7 */
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, /* D8-DF */
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_SPC, CH_SPC, CH_SPC, CH_SPC, /* E0-E7 */
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_APS, CH_SPC, /* E8-EF */
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, /* F0-F7 */
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC /* F8-FF */
+ },
+ // WARD 3 (start 0x030)
+ {
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // 00-07
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // 08-0f
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // 10-17
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // 18-1f
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // 20-27
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // 28-2f
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // 30-37
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // 38-3f
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // 40-47
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // 48-4f
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // 50-57
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // 58-5f
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // 60-67
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // 68-6f
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // 70-77
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SND, CH_SPC, // 78-7f
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_UPR, CH_SPC, // 80-87
+ CH_UPR, CH_UPR, CH_UPR, CH_SPC, CH_UPR, CH_SPC, CH_UPR, CH_UPR, // 88-8f
+ CH_LWR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, // 90-97
+ CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, // 98-9f
+ CH_UPR, CH_UPR, CH_SPC, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, // a0-a7
+ CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, // a8-af
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, // b0-b7
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, // b8-bf
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, // c0-c7
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_SPC, // c8-cf
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, // d0-d7
+ CH_SPC, CH_SPC, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, // d8-df
+ CH_LWR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, // e0-e7
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, // e8-ef
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // f0-f7
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC // f8-ff
+ },
+ // WARD 4 (0x040)
+ {
+ CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, // 00-07
+ CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, // 08-0f
+ CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, // 10-17
+ CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, // 18-1f
+ CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, // 20-27
+ CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, // 28-2f
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, // 30-37
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, // 38-3f
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, // 40-47
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, // 48-4f
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, // 50-57
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, // 58-5f
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, // 60-67
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, // 68-6f
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, // 70-77
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, // 78-7f
+ CH_UPR, CH_LWR, CH_CUR, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // 80-87
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_UPR, CH_LWR, CH_UPR, CH_LWR, // 88-8f
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, // 90-97
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, // 98-9f
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, // a0-a7
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, // a8-af
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, // b0-b7
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, // b8-bf
+ CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_SPC, CH_SPC, CH_UPR, // c0-c7
+ CH_LWR, CH_SPC, CH_SPC, CH_UPR, CH_LWR, CH_SPC, CH_SPC, CH_SPC, // c8-cf
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, // d0-d7
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, // d8-df
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, // e0-e7
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, // e8-ef
+ CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_UPR, CH_LWR, CH_SPC, CH_SPC, // f0-f7
+ CH_UPR, CH_LWR, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC // f8-ff
+ },
+ // WARD 5 (0x050)
+ {
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // 00-07
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // 08-0f
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // 10-17
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // 18-1f
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // 20-27
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // 28-2f
+ CH_SPC, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, // 30-37
+ CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, // 38-3f
+ CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, // 40-47
+ CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, // 48-4f
+ CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_UPR, CH_SPC, // 50-57
+ CH_SPC, CH_LWR, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SND, CH_SPC, // 58-5f
+ CH_SPC, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, // 60-67
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, // 68-6f
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, // 70-77
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, // 78-7f
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, // 80-87
+ CH_SPC, CH_PRD, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // 88-8f
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // 90-97
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // 98-9f
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // a0-a7
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // a8-af
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // b0-b7
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // b8-bf
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // c0-c7
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // c8-cf
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, // d0-d7
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, // d8-df
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, // e0-e7
+ CH_LWR, CH_LWR, CH_LWR, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // e8-ef
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // f0-f7
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC // f8-ff
+ },
+ // WARD 6 (0x060)
+ {
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // 00-07
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // 08-0f
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // 10-17
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SND, // 18-1f
+ CH_SPC, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, // 20-27
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, // 28-2f
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, // 30-37
+ CH_LWR, CH_LWR, CH_LWR, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // 38-3f
+ CH_SPC, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, // 40-47
+ CH_LWR, CH_LWR, CH_LWR, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // 48-4f
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // 50-57
+ CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_SPC, // 58-5f
+ CH_NUM, CH_NUM, CH_NUM, CH_NUM, CH_NUM, CH_NUM, CH_NUM, CH_NUM, // 60-67
+ CH_NUM, CH_NUM, CH_SPC, CH_SPC, CH_NUM, CH_SPC, CH_SPC, CH_SPC, // 68-6f
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, // 70-77
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, // 78-7f
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, // 80-87
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, // 88-8f
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, // 90-97
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, // 98-9f
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, // a0-a7
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, // a8-af
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, // b0-b7
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, // b8-bf
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, // c0-c7
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, // c8-cf
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_PRD, CH_LWR, CH_LWR, CH_LWR, // d0-d7
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_SPC, CH_SPC, CH_LWR, // d8-df
+ CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, CH_LWR, // e0-e7
+ CH_LWR, CH_SPC, CH_SPC, CH_SPC, CH_SPC, CH_LWR, CH_SPC, CH_SPC, // e8-ef
+ CH_NUM, CH_NUM, CH_NUM, CH_NUM, CH_NUM, CH_NUM, CH_NUM, CH_NUM, // f0-f7
+ CH_NUM, CH_NUM, CH_LWR, CH_LWR, CH_LWR, CH_SPC, CH_SPC, CH_SPC // f8-ff
+ }
+ };
+
+
+
+
+ /**********************************************************************/
+ /* */
+ /* A P I Function */
+ /* */
+ /**********************************************************************/
+
+//lint -save -e909 : Implicit conversion from enum/pointer to bool
+
+ Tokenizer::Tokenizer(void) :
+ iv_bUseAlternateTerritories(true),
+ iv_pauiCharMapWard(NULL) {
+ assert(sizeof(TyCharmap) == ((MAXWARD+1)* 256 * sizeof(unsigned short)));
+ assert(sizeof(gs_cauiCharMapWard) == sizeof(*iv_pauiCharMapWard));
+ assert(sizeof(gs_cauiCharMapWard) == sizeof(TyCharmap));
+ // we don't won't modify the global static map but we don't want to create a
+ // writable copy until there is the need to do so (setCharClass() is called)
+ iv_pauiCharMapWard = &gs_cauiCharMapWard;
+ }
+
+ Tokenizer::~Tokenizer(void) {
+ resetCharClasses();
+ iv_pauiCharMapWard = NULL;
+ }
+
+ void Tokenizer::resetCharClasses(void) {
+ if (iv_pauiCharMapWard != &gs_cauiCharMapWard) {
+ free(iv_pauiCharMapWard);
+ iv_pauiCharMapWard = &gs_cauiCharMapWard;
+ }
+ }
+
+ void Tokenizer::setCharClass(WORD16 uiUnicodeCodePoint, EnCharClass enCharClass)
+ /* ----------------------------------------------------------------------- */
+ {
+ assert(EXISTS(iv_pauiCharMapWard));
+ // This function is called rarely, so it is optimized for clarity rather than speed
+ size_t uiWard = uiUnicodeCodePoint/256; //determine the ward for the codepoint
+ if (uiWard > (sizeof(TyCharmap)/256)) {
+ return;
+ }
+
+ if (iv_pauiCharMapWard == &gs_cauiCharMapWard) {
+ // allocate memory for writable copy
+ iv_pauiCharMapWard = (TyCharmap*)malloc(sizeof(TyCharmap));
+ if (iv_pauiCharMapWard == NULL) {
+ UIMA_EXC_THROW_NEW(ExcOutOfMemory,
+ UIMA_ERR_USER_ANNOTATOR_OUT_OF_MEMORY,
+ UIMA_MSG_ID_EXC_OUT_OF_MEMORY,
+ uima::ErrorMessage(UIMA_MSG_ID_EXCON_TOK_ALLOCATING_CHARTABLE),
+ uima::ErrorInfo::unrecoverable);
+ }
+ // copy values from default map
+ memcpy(*iv_pauiCharMapWard, gs_cauiCharMapWard, sizeof(TyCharmap));
+ }
+
+ size_t uiWardOffset = uiUnicodeCodePoint%256;
+ (*iv_pauiCharMapWard)[uiWard][uiWardOffset] = (unsigned short)enCharClass;
+ }
+
+ static const EnCharClass gs_aenIcuCharCat2TokCharClass [U_CHAR_CATEGORY_COUNT+1] = {
+ /** Non-category for unassigned and non-character code points.
+ U_UNASSIGNED = 0, */ CH_SPC,
+ /** Lu U_UPPERCASE_LETTER = 1, */ CH_UPR,
+ /** Ll U_LOWERCASE_LETTER = 2, */ CH_LWR,
+ /** Lt U_TITLECASE_LETTER = 3, */ CH_UPR,
+ /** Lm U_MODIFIER_LETTER = 4, */ CH_USC,
+ /** Lo U_OTHER_LETTER = 5, */ CH_USC,
+ /** Mn U_NON_SPACING_MARK = 6, */ CH_USC,
+ /** Me U_ENCLOSING_MARK = 7, */ CH_USC,
+ /** Mc U_COMBINING_SPACING_MARK = 8, */ CH_USC,
+ /** Nd U_DECIMAL_DIGIT_NUMBER = 9, */ CH_NUM,
+ /** Nl U_LETTER_NUMBER = 10,*/ CH_NUM,
+ /** No U_OTHER_NUMBER = 11,*/ CH_NUM,
+ /** Zs U_SPACE_SEPARATOR = 12,*/ CH_BLK,
+ /** Zl U_LINE_SEPARATOR = 13,*/ CH_NWL,
+ /** Zp U_PARAGRAPH_SEPARATOR = 14,*/ CH_NPA,
+ /** Cc U_CONTROL_CHAR = 15,*/ CH_SPC,
+ /** Cf U_FORMAT_CHAR = 16,*/ CH_SPC,
+ /** Co U_PRIVATE_USE_CHAR = 17,*/ CH_USC,
+ /** Cs U_SURROGATE = 18,*/ CH_USC,
+ /** Pd U_DASH_PUNCTUATION = 19,*/ CH_CWS,
+ /** Ps U_START_PUNCTUATION = 20,*/ CH_SPC,
+ /** Pe U_END_PUNCTUATION = 21,*/ CH_SPC,
+ /** Pc U_CONNECTOR_PUNCTUATION = 22,*/ CH_CWS,
+ /** Po U_OTHER_PUNCTUATION = 23,*/ CH_SPC,
+ /** Sm U_MATH_SYMBOL = 24,*/ CH_SPC,
+ /** Sc U_CURRENCY_SYMBOL = 25,*/ CH_CUR,
+ /** Sk U_MODIFIER_SYMBOL = 26,*/ CH_USC,
+ /** So U_OTHER_SYMBOL = 27,*/ CH_SPC,
+ /** Pi U_INITIAL_PUNCTUATION = 28,*/ CH_SPC,
+ /** Pf U_FINAL_PUNCTUATION = 29,*/ CH_SPC,
+ /** Cn U_GENERAL_OTHER_TYPES = 30,*/ CH_SPC
+ /** One higher than the last enum UCharCategory constant.
+ U_CHAR_CATEGORY_COUNT */
+ };
+
+
+// inline function used in this file
+ inline EnCharClass
+ Tokenizer::getCharClassInl( UChar c ) {
+ // isolate first byte which designates ward
+ unsigned char c1 = c >> 8;
+
+ // mapping tables only defined for the first WARDS
+ if (c1 <= MAXWARD) {
+ // isolate second byte
+ unsigned char c2 = c & 0xFF;
+
+ // use both byte parts for lookup in ward table
+ return(EnCharClass) (*iv_pauiCharMapWard)[c1][c2];
+ }
+
+ assert(u_charType(c) >= 0);
+ assert(u_charType(c) < U_CHAR_CATEGORY_COUNT);
+ // for all other characters get unicode character type from ICU
+ // and map the unicode character type to our character class using table
+ return ( gs_aenIcuCharCat2TokCharClass[(UCharCategory)u_charType(c)] );
+
+ }
+
+ /* class function used in annotator_tok,cpp */
+ EnCharClass Tokenizer::getCharClass( UChar c ) {
+ return getCharClassInl(c);
+ }
+
+
+
+ /**********************************************************************/
+ /* */
+ /* A P I Function */
+ /* */
+ /**********************************************************************/
+
+ inline int Tokenizer::tokenEntry(
+ const UChar *pToken, size_t ulLocation, size_t ulLength,
+ TokenProperties &rclTokenProperties,
+ bool &bNewPara, bool &bNewSent, size_t & rulNewlines) {
+
+ // send token to UIMA
+ tokenCallback( ulLocation, ulLength, rclTokenProperties, bNewPara, bNewSent );
+
+ // actions after the token was sent to UIMA:
+
+ // reset token class for next token
+ rclTokenProperties.reset();
+ // reset new paragraph / new sentence flags
+ bNewPara = bNewSent = false;
+ // reset count for newlines (even if there was only one)
+ rulNewlines = 0;
+
+ return 0;
+ }
+
+
+ void Tokenizer::process(const UChar *text_start, const UChar *text_end) {
+ assert(EXISTS(text_start));
+ assert(EXISTS(text_end));
+ assert(EXISTS(iv_pauiCharMapWard));
+
+ //? UString str((UniChar *) text_start, (size_t) (text_end - text_start) + 1);
+ //? cout << ">>> '" << str.prv_asSingleByteString(CCSID(819)) << "' <<<" << endl;
+
+ const UChar *pText = text_start; // curent pointer in text
+ const UChar *pWordStart = NULL; // start of current word or NULL
+ // if not in a word
+ bool bNewSent = false; // next Word is in new sentence
+ bool bNewPara = false; // next Word is in new paragraph
+ size_t uiNewlines = 0; // number of subsequent newlines
+ // (more than 2 indicate new paragraph)
+ TokenProperties clTokenProperties; // class of current word (e.g. all upper)
+
+ //clTokenProperties.reset()
+
+ while ( pText <= text_end ) {
+ EnCharClass charClass = getCharClassInl( *pText );
+ const UChar chTextNext = (pText < text_end) ? *(pText + 1) : 0;
+
+ // Default case: current character is upper or lower case character or digit:
+ if ( charClass & (CH_LWR | CH_UPR | CH_NUM | CH_CUR | CH_USC ) ) {
+ if ( pWordStart == NULL ) {
+ // the start of a new word
+ pWordStart = pText;
+ }
+ // token class classification (most frequent checked first)
+ if ( charClass & CH_LWR )
+ clTokenProperties.setLower();
+ else if ( charClass & CH_UPR ) {
+ if ( pWordStart == pText )
+ clTokenProperties.setLeadingUpper();
+ else
+ clTokenProperties.setTrailingUpper();
+ } else if ( charClass & CH_NUM )
+ clTokenProperties.setNumeric();
+ else if ( charClass & CH_USC )
+ clTokenProperties.setSpecial();
+ else if ( charClass & CH_CUR ) {
+ if ( pWordStart == pText ) {
+ // accept currency only as a first character, if a digit
+ // is following
+ if ( getCharClassInl( chTextNext) != CH_NUM ) {
+ pWordStart = NULL; // reset word pointer ("not in a word")
+ } else {
+ clTokenProperties.setSpecial();
+ }
+ } else {
+ clTokenProperties.setSpecial();
+ }
+ }
+ // move to next character
+ pText++;
+ continue;
+ }
+
+ switch ( charClass ) {
+ case CH_BLK: // blank
+ // unconditionally terminates the current word as a token
+ // and starts a new word
+ if ( pWordStart ) {
+ // end of current word
+ tokenEntry( pWordStart, pWordStart-text_start, pText-pWordStart, clTokenProperties, bNewPara, bNewSent, uiNewlines );
+ pWordStart = NULL;
+ }
+ break;
+ case CH_SPC: // special character
+ // unconditionally terminates the current word as a token
+ // and starts a new word
+ if ( pWordStart ) {
+ // end of current word
+ tokenEntry( pWordStart, pWordStart-text_start, pText-pWordStart, clTokenProperties, bNewPara, bNewSent, uiNewlines );
+ }
+ // the start of a new "word" containing the special char(s)
+ pWordStart = pText;
+ clTokenProperties.setSpecial();
+ // check if the next char is end of a special char(s sequence)
+ if (pWordStart && !(getCharClassInl( chTextNext ) & (CH_SPC))) {
+ // create the special char(s sequence)
+ tokenEntry( pWordStart, pWordStart-text_start, pText+1-pWordStart, clTokenProperties, bNewPara, bNewSent, uiNewlines );
+ pWordStart = NULL;
+ }
+ break;
+ case CH_SND: // sentence end ("?" or "!")
+ // terminates the current sentence
+ if ( pWordStart ) {
+ if (!(getCharClassInl( *(pText-1) ) & (CH_SND))) {
+ // create the token immediately to the left of ? e.g. "abc?"
+ tokenEntry( pWordStart, pWordStart-text_start, pText-pWordStart, clTokenProperties, bNewPara, bNewSent, uiNewlines );
+ // the start of a new "word" containing the '''
+ pWordStart = pText;
+ }
+ } else {
+ // the start of a new "word" containing the '''
+ pWordStart = pText;
+ }
+ clTokenProperties.setSpecial();
+ // check if the next char is end of a ? or ??? sequence
+ if (pWordStart && !(getCharClassInl( chTextNext ) & (CH_SND))) {
+ // create the ? or ??? token
+ tokenEntry( pWordStart, pWordStart-text_start, pText+1-pWordStart, clTokenProperties, bNewPara, bNewSent, uiNewlines );
+ // start a new sentence
+ bNewSent = true;
+ pWordStart = NULL;
+ }
+ break;
+ case CH_NWL: // newline
+ if ( pWordStart ) {
+ // end of current word
+ tokenEntry( pWordStart, pWordStart-text_start, pText-pWordStart, clTokenProperties, bNewPara, bNewSent, uiNewlines );
+ pWordStart = NULL;
+ }
+ // count occuring newlines, if a new word starts and there were more
+ // than one newlines, this is the begin of a new paragraph
+ ++uiNewlines;
+ // if there were some newlines before
+ // start a new paragraph
+ if ( uiNewlines > 1 ) {
+ // new paragraph (and new sentence)
+ bNewPara = true;
+ bNewSent = true;
+ }
+ break;
+ case CH_NPA: // newpara
+ if ( pWordStart ) {
+ // end of current word
+ tokenEntry( pWordStart, pWordStart-text_start, pText-pWordStart, clTokenProperties, bNewPara, bNewSent, uiNewlines );
+ pWordStart = NULL;
+ }
+ // new paragraph (and new sentence)
+ bNewPara = true;
+ bNewSent = true;
+ break;
+ case CH_PRD: // period
+ // if not in a word, ignore a leading point
+ if ( pWordStart ) {
+ if ( pText == text_end ) {
+ // period is the last character in the text:
+ // Since no characters are following, this can only be the
+ // end of the sentence.
+ // end of current word
+ tokenEntry( pWordStart, pWordStart-text_start, pText-pWordStart, clTokenProperties, bNewPara, bNewSent, uiNewlines );
+ // now emmit the final period token itself
+ // the start of a new "word" containing the . or ..
+ pWordStart = pText;
+ clTokenProperties.setSpecial();
+ tokenEntry( pWordStart, pWordStart-text_start, 1, clTokenProperties, bNewPara, bNewSent, uiNewlines );
+ // setting bNewSent to true here is not really neccessary.
+ // since we are at the end of the text. However, to have
+ // the same code for all "sentence end" conditions, this is
+ // left here.
+ bNewSent = true;
+ pWordStart = NULL;
+ } else {
+ // period is not at the end of the text - action depends on leading and following character
+ // note: since pWordStart is not NULL here, pText points not
+ // to the very beginning of the text.
+ //
+ // part of the word if between numbers of alpha characters (like conditional whitespaces)
+ // This is for tokens like "9.164.220.12"
+ if ( (getCharClassInl( *(pText-1)) & (CH_UPR | CH_LWR | CH_NUM )) &&
+ (getCharClassInl( chTextNext) & (CH_UPR | CH_LWR | CH_NUM ))) {
+ clTokenProperties.setSpecial();
+ break;
+ }
+ unsigned long ulWordLen = pText-pWordStart;
+ const UChar chTextNextNext = (pText < (text_end - 1)) ? *(pText + 2) : 0;
+
+ if ( (ulWordLen ==1 && clTokenProperties.hasUpper())
+ // OR beginning of next token is lower: must be abrev
+ || (getCharClassInl( chTextNextNext) & (CH_LWR))
+ // OR found in abbreviation list
+ // || isAbreviation( pWordStart, ulWordLen ) ) {
+ ) {
+ clTokenProperties.setSpecial();
+ // is an abbreviation, ignore this word, do not end the sentence
+ // pass token WITH period.
+ tokenEntry( pWordStart, pWordStart-text_start, ulWordLen+1, clTokenProperties, bNewPara, bNewSent, uiNewlines );
+ pWordStart = NULL;
+ } else if (!(getCharClassInl( *(pText-1) ) & (CH_PRD))) {
+ // must be the end of a sentence
+ // end of current word (without period)
+ tokenEntry( pWordStart, pWordStart-text_start, ulWordLen, clTokenProperties, bNewPara, bNewSent, uiNewlines );
+ pWordStart = pText;
+ }
+ }
+ } else {
+ // the start of a new "word" containing the ...
+ pWordStart = pText;
+ }
+ if (pWordStart) {
+ clTokenProperties.setSpecial();
+ }
+ // check if the next char is end of a . or ... sequence
+ // Note: we allow for ".12" or ".Net" or ..12 to be one token
+ if (pWordStart && !(getCharClassInl( chTextNext ) & (CH_PRD | CH_NUM | CH_LWR | CH_UPR))) {
+ clTokenProperties.setSpecial();
+ // create the . or ... token
+ tokenEntry( pWordStart, pWordStart-text_start, pText+1-pWordStart, clTokenProperties, bNewPara, bNewSent, uiNewlines );
+ if (getCharClassInl( chTextNext ) & (CH_BLK | CH_NWL)) {
+ bNewSent = true;
+ }
+ pWordStart = NULL;
+ }
+ break;
+ case CH_NSP: // number seperator or ','
+ if ( pWordStart ) {
+ // part of a number if between digits
+ if ( getCharClassInl( *(pText-1)) == CH_NUM && getCharClassInl(chTextNext) == CH_NUM ) {
+ clTokenProperties.setSpecial();
+ break;
+ } else if (!(getCharClassInl( *(pText-1) ) & (CH_NSP))) {
+ // create the token immediately to the left of , e.g. "abc,"
+ tokenEntry( pWordStart, pWordStart-text_start, pText-pWordStart, clTokenProperties, bNewPara, bNewSent, uiNewlines );
+ // the start of a new "word" containing the '''
+ pWordStart = pText;
+ }
+ } else {
+ // the start of a new "word" containing the '''
+ pWordStart = pText;
+ }
+ clTokenProperties.setSpecial();
+ // check if the next char is end of a , or ,,, sequence
+ if (pWordStart && !(getCharClassInl( chTextNext ) & (CH_NSP))) {
+ // create the , or ,,, token
+ tokenEntry( pWordStart, pWordStart-text_start, pText+1-pWordStart, clTokenProperties, bNewPara, bNewSent, uiNewlines );
+ pWordStart = NULL;
+ }
+ break;
+ case CH_CWS: // conditional whitespace
+ if ( pWordStart ) {
+ // part of the word if between alphanumeric character follows
+ if ( (getCharClassInl( *(pText-1)) & (CH_UPR | CH_LWR | CH_NUM )) &&
+ (getCharClassInl( chTextNext) & (CH_UPR | CH_LWR | CH_NUM ))) {
+ clTokenProperties.setSpecial();
+ break;
+ } else if (!(getCharClassInl( *(pText-1) ) & (CH_CWS))) {
+ // create the token immediately to the left of , e.g. "abc,"
+ tokenEntry( pWordStart, pWordStart-text_start, pText-pWordStart, clTokenProperties, bNewPara, bNewSent, uiNewlines );
+ // the start of a new "word" containing the '''
+ pWordStart = pText;
+ }
+ } else {
+ // the start of a new "word" containing the '''
+ pWordStart = pText;
+ }
+ clTokenProperties.setSpecial();
+ // check if the next char is end of a - or --- sequence
+ if (pWordStart && !(getCharClassInl( chTextNext ) & (CH_CWS))) {
+ // create the - or --- token
+ tokenEntry( pWordStart, pWordStart-text_start, pText+1-pWordStart, clTokenProperties, bNewPara, bNewSent, uiNewlines );
+ pWordStart = NULL;
+ }
+ break;
+ case CH_APS: // apostroph is part of the word within words (l'oreal, Tom's, don't)
+ if ( pWordStart ) {
+ // part of the word if between alphanumeric character follows
+ if ( (getCharClassInl( *(pText-1)) & (CH_UPR | CH_LWR | CH_NUM )) &&
+ (getCharClassInl( chTextNext) & (CH_UPR | CH_LWR | CH_NUM ))) {
+ clTokenProperties.setSpecial();
+ break;
+ } else if (!(getCharClassInl( *(pText-1) ) & (CH_APS))) {
+ // create the token immediately to the left of , e.g. "abc,"
+ tokenEntry( pWordStart, pWordStart-text_start, pText-pWordStart, clTokenProperties, bNewPara, bNewSent, uiNewlines );
+ // the start of a new "word" containing the '''
+ pWordStart = pText;
+ }
+ } else {
+ // the start of a new "word" containing the '''
+ pWordStart = pText;
+ }
+ clTokenProperties.setSpecial();
+ // check if the next char is end of a ' or ''' sequence
+ if (pWordStart && !(getCharClassInl( chTextNext ) & (CH_APS))) {
+ // create the ' or ''' token
+ tokenEntry( pWordStart, pWordStart-text_start, pText+1-pWordStart, clTokenProperties, bNewPara, bNewSent, uiNewlines );
+ pWordStart = NULL;
+ }
+ break;
+ case CH_LWR: // all the following cases are handled in the first loop
+ case CH_UPR:
+ case CH_NUM:
+ case CH_USC:
+ case CH_CUR:
+ default:
+ assert( false );
+ break;
+ }
+ ++pText;
+ }
+
+ // if end of text and still in a word
+ // send the word to UIMA
+ if ( pWordStart ) {
+ // end of current word
+ tokenEntry( pWordStart, pWordStart-text_start, pText-pWordStart, clTokenProperties, bNewPara, bNewSent, uiNewlines );
+ pWordStart = NULL;
+ }
+ }
+
+//lint -restore : Implicit conversion from enum/pointer to bool
+
+} // namespace uima
+
+/* <EOF> */
+
Propchange: incubator/uima/uimacpp/trunk/src/test/src/ss_tokenizer.cpp
------------------------------------------------------------------------------
svn:eol-style = native