You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by ea...@apache.org on 2007/02/03 18:19:59 UTC
svn commit: r503266 [1/2] - /incubator/uima/uimacpp/trunk/src/test/src/uima/
Author: eae
Date: Sat Feb 3 09:19:57 2007
New Revision: 503266
URL: http://svn.apache.org/viewvc?view=rev&rev=503266
Log:
Initial entry
Added:
incubator/uima/uimacpp/trunk/src/test/src/uima/
incubator/uima/uimacpp/trunk/src/test/src/uima/annotator_dump.hpp (with props)
incubator/uima/uimacpp/trunk/src/test/src/uima/annotator_tok.hpp (with props)
incubator/uima/uimacpp/trunk/src/test/src/uima/conui.hpp (with props)
incubator/uima/uimacpp/trunk/src/test/src/uima/doc_buffer.hpp (with props)
incubator/uima/uimacpp/trunk/src/test/src/uima/parse_handlers.hpp (with props)
incubator/uima/uimacpp/trunk/src/test/src/uima/parser_config.hpp (with props)
incubator/uima/uimacpp/trunk/src/test/src/uima/parser_interface.hpp (with props)
incubator/uima/uimacpp/trunk/src/test/src/uima/ss_tokenizer.hpp (with props)
incubator/uima/uimacpp/trunk/src/test/src/uima/strptrlenpair.hpp (with props)
incubator/uima/uimacpp/trunk/src/test/src/uima/tt_types.hpp (with props)
incubator/uima/uimacpp/trunk/src/test/src/uima/xmlparse_handlers.hpp (with props)
Added: incubator/uima/uimacpp/trunk/src/test/src/uima/annotator_dump.hpp
URL: http://svn.apache.org/viewvc/incubator/uima/uimacpp/trunk/src/test/src/uima/annotator_dump.hpp?view=auto&rev=503266
==============================================================================
--- incubator/uima/uimacpp/trunk/src/test/src/uima/annotator_dump.hpp (added)
+++ incubator/uima/uimacpp/trunk/src/test/src/uima/annotator_dump.hpp Sat Feb 3 09:19:57 2007
@@ -0,0 +1,159 @@
+#ifndef UIMA_ANNOTATOR_DUMP_H$
+#define UIMA_ANNOTATOR_DUMP_H$
+/** \file annotator_dump.hpp .
+-----------------------------------------------------------------------------
+
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+
+-------------------------------------------------------------------------- */
+
+
+/* ----------------------------------------------------------------------- */
+/* Include dependencies */
+/* ----------------------------------------------------------------------- */
+
+#include "uima/api.hpp" /* UIMA API */
+
+#include <fstream>
+#include <vector>
+#include <deque>
+
+
+#include "uima/filename.hpp"
+
+using namespace uima;
+
+/* ----------------------------------------------------------------------- */
+/* Forward declarations */
+/* ----------------------------------------------------------------------- */
+
+
+/* ----------------------------------------------------------------------- */
+/* Types / Classes */
+/* ----------------------------------------------------------------------- */
+
+
+/** @name AnnotatorDump
+ The class <TT>AnnotatorDump</TT> is used to .
+ Example:
+ \code
+ \endcode
+ @see
+*/
+class AnnotatorDump : public TextAnnotator {
+public:
+ /** @name Constructors */
+ /*@{*/
+ /** Default Constructor:
+ */
+ AnnotatorDump();
+ /*@}*/
+
+ ~AnnotatorDump(void); //lint !e1908 !e1509: base class destructor for class 'AnnotatorABase' is not virtual : 'virtual' assumed for ~AnnotatorDump() (inherited from base class AnnotatorABase)
+
+ /** @name Annotator Processing Functions */
+ /*@{*/
+ /** call the UIMA Annotator to initialize itself based on a UIMA engine
+ and a UIMA Configuration section and return a UIMA error code */
+ TyErrorId
+ initialize(
+ AnnotatorContext & rclAnnotatorContext
+ ); //lint !e1909: 'virtual' assumed, see: AnnotatorABase::init(Engine &, ConfigAnnotator &) (line 79, file g:\projects\UIMAcurrent\code\engine\include\annotator_abase.hpp)
+
+ TyErrorId typeSystemInit(uima::TypeSystem const &);
+
+ /** call the UIMA Annotator to deinitialize itself based on a UIMA engine
+ and return a UIMA error code */
+ TyErrorId
+ destroy(); //lint !e1909: 'virtual' assumed, see: AnnotatorABase::deInit(Engine &) (line 83, file g:\projects\UIMAcurrent\code\engine\include\annotator_abase.hpp)
+
+ /** call the UIMA Annotator to reconfigure itself based on a UIMA Configuration
+ section and return a UIMA error code */
+ TyErrorId
+ reconfigure(
+ ); //lint !e1909: 'virtual' assumed, see: AnnotatorABase::config(ConfigAnnotator &) (line 87, file g:\projects\UIMAcurrent\code\engine\include\annotator_abase.hpp)
+
+ /** call the UIMA Annotator to perform its doc related duty based on a UIMA engine
+ and return a UIMA error code */
+ TyErrorId
+ process(
+ CAS & tcas,
+ ResultSpecification const &
+ ); //lint !e1909: 'virtual' assumed, see: AnnotatorABase::processDocument(Engine &, const TargetSetAT &, const TargetSetTT &) (line 91, file g:\projects\UIMAcurrent\code\engine\include\annotator_abase.hpp)
+
+ /*@}*/
+
+ /** @name Properties */
+ /*@{*/
+ /*@}*/
+ /** @name Miscellaneous */
+ /*@{*/
+ /*@}*/
+protected:
+private:
+
+ /* --- types ---------------------------------------------------------------*/
+
+ enum EnOutputStyle {
+ Xml, XCas
+ };
+
+ /* --- variables -------------------------------------------------------- */
+ util::Filename iv_clOutputFilename;
+ bool iv_bDumpDocBuffer; // When set to 'True', the Annotator dumps the Doc Buffer
+ bool iv_bSaveDocBuffer; // When set to 'True', the Annotator dumps the Doc Buffer in binary format, too
+
+ ofstream iv_clOutputStream;
+ bool iv_bAppendFile;
+
+ EnOutputStyle iv_enOutputStyle;
+
+ //vector<uima::Type> iv_vecOutputTypes;
+
+ // The annotator may be invoked in several sections within one config-file.
+ // If all output gets dumped into one file, the names of the sections serve
+ // as headers. We can't access the ConfigAnnotator-Object in 'processDocument',
+ // hence, we need a member var.
+ string iv_cpszSectionName;
+ /* --- functions -------------------------------------------------------- */
+
+ TyErrorId
+ openOutputFile( void );
+
+ void
+ closeOutputFile( void );
+
+ void outputDocBuffer(UnicodeStringRef const & crclDoc);
+
+
+ AnnotatorDump & operator=(
+ const AnnotatorDump &
+ );
+
+ AnnotatorDump(
+ const AnnotatorDump &
+ );
+
+}
+; /* AnnotatorDump */
+
+/* ----------------------------------------------------------------------- */
+#endif
+
+/* <EOF> */
+
Propchange: incubator/uima/uimacpp/trunk/src/test/src/uima/annotator_dump.hpp
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/uima/uimacpp/trunk/src/test/src/uima/annotator_tok.hpp
URL: http://svn.apache.org/viewvc/incubator/uima/uimacpp/trunk/src/test/src/uima/annotator_tok.hpp?view=auto&rev=503266
==============================================================================
--- incubator/uima/uimacpp/trunk/src/test/src/uima/annotator_tok.hpp (added)
+++ incubator/uima/uimacpp/trunk/src/test/src/uima/annotator_tok.hpp Sat Feb 3 09:19:57 2007
@@ -0,0 +1,253 @@
+#ifndef UIMA_ANNOTATOR_TOK_H$
+#define UIMA_ANNOTATOR_TOK_H$
+/** \file annotator_tok.hpp .
+-----------------------------------------------------------------------------
+
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+
+
+ \brief Contains AnnotatorTokenizer a Unicode UIMA Tokenizer Annotator.
+
+-------------------------------------------------------------------------- */
+
+
+/* ----------------------------------------------------------------------- */
+/* Include dependencies */
+/* ----------------------------------------------------------------------- */
+
+/* We want the timers in this annotator to be only active if the annotator specific
+ define ANNOTATOR_TIMERS is set (e.g. in the automake.pro file for this annotator)
+ in all other cases we don't want the timers.
+ Since all the timers depend on the more generic define DEBUG_TIMING we
+ define DEBUG_TIMING if and only if ANNOTATOR_TIMERS is set.
+ Specificaly we don't want the timers set when DEBUG_TIMING is defined to
+ build a generic timing driver of the whole system.
+ Our internal annotator timers would bias the whole timing driver with the
+ overhead involved in calling them in this annotator. This is why we specificaly
+ undefine DEBUG_TIMING even if it might be set in the makefile to build this
+ annotator.
+ If you want timing in this annotator use ANNOTATOR_TIMERS not DEBUG_TIMING
+ */
+#ifdef ANNOTATOR_TIMERS
+# ifndef DEBUG_TIMING
+# define DEBUG_TIMING
+# endif
+#else
+# ifdef DEBUG_TIMING
+# undef DEBUG_TIMING
+# endif
+#endif
+
+#include "uima/timedatetools.hpp"
+#include "uima/api.hpp" /* UIMA API */
+///////#include "uima/u2cpcnvrtbuff.hpp" /* U2CpConvertBuffer */
+#include "uima/ss_tokenizer.hpp"
+#include "uima/internal_casimpl.hpp"
+
+#define STEMMER_BUF_LEN 50
+
+using namespace uima;
+
+/** @name AnnotatorTokenizer
+ The class <TT>AnnotatorTokenizer</TT> is used a universal Unicode Tokenizer.
+
+ It uses a little trick to check API consistency via an abstract base class,
+ without having the overhead of virtual functions in our ship version.
+ <TT>AnnotatorABase</TT> defines all non-static member functions a plug-in needs
+ to define as pure virtual functions. By making this class inherit from
+ this base class we can make sure that compilation will fail if the
+ interfaces change.
+ Since we don't really use the inheritance relationship we don't define
+ it in the ship version.
+
+ Example:
+ \code
+ \endcode
+ @see AnnotatorABase
+*/
+class AnnotatorTokenizer : public Tokenizer , public TextAnnotator {
+public:
+ /** @name Constructors */
+ /*@{*/
+ /** Default Constructor.
+ */
+ AnnotatorTokenizer(void);
+
+ /*@}*/
+ virtual ~AnnotatorTokenizer(void); //lint !e1509: base class destructor for class 'AnnotatorABase' is not virtual
+
+ /** @name Annotator Processing Functions */
+ /*@{*/
+ /** call the UIMA Annotator to initialize itself based on a UIMA engine
+ and a UIMA Configuration section and return a UIMA error code */
+ TyErrorId
+ initialize(
+ AnnotatorContext & rclAnnotatorContext
+ ); //lint !e1909: 'virtual' assumed, see: AnnotatorABase::init(Engine &, ConfigAnnotator &) (line 79, file d:\develop\uima\current\code\engine\include\annotator_abase.hpp)
+
+ TyErrorId typeSystemInit(TypeSystem const &);
+
+ /** call the UIMA Annotator to deinitialize itself based on a UIMA engine
+ and return a UIMA error code */
+ TyErrorId
+ destroy(); //lint !e1909: 'virtual' assumed, see: AnnotatorABase::deInit(Engine &) (line 83, file d:\develop\uima\current\code\engine\include\annotator_abase.hpp)
+
+ /** call the UIMA Annotator to reconfigure itself based on a UIMA Configuration
+ section and return a UIMA error code */
+ TyErrorId
+ reconfigure(
+ ); //lint !e1909: 'virtual' assumed, see: AnnotatorABase::config(ConfigAnnotator &) (line 87, file d:\develop\uima\current\code\engine\include\annotator_abase.hpp)
+
+protected:
+ /** call the UIMA Annotator to perform its doc related duty based on a UIMA engine
+ and return a UIMA error code */
+ TyErrorId
+ process(
+ CAS &,
+ const ResultSpecification & crclTargetSet
+ ); //lint !e1909: 'virtual' assumed, see: AnnotatorABase::processDocument(Engine &, const TargetSetAT &, const TargetSetTT &) (line 91, file d:\develop\uima\current\code\engine\include\annotator_abase.hpp)
+
+ /*@}*/
+protected:
+
+ virtual int tokenCallback( unsigned long ulLocation, unsigned long ulLength,
+ TokenProperties & crclTokenProperties,
+ bool bNewPara, bool bNewSent );
+
+ TyDocIndex iv_uiParagraphStartIndex;
+ TyDocIndex iv_uiSentenceStartIndex;
+ // segment numbers
+ size_t iv_uiTokenNbr;
+ size_t iv_uiSentenceNbr;
+ size_t iv_uiParagraphNbr;
+private:
+ // number of the first token/sentence/paragraph
+ const size_t iv_cuiCOUNTER_START;
+
+ /* --------------------------------------------------*/
+ /* config values we use */
+ /* --------------------------------------------------*/
+
+ /// Enum listing all the config option we support
+ enum EnAnnotatorConfigOptions {
+ enConfigOption_TokenNumbersIncludeStopwords,
+ enConfigOption_UseRelativeTokenAndSentenceNumbers,
+ enConfigOption_IgnorePunctuationTokens,
+ // (drop inifile support) enConfigOption_CharMapConfigFilename,
+ enNumberOfConfigOptions // must be last in enum
+ };
+
+ // our config table
+ static const ConfigOptionInfo::StOptionInfo cv_astConfigOptionInfo[enNumberOfConfigOptions];
+
+ // Variables the config options are stored in:
+
+ // if this is true the token numbers are counted including stopwords
+ bool iv_bTokenNumbersIncludeStopwords;
+ // if this is true token and sentence number are reset to 1
+ // for each new sentence/paragraph
+ bool iv_bUseRelativeTokenAndSentenceNumbers;
+ // If true, punctuation tokens are ignored
+ bool iv_bIgnorePunctuationTokens;
+ // trace component ID
+ uima::TyComponentId iv_iTraceCompID;
+
+ // Some pointers for quick access to UIMA objects. Initialized in init()
+ uima::internal::CASImpl * iv_pCASImpl;
+ lowlevel::FSHeap * iv_pFSHeap;
+ lowlevel::IndexRepository * iv_pIndexRepository;
+ // FSTypes and corresponding sizes
+ lowlevel::TyFSType iv_tyTokenType;
+ lowlevel::TyFeatureOffset iv_tyTokenTypeSize;
+ lowlevel::TyFSType iv_tySentenceType;
+ lowlevel::TyFeatureOffset iv_tySentenceTypeSize;
+ lowlevel::TyFSType iv_tyParagraphType;
+ lowlevel::TyFeatureOffset iv_tyParagraphTypeSize;
+
+ // FSFeatures
+ lowlevel::TyFeatureOffset iv_tySofaFeatureOffset;
+ lowlevel::TyFeatureOffset iv_tyBeginPositionFeatureOffset;
+ lowlevel::TyFeatureOffset iv_tyEndPositionFeatureOffset;
+
+ lowlevel::TyFeatureOffset iv_tyTokenPropertiesFeatureOffset;
+ lowlevel::TyFeatureOffset iv_tyTokenNbrFeatureOffset;
+ lowlevel::TyFeatureOffset iv_tySentenceNbrFeatureOffset;
+ lowlevel::TyFeatureOffset iv_tyParagraphNbrFeatureOffset;
+ lowlevel::TyFSFeature iv_stemFeature;
+
+ // needed output types
+ bool iv_bIsTokenReq;
+ bool iv_bIsSentenceReq;
+ bool iv_bIsParagraphReq;
+ bool iv_stemsRequired;
+ TokenProperties iv_clTokenProperties;
+
+#ifdef DEBUG_TIMING
+ uima::Timer iv_clTotalTimer;
+ uima::Timer iv_clSSTokTimer;
+ uima::Timer iv_clUimaAnCreateTimer;
+ uima::Timer iv_clUimaAnSetValTimer;
+#endif
+
+
+ /* --- functions --- */
+ /* COPY CONSTRUCTOR NOT SUPPORTED */
+ AnnotatorTokenizer(const AnnotatorTokenizer & ); //lint !e1704
+ /* ASSIGNMENT OPERATOR NOT SUPPORTED */
+ AnnotatorTokenizer & operator=(const AnnotatorTokenizer & crclObject);
+
+ /// (Re-) Access config values. Used in init() and config().
+ TyErrorId
+ getConfigValues(
+ AnnotatorContext & rclConfig
+ );
+
+ /// member functions for adding annotations
+ void
+ addNewTokenAnnotation(
+ TyDocIndex tyBeginPos,
+ TyDocIndex tyEndPos
+ );
+
+ /// member functions for adding annotations
+ void
+ addNewSentenceAnnotation(
+ TyDocIndex tyBeginPos,
+ TyDocIndex tyEndPos
+ );
+
+ /// member functions for adding annotations
+ void
+ addNewParagraphAnnotation(
+ TyDocIndex tyBeginPos,
+ TyDocIndex tyEndPos
+ );
+
+#if defined(DEBUG_TIMING)
+ void
+ dumpTimingData( void ) const;
+#endif
+
+}
+; /* AnnotatorTokenizer */
+
+/* ----------------------------------------------------------------------- */
+#endif /* UIMA_ANNOTATOR_TOK_H */
+
+/* <EOF> */
+
Propchange: incubator/uima/uimacpp/trunk/src/test/src/uima/annotator_tok.hpp
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/uima/uimacpp/trunk/src/test/src/uima/conui.hpp
URL: http://svn.apache.org/viewvc/incubator/uima/uimacpp/trunk/src/test/src/uima/conui.hpp?view=auto&rev=503266
==============================================================================
--- incubator/uima/uimacpp/trunk/src/test/src/uima/conui.hpp (added)
+++ incubator/uima/uimacpp/trunk/src/test/src/uima/conui.hpp Sat Feb 3 09:19:57 2007
@@ -0,0 +1,71 @@
+/** \file conui.hpp .
+
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+
+ \brief functions to let UIMA types interact with util::ConsoleUI
+
+-------------------------------------------------------------------------- */
+
+#ifndef UIMA_CONUI_HPP
+#define UIMA_CONUI_HPP
+
+/* ----------------------------------------------------------------------- */
+/* Include dependencies */
+/* ----------------------------------------------------------------------- */
+
+#include "uima/pragmas.hpp" // must be first file to be included to get pragmas
+#include "uima/exceptions.hpp"
+#include "uima/err_ids.h"
+#include "uima/consoleui.hpp"
+
+/* ----------------------------------------------------------------------- */
+/* Constants */
+/* ----------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------- */
+/* Forward declarations */
+/* ----------------------------------------------------------------------- */
+
+namespace uima {
+
+ /** display the specified UIMA exception on the console object */
+ void uimaToolDisplayException(uima::util::ConsoleUI & rclConsole, const uima::Exception & crclException);
+
+ /** display the specified UIMA error id on the console object */
+ void uimaToolDisplayErrorId(uima::util::ConsoleUI const & rclConsole, const uima::TyErrorId utErrorId, const TCHAR * cpszLastErrorMsg);
+
+ /** display the specified UIMA error id on the console object and
+ call uima::util::ConsoleUI::fatal() if the error id is not UIMA_ERR_NONE */
+ void uimaToolHandleErrorId(uima::util::ConsoleUI & rclConsole, const uima::TyErrorId utErrorId, const TCHAR * cpszLastErrorMsg, const TCHAR * cpszFunction, uima::TyErrorId utErrorIdExpected = 0);
+
+ /* ----------------------------------------------------------------------- */
+ /* Types / Classes */
+ /* ----------------------------------------------------------------------- */
+
+ /* ----------------------------------------------------------------------- */
+ /* Implementation */
+ /* ----------------------------------------------------------------------- */
+
+ /* ----------------------------------------------------------------------- */
+
+} //namespace uima
+
+#endif /* UIMA_CONUI_HPP */
+
+/* <EOF> */
+
Propchange: incubator/uima/uimacpp/trunk/src/test/src/uima/conui.hpp
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/uima/uimacpp/trunk/src/test/src/uima/doc_buffer.hpp
URL: http://svn.apache.org/viewvc/incubator/uima/uimacpp/trunk/src/test/src/uima/doc_buffer.hpp?view=auto&rev=503266
==============================================================================
--- incubator/uima/uimacpp/trunk/src/test/src/uima/doc_buffer.hpp (added)
+++ incubator/uima/uimacpp/trunk/src/test/src/uima/doc_buffer.hpp Sat Feb 3 09:19:57 2007
@@ -0,0 +1,160 @@
+/** \file doc_buffer.hpp .
+-----------------------------------------------------------------------------
+
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+
+
+ \brief Contains DocBuffer a document buffer for storing a document
+
+-------------------------------------------------------------------------- */
+
+#ifndef UIMA_DOC_BUFFER_HPP
+#define UIMA_DOC_BUFFER_HPP
+
+/* ----------------------------------------------------------------------- */
+/* Include dependencies */
+/* ----------------------------------------------------------------------- */
+
+#include "uima/pragmas.hpp" //must be included first to disable warnings
+#include "unicode/uchar.h"
+#include "uima/types.h"
+#include "uima/exceptions.hpp"
+#include "uima/unistrref.hpp"
+
+/* ----------------------------------------------------------------------- */
+/* Constants */
+/* ----------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------- */
+/* Forward declarations */
+/* ----------------------------------------------------------------------- */
+
+namespace uima {
+ class CodePage2UnicodeConverter;
+}
+
+/* ----------------------------------------------------------------------- */
+/* Types / Classes */
+/* ----------------------------------------------------------------------- */
+
+
+namespace uima {
+
+ /**
+ * The class <TT>DocBuffer</TT> is used to
+ * \code
+ \endcode
+ * @see
+ */
+ class DocBuffer {
+ public:
+ /** @name Constructors */
+ /*@{*/
+ DocBuffer();
+ DocBuffer(size_t uMemPoolInitialSize, size_t uMemPoolGrowSize);
+ /*@}*/
+ ~DocBuffer(void);
+ /** @name Properties */
+ /*@{*/
+ /** Return TRUE, if the specified index is a valid for this buffer. */
+ bool isValidIndex(TyDocIndex uIndex) const;
+ /** Return TRUE, if the document buffer is empty. */
+ bool isEmpty(void) const {
+ return(iv_uLength == 0);
+ }
+ /** Return TRUE if this could be initialized correctly */
+ bool isValid(void) const;
+ /** Return the number of characters as stored in this document buffer. */
+ size_t getLength(void) const {
+ return(iv_uLength);
+ }
+ /** Return a pointer to text for the specified index. <TT>ruLength</TT> is set
+ to the length of the text area that can be accessed following the pointer
+ given as the return value. */
+ const UChar * getDocBuffer(void) const {
+ return(iv_cpw16Document);
+ }
+ /** Return a reference to the text for the specified text index. */
+ UnicodeStringRef getText(TyDocIndex uIndexBegin,
+ TyDocIndex uIndexEnd) const UIMA_THROW(ExcDocBuffer);
+ /*@}*/
+ /** @name Miscellaneous */
+ /*@{*/
+ /** Add a complete document as a buffer starting at address <TT>cpacDocText</TT>,
+ with size <TT>uDocLengthInBytes</TT> in characters, and with CCSID EnCCSID_UCS2.
+ An optional handle to user data may be provided as <TT>hUserDocInfo</TT>. */
+// void addDocInMemory(const UChar * cpclDocText,
+// size_t uDocLength);
+ void addDocPart(const char * cpacDocPartText,
+ size_t uDocPartSize,
+ const char * crclCCSID);
+
+ /** Add part of a document as a buffer starting at address <TT>cpacDocPartText</TT>,
+ with size <TT>uDocPartSize</TT> in bytes, and using the specified converter
+ <TT>crclConverter</TT> for codepage conversion.
+ An optional handle to user data may be provided as <TT>hUserDocPartInfo</TT>. */
+ void addDocPart(const char * cpacDocPartText,
+ size_t uDocPartSize,
+ CodePage2UnicodeConverter & crclConverter);
+ /** Add part of a document as a buffer starting at address <TT>cpacDocPartText</TT>,
+ with size <TT>uDocPartLength</TT> in characters, and with CCSID EnCCSID_UCS2.
+ An optional handle to user data may be provided as <TT>hUserDocPartInfo</TT>. */
+ void addDocPart(const UChar * cpclDocPartText,
+ size_t uDocPartLength);
+ /** This method clears the document buffer, removes all document data
+ and resets all internal settings. */
+ void reset(void);
+ /*@}*/
+ protected:
+ /* --- functions --- */
+ private:
+ /* --- variables --- */
+ size_t iv_uMemPoolInitialSize;
+ size_t iv_uMemPoolReserve;
+ const UChar * iv_cpw16Document;
+ size_t iv_uLength;
+ size_t iv_uSizeAllocated;
+ /* --- functions --- */
+ void init();
+ void addDocPartImp(const char * cpacDocPartText, size_t uDocPartSize, CodePage2UnicodeConverter & crclConverter);
+ void resetMemPool(void);
+ /* COPY CONSTRUCTOR NOT SUPPORTED */
+ DocBuffer(const DocBuffer & ); //lint !e1704
+ /* ASSIGNMENT OPERATOR NOT SUPPORTED */
+ DocBuffer & operator=(const DocBuffer & crclObject);
+ }
+ ; /* DocBuffer */
+
+}
+
+/* ----------------------------------------------------------------------- */
+/* Implementation */
+/* ----------------------------------------------------------------------- */
+
+namespace uima {
+
+ inline bool DocBuffer::isValidIndex(TyDocIndex uIndex) const
+ /* ----------------------------------------------------------------------- */
+ {
+ return((uIndex >= 0) && (iv_uLength > 0) && (uIndex <= iv_uLength - 1));
+ }
+}
+#endif /* UIMA_DOC_BUFFER_HPP */
+
+/* <EOF> */
+
Propchange: incubator/uima/uimacpp/trunk/src/test/src/uima/doc_buffer.hpp
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/uima/uimacpp/trunk/src/test/src/uima/parse_handlers.hpp
URL: http://svn.apache.org/viewvc/incubator/uima/uimacpp/trunk/src/test/src/uima/parse_handlers.hpp?view=auto&rev=503266
==============================================================================
--- incubator/uima/uimacpp/trunk/src/test/src/uima/parse_handlers.hpp (added)
+++ incubator/uima/uimacpp/trunk/src/test/src/uima/parse_handlers.hpp Sat Feb 3 09:19:57 2007
@@ -0,0 +1,159 @@
+/** \file parse_handlers.hpp .
+
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+
+
+ \brief Generic SAX-like parse hander class definitions
+
+-------------------------------------------------------------------------- */
+#ifndef __UIMA_PARSE_HANDLERS_HPP
+#define __UIMA_PARSE_HANDLERS_HPP
+
+
+// ---------------------------------------------------------------------------
+// Includes
+// ---------------------------------------------------------------------------
+
+#include "uima/pragmas.hpp" //must be first to surpress warnings
+#include <map>
+#include <stack>
+#include <utility>
+#include "uima/parser_config.hpp"
+#include "uima/doc_buffer.hpp"
+#include "uima/tcas.hpp"
+#include "uima/parser_interface.hpp"
+
+
+namespace uima {
+
+ /**
+ The class <TT>ParseHandlers</TT> is used as a generic SAX-like
+ parse hander class.
+
+ @see XMLParseHandlers
+ */
+ class ParseHandlers {
+ public:
+ /**
+ * Typedefs for data structure for communication between the beginElement()
+ * and endElement() function.
+ * We get attribute information in beginElement() and need to pass this
+ * information to endElement() because we can only do the mapping
+ * once we know the end of an annotation
+ * @{*/
+
+ /// a struct to hold information about a single XML attribute
+ class StXMLAttrInfo {
+ public:
+ icu::UnicodeString ustrName;
+ icu::UnicodeString ustrType;
+ icu::UnicodeString ustrValue;
+ // OS STL need this to be a full STL compliant class
+ bool operator < (const StXMLAttrInfo & crclRHS) const {
+ return(bool)(ustrName < crclRHS.ustrName);
+ }
+ bool operator ==(const StXMLAttrInfo & crclRHS) const {
+ return(bool)(ustrName == crclRHS.ustrName);
+ }
+ };
+ /// a container to hold the list of all XML attributes of a given XML element
+ typedef vector< StXMLAttrInfo > TyXMLAttrInfoList;
+ /*@}*/
+
+ public:
+ // -----------------------------------------------------------------------
+ // Constructors and Destructor
+ // -----------------------------------------------------------------------
+ ParseHandlers();
+ virtual ~ParseHandlers();
+
+ // -----------------------------------------------------------------------
+ // init method
+ // -----------------------------------------------------------------------
+
+ bool
+ init(
+ TCAS & rTCAS,
+ ParserConfiguration const & rclConfig,
+ bool bVerbose = false
+ );
+
+ bool deInit();
+
+ void setMultiDocCallback(ParserInterface::MultiDocCallbackInterface &);
+
+ TyErrorId beginDoc();
+ TyErrorId endDoc();
+
+ // -----------------------------------------------------------------------
+ // Getter methods
+ // -----------------------------------------------------------------------
+ size_t getNumberOfDocumentsParsed() const;
+
+ size_t getNumberOfBytesParsed() const;
+
+ bool isMultiDocFile() const;
+
+ // -----------------------------------------------------------------------
+ // Handlers for the DocumentHandler interface
+ // -----------------------------------------------------------------------
+ void endElement(const UChar* cpuCName, size_t uiLength);
+ void startElement(const UChar* cpucName, size_t uiLength, const TyXMLAttrInfoList & crvecAttributes);
+ void characters(const UChar* cpucChars, size_t uiLength);
+
+
+ void processWarning(const char* cpszErrorId, const UChar * cpszErrorContext);
+
+ UnicodeStringRef getDocumentText() const;
+
+ protected:
+
+ AnnotationFS findLastAnnOfType(size_t uiBeginPos, Type type) const;
+
+ // -----------------------------------------------------------------------
+ // we need a stack of those containers for each XML element
+ // so we define a map from the XML element name to a pair of
+ // 1: the begin index of the element with those attrs
+ // 2: the attr of the element at that position
+ typedef pair< TyDocIndex, TyXMLAttrInfoList > TyIndexAttrsPair;
+ typedef stack< TyIndexAttrsPair, deque< TyIndexAttrsPair > > TyStack;
+ typedef map< icu::UnicodeString, TyStack, less< icu::UnicodeString > >
+ TyPosStack;
+ // -----------------------------------------------------------------------
+ // Private data members
+ // -----------------------------------------------------------------------
+ ParserConfiguration const * iv_pclConfig;
+
+ DocBuffer iv_docBuffer;
+ TCAS * iv_pTCAS;
+
+ bool iv_bVerbose;
+ bool iv_bIsMultiDocFile;
+ size_t iv_uiMultiDocNbr;
+ size_t iv_uiMultiDocOffset;
+
+ size_t iv_uiInputSize;
+ long iv_lLastEndIndex;
+ TyPosStack iv_clPosStack;
+ size_t iv_uiInIgnoreTag;
+ ParserInterface::MultiDocCallbackInterface * iv_pCallbackObject;
+ };
+
+} // namespace uima
+
+#endif //__UIMA_PARSE_HANDLERS_HPP
Propchange: incubator/uima/uimacpp/trunk/src/test/src/uima/parse_handlers.hpp
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/uima/uimacpp/trunk/src/test/src/uima/parser_config.hpp
URL: http://svn.apache.org/viewvc/incubator/uima/uimacpp/trunk/src/test/src/uima/parser_config.hpp?view=auto&rev=503266
==============================================================================
--- incubator/uima/uimacpp/trunk/src/test/src/uima/parser_config.hpp (added)
+++ incubator/uima/uimacpp/trunk/src/test/src/uima/parser_config.hpp Sat Feb 3 09:19:57 2007
@@ -0,0 +1,395 @@
+/** \file parser_config.hpp .
+
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+
+
+ \brief Configuration class for parsers.
+
+-------------------------------------------------------------------------- */
+#ifndef UIMA_PARSER_CONFIG_HPP
+#define UIMA_PARSER_CONFIG_HPP
+
+
+// ---------------------------------------------------------------------------
+// Includes
+// ---------------------------------------------------------------------------
+
+#include "uima/pragmas.hpp" //must be first to surpress warnings
+#include <map>
+#include <stack>
+#include <utility>
+#include "uima/typesystem.hpp"
+
+namespace uima {
+
+ class TCAS;
+ class TextAnalysisEngineSpecifier;
+
+ /**
+ * The class ParserConfiguration is used to instruct a parser that puts
+ * tagged document into a CAS on how to map document information contained
+ * in tags to annotations and features of annotations.
+ * This configuration is mainly intended to configure HTML and XML parsers
+ * but could be used for any format that uses labeled tags and tag attributes
+ *
+ * For XML input this configuration object also allows to specify if and how
+ * multiple documents are delimited within one physical XML file.
+ * Also the content of certain tags can be excluded from being processed at
+ * all.
+ */
+ class ParserConfiguration {
+ public:
+ ParserConfiguration();
+ /**
+ * Initialize this config object from settings specified in
+ * the parser configuration file.
+ *
+ * @param parserConfigFilename file name of the configuration file
+ * @param cas CAS object defining the types relevant
+ * for the tag2type mappings
+ *
+ * @return UIMA_ERR_NONE if OK
+ */
+ TyErrorId init(icu::UnicodeString const & parserConfigFilename, CAS & cas, ErrorInfo & err);
+
+ /**
+ * Returns the tag that is used to delimit/separate multiple
+ * logical documents within the same physical file.
+ * If no such tag is defined - i.e. there is only one document
+ * per file - an empty string is returned.
+ *
+ * Note: This option will always return the empty string for HTML.
+ */
+ icu::UnicodeString const & getDocumentDelimiterTag() const;
+
+ /**
+ * Sets the tag that is used to delimit/separate multiple
+ * logical documents within the same physical file.
+ *
+ * @see getDocumentDelimiterTag
+ */
+ void setDocumentDelimiterTag(icu::UnicodeString const & tag);
+
+
+ /**
+ * Returns the tags that are to be ignored when parsing an (XML) document.
+ *
+ * Only the textual content of tags not included in the returned container
+ * will be part of the CAS document.
+ */
+ vector<icu::UnicodeString> const & getExcludedTags() const;
+
+ /**
+ * Sets the tags that are to be ignored when parsing an (XML) document.
+ *
+ * @see getExcludedTags
+ */
+ void setExcludedTags( vector<icu::UnicodeString> const & tags);
+
+ /**
+ * Returns the type of the annotation that the parser is supposed to
+ * create for each occurrence of a tag with name <code>tagName</code>.
+ * If no annotation is to be created for occurrences of this tag
+ * <code>tagName</code> the return value is an invalid type object.
+ *
+ * This function returns the value of getDefaultTypeForTags if no
+ * explicit mapping has been specified.
+ *
+ * Note: The returned Type object will be subsumed by type Annotation.
+ *
+ * @param tagName The name of the tag to look up
+ *
+ * @return The mapped type for tagName or an invalid type
+ * if no mapping specified
+ *
+ * @see setDefaultTypeForTags
+ */
+ Type getTypeForTag(icu::UnicodeString const & tagName) const;
+
+ /**
+ * Sets the type of the annotation that the parser is supposed to
+ * create for each occurrence of a tag with name <code>tagName</code>.
+ *
+ * @param tagName the name of a tag to map
+ * @param type a CAS type (must be valid!)
+ *
+ * @see getTypeForTag
+ */
+ void setTypeForTag(icu::UnicodeString const & tagName, Type type);
+
+
+ /**
+ * This option can be used to specify a default mapping in case no
+ * explicit mapping is available for a tag.
+ *
+ * If the default type is set (is valid) getTypeForTag will return
+ * this default type whenever no explicit mapping is specified.
+ * In this case every tag will be mapped to some type without
+ * having to specify many mappings.
+ *
+ * Since this default value is optional the result of
+ * getDefaultTypeForTags() may be invalid.
+ * In this case some tags are not mapped to types.
+ *
+ * @param type The type to use as default mapping type
+ *
+ * @see getTypeForTag
+ * @see getDefaultTypeForTags
+ *
+ */
+ void setDefaultTypeForTags(Type type);
+
+ /**
+ * @return The default type to use for tags not directly mapped
+ * by getTypeForTag() (may be invalid if none specified)
+ *
+ * @see setDefaultType
+ */
+ Type getDefaultTypeForTags() const;
+
+ /**
+ * This allows to specify a feature where the name of the tag is stored
+ * for each annotation created by the parser.
+ * If this feature is invalid the parser will take no action.
+ *
+ * @param f The feature where the tag name will stored
+ * f must be of type string.
+ * Also f must be appropriate for all types mapped to
+ * tag.
+ *
+ * @see getFeatureForTagName
+ */
+ void setFeatureForTagName(Feature f);
+
+ /**
+ * @return The feature where the name of the tag is stored for each
+ * annotation created by the parser (may be invalid if none
+ * specified).
+ *
+ * @see setFeatureForTagName
+ */
+ Feature getFeatureForTagName() const;
+
+ /**
+ * For a given tag name and attribute name this function returns an
+ * annotation type and a feature of that type to which the value
+ * of the attribute is to be mapped.
+ *
+ * The returned type and feature will be invalid if the attribute
+ * <code>attrName</code> at tag <code>tagName</code> is not to be
+ * mapped to the CAS.
+ *
+ * If the returned type and feature are valid the parser is supposed to
+ * look for the "last" annotation of the returned type and set the
+ * value of the returned feature at this annotation to the value of
+ * the attribute <code>attrName</code>.
+ *
+ * Note that the parser does not necessarily have to create an annotation
+ * of the returned type. This mapping can be used to set features of
+ * existing annotations: E.g. the attribute "name" of the "meta" tag
+ * in HTML could be mapped to the feature "DocumentName" of
+ * type "Document"
+ *
+ * The "last" occurrence of an annotation of the returned type is
+ * determined by starting with the annotation corresponding the current
+ * tag (if there is one) and searching from there towards the beginning
+ * of the text. The annotation corresponding the current tag is included
+ * in the search.
+ *
+ * For each occurrence of a tag the parser is supposed to first check
+ * the function <code>getTypeForTag()</code> and create a corresponding
+ * annotation if <code>getTypeForTag()</code> returns a valid type.
+ *
+ * Only after annotations are created for mapped tags the attributes are
+ * being mapped to features.
+ * This execution order guarantees that for tags that are used in
+ * <code>getTypeForTag()</code> and in <code>getFeatureForAttribute()</code>
+ * the "last" annotation will be the newly created one.
+ *
+ * The returned Type object must be subsumed by type Annotation.
+ *
+ * The returnedFeature must be of type string, integer or float.
+ * A conforming parser is supposed to convert the attribute value
+ * from it's string representation to an appropriate value before
+ * setting the feature value.
+ *
+ * taph 04.12.2002: There is currently a limitation for returnedFeature
+ * to be of type string only. This will be removed in the future.
+ *
+ * @param tagName The name of the tag to look up
+ * @param attrName The name of the attribute of tag tagName to look up
+ * @param returnedType Output param: the type corresponding to tag tagName
+ * @param returnedFeature Output param: the feature corresponding to attrName
+ */
+ void getFeatureForAttribute(
+ icu::UnicodeString const & tagName,
+ icu::UnicodeString const & attrName,
+ Type & returnedType,
+ Feature & returnedFeature
+ ) const;
+
+
+ /**
+ * The break properties that can be specified in getBreakPropertyForTag()
+ */
+ enum EnBreakProperty {
+ enNoBreak = 0,
+ enWordBreak = 0x200B,
+ enSentenceBreak = 0x2029,
+ enLineBreak = 0x2028,
+ enParagraphBreak = 0x2029,
+ enNumberOfBreakProperties = 6
+ };
+ /**
+ * Returns the break property the parser is supposed to
+ * associate for each occurrence of a tag with name <code>tagName</code>.
+ *
+ * You can think of break properties as instructions on how to replace
+ * a tag with white space content during de-tagging an HTML/XML document.
+ * - tags with enNoBreak property will be replaced by the empty string
+ * (e.g. bold <b>F</b>irst Letter becomes
+ * First Letter)
+ * - tags with enWordBreak property will be replaced by a Unicode
+ * U+200B ZERO WIDTH SPACE (e.g. <label>)
+ * - tags with enSentenceBreak property will be replaced by a Unicode
+ * paragraph separator character U+2029 PARAGRAPH SEPARATOR ???
+ * - tags with enLineBreak property will be replaced by a Unicode
+ * line separator character U+2028 LINE SEPARATOR
+ * (e.g. <br> and <li>)
+ * - tags with enParagraphBreak property will be replaced by a Unicode
+ * paragraph separator character U+2029 PARAGRAPH SEPARATOR
+ * (e.g. headings like <h1>)
+ *
+ * Note that all break properties only apply to the end tag.
+ * The begin tag is always replaced by the empty string.
+ * For HTML tags that don't have a end tag (e.g. <br>, or where the
+ * end tag is optional (e.g. <li>) the parser should introduce the
+ * replacement character before the next opening tag that can
+ * conceptually close the tag (e.g. <li>) or the next end tag that
+ * closes the tag (e.g. </li>)
+ *
+ * If no explicit break property has been given
+ * getDefaultBreakProperty() is returned.
+ *
+ * @param tagName The name of the tag to look up
+ */
+ EnBreakProperty
+ getBreakPropertyForTag(icu::UnicodeString const & tagName) const;
+
+ /**
+ * Returns the default break property for tags where no explicit
+ * break property has been configured.
+ */
+ EnBreakProperty getDefaultBreakProperty() const;
+
+ /**
+ * Sets the default break property for tags where no explicit
+ * break property has been configured.
+ */
+ void setDefaultBreakProperty(EnBreakProperty enBreakProp);
+
+ /**
+ * Returns the (annotation) type corresponding to a break property.
+ * The return value may be invalid if no specific type is set.
+ * If a type is specified a conforming parser is supposed to
+ * create an annotation of that type for each annotation with a given
+ * break property (in addition to inserting the corresponding break
+ * character)
+ *
+ * In general this is only usefull for paragraphs.
+ * For all other break types (especialy tokens and sentences)
+ * a parser can not and should not directly create the entity
+ * corresponding to the break (token, sentence) as annotation.
+ * The reason for this that the parser only knows that
+ * such an annotation must begin at the position of the begin tag and
+ * end at the begin tag but not how many entities (tokens, sentences)
+ * may be spanned by the tag (e.g. an h1 tag).
+ * In general the parser should not create a few instances of
+ * annotation types (like tokens or sentences) that are mainly created
+ * by annotators (like the tokenizer). Instead it should leave traces in
+ * the text that guide the downstream annotator.
+ *
+ * But paragraphs are an exception to this as they should be
+ * created by the parser and only by the parser. For HTML a tokenizer
+ * applying plain text paragraph finding heuristics (double newline) would
+ * produce incorrect results.
+ * So for HTML it would make sense to specify that tags like <p> and <h1>
+ * etc. are paragraph break and specify the appropriate paragraph
+ * type as the value of getTypeForBreakProperty()
+ * Note that you should not map <p> to a type if you do that otherwise
+ * two annotations would be created for each <p> tag.
+ * If this is done then to ensure consistency <em>only</em> the parser
+ * should create paragraph and no annotator should try to do this.
+ */
+ Type getTypeForBreakProperty(EnBreakProperty enBreakProp) const;
+ private:
+ // -----------------------------------------------------------------------
+ // data structures to store mapping information between XML and UIMA
+
+ // map holding information about XML Elements which are mapped to UIMA
+ // types (new annotations are created for each occurrence of such an element)
+ typedef map< icu::UnicodeString, Type, less<icu::UnicodeString> >
+ TyXMLNameToTypeMap;
+
+ // map holding information about XML Attributes which are mapped to UIMA
+ // attributes (UIMA Attributes of existing annotations are set to the values
+ // of the XML Attributes)
+ // Since each attribute (XML or UIMA) occurs at a certain anchor
+ // (XML Element or UIMA Type) we need to store 4 pieces of information
+ // Since more than one attr can be mapped from the same element we need a vector
+ typedef struct StFeatureInfo_ {
+ Type type; // 3: Type for Feature
+ Feature feature; // 4: Feature
+ }
+ StFeatureInfo;
+
+ typedef map< icu::UnicodeString, StFeatureInfo, less<icu::UnicodeString> >
+ TyXMLAttrToFeatureMap;
+
+ // actual map from 1: the name of XML Element and
+ // 2: the XML Attribute name to the rest of the mapping info
+ typedef map< icu::UnicodeString, TyXMLAttrToFeatureMap, less<icu::UnicodeString> >
+ TyXMLNameToAttrMap;
+
+ // map holding information about XML Elements and their break properties
+ typedef map< icu::UnicodeString, EnBreakProperty, less<icu::UnicodeString> >
+ TyXMLNameToBreakMap;
+
+ // map holding information about XML Elements and their break properties
+ typedef map< EnBreakProperty, Type, less<EnBreakProperty> >
+ TyBreakToTypeMap;
+
+ TyXMLNameToTypeMap iv_mapXMLNameToType;
+ TyXMLNameToAttrMap iv_mapXMLNameToAttr;
+ TyXMLNameToBreakMap iv_mapXMLNameToBreak;
+ TyBreakToTypeMap iv_mapBreakToType;
+
+ icu::UnicodeString iv_ustrDocumentDelimiterTag;
+ vector<icu::UnicodeString> iv_vecustrExcludedTags;
+
+ EnBreakProperty iv_enDefaultBreakProperty;
+
+ Type iv_defaultTypeForTags;
+ Feature iv_featureForTagName;
+
+ }
+ ; /* ParserConfiguration */
+
+} // namespace uima
+
+#endif //UIMA_PARSER_CONFIG_HPP
Propchange: incubator/uima/uimacpp/trunk/src/test/src/uima/parser_config.hpp
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/uima/uimacpp/trunk/src/test/src/uima/parser_interface.hpp
URL: http://svn.apache.org/viewvc/incubator/uima/uimacpp/trunk/src/test/src/uima/parser_interface.hpp?view=auto&rev=503266
==============================================================================
--- incubator/uima/uimacpp/trunk/src/test/src/uima/parser_interface.hpp (added)
+++ incubator/uima/uimacpp/trunk/src/test/src/uima/parser_interface.hpp Sat Feb 3 09:19:57 2007
@@ -0,0 +1,165 @@
+/** \file parser_interface.hpp .
+
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+
+
+ \brief Interface class for parsers.
+
+-------------------------------------------------------------------------- */
+#ifndef UIMA_PARSER_INTERFACE_HPP
+#define UIMA_PARSER_INTERFACE_HPP
+
+
+// ---------------------------------------------------------------------------
+// Includes
+// ---------------------------------------------------------------------------
+
+#include "uima/pragmas.hpp" //must be first to surpress warnings
+#include <iostream>
+
+namespace uima {
+
+ class ParserConfiguration;
+ class TextAnalysisEngine;
+ class CAS;
+
+ /**
+ * The class ParserInterface is used as an abstract base class for all
+ * document parsers.
+ * This is just a sketch of how this interface should be used.
+ * Things like (the facade?) creating instances of objects implementing
+ * this interface are not thought through yet.
+ * <pre>
+ * TAE engine = createTAE...
+ * ParserFacade parserFacade(engine);
+ * parserFacade.setMultiDocCallback(...); // optional for mulit-doc formats
+ * // for each supported parser beyond the pre-defined ones
+ * parserFacade.registerParserForType(p, t, config);
+ * for each document {
+ * parserFacade.parseDocument(d,[config]); // pre-fills the CAS
+ * engine.process(...); // annotators fill the CAS
+ * ... read out results ...
+ * engine.reset(...) // flush CAS
+ * }
+ * engine.destroy();
+ * </pre>
+ */
+ class ParserInterface {
+ public:
+ /**
+ * Callback interface to make it possible for applications to get
+ * notified every time an embedded document is done, so that
+ * they can retrieve their results
+ *
+ * @see ParserInterface::setMultiDocCallback
+ */
+ class MultiDocCallbackInterface {
+ public:
+ /**
+ * Called <em>after</em> a multi-doc parser detects the end
+ * of document.
+ *
+ * An application should call Engine::processDocument() there
+ * and retrieve the results of document processing after that
+ * using iterators over the CAS or TCAS.
+ * Finally an application should call resetDocument().
+ *
+ * An application should <em>not</em> call addDocPartsFinish()
+ * or addDocPartsFinish() in this function since a conforming
+ * parser is supposed to do any doc part processing.
+ */
+ virtual
+ void documentBoundaryReached(UChar const * cpBuffer, size_t uiLength) = 0;
+ };
+
+ /**
+ * Initialize the parser.
+ * The parser is beeing passed an engine object and not a CAS because
+ * for (XML) files that contain multiple documents the parser must be
+ * able to call the process functions for each embedded document.
+ *
+ * An implementation needs to store the argument objects for later
+ * use in function parse().
+ *
+ * Called once per session.
+ *
+ * @param config The configuration for the parser
+ * @param engine The engine object into which the results go
+ * @param fallback An encoding to use in
+ * case the parser can't determine the encoding
+ * by other means
+ *
+ * @return UIMA_ERR_NONE if OK, error code otherwise
+ */
+ virtual
+ TyErrorId init(ParserConfiguration const & config, TextAnalysisEngine & engine, const char * = "Latin1" ) = 0;
+ /**
+ * @see MultiDocCallbackInterface
+ */
+ virtual
+ void setMultiDocCallback(MultiDocCallbackInterface & callbackObject) = 0;
+
+ /**
+ * Do the parsing add the tag free text to the CAS and potentially
+ * translate tag information to CAS annotations.
+ *
+ * Called once per document.
+ *
+ * @param inputFileName The input to process
+ * @return UIMA_ERR_NONE if OK, error code otherwise
+ */
+ virtual
+ TyErrorId parseDocument(char* const inputFileName) = 0;
+ virtual
+ TyErrorId parseDocument(std::istream & inputFileStream) = 0;
+
+ /**
+ * Returns the number of documents parsed by the parser.
+ * This will always be 1 for HTML but can be more for XML.
+ *
+ * Optionally called once per document.
+ */
+ virtual
+ size_t getNumberOfDocumentsParsed() const = 0;
+
+ /**
+ * Returns the number of bytes parsed by the parser.
+ * Information function to allow throughput computation by calling
+ * environment.
+ *
+ * Optionally called once per document.
+ */
+ virtual
+ size_t getNumberOfBytesParsed() const = 0;
+
+ /**
+ * De-initialize the parser (free ressources etc.)
+ *
+ * Called once per session.
+ *
+ * @return UIMA_ERR_NONE if OK, error code otherwise
+ */
+ virtual
+ TyErrorId deInit() = 0;
+ }
+ ; /* ParserInterface */
+
+} // namespace uima
+
+#endif //UIMA_PARSER_INTERFACE_HPP
+
Propchange: incubator/uima/uimacpp/trunk/src/test/src/uima/parser_interface.hpp
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/uima/uimacpp/trunk/src/test/src/uima/ss_tokenizer.hpp
URL: http://svn.apache.org/viewvc/incubator/uima/uimacpp/trunk/src/test/src/uima/ss_tokenizer.hpp?view=auto&rev=503266
==============================================================================
--- incubator/uima/uimacpp/trunk/src/test/src/uima/ss_tokenizer.hpp (added)
+++ incubator/uima/uimacpp/trunk/src/test/src/uima/ss_tokenizer.hpp Sat Feb 3 09:19:57 2007
@@ -0,0 +1,118 @@
+/** \file ss_tokenizer.hpp .
+-----------------------------------------------------------------------------
+
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+
+-------------------------------------------------------------------------- */
+
+#ifndef _INCLUDE_UIMASS
+#define _INCLUDE_UIMASS
+
+#include "uima/language.hpp"
+#include "uima/token_properties.hpp"
+
+namespace uima {
+
+ class ResourceABR;
+
+
+
+ static const int MAXWARD = 6;
+
+ typedef unsigned short TyCharmap [MAXWARD+1][256];
+
+ /**character types used in our char map*/
+ typedef enum {
+ CH_INVALID = 0,
+ CH_LWR = 1, // lowercase characters
+ CH_UPR = 2, // uppercase characters
+ CH_NUM = 4, // number or currency symbol
+ CH_USC = 8, // underscore: like a character, no upper/lower information
+ CH_PRD = 16, // period (full stop)
+ CH_SND = 32, // sentence end: '?' and '!'
+ CH_BLK = 64, // blank
+ CH_NWL = 128, // newline
+ CH_SPC = 256, // special character (or whitespace)
+ CH_CWS = 512, // conditional whitespace: if character is between two
+ // alphanumeric characters, then it becomes part of
+ // the word, e.g / @ -
+ // if not, it's treated as a whitespace
+ CH_NSP = 1024, // number seperator ':' and ',' part of the number
+ // if between digits
+ CH_APS = 2048, // apostroph
+ CH_NPA = 4096, // new paragraph
+ CH_CUR = 8192 // currency and degree symbol: part of number if after of before digit
+ }
+ EnCharClass;
+
+#define CHAR_CLASS_IS_TOKEN_PART(x) ((x) < CH_PRD)
+
+ /** @name Tokenizer
+ The class <TT>Tokenizer</TT> is the implementation of an universal Unicode
+ Tokenizer which is used in the UIMA tokenizer annotator.
+ @see AnnotatorTokenizer
+ */
+ class Tokenizer {
+ public:
+ /** Default Constructor.
+ */
+ Tokenizer( void );
+ virtual ~Tokenizer();
+ /// Main tokenization function
+ void process( const UChar *cpszStart, const UChar *cpszEnd );
+ /// Specify language to use (needed for stopword recognition only)
+ void setLanguage( const Language & crclLanguage );
+ /// Callback function triggered on token recognition
+ virtual int tokenCallback( unsigned long ulLocation,
+ unsigned long ulLength,
+ TokenProperties & crclTokenProperties,
+ bool bNewPara, bool bNewSent ) = 0;
+
+ EnCharClass getCharClass(UChar c);
+
+ // change the character class for a code point
+ void setCharClass(WORD16 uiUnicodeCodePoint,
+ EnCharClass enCharClass);
+
+ // reset char class table to initial values
+ void resetCharClasses(void);
+
+ protected:
+ int tokenEntry( const UChar *, size_t ulLocation,
+ size_t ulLength,
+ TokenProperties & crclTokenProperties,
+ bool &bNewPara, bool &bNewSent,
+ size_t & rulNewlines);
+
+ private:
+ bool isAbreviation(const UChar * pw16String, size_t uiLength) const;
+ EnCharClass getCharClassInl( UChar c );
+
+ // get character class to a character
+ bool iv_bUseAlternateTerritories;
+ Language iv_clLanguageABR;
+ ResourceABR * iv_pclResourceABR;
+ // this will either point to our constant static map or
+ // to a freshly allocated writable map if setCharClass has been called
+ TyCharmap * iv_pauiCharMapWard;
+
+ };
+
+} // namespace uima
+
+#endif /* _INCLUDE_UIMASS */
Propchange: incubator/uima/uimacpp/trunk/src/test/src/uima/ss_tokenizer.hpp
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/uima/uimacpp/trunk/src/test/src/uima/strptrlenpair.hpp
URL: http://svn.apache.org/viewvc/incubator/uima/uimacpp/trunk/src/test/src/uima/strptrlenpair.hpp?view=auto&rev=503266
==============================================================================
--- incubator/uima/uimacpp/trunk/src/test/src/uima/strptrlenpair.hpp (added)
+++ incubator/uima/uimacpp/trunk/src/test/src/uima/strptrlenpair.hpp Sat Feb 3 09:19:57 2007
@@ -0,0 +1,532 @@
+#ifndef UIMA_STRPTRLENPAIR_HPP
+#define UIMA_STRPTRLENPAIR_HPP
+/** \file strptrlenpair.hpp .
+-----------------------------------------------------------------------------
+
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+
+ \brief Shallow string object consisting of a pair of
+ string pointer and a length
+
+-----------------------------------------------------------------------------
+*/
+
+#include "uima/pragmas.hpp" //must be included first to disable warnings
+
+#include <vector>
+#include <utility>
+#include <string>
+#include <iostream>
+
+#include "uima/assertmsg.h"
+//#include "uima/ccsid.hpp"
+//#include "uima/u2cpcnvrt.hpp"
+#include "unicode/uchar.h"
+
+/* ----------------------------------------------------------------------- */
+/* Interface dependencies */
+/* ----------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------- */
+/* Types / Classes */
+/* ----------------------------------------------------------------------- */
+
+namespace uima {
+
+ /**
+ The class <TT>BasicStrPtrLenPair</TT> provides support for non zero-terminated strings
+ that are presented as pointers to string arrays with an associated length.
+ As this type of string is used only as string reference into read-only buffers,
+ the string pointer is constant.
+ The member functions are names in an ANSI basic_string.
+ This enables a limited use of basic-l-strings in template functions that
+ are designed for basic_strings (the hash functions will work, for example).
+ Note: This is why previous function <TT>set()</TT> has been renamed
+ <TT>assign()</TT>
+ */
+ template < class CharT >
+ class BasicStrPtrLenPair : public std::pair< CharT const *, size_t > {
+ public:
+ ///(Default) Constructor
+ BasicStrPtrLenPair( void ) :
+ std::pair< CharT const * , size_t >(NULL, 0) {}
+
+ ///Constructor from zero terminated string
+ BasicStrPtrLenPair(
+ const CharT * cpacString
+ ) :
+ std::pair< CharT const * , size_t >(cpacString, strlen_templ(cpacString)) {
+ assert( (EXISTS(first) )
+ || ((first == NULL ) && (second == 0)) );
+ }
+
+ ///Constructor from string and length
+ BasicStrPtrLenPair(
+ const CharT * cpacString,
+ size_t uiLength
+ ) :
+ std::pair< CharT const * , size_t >(cpacString, uiLength) {
+ assert( (EXISTS(first) )
+ || ((first == NULL ) && (second == 0)) );
+ }
+
+ /// Constructor from a two pointers (begin/end). Note: end points to the first char <em>behind</em> the string.
+ BasicStrPtrLenPair(
+ const CharT * paucStringBegin,
+ const CharT * paucStringEnd
+ ) :
+ std::pair< CharT const * , size_t >(paucStringBegin, paucStringEnd - paucStringBegin ) //lint !e613: Possible use of null pointer 'paucStringEnd' in left argument to operator 'ptr-ptr'
+ {
+ assert(EXISTS(paucStringBegin));
+ assert(EXISTS(paucStringEnd));
+ assert(paucStringEnd >= paucStringBegin);
+ assert( (EXISTS(first) )
+ || ((first == NULL ) && (second == 0)) );
+ }
+
+ ///Constructor from basic_string<CharT>
+ BasicStrPtrLenPair(
+ const std::basic_string< CharT > & crclBasicString
+ ) :
+ std::pair< CharT const * , size_t >(crclBasicString.data(), crclBasicString.length()) {
+ assert( (EXISTS(first) )
+ || ((first == NULL ) && (second == 0)) );
+ }
+
+ ///Constructor from pair
+ BasicStrPtrLenPair(
+ std::pair< CharT const * , size_t > const & crclPair
+ ) : //lint !e1724: Argument to copy constructor for class 'uima::BasicStrPtrLenPair<<1>>' should be a const reference
+ std::pair< CharT const * , size_t >(crclPair.first, crclPair.second) {
+ assert( (EXISTS(first) )
+ || ((first == NULL ) && (second == 0)) );
+ }
+
+ ///Accessor for the string length in logical characters
+ size_t
+ length( void ) const {
+ return second;
+ }
+
+ ///Accessor for the string length in bytes
+ size_t
+ getSizeInBytes( void ) const {
+ return (second * sizeof(CharT));
+ }
+
+ ///CONST Accessor for the string content (NOT ZERO DELIMITED!).
+ const CharT *
+ data( void ) const {
+ return first;
+ }
+
+ ///CONST Accessor to the begin of string content (NOT ZERO DELIMITED!).
+ const CharT *
+ begin( void ) const {
+ return (first);
+ }
+
+ ///Accessor to position AFTER the end of string content.
+ const CharT *
+ end( void ) const {
+ return (first == NULL ? NULL : first + second);
+ }
+
+ /**
+ Finds the first occurence of key character <TT>cPattern</TT> in the string.
+ */
+ size_t
+ find( CharT cPattern ) const {
+ return str_find_first( cPattern, first, second);
+ } //lint !e1746: parameter 'cPattern' in function 'BasicStrPtrLenPair<UChar>::find(UChar) const' could be made const reference
+
+ /**
+ Finds the first occurence of key string <TT>cpacPattern</TT> (with length
+ <TT>uiPatternLen</TT>) in the string
+ */
+ size_t
+ find(
+ const BasicStrPtrLenPair< CharT > & crlstrPattern // pattern to search for
+ ) const {
+ return str_find_first( crlstrPattern.first, crlstrPattern.second,
+ first, second);
+ }
+
+ /**
+ Finds the first occurence of key string <TT>cpacPattern</TT> (with length
+ <TT>uiPatternLen</TT>) in the string
+ */
+ size_t
+ find(
+ const CharT * cpacPattern, // pattern to search for
+ size_t uiPatternLen // length of pattern
+ ) const {
+ return str_find_first( cpacPattern, uiPatternLen, first, second);
+ }
+
+ /**
+ Finds the first occurence of key string <TT>cpacPattern</TT> (with length
+ <TT>uiPatternLen</TT>), in the substring from <TT>uiStartPos</TT> to
+ <TT>uiStartPos+uiStartLength</TT>
+ */
+ size_t
+ find(
+ const CharT * cpacPattern, // pattern to search for
+ size_t uiPatternLen, // length of pattern
+ size_t uiStartPos, // from this pos
+ size_t /*uiStartLength*/ // up to uiStartPos+uiStartLength
+ ) const {
+ if ( uiStartPos >= second ) {
+ return STRING_NPOS; // If search starts past end of str, indicate "not found".
+ }
+
+ assert(EXISTS(cpacPattern));
+ assert(EXISTS(first));
+ return str_find_first( cpacPattern, uiPatternLen,
+ (first+uiStartPos), second);
+ }
+
+ /** Return a sub-string of this string starting from position <TT>uiStartPos</TT>
+ and including the following <TT>uiLength</TT> characters.
+ */
+ BasicStrPtrLenPair< CharT >
+ sub_str(
+ size_t uiStartPos,
+ size_t uiLength
+ ) {
+ assert(uiStartPos < second);
+ assert(uiStartPos + uiLength < second);
+ assert(EXISTS(first));
+ return BasicStrPtrLenPair< CharT >(first+uiStartPos, uiLength);
+ }
+
+ ///Set the string to new value. (used to be named <TT>set()</TT>)
+ BasicStrPtrLenPair< CharT > &
+ assign(
+ const CharT * cpacString,
+ size_t uiLength
+ ) {
+ first = cpacString;
+ second = uiLength;
+ assert( (EXISTS(first) )
+ || ((first == NULL ) && (second == 0)) );
+
+ return (*this);
+ }
+
+ ///Set the string to new value. (used to be names set)
+ BasicStrPtrLenPair< CharT > &
+ assign(
+ const std::basic_string< CharT > & crclBasicString
+ ) {
+ first = crclBasicString.data();
+ second = crclBasicString.length();
+ assert( (EXISTS(first) )
+ || ((first == NULL ) && (second == 0)) );
+
+ return (*this);
+ }
+
+ ///Assignment operator
+ BasicStrPtrLenPair< CharT > &
+ assign( const std::pair< CharT const *, size_t > & crclPair ) {
+ first = crclPair.first;
+ second = crclPair.second;
+ return (*this);
+ }
+
+ /** Accessor for the string content (CharT dependant string return type).
+ */
+ std::basic_string< CharT >
+ copyToBasicString(
+ void
+ ) const {
+ return basic_string< CharT >(first, second);
+ }
+
+#ifdef NEVER
+ /// convert to single byte string. crclCCSID specifies the target encoding)
+ std::string
+ prv_asSingleByteString(
+ const uima::CCSID & crclCCSID
+ ) const {
+ if (sizeof(CharT) == 1) { //lint !e774: Boolean within 'if' always evaluates to True
+ // single byte lstrings
+ return string((char*)data(), length());
+ }
+ if (length() == 0) {
+ return string();
+ }
+ assert(sizeof(CharT) == 2); // unicode lstrings
+ assert(EXISTS(data())); //lint !e527 !e666: Expression with side effects passed to repeated parameter 1 in macro EXISTS
+ // Small values are copied in a stack based buffer, larger are allocated
+ // Max string length to handle stack based
+ const size_t STACK_BUFF_LIMIT = 64;
+ // Our stack buffer
+ char acStackBuff [STACK_BUFF_LIMIT];
+ // A pointer to either the stack buffer of dynamic storage
+ char * pcCharBuff;
+
+ Unicode2CodePageConverter clConverter(crclCCSID);
+ size_t uiMaxNewLength = clConverter.getMaximumSizeForLength(length());
+
+ if (uiMaxNewLength < STACK_BUFF_LIMIT) {
+ pcCharBuff = acStackBuff; //use stack buffer
+ } else {
+ pcCharBuff = new char [uiMaxNewLength]; //allocate
+ }
+ // Now convert UChar into char array
+ size_t uiCharsWritten = clConverter.convertCharacters(pcCharBuff, uiMaxNewLength, (const UChar*)data(), length());
+
+ // Construct our string
+ string strRetVal(pcCharBuff, uiCharsWritten);
+
+
+ if (uiMaxNewLength >= STACK_BUFF_LIMIT) { // if allocated ...
+ delete [] pcCharBuff; //lint !e673 Possibly inappropriate deallocation (delete[]) for 'auto' data
+ }
+ return strRetVal;
+ }
+#endif
+
+ ///CONST Array Index Access operator
+ const CharT &
+ operator[]( size_t uiIndex ) const {
+ assert(uiIndex < second);
+ assert(EXISTS(first));
+ return first[uiIndex]; //lint !e613: Possible use of null pointer 'BasicStrPtrLenPair<wchar_t>::first' in left argument to operator '['
+ }
+
+
+ ///Equality operator
+ int
+ operator==( const BasicStrPtrLenPair< CharT > & crclRHS ) const {
+ if (second != crclRHS.second) {
+ return false;
+ }
+ return strncmp_templ(first, crclRHS.first, second) == 0;
+ }
+
+ ///Assignment operator
+ BasicStrPtrLenPair< CharT > &
+ operator=( BasicStrPtrLenPair< CharT > const & crclRHS ) {
+ first = crclRHS.first;
+ second = crclRHS.second;
+ return (*this);
+ }
+
+ ///Assignment operator
+ BasicStrPtrLenPair< CharT > &
+ operator=( std::pair< CharT const *, size_t > const & crclPair ) { //lint !e1520 !e1720 :multiple assignment ops assignment operator for class 'uima::BasicStrPtrLenPair<<1>>' has non-const parameter
+ first = crclPair.first;
+ second = crclPair.second;
+ return (*this);
+ }
+
+ ///less operator
+ bool operator <( BasicStrPtrLenPair< CharT > const & crclRHS ) const {
+ size_t uiLen1 = length();
+ size_t uiLen2 = crclRHS.length();
+ if (!(bool)uiLen2) {
+ return(false);
+ }
+ if (!(bool)uiLen1) {
+ return(true);
+ }
+ const CharT * cpszString1 = data();
+ const CharT * cpszString2 = crclRHS.data();
+ while ((bool)uiLen1 && (bool)uiLen2 && *cpszString1 == *cpszString2) {
+ ++cpszString1;
+ ++cpszString2;
+ --uiLen1;
+ --uiLen2;
+ }
+ if (!(bool)uiLen2) {
+ return(false);
+ }
+ if (!(bool)uiLen1) {
+ return(true);
+ }
+ return (*cpszString1 < *cpszString2);
+ }
+
+ };
+
+///This defines the standard LString class with single byte character.
+ typedef BasicStrPtrLenPair< char > StrPtrLenPair;
+
+///This defines the standard LString class with wide character.
+ typedef BasicStrPtrLenPair< wchar_t > WStrPtrLenPair;
+
+///This defines the wide LString class with wide/double byte character.
+ typedef BasicStrPtrLenPair< UChar > UStrPtrLenPair;
+
+#if defined(UNDECLARED_FUNCTION_TEMPLATES_LINK_BUG)
+// To work around "unsatisfied symbols" during linking,
+// we need a declaration in addition to the definition below
+ template < class CharT >
+ std::ostream &
+ operator << (
+ std::ostream & rclOStream,
+ const BasicStrPtrLenPair< CharT > & crclLString
+ );
+#endif
+
+#ifdef NEVER
+///Output stream support for BasicStrPtrLenPair
+ template < class CharT >
+ inline std::ostream &
+ operator << (
+ std::ostream & rclOStream,
+ const BasicStrPtrLenPair< CharT > & crclLString
+ ) {
+ if (rclOStream == cout || rclOStream == cerr) { //lint !e1912: Implicit call of conversion function from class 'basic_ostream' to type 'void *'
+ rclOStream << crclLString.prv_asSingleByteString(CosClCCSID::getConsoleCCSID()).c_str();
+ } else {
+ rclOStream << crclLString.prv_asSingleByteString(CosClCCSID::CosEnCCSID_UTF8).c_str();
+ }
+ return rclOStream;
+ }
+
+
+///Output stream support for pointer length pairs
+ inline std::ostream &
+ operator << (
+ std::ostream & rclOStream,
+ const std::pair< UChar const *, size_t > & crclPair
+ ) {
+ BasicStrPtrLenPair< UChar > const lString(crclPair);
+ if (rclOStream == std::cout || rclOStream == std::cerr) { //lint !e1912: Implicit call of conversion function from class 'basic_ostream' to type 'void *'
+ rclOStream << lString.prv_asSingleByteString(CosClCCSID::getConsoleCCSID()).c_str();
+ } else {
+ rclOStream << lString.prv_asSingleByteString(CosClCCSID::CosEnCCSID_UTF8).c_str();
+ }
+ return rclOStream;
+ }
+
+#endif
+
+
+ /* ----------------------------------------------------------------------- */
+ /** @name vector to/from delimited string conversion routines */
+ /* ----------------------------------------------------------------------- */
+ /*@{*/
+
+ /**
+ Removes whitespace from both ends of a string.
+ Template function using <TT>isspace_templ()</TT>.
+ */
+ template < class CharT >
+ inline BasicStrPtrLenPair< CharT >
+ strtrim(
+ const BasicStrPtrLenPair< CharT > & s
+ ) {
+ if (s.length() == 0) {
+ return s;
+ }
+ const CharT * beg = s.data();
+ const CharT * end = s.data()+s.length()-1;
+ while (end >= beg && isspace_templ(*end) ) {
+ --end;
+ }
+ while (beg < end && isspace_templ(*beg) ) {
+ ++beg;
+ }
+ return BasicStrPtrLenPair< CharT >(beg, end-beg+1);
+ }
+
+ /**
+ Splits a delimited string into pieces and stores the results in a vector
+ of strings. Delimiters are passed as a zero terminated string.
+
+ @param rveclstrOutput (Output) The vector where the results are stored
+ @param pcInput The delimited string to split.
+ @param uiInputLength The number of chars in pcInput
+ @param cpszDelimiters The delimiters. CharT* are interpreted as a set of delimiters.
+ @param bTrimString Flag: If true, all pieces will be trimmed before storing in <TT>storeVar</TT>
+ @param bInsertEmptyStrings Flag: If false, pieces that have length 0 will not be stored in <TT>storeVar</TT>
+
+ @return The number of strings added to <TT>rvecstrOutput</TT>
+ */
+ template < class CharT >
+ inline size_t
+ delimitedStrPtrLenPair2Vector(
+ std::vector< uima::BasicStrPtrLenPair< CharT > > & rveclstrOutput,
+ const CharT * pcInput,
+ size_t uiInputLength,
+ const CharT * cpszDelimiters,
+ bool bTrimString,
+ bool bInsertEmptyStrings
+ ) {
+ const CharT * pcBegin = pcInput;
+ size_t uiEnd;
+ const CharT * pcEnd = pcBegin;
+ size_t uiNumFound = 0;
+ size_t uiDelimitersLen = strlen_templ(cpszDelimiters);
+
+ if (uiInputLength == 0) {
+ return 0;
+ }
+ const CharT * pcInputEnd = pcInput + uiInputLength;
+ BasicStrPtrLenPair< CharT > _s;
+
+ while (pcBegin < pcInputEnd) {
+ // uiBegin--;
+ uiEnd = str_find_first_of(cpszDelimiters, uiDelimitersLen, pcBegin, (size_t)(pcInputEnd-pcBegin));
+ pcEnd = pcBegin+uiEnd;
+ if (uiEnd != STRING_NPOS) {
+ ++pcEnd;
+ }
+ if (uiEnd == STRING_NPOS) {
+ uiEnd = uiInputLength+1;
+ pcEnd = pcInputEnd+1;
+ }
+ assert(pcEnd > pcBegin);
+ _s.assign(pcBegin, pcEnd-pcBegin-1);
+ if (bTrimString) {
+ _s = strtrim(_s);
+ }
+ if (bInsertEmptyStrings || _s.length() > 0) {
+ rveclstrOutput.push_back(_s);
+ uiNumFound++;
+ }
+ pcBegin = pcEnd;
+ }
+ return uiNumFound;
+ }
+
+ template < class CharT >
+ inline size_t
+ delimitedStrPtrLenPair2Vector(
+ std::vector< BasicStrPtrLenPair< CharT > > & veclstrOutput,
+ const CharT * pcInput,
+ const CharT * cpszDelimiters,
+ bool bTrimString,
+ bool bInsertEmptyStrings
+ ) {
+ return delimitedStrPtrLenPair2Vector(veclstrOutput, pcInput, strlen_templ(pcInput), cpszDelimiters, bTrimString, bInsertEmptyStrings);
+ }
+
+//@}
+
+} // namespace uima
+
+#endif /* UIMA_STRPTRLENPAIR_HPP */
+
+/* <EOF> */
+
Propchange: incubator/uima/uimacpp/trunk/src/test/src/uima/strptrlenpair.hpp
------------------------------------------------------------------------------
svn:eol-style = native