You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by ea...@apache.org on 2007/02/03 18:19:59 UTC
svn commit: r503266 [1/2] - /incubator/uima/uimacpp/trunk/src/test/src/uima/

Author: eae
Date: Sat Feb  3 09:19:57 2007
New Revision: 503266

URL: http://svn.apache.org/viewvc?view=rev&rev=503266
Log:
Initial entry

Added:
    incubator/uima/uimacpp/trunk/src/test/src/uima/
    incubator/uima/uimacpp/trunk/src/test/src/uima/annotator_dump.hpp   (with props)
    incubator/uima/uimacpp/trunk/src/test/src/uima/annotator_tok.hpp   (with props)
    incubator/uima/uimacpp/trunk/src/test/src/uima/conui.hpp   (with props)
    incubator/uima/uimacpp/trunk/src/test/src/uima/doc_buffer.hpp   (with props)
    incubator/uima/uimacpp/trunk/src/test/src/uima/parse_handlers.hpp   (with props)
    incubator/uima/uimacpp/trunk/src/test/src/uima/parser_config.hpp   (with props)
    incubator/uima/uimacpp/trunk/src/test/src/uima/parser_interface.hpp   (with props)
    incubator/uima/uimacpp/trunk/src/test/src/uima/ss_tokenizer.hpp   (with props)
    incubator/uima/uimacpp/trunk/src/test/src/uima/strptrlenpair.hpp   (with props)
    incubator/uima/uimacpp/trunk/src/test/src/uima/tt_types.hpp   (with props)
    incubator/uima/uimacpp/trunk/src/test/src/uima/xmlparse_handlers.hpp   (with props)

Added: incubator/uima/uimacpp/trunk/src/test/src/uima/annotator_dump.hpp
URL: http://svn.apache.org/viewvc/incubator/uima/uimacpp/trunk/src/test/src/uima/annotator_dump.hpp?view=auto&rev=503266
==============================================================================
--- incubator/uima/uimacpp/trunk/src/test/src/uima/annotator_dump.hpp (added)
+++ incubator/uima/uimacpp/trunk/src/test/src/uima/annotator_dump.hpp Sat Feb  3 09:19:57 2007
@@ -0,0 +1,159 @@
+#ifndef UIMA_ANNOTATOR_DUMP_H$
+#define UIMA_ANNOTATOR_DUMP_H$
+/** \file annotator_dump.hpp .
+-----------------------------------------------------------------------------
+
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+
+-------------------------------------------------------------------------- */
+
+
+/* ----------------------------------------------------------------------- */
+/*       Include dependencies                                              */
+/* ----------------------------------------------------------------------- */
+
+#include "uima/api.hpp"                               /* UIMA API */
+
+#include <fstream>
+#include <vector>
+#include <deque>
+
+
+#include "uima/filename.hpp"
+
+using namespace uima;
+
+/* ----------------------------------------------------------------------- */
+/*       Forward declarations                                              */
+/* ----------------------------------------------------------------------- */
+
+
+/* ----------------------------------------------------------------------- */
+/*       Types / Classes                                                   */
+/* ----------------------------------------------------------------------- */
+
+
+/** @name AnnotatorDump
+   The class <TT>AnnotatorDump</TT> is used to .
+   Example:
+   \code
+   \endcode
+   @see
+*/
+class AnnotatorDump : public TextAnnotator {
+public:
+  /** @name Constructors */
+  /*@{*/
+  /** Default Constructor:
+  */
+  AnnotatorDump();
+  /*@}*/
+
+  ~AnnotatorDump(void);  //lint !e1908 !e1509: base class destructor for class 'AnnotatorABase' is not virtual : 'virtual' assumed for ~AnnotatorDump() (inherited from base class AnnotatorABase)
+
+  /** @name Annotator Processing Functions */
+  /*@{*/
+  /** call the UIMA Annotator to initialize itself based on a UIMA engine
+      and a UIMA Configuration section and return a UIMA error code */
+  TyErrorId
+  initialize(
+    AnnotatorContext & rclAnnotatorContext
+  );  //lint !e1909: 'virtual' assumed, see: AnnotatorABase::init(Engine &, ConfigAnnotator &) (line 79, file g:\projects\UIMAcurrent\code\engine\include\annotator_abase.hpp)
+
+  TyErrorId typeSystemInit(uima::TypeSystem const &);
+
+  /** call the UIMA Annotator to deinitialize itself based on a UIMA engine
+      and return a UIMA error code */
+  TyErrorId
+  destroy();  //lint !e1909: 'virtual' assumed, see: AnnotatorABase::deInit(Engine &) (line 83, file g:\projects\UIMAcurrent\code\engine\include\annotator_abase.hpp)
+
+  /** call the UIMA Annotator to reconfigure itself based on a UIMA Configuration
+      section and return a UIMA error code */
+  TyErrorId
+  reconfigure(
+  );  //lint !e1909: 'virtual' assumed, see: AnnotatorABase::config(ConfigAnnotator &) (line 87, file g:\projects\UIMAcurrent\code\engine\include\annotator_abase.hpp)
+
+  /** call the UIMA Annotator to perform its doc related duty based on a UIMA engine
+      and return a UIMA error code */
+  TyErrorId
+  process(
+    CAS & tcas,
+    ResultSpecification const &
+  );  //lint !e1909: 'virtual' assumed, see: AnnotatorABase::processDocument(Engine &, const TargetSetAT &, const TargetSetTT &) (line 91, file g:\projects\UIMAcurrent\code\engine\include\annotator_abase.hpp)
+
+  /*@}*/
+
+  /** @name Properties */
+  /*@{*/
+  /*@}*/
+  /** @name Miscellaneous */
+  /*@{*/
+  /*@}*/
+protected:
+private:
+
+  /* --- types ---------------------------------------------------------------*/
+
+  enum EnOutputStyle {
+    Xml, XCas
+  };
+
+  /* --- variables -------------------------------------------------------- */
+  util::Filename iv_clOutputFilename;
+  bool     iv_bDumpDocBuffer; // When set to 'True', the Annotator dumps the Doc Buffer
+  bool     iv_bSaveDocBuffer; // When set to 'True', the Annotator dumps the Doc Buffer in binary format, too
+
+  ofstream iv_clOutputStream;
+  bool     iv_bAppendFile;
+
+  EnOutputStyle iv_enOutputStyle;
+
+  //vector<uima::Type> iv_vecOutputTypes;
+
+  // The annotator may be invoked in several sections within one config-file.
+  // If all output gets dumped into one file, the names of the sections serve
+  // as headers. We can't access the ConfigAnnotator-Object in 'processDocument',
+  // hence, we need a member var.
+  string iv_cpszSectionName;
+  /* --- functions -------------------------------------------------------- */
+
+  TyErrorId
+  openOutputFile( void );
+
+  void
+  closeOutputFile( void );
+
+  void outputDocBuffer(UnicodeStringRef const & crclDoc);
+
+
+  AnnotatorDump & operator=(
+    const AnnotatorDump &
+  );
+
+  AnnotatorDump(
+    const AnnotatorDump &
+  );
+
+}
+; /* AnnotatorDump */
+
+/* ----------------------------------------------------------------------- */
+#endif
+
+/* <EOF> */
+

Propchange: incubator/uima/uimacpp/trunk/src/test/src/uima/annotator_dump.hpp
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/uima/uimacpp/trunk/src/test/src/uima/annotator_tok.hpp
URL: http://svn.apache.org/viewvc/incubator/uima/uimacpp/trunk/src/test/src/uima/annotator_tok.hpp?view=auto&rev=503266
==============================================================================
--- incubator/uima/uimacpp/trunk/src/test/src/uima/annotator_tok.hpp (added)
+++ incubator/uima/uimacpp/trunk/src/test/src/uima/annotator_tok.hpp Sat Feb  3 09:19:57 2007
@@ -0,0 +1,253 @@
+#ifndef UIMA_ANNOTATOR_TOK_H$
+#define UIMA_ANNOTATOR_TOK_H$
+/** \file annotator_tok.hpp .
+-----------------------------------------------------------------------------
+
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+
+
+   \brief  Contains AnnotatorTokenizer a Unicode UIMA Tokenizer Annotator.
+
+-------------------------------------------------------------------------- */
+
+
+/* ----------------------------------------------------------------------- */
+/*       Include dependencies                                              */
+/* ----------------------------------------------------------------------- */
+
+/* We want the timers in this annotator to be only active if the annotator specific
+   define ANNOTATOR_TIMERS is set (e.g. in the automake.pro file for this annotator)
+   in all other cases we don't want the timers.
+   Since all the timers depend on the more generic define DEBUG_TIMING we
+   define DEBUG_TIMING if and only if ANNOTATOR_TIMERS is set.
+   Specificaly we don't want the timers set when DEBUG_TIMING is defined to
+   build a generic timing driver of the whole system.
+   Our internal annotator timers would bias the whole timing driver with the
+   overhead involved in calling them in this annotator. This is why we specificaly
+   undefine DEBUG_TIMING even if it might be set in the makefile to build this
+   annotator.
+   If you want timing in this annotator use ANNOTATOR_TIMERS not DEBUG_TIMING
+ */
+#ifdef ANNOTATOR_TIMERS
+#  ifndef DEBUG_TIMING
+#     define DEBUG_TIMING
+#  endif
+#else
+#  ifdef DEBUG_TIMING
+#     undef DEBUG_TIMING
+#  endif
+#endif
+
+#include "uima/timedatetools.hpp"
+#include "uima/api.hpp"                          /* UIMA API */
+///////#include "uima/u2cpcnvrtbuff.hpp"                 /* U2CpConvertBuffer */
+#include "uima/ss_tokenizer.hpp"
+#include "uima/internal_casimpl.hpp"
+
+#define STEMMER_BUF_LEN 50
+
+using namespace uima;
+
+/** @name AnnotatorTokenizer
+   The class <TT>AnnotatorTokenizer</TT> is used a universal Unicode Tokenizer.
+
+   It uses a little trick to check API consistency via an abstract base class,
+   without having the overhead of virtual functions in our ship version.
+   <TT>AnnotatorABase</TT> defines all non-static member functions a plug-in needs
+   to define as pure virtual functions. By making this class inherit from
+   this base class we can make sure that compilation will fail if the
+   interfaces change.
+   Since we don't really use the inheritance relationship we don't define
+   it in the ship version.
+
+   Example:
+   \code
+   \endcode
+   @see AnnotatorABase
+*/
+class AnnotatorTokenizer : public Tokenizer , public TextAnnotator {
+public:
+  /** @name Constructors */
+  /*@{*/
+  /** Default Constructor.
+  */
+  AnnotatorTokenizer(void);
+
+  /*@}*/
+  virtual ~AnnotatorTokenizer(void);  //lint !e1509: base class destructor for class 'AnnotatorABase' is not virtual
+
+  /** @name Annotator Processing Functions */
+  /*@{*/
+  /** call the UIMA Annotator to initialize itself based on a UIMA engine
+      and a UIMA Configuration section and return a UIMA error code */
+  TyErrorId
+  initialize(
+    AnnotatorContext & rclAnnotatorContext
+  );  //lint !e1909: 'virtual' assumed, see: AnnotatorABase::init(Engine &, ConfigAnnotator &) (line 79, file d:\develop\uima\current\code\engine\include\annotator_abase.hpp)
+
+  TyErrorId typeSystemInit(TypeSystem const &);
+
+  /** call the UIMA Annotator to deinitialize itself based on a UIMA engine
+      and return a UIMA error code */
+  TyErrorId
+  destroy();  //lint !e1909: 'virtual' assumed, see: AnnotatorABase::deInit(Engine &) (line 83, file d:\develop\uima\current\code\engine\include\annotator_abase.hpp)
+
+  /** call the UIMA Annotator to reconfigure itself based on a UIMA Configuration
+      section and return a UIMA error code */
+  TyErrorId
+  reconfigure(
+  );  //lint !e1909: 'virtual' assumed, see: AnnotatorABase::config(ConfigAnnotator &) (line 87, file d:\develop\uima\current\code\engine\include\annotator_abase.hpp)
+
+protected:
+  /** call the UIMA Annotator to perform its doc related duty based on a UIMA engine
+      and return a UIMA error code */
+  TyErrorId
+  process(
+    CAS &,
+    const ResultSpecification & crclTargetSet
+  );  //lint !e1909: 'virtual' assumed, see: AnnotatorABase::processDocument(Engine &, const TargetSetAT &, const TargetSetTT &) (line 91, file d:\develop\uima\current\code\engine\include\annotator_abase.hpp)
+
+  /*@}*/
+protected:
+
+  virtual int tokenCallback( unsigned long ulLocation, unsigned long ulLength,
+                             TokenProperties & crclTokenProperties,
+                             bool bNewPara, bool bNewSent );
+
+  TyDocIndex                 iv_uiParagraphStartIndex;
+  TyDocIndex                 iv_uiSentenceStartIndex;
+  // segment numbers
+  size_t                     iv_uiTokenNbr;
+  size_t                     iv_uiSentenceNbr;
+  size_t                     iv_uiParagraphNbr;
+private:
+  // number of the first token/sentence/paragraph
+  const size_t               iv_cuiCOUNTER_START;
+
+  /* --------------------------------------------------*/
+  /*   config values we use                            */
+  /* --------------------------------------------------*/
+
+  /// Enum listing all the config option we support
+  enum EnAnnotatorConfigOptions {
+    enConfigOption_TokenNumbersIncludeStopwords,
+    enConfigOption_UseRelativeTokenAndSentenceNumbers,
+    enConfigOption_IgnorePunctuationTokens,
+    // (drop inifile support) enConfigOption_CharMapConfigFilename,
+    enNumberOfConfigOptions  // must be last in enum
+  };
+
+  // our config table
+  static const ConfigOptionInfo::StOptionInfo cv_astConfigOptionInfo[enNumberOfConfigOptions];
+
+  // Variables the config options are stored in:
+
+  // if this is true the token numbers are counted including stopwords
+  bool                    iv_bTokenNumbersIncludeStopwords;
+  // if this is true token and sentence number are reset to 1
+  // for each new sentence/paragraph
+  bool                    iv_bUseRelativeTokenAndSentenceNumbers;
+  // If true, punctuation tokens are ignored
+  bool                    iv_bIgnorePunctuationTokens;
+  // trace component ID
+  uima::TyComponentId                   iv_iTraceCompID;
+
+  // Some pointers for quick access to UIMA objects. Initialized in init()
+  uima::internal::CASImpl * iv_pCASImpl;
+  lowlevel::FSHeap * iv_pFSHeap;
+  lowlevel::IndexRepository * iv_pIndexRepository;
+  // FSTypes and corresponding sizes
+  lowlevel::TyFSType iv_tyTokenType;
+  lowlevel::TyFeatureOffset iv_tyTokenTypeSize;
+  lowlevel::TyFSType iv_tySentenceType;
+  lowlevel::TyFeatureOffset iv_tySentenceTypeSize;
+  lowlevel::TyFSType iv_tyParagraphType;
+  lowlevel::TyFeatureOffset iv_tyParagraphTypeSize;
+
+  // FSFeatures
+  lowlevel::TyFeatureOffset  iv_tySofaFeatureOffset;
+  lowlevel::TyFeatureOffset  iv_tyBeginPositionFeatureOffset;
+  lowlevel::TyFeatureOffset  iv_tyEndPositionFeatureOffset;
+
+  lowlevel::TyFeatureOffset  iv_tyTokenPropertiesFeatureOffset;
+  lowlevel::TyFeatureOffset  iv_tyTokenNbrFeatureOffset;
+  lowlevel::TyFeatureOffset  iv_tySentenceNbrFeatureOffset;
+  lowlevel::TyFeatureOffset  iv_tyParagraphNbrFeatureOffset;
+  lowlevel::TyFSFeature  iv_stemFeature;
+
+  // needed output types
+  bool                       iv_bIsTokenReq;
+  bool                       iv_bIsSentenceReq;
+  bool                       iv_bIsParagraphReq;
+  bool                       iv_stemsRequired;
+  TokenProperties            iv_clTokenProperties;
+
+#ifdef DEBUG_TIMING
+  uima::Timer                 iv_clTotalTimer;
+  uima::Timer                 iv_clSSTokTimer;
+  uima::Timer                 iv_clUimaAnCreateTimer;
+  uima::Timer                 iv_clUimaAnSetValTimer;
+#endif
+
+
+  /* --- functions --- */
+  /* COPY CONSTRUCTOR NOT SUPPORTED */
+  AnnotatorTokenizer(const AnnotatorTokenizer & ); //lint !e1704
+  /* ASSIGNMENT OPERATOR NOT SUPPORTED */
+  AnnotatorTokenizer & operator=(const AnnotatorTokenizer & crclObject);
+
+  /// (Re-) Access config values. Used in init() and config().
+  TyErrorId
+  getConfigValues(
+    AnnotatorContext          & rclConfig
+  );
+
+  /// member functions for adding annotations
+  void
+  addNewTokenAnnotation(
+    TyDocIndex           tyBeginPos,
+    TyDocIndex           tyEndPos
+  );
+
+  /// member functions for adding annotations
+  void
+  addNewSentenceAnnotation(
+    TyDocIndex           tyBeginPos,
+    TyDocIndex           tyEndPos
+  );
+
+  /// member functions for adding annotations
+  void
+  addNewParagraphAnnotation(
+    TyDocIndex           tyBeginPos,
+    TyDocIndex           tyEndPos
+  );
+
+#if defined(DEBUG_TIMING)
+  void
+  dumpTimingData( void ) const;
+#endif
+
+}
+; /* AnnotatorTokenizer */
+
+/* ----------------------------------------------------------------------- */
+#endif /* UIMA_ANNOTATOR_TOK_H */
+
+/* <EOF> */
+

Propchange: incubator/uima/uimacpp/trunk/src/test/src/uima/annotator_tok.hpp
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/uima/uimacpp/trunk/src/test/src/uima/conui.hpp
URL: http://svn.apache.org/viewvc/incubator/uima/uimacpp/trunk/src/test/src/uima/conui.hpp?view=auto&rev=503266
==============================================================================
--- incubator/uima/uimacpp/trunk/src/test/src/uima/conui.hpp (added)
+++ incubator/uima/uimacpp/trunk/src/test/src/uima/conui.hpp Sat Feb  3 09:19:57 2007
@@ -0,0 +1,71 @@
+/** \file conui.hpp .
+
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+
+   \brief  functions to let UIMA types interact with util::ConsoleUI
+
+-------------------------------------------------------------------------- */
+
+#ifndef UIMA_CONUI_HPP
+#define UIMA_CONUI_HPP
+
+/* ----------------------------------------------------------------------- */
+/*       Include dependencies                                              */
+/* ----------------------------------------------------------------------- */
+
+#include "uima/pragmas.hpp" // must be first file to be included to get pragmas
+#include "uima/exceptions.hpp"
+#include "uima/err_ids.h"
+#include "uima/consoleui.hpp"
+
+/* ----------------------------------------------------------------------- */
+/*       Constants                                                         */
+/* ----------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------- */
+/*       Forward declarations                                              */
+/* ----------------------------------------------------------------------- */
+
+namespace uima {
+
+  /** display the specified UIMA exception on the console object */
+  void uimaToolDisplayException(uima::util::ConsoleUI & rclConsole, const uima::Exception & crclException);
+
+  /** display the specified UIMA error id on the console object */
+  void uimaToolDisplayErrorId(uima::util::ConsoleUI const & rclConsole, const uima::TyErrorId utErrorId, const TCHAR * cpszLastErrorMsg);
+
+  /** display the specified UIMA error id on the console object and
+      call uima::util::ConsoleUI::fatal() if the error id is not UIMA_ERR_NONE */
+  void uimaToolHandleErrorId(uima::util::ConsoleUI & rclConsole, const uima::TyErrorId utErrorId, const TCHAR * cpszLastErrorMsg, const TCHAR * cpszFunction, uima::TyErrorId utErrorIdExpected = 0);
+
+  /* ----------------------------------------------------------------------- */
+  /*       Types / Classes                                                   */
+  /* ----------------------------------------------------------------------- */
+
+  /* ----------------------------------------------------------------------- */
+  /*       Implementation                                                    */
+  /* ----------------------------------------------------------------------- */
+
+  /* ----------------------------------------------------------------------- */
+
+} //namespace uima
+
+#endif /* UIMA_CONUI_HPP */
+
+/* <EOF> */
+

Propchange: incubator/uima/uimacpp/trunk/src/test/src/uima/conui.hpp
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/uima/uimacpp/trunk/src/test/src/uima/doc_buffer.hpp
URL: http://svn.apache.org/viewvc/incubator/uima/uimacpp/trunk/src/test/src/uima/doc_buffer.hpp?view=auto&rev=503266
==============================================================================
--- incubator/uima/uimacpp/trunk/src/test/src/uima/doc_buffer.hpp (added)
+++ incubator/uima/uimacpp/trunk/src/test/src/uima/doc_buffer.hpp Sat Feb  3 09:19:57 2007
@@ -0,0 +1,160 @@
+/** \file doc_buffer.hpp .
+-----------------------------------------------------------------------------
+
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+
+
+   \brief  Contains DocBuffer a document buffer for storing a document
+
+-------------------------------------------------------------------------- */
+
+#ifndef UIMA_DOC_BUFFER_HPP
+#define UIMA_DOC_BUFFER_HPP
+
+/* ----------------------------------------------------------------------- */
+/*       Include dependencies                                              */
+/* ----------------------------------------------------------------------- */
+
+#include "uima/pragmas.hpp" //must be included first to disable warnings
+#include "unicode/uchar.h"
+#include "uima/types.h"
+#include "uima/exceptions.hpp"
+#include "uima/unistrref.hpp"
+
+/* ----------------------------------------------------------------------- */
+/*       Constants                                                         */
+/* ----------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------- */
+/*       Forward declarations                                              */
+/* ----------------------------------------------------------------------- */
+
+namespace uima {
+  class CodePage2UnicodeConverter;
+}
+
+/* ----------------------------------------------------------------------- */
+/*       Types / Classes                                                   */
+/* ----------------------------------------------------------------------- */
+
+
+namespace uima {
+
+  /**
+  * The class <TT>DocBuffer</TT> is used to
+  * \code
+  \endcode
+  * @see
+  */
+  class DocBuffer {
+  public:
+    /** @name Constructors */
+    /*@{*/
+    DocBuffer();
+    DocBuffer(size_t uMemPoolInitialSize, size_t uMemPoolGrowSize);
+    /*@}*/
+    ~DocBuffer(void);
+    /** @name Properties */
+    /*@{*/
+    /** Return TRUE, if the specified index is a valid for this buffer. */
+    bool                    isValidIndex(TyDocIndex uIndex) const;
+    /** Return TRUE, if the document buffer is empty. */
+    bool                    isEmpty(void) const                          {
+      return(iv_uLength == 0);
+    }
+    /** Return TRUE if this could be initialized correctly */
+    bool                    isValid(void) const;
+    /** Return the number of characters as stored in this document buffer. */
+    size_t                  getLength(void) const                        {
+      return(iv_uLength);
+    }
+    /** Return a pointer to text for the specified index. <TT>ruLength</TT> is set
+        to the length of the text area that can be accessed following the pointer
+        given as the return value. */
+    const UChar *           getDocBuffer(void) const                     {
+      return(iv_cpw16Document);
+    }
+    /** Return a reference to the text for the specified text index. */
+    UnicodeStringRef        getText(TyDocIndex uIndexBegin,
+                                    TyDocIndex uIndexEnd) const UIMA_THROW(ExcDocBuffer);
+    /*@}*/
+    /** @name Miscellaneous */
+    /*@{*/
+    /** Add a complete document as a buffer starting at address <TT>cpacDocText</TT>,
+        with size <TT>uDocLengthInBytes</TT> in characters, and with CCSID EnCCSID_UCS2.
+        An optional handle to user data may be provided as <TT>hUserDocInfo</TT>. */
+//      void                    addDocInMemory(const UChar * cpclDocText,
+//                                             size_t uDocLength);
+    void                    addDocPart(const char * cpacDocPartText,
+                                       size_t uDocPartSize,
+                                       const char * crclCCSID);
+
+    /** Add part of a document as a buffer starting at address <TT>cpacDocPartText</TT>,
+        with size <TT>uDocPartSize</TT> in bytes, and using the specified converter
+        <TT>crclConverter</TT> for codepage conversion.
+        An optional handle to user data may be provided as <TT>hUserDocPartInfo</TT>. */
+    void                    addDocPart(const char * cpacDocPartText,
+                                       size_t uDocPartSize,
+                                       CodePage2UnicodeConverter & crclConverter);
+    /** Add part of a document as a buffer starting at address <TT>cpacDocPartText</TT>,
+        with size <TT>uDocPartLength</TT> in characters, and with CCSID EnCCSID_UCS2.
+        An optional handle to user data may be provided as <TT>hUserDocPartInfo</TT>. */
+    void                    addDocPart(const UChar * cpclDocPartText,
+                                       size_t uDocPartLength);
+    /** This method clears the document buffer, removes all document data
+        and resets all internal settings. */
+    void                    reset(void);
+    /*@}*/
+  protected:
+    /* --- functions --- */
+  private:
+    /* --- variables --- */
+    size_t                  iv_uMemPoolInitialSize;
+    size_t                  iv_uMemPoolReserve;
+    const UChar *           iv_cpw16Document;
+    size_t                  iv_uLength;
+    size_t                  iv_uSizeAllocated;
+    /* --- functions --- */
+    void init();
+    void                    addDocPartImp(const char * cpacDocPartText, size_t uDocPartSize, CodePage2UnicodeConverter & crclConverter);
+    void                    resetMemPool(void);
+    /* COPY CONSTRUCTOR NOT SUPPORTED */
+    DocBuffer(const DocBuffer & ); //lint !e1704
+    /* ASSIGNMENT OPERATOR NOT SUPPORTED */
+    DocBuffer & operator=(const DocBuffer & crclObject);
+  }
+  ; /* DocBuffer */
+
+}
+
+/* ----------------------------------------------------------------------- */
+/*       Implementation                                                    */
+/* ----------------------------------------------------------------------- */
+
+namespace uima {
+
+  inline bool DocBuffer::isValidIndex(TyDocIndex uIndex) const
+  /* ----------------------------------------------------------------------- */
+  {
+    return((uIndex >= 0) && (iv_uLength > 0) && (uIndex <= iv_uLength - 1));
+  }
+}
+#endif /* UIMA_DOC_BUFFER_HPP */
+
+/* <EOF> */
+

Propchange: incubator/uima/uimacpp/trunk/src/test/src/uima/doc_buffer.hpp
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/uima/uimacpp/trunk/src/test/src/uima/parse_handlers.hpp
URL: http://svn.apache.org/viewvc/incubator/uima/uimacpp/trunk/src/test/src/uima/parse_handlers.hpp?view=auto&rev=503266
==============================================================================
--- incubator/uima/uimacpp/trunk/src/test/src/uima/parse_handlers.hpp (added)
+++ incubator/uima/uimacpp/trunk/src/test/src/uima/parse_handlers.hpp Sat Feb  3 09:19:57 2007
@@ -0,0 +1,159 @@
+/** \file parse_handlers.hpp .
+
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+
+
+   \brief  Generic SAX-like parse hander class definitions
+
+-------------------------------------------------------------------------- */
+#ifndef __UIMA_PARSE_HANDLERS_HPP
+#define __UIMA_PARSE_HANDLERS_HPP
+
+
+// ---------------------------------------------------------------------------
+//  Includes
+// ---------------------------------------------------------------------------
+
+#include "uima/pragmas.hpp" //must be first to surpress warnings
+#include <map>
+#include <stack>
+#include <utility>
+#include "uima/parser_config.hpp"
+#include "uima/doc_buffer.hpp"
+#include "uima/tcas.hpp"
+#include "uima/parser_interface.hpp"
+
+
+namespace uima {
+
+  /**
+     The class <TT>ParseHandlers</TT> is used as a generic SAX-like
+     parse hander class.
+
+     @see XMLParseHandlers
+  */
+  class ParseHandlers {
+  public:
+    /**
+     * Typedefs for data structure for communication between the beginElement()
+     * and endElement() function.
+     * We get attribute information in beginElement() and need to pass this
+     * information to endElement() because we can only do the mapping
+     * once we know the end of an annotation
+     * @{*/
+
+    /// a struct to hold information about a single XML attribute
+    class StXMLAttrInfo {
+    public:
+      icu::UnicodeString  ustrName;
+      icu::UnicodeString  ustrType;
+      icu::UnicodeString  ustrValue;
+      // OS STL need this to be a full STL compliant class
+      bool operator < (const StXMLAttrInfo & crclRHS) const {
+        return(bool)(ustrName < crclRHS.ustrName);
+      }
+      bool operator ==(const StXMLAttrInfo & crclRHS) const {
+        return(bool)(ustrName == crclRHS.ustrName);
+      }
+    };
+    /// a container to hold the list of all XML attributes of a given XML element
+    typedef vector< StXMLAttrInfo > TyXMLAttrInfoList;
+    /*@}*/
+
+  public:
+    // -----------------------------------------------------------------------
+    //  Constructors and Destructor
+    // -----------------------------------------------------------------------
+    ParseHandlers();
+    virtual ~ParseHandlers();
+
+    // -----------------------------------------------------------------------
+    //  init method
+    // -----------------------------------------------------------------------
+
+    bool
+    init(
+      TCAS                    & rTCAS,
+      ParserConfiguration const & rclConfig,
+      bool                        bVerbose = false
+    );
+
+    bool deInit();
+
+    void setMultiDocCallback(ParserInterface::MultiDocCallbackInterface &);
+
+    TyErrorId beginDoc();
+    TyErrorId endDoc();
+
+    // -----------------------------------------------------------------------
+    //  Getter methods
+    // -----------------------------------------------------------------------
+    size_t getNumberOfDocumentsParsed() const;
+
+    size_t getNumberOfBytesParsed() const;
+
+    bool   isMultiDocFile() const;
+
+    // -----------------------------------------------------------------------
+    //  Handlers for the DocumentHandler interface
+    // -----------------------------------------------------------------------
+    void endElement(const UChar* cpuCName, size_t uiLength);
+    void startElement(const UChar* cpucName, size_t uiLength, const TyXMLAttrInfoList & crvecAttributes);
+    void characters(const UChar* cpucChars, size_t uiLength);
+
+
+    void processWarning(const char* cpszErrorId, const UChar * cpszErrorContext);
+
+    UnicodeStringRef getDocumentText() const;
+
+  protected:
+
+    AnnotationFS findLastAnnOfType(size_t uiBeginPos, Type type) const;
+
+    // -----------------------------------------------------------------------
+    // we need a stack of those containers for each XML element
+    // so we define a map from the XML element name to a pair of
+    // 1: the begin index of the element with those attrs
+    // 2: the attr of the element at that position
+    typedef pair< TyDocIndex, TyXMLAttrInfoList > TyIndexAttrsPair;
+    typedef stack< TyIndexAttrsPair, deque< TyIndexAttrsPair > > TyStack;
+    typedef map< icu::UnicodeString, TyStack, less< icu::UnicodeString > >
+    TyPosStack;
+    // -----------------------------------------------------------------------
+    //  Private data members
+    // -----------------------------------------------------------------------
+    ParserConfiguration const * iv_pclConfig;
+
+    DocBuffer             iv_docBuffer;
+    TCAS *                iv_pTCAS;
+
+    bool                  iv_bVerbose;
+    bool                  iv_bIsMultiDocFile;
+    size_t                iv_uiMultiDocNbr;
+    size_t                iv_uiMultiDocOffset;
+
+    size_t                iv_uiInputSize;
+    long                  iv_lLastEndIndex;
+    TyPosStack            iv_clPosStack;
+    size_t                iv_uiInIgnoreTag;
+    ParserInterface::MultiDocCallbackInterface * iv_pCallbackObject;
+  };
+
+} // namespace uima
+
+#endif //__UIMA_PARSE_HANDLERS_HPP

Propchange: incubator/uima/uimacpp/trunk/src/test/src/uima/parse_handlers.hpp
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/uima/uimacpp/trunk/src/test/src/uima/parser_config.hpp
URL: http://svn.apache.org/viewvc/incubator/uima/uimacpp/trunk/src/test/src/uima/parser_config.hpp?view=auto&rev=503266
==============================================================================
--- incubator/uima/uimacpp/trunk/src/test/src/uima/parser_config.hpp (added)
+++ incubator/uima/uimacpp/trunk/src/test/src/uima/parser_config.hpp Sat Feb  3 09:19:57 2007
@@ -0,0 +1,395 @@
+/** \file parser_config.hpp .
+
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+
+
+   \brief  Configuration class for parsers.
+
+-------------------------------------------------------------------------- */
+#ifndef UIMA_PARSER_CONFIG_HPP
+#define UIMA_PARSER_CONFIG_HPP
+
+
+// ---------------------------------------------------------------------------
+//  Includes
+// ---------------------------------------------------------------------------
+
+#include "uima/pragmas.hpp" //must be first to surpress warnings
+#include <map>
+#include <stack>
+#include <utility>
+#include "uima/typesystem.hpp"
+
+namespace uima {
+
+  class TCAS;
+  class TextAnalysisEngineSpecifier;
+
+  /**
+   * The class ParserConfiguration is used to instruct a parser that puts
+   * tagged document into a CAS on how to map document information contained
+   * in tags to annotations and features of annotations.
+   * This configuration is mainly intended to configure HTML and XML parsers
+   * but could be used for any format that uses labeled tags and tag attributes
+   *
+   * For XML input this configuration object also allows to specify if and how
+   * multiple documents are delimited within one physical XML file.
+   * Also the content of certain tags can be excluded from being processed at
+   * all.
+   */
+  class ParserConfiguration {
+  public:
+    ParserConfiguration();
+    /**
+     * Initialize this config object from settings specified in
+     * the parser configuration file.
+     *
+     * @param parserConfigFilename   file name of the configuration file
+     * @param cas                    CAS object defining the types relevant
+     *                               for the tag2type mappings
+     *
+     * @return UIMA_ERR_NONE if OK
+     */
+    TyErrorId init(icu::UnicodeString const & parserConfigFilename, CAS & cas, ErrorInfo & err);
+
+    /**
+     * Returns the tag that is used to delimit/separate multiple
+     * logical documents within the same physical file.
+     * If no such tag is defined - i.e. there is only one document
+     * per file - an empty string is returned.
+     *
+     * Note: This option will always return the empty string for HTML.
+     */
+    icu::UnicodeString const & getDocumentDelimiterTag() const;
+
+    /**
+     * Sets the tag that is used to delimit/separate multiple
+     * logical documents within the same physical file.
+     *
+     * @see getDocumentDelimiterTag
+     */
+    void setDocumentDelimiterTag(icu::UnicodeString const & tag);
+
+
+    /**
+     * Returns the tags that are to be ignored when parsing an (XML) document.
+     *
+     * Only the textual content of tags not included in the returned container
+     * will be part of the CAS document.
+     */
+    vector<icu::UnicodeString> const & getExcludedTags() const;
+
+    /**
+     * Sets the tags that are to be ignored when parsing an (XML) document.
+     *
+     * @see getExcludedTags
+     */
+    void setExcludedTags( vector<icu::UnicodeString> const & tags);
+
+    /**
+     * Returns the type of the annotation that the parser is supposed to
+     * create for each occurrence of a tag with name <code>tagName</code>.
+     * If no annotation is to be created for occurrences of this tag
+     * <code>tagName</code> the return value is an invalid type object.
+     *
+     * This function returns the value of getDefaultTypeForTags if no
+     * explicit mapping has been specified.
+     *
+     * Note: The returned Type object will be subsumed by type Annotation.
+     *
+     * @param tagName    The name of the tag to look up
+     *
+     * @return           The mapped type for tagName or an invalid type
+     *                   if no mapping specified
+     *
+     * @see setDefaultTypeForTags
+     */
+    Type getTypeForTag(icu::UnicodeString const & tagName) const;
+
+    /**
+     * Sets the type of the annotation that the parser is supposed to
+     * create for each occurrence of a tag with name <code>tagName</code>.
+     *
+     * @param tagName    the name of a tag to map
+     * @param type       a CAS type (must be valid!)
+     *
+     * @see getTypeForTag
+     */
+    void setTypeForTag(icu::UnicodeString const & tagName, Type type);
+
+
+    /**
+     * This option can be used to specify a default mapping in case no
+     * explicit mapping is available for a tag.
+     *
+     * If the default type is set (is valid) getTypeForTag will return
+     * this default type whenever no explicit mapping is specified.
+     * In this case every tag will be mapped to some type without
+     * having to specify many mappings.
+     *
+     * Since this default value is optional the result of
+     * getDefaultTypeForTags() may be invalid.
+     * In this case some tags are not mapped to types.
+     *
+     * @param type    The type to use as default mapping type
+     *
+     * @see getTypeForTag
+     * @see getDefaultTypeForTags
+     *
+     */
+    void setDefaultTypeForTags(Type type);
+
+    /**
+     * @return  The default type to use for tags not directly mapped
+     *          by getTypeForTag() (may be invalid if none specified)
+     *
+     * @see setDefaultType
+     */
+    Type getDefaultTypeForTags() const;
+
+    /**
+     * This allows to specify a feature where the name of the tag is stored
+     * for each annotation created by the parser.
+     * If this feature is invalid the parser will take no action.
+     *
+     * @param f    The feature where the tag name will stored
+     *             f must be of type string.
+     *             Also f must be appropriate for all types mapped to
+     *             tag.
+     *
+     * @see getFeatureForTagName
+     */
+    void setFeatureForTagName(Feature f);
+
+    /**
+     * @return  The feature where the name of the tag is stored for each
+     *          annotation created by the parser (may be invalid if none
+     *          specified).
+     *
+     * @see setFeatureForTagName
+     */
+    Feature getFeatureForTagName() const;
+
+    /**
+     * For a given tag name and attribute name this function returns an
+     * annotation type and a feature of that type to which the value
+     * of the attribute is to be mapped.
+     *
+     * The returned type and feature will be invalid if the attribute
+     * <code>attrName</code> at tag <code>tagName</code> is not to be
+     * mapped to the CAS.
+     *
+     * If the returned type and feature are valid the parser is supposed to
+     * look for the "last" annotation of the returned type and set the
+     * value of the returned feature at this annotation to the value of
+     * the attribute <code>attrName</code>.
+     *
+     * Note that the parser does not necessarily have to create an annotation
+     * of the returned type. This mapping can be used to set features of
+     * existing annotations: E.g. the attribute "name" of the "meta" tag
+     * in HTML could be mapped to the feature "DocumentName" of
+     * type "Document"
+     *
+     * The "last" occurrence of an annotation of the returned type is
+     * determined by starting with the annotation corresponding the current
+     * tag (if there is one) and searching from there towards the beginning
+     * of the text. The annotation corresponding the current tag is included
+     * in the search.
+     *
+     * For each occurrence of a tag the parser is supposed to first check
+     * the function <code>getTypeForTag()</code> and create a corresponding
+     * annotation if <code>getTypeForTag()</code> returns a valid type.
+     *
+     * Only after annotations are created for mapped tags the attributes are
+     * being mapped to features.
+     * This execution order guarantees that for tags that are used in
+     * <code>getTypeForTag()</code> and in <code>getFeatureForAttribute()</code>
+     * the "last" annotation will be the newly created one.
+     *
+     * The returned Type object must be subsumed by type Annotation.
+     *
+     * The returnedFeature must be of type string, integer or float.
+     * A conforming parser is supposed to convert the attribute value
+     * from it's string representation to an appropriate value before
+     * setting the feature value.
+     *
+     * taph 04.12.2002:  There is currently a limitation for returnedFeature
+     * to be of type string only. This will be removed in the future.
+     *
+     * @param tagName         The name of the tag to look up
+     * @param attrName        The name of the attribute of tag tagName to look up
+     * @param returnedType    Output param: the type corresponding to tag tagName
+     * @param returnedFeature Output param: the feature corresponding to attrName
+     */
+    void getFeatureForAttribute(
+      icu::UnicodeString const & tagName,
+      icu::UnicodeString const & attrName,
+      Type     & returnedType,
+      Feature  & returnedFeature
+    ) const;
+
+
+    /**
+     * The break properties that can be specified in getBreakPropertyForTag()
+     */
+    enum EnBreakProperty {
+      enNoBreak                  = 0,
+      enWordBreak                = 0x200B,
+      enSentenceBreak            = 0x2029,
+      enLineBreak                = 0x2028,
+      enParagraphBreak           = 0x2029,
+      enNumberOfBreakProperties  = 6
+    };
+    /**
+     * Returns the break property the parser is supposed to
+     * associate for each occurrence of a tag with name <code>tagName</code>.
+     *
+     * You can think of break properties as instructions on how to replace
+     * a tag with white space content during de-tagging an HTML/XML document.
+     * - tags with enNoBreak property will be replaced by the empty string
+     *   (e.g. bold &lt;b>F&lt;/b>irst Letter becomes
+     *   First Letter)
+     * - tags with enWordBreak property will be replaced by a Unicode
+     *   U+200B ZERO WIDTH SPACE (e.g. &lt;label>)
+     * - tags with enSentenceBreak property will be replaced by a Unicode
+     *   paragraph separator character U+2029 PARAGRAPH SEPARATOR ???
+     * - tags with enLineBreak property will be replaced by a Unicode
+     *   line separator character U+2028 LINE SEPARATOR
+     *   (e.g. &lt;br> and &lt;li>)
+     * - tags with enParagraphBreak property will be replaced by a Unicode
+     *   paragraph separator character U+2029 PARAGRAPH SEPARATOR
+     *   (e.g. headings like &lt;h1>)
+     *
+     * Note that all break properties only apply to the end tag.
+     * The begin tag is always replaced by the empty string.
+     * For HTML tags that don't have a end tag (e.g. &lt;br>, or where the
+     * end tag is optional (e.g. &lt;li>) the parser should introduce the
+     * replacement character before the next opening tag that can
+     * conceptually close the tag (e.g. &lt;li>) or the next end tag that
+     * closes the tag (e.g. &lt;/li>)
+     *
+     * If no explicit break property has been given
+     * getDefaultBreakProperty() is returned.
+     *
+     * @param tagName    The name of the tag to look up
+     */
+    EnBreakProperty
+    getBreakPropertyForTag(icu::UnicodeString const & tagName) const;
+
+    /**
+     * Returns the default break property for tags where no explicit
+     * break property has been configured.
+     */
+    EnBreakProperty getDefaultBreakProperty() const;
+
+    /**
+     * Sets the default break property for tags where no explicit
+     * break property has been configured.
+     */
+    void setDefaultBreakProperty(EnBreakProperty enBreakProp);
+
+    /**
+     * Returns the (annotation) type corresponding to a break property.
+     * The return value may be invalid if no specific type is set.
+     * If a type is specified a conforming parser is supposed to
+     * create an annotation of that type for each annotation with a given
+     * break property (in addition to inserting the corresponding break
+     * character)
+     *
+     * In general this is only usefull for paragraphs.
+     * For all other break types (especialy tokens and sentences)
+     * a parser can not and should not directly create the entity
+     * corresponding to the break (token, sentence) as annotation.
+     * The reason for this that the parser only knows that
+     * such an annotation must begin at the position of the begin tag and
+     * end at the begin tag but not how many entities (tokens, sentences)
+     * may be spanned by the tag (e.g. an h1 tag).
+     * In general the parser should not create a few instances of
+     * annotation types (like tokens or sentences) that are mainly created
+     * by annotators (like the tokenizer). Instead it should leave traces in
+     * the text that guide the downstream annotator.
+     *
+     * But paragraphs are an exception to this as they should be
+     * created by the parser and only by the parser. For HTML a tokenizer
+     * applying plain text paragraph finding heuristics (double newline) would
+     * produce incorrect results.
+     * So for HTML it would make sense to specify that tags like &lt;p> and &lt;h1>
+     * etc. are paragraph break and specify the appropriate paragraph
+     * type as the value of getTypeForBreakProperty()
+     * Note that you should not map &lt;p> to a type if you do that otherwise
+     * two annotations would be created for each &lt;p> tag.
+     * If this is done then to ensure consistency <em>only</em> the parser
+     * should create paragraph and no annotator should try to do this.
+     */
+    Type getTypeForBreakProperty(EnBreakProperty enBreakProp) const;
+  private:
+    // -----------------------------------------------------------------------
+    // data structures to store mapping information between XML and UIMA
+
+    // map holding information about XML Elements which are mapped to UIMA
+    // types (new annotations are created for each occurrence of such an element)
+    typedef map< icu::UnicodeString, Type, less<icu::UnicodeString> >
+    TyXMLNameToTypeMap;
+
+    // map holding information about XML Attributes which are mapped to UIMA
+    // attributes (UIMA Attributes of existing annotations are set to the values
+    // of the XML Attributes)
+    // Since each attribute (XML or UIMA) occurs at a certain anchor
+    // (XML Element or UIMA Type) we need to store 4 pieces of information
+    // Since more than one attr can be mapped from the same element we need a vector
+    typedef struct StFeatureInfo_ {
+      Type    type;    // 3: Type for Feature
+      Feature feature; // 4: Feature
+    }
+    StFeatureInfo;
+
+    typedef map< icu::UnicodeString, StFeatureInfo, less<icu::UnicodeString> >
+    TyXMLAttrToFeatureMap;
+
+    // actual map from 1: the name of XML Element and
+    // 2: the XML Attribute name to the rest of the mapping info
+    typedef map< icu::UnicodeString, TyXMLAttrToFeatureMap, less<icu::UnicodeString> >
+    TyXMLNameToAttrMap;
+
+    // map holding information about XML Elements and their break properties
+    typedef map< icu::UnicodeString, EnBreakProperty, less<icu::UnicodeString> >
+    TyXMLNameToBreakMap;
+
+    // map holding information about XML Elements and their break properties
+    typedef map< EnBreakProperty, Type, less<EnBreakProperty> >
+    TyBreakToTypeMap;
+
+    TyXMLNameToTypeMap         iv_mapXMLNameToType;
+    TyXMLNameToAttrMap         iv_mapXMLNameToAttr;
+    TyXMLNameToBreakMap        iv_mapXMLNameToBreak;
+    TyBreakToTypeMap           iv_mapBreakToType;
+
+    icu::UnicodeString         iv_ustrDocumentDelimiterTag;
+    vector<icu::UnicodeString> iv_vecustrExcludedTags;
+
+    EnBreakProperty            iv_enDefaultBreakProperty;
+
+    Type                       iv_defaultTypeForTags;
+    Feature                    iv_featureForTagName;
+
+  }
+  ; /* ParserConfiguration */
+
+} // namespace uima
+
+#endif //UIMA_PARSER_CONFIG_HPP

Propchange: incubator/uima/uimacpp/trunk/src/test/src/uima/parser_config.hpp
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/uima/uimacpp/trunk/src/test/src/uima/parser_interface.hpp
URL: http://svn.apache.org/viewvc/incubator/uima/uimacpp/trunk/src/test/src/uima/parser_interface.hpp?view=auto&rev=503266
==============================================================================
--- incubator/uima/uimacpp/trunk/src/test/src/uima/parser_interface.hpp (added)
+++ incubator/uima/uimacpp/trunk/src/test/src/uima/parser_interface.hpp Sat Feb  3 09:19:57 2007
@@ -0,0 +1,165 @@
+/** \file parser_interface.hpp .
+
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+
+
+   \brief  Interface class for parsers.
+
+-------------------------------------------------------------------------- */
+#ifndef UIMA_PARSER_INTERFACE_HPP
+#define UIMA_PARSER_INTERFACE_HPP
+
+
+// ---------------------------------------------------------------------------
+//  Includes
+// ---------------------------------------------------------------------------
+
+#include "uima/pragmas.hpp" //must be first to surpress warnings
+#include <iostream>
+
+namespace uima {
+
+  class ParserConfiguration;
+  class TextAnalysisEngine;
+  class CAS;
+
+  /**
+   * The class ParserInterface is used as an abstract base class for all
+   * document parsers.
+   * This is just a sketch of how this interface should be used.
+   * Things like (the facade?) creating instances of objects implementing
+   * this interface are not thought through yet.
+   * <pre>
+   * TAE engine = createTAE...
+   * ParserFacade parserFacade(engine);
+   * parserFacade.setMultiDocCallback(...); // optional for mulit-doc formats
+   * // for each supported parser beyond the pre-defined ones
+   * parserFacade.registerParserForType(p, t, config);
+   * for each document {
+   *    parserFacade.parseDocument(d,[config]);    // pre-fills the CAS
+   *    engine.process(...);  // annotators fill the CAS
+   *    ... read out results ...
+   *    engine.reset(...)     // flush CAS
+   * }
+   * engine.destroy();
+   * </pre>
+   */
+  class ParserInterface {
+  public:
+    /**
+     * Callback interface to make it possible for applications to get
+     * notified every time an embedded document is done, so  that
+     * they can retrieve their results
+     *
+     * @see ParserInterface::setMultiDocCallback
+     */
+    class MultiDocCallbackInterface {
+    public:
+      /**
+       * Called <em>after</em> a multi-doc parser detects the end
+       * of document.
+       *
+       * An application should call Engine::processDocument() there
+       * and retrieve the results of document processing after that
+       * using iterators over the CAS or TCAS.
+       * Finally an application should call resetDocument().
+       *
+       * An application should <em>not</em> call addDocPartsFinish()
+       * or addDocPartsFinish() in this function since a conforming
+       * parser is supposed to do any doc part processing.
+       */
+      virtual
+      void documentBoundaryReached(UChar const * cpBuffer, size_t uiLength) = 0;
+    };
+
+    /**
+     * Initialize the parser.
+     * The parser is beeing passed an engine object and not a CAS because
+     * for (XML) files that contain multiple documents the parser must be
+     * able to call the process functions for each embedded document.
+     *
+     * An implementation needs to store the argument objects for later
+     * use in function parse().
+     *
+     * Called once per session.
+     *
+     * @param config     The configuration for the parser
+     * @param engine     The engine object into which the results go
+     * @param fallback   An encoding to use in
+     *                   case the parser can't determine the encoding
+     *                   by other means
+     *
+     * @return           UIMA_ERR_NONE if OK, error code otherwise
+     */
+    virtual
+    TyErrorId init(ParserConfiguration const & config, TextAnalysisEngine & engine, const char * = "Latin1" ) = 0;
+    /**
+     * @see MultiDocCallbackInterface
+     */
+    virtual
+    void setMultiDocCallback(MultiDocCallbackInterface & callbackObject) = 0;
+
+    /**
+     * Do the parsing add the tag free text to the CAS and potentially
+     * translate tag information to CAS annotations.
+     *
+     * Called once per document.
+     *
+     * @param inputFileName The input to process
+     * @return              UIMA_ERR_NONE if OK, error code otherwise
+     */
+    virtual
+    TyErrorId parseDocument(char* const inputFileName) = 0;
+    virtual
+    TyErrorId parseDocument(std::istream & inputFileStream) = 0;
+
+    /**
+     * Returns the number of documents parsed by the parser.
+     * This will always be 1 for HTML but can be more for XML.
+     *
+     * Optionally called once per document.
+     */
+    virtual
+    size_t getNumberOfDocumentsParsed() const = 0;
+
+    /**
+     * Returns the number of bytes parsed by the parser.
+     * Information function to allow throughput computation by calling
+     * environment.
+     *
+     * Optionally called once per document.
+     */
+    virtual
+    size_t getNumberOfBytesParsed() const = 0;
+
+    /**
+     * De-initialize the parser (free ressources etc.)
+     *
+     * Called once per session.
+     *
+     * @return           UIMA_ERR_NONE if OK, error code otherwise
+     */
+    virtual
+    TyErrorId deInit() = 0;
+  }
+  ; /* ParserInterface */
+
+} // namespace uima
+
+#endif //UIMA_PARSER_INTERFACE_HPP
+

Propchange: incubator/uima/uimacpp/trunk/src/test/src/uima/parser_interface.hpp
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/uima/uimacpp/trunk/src/test/src/uima/ss_tokenizer.hpp
URL: http://svn.apache.org/viewvc/incubator/uima/uimacpp/trunk/src/test/src/uima/ss_tokenizer.hpp?view=auto&rev=503266
==============================================================================
--- incubator/uima/uimacpp/trunk/src/test/src/uima/ss_tokenizer.hpp (added)
+++ incubator/uima/uimacpp/trunk/src/test/src/uima/ss_tokenizer.hpp Sat Feb  3 09:19:57 2007
@@ -0,0 +1,118 @@
+/** \file ss_tokenizer.hpp .
+-----------------------------------------------------------------------------
+
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+
+-------------------------------------------------------------------------- */
+
+#ifndef _INCLUDE_UIMASS
+#define _INCLUDE_UIMASS
+
+#include "uima/language.hpp"
+#include "uima/token_properties.hpp"
+
+namespace uima {
+
+  class ResourceABR;
+
+
+
+  static const int MAXWARD = 6;
+
+  typedef unsigned short TyCharmap [MAXWARD+1][256];
+
+  /**character types used in our char map*/
+  typedef enum {
+    CH_INVALID = 0,
+    CH_LWR = 1,    // lowercase characters
+    CH_UPR = 2,    // uppercase characters
+    CH_NUM = 4,    // number or currency symbol
+    CH_USC = 8,    // underscore: like a character, no upper/lower information
+    CH_PRD = 16,   // period (full stop)
+    CH_SND = 32,   // sentence end: '?' and '!'
+    CH_BLK = 64,   // blank
+    CH_NWL = 128,  // newline
+    CH_SPC = 256,  // special character (or whitespace)
+    CH_CWS = 512,  // conditional whitespace: if character is between two
+    // alphanumeric characters, then it becomes part of
+    // the word, e.g / @ -
+    // if not, it's treated as a whitespace
+    CH_NSP = 1024, // number seperator ':' and ',' part of the number
+    // if between digits
+    CH_APS = 2048, // apostroph
+    CH_NPA = 4096, // new paragraph
+    CH_CUR = 8192  // currency and degree symbol: part of number if after of before digit
+  }
+  EnCharClass;
+
+#define CHAR_CLASS_IS_TOKEN_PART(x)     ((x) < CH_PRD)
+
+  /** @name Tokenizer
+     The class <TT>Tokenizer</TT> is the implementation of an universal Unicode
+     Tokenizer which is used in the UIMA tokenizer annotator.
+     @see AnnotatorTokenizer
+  */
+  class Tokenizer {
+  public:
+    /** Default Constructor.
+    */
+    Tokenizer( void );
+    virtual                    ~Tokenizer();
+    /// Main tokenization function
+    void                       process( const UChar *cpszStart, const UChar *cpszEnd );
+    /// Specify language to use (needed for stopword recognition only)
+    void                       setLanguage( const Language & crclLanguage );
+    /// Callback function triggered on token recognition
+    virtual int                tokenCallback( unsigned long ulLocation,
+        unsigned long ulLength,
+        TokenProperties & crclTokenProperties,
+        bool bNewPara, bool bNewSent ) = 0;
+
+    EnCharClass                getCharClass(UChar c);
+
+    // change the character class for a code point
+    void                       setCharClass(WORD16 uiUnicodeCodePoint,
+                                            EnCharClass enCharClass);
+
+    // reset char class table to initial values
+    void                       resetCharClasses(void);
+
+  protected:
+    int                        tokenEntry( const UChar *, size_t ulLocation,
+                                           size_t ulLength,
+                                           TokenProperties & crclTokenProperties,
+                                           bool &bNewPara, bool &bNewSent,
+                                           size_t & rulNewlines);
+
+  private:
+    bool                       isAbreviation(const UChar * pw16String, size_t uiLength) const;
+    EnCharClass                getCharClassInl( UChar c );
+
+    // get character class to a character
+    bool                       iv_bUseAlternateTerritories;
+    Language                   iv_clLanguageABR;
+    ResourceABR *              iv_pclResourceABR;
+    // this will either point to our constant static map or
+    // to a freshly allocated writable map if setCharClass has been called
+    TyCharmap             *    iv_pauiCharMapWard;
+
+  };
+
+} // namespace uima
+
+#endif /* _INCLUDE_UIMASS */

Propchange: incubator/uima/uimacpp/trunk/src/test/src/uima/ss_tokenizer.hpp
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/uima/uimacpp/trunk/src/test/src/uima/strptrlenpair.hpp
URL: http://svn.apache.org/viewvc/incubator/uima/uimacpp/trunk/src/test/src/uima/strptrlenpair.hpp?view=auto&rev=503266
==============================================================================
--- incubator/uima/uimacpp/trunk/src/test/src/uima/strptrlenpair.hpp (added)
+++ incubator/uima/uimacpp/trunk/src/test/src/uima/strptrlenpair.hpp Sat Feb  3 09:19:57 2007
@@ -0,0 +1,532 @@
+#ifndef UIMA_STRPTRLENPAIR_HPP
+#define UIMA_STRPTRLENPAIR_HPP
+/** \file strptrlenpair.hpp .
+-----------------------------------------------------------------------------
+
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+
+   \brief  Shallow string object consisting of a pair of
+           string pointer and a length
+
+-----------------------------------------------------------------------------
+*/
+
+#include "uima/pragmas.hpp" //must be included first to disable warnings
+
+#include <vector>
+#include <utility>
+#include <string>
+#include <iostream>
+
+#include "uima/assertmsg.h"
+//#include "uima/ccsid.hpp"
+//#include "uima/u2cpcnvrt.hpp"
+#include "unicode/uchar.h"
+
+/* ----------------------------------------------------------------------- */
+/*       Interface dependencies                                            */
+/* ----------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------- */
+/*       Types / Classes                                                   */
+/* ----------------------------------------------------------------------- */
+
+namespace uima {
+
+  /**
+     The class <TT>BasicStrPtrLenPair</TT> provides support for non zero-terminated strings
+     that are presented as pointers to string arrays with an associated length.
+     As this type of string is used only as string reference into read-only buffers,
+     the string pointer is constant.
+     The member functions are names in an ANSI basic_string.
+     This enables a limited use of basic-l-strings in template functions that
+     are designed for basic_strings (the hash functions will work, for example).
+     Note: This is why previous function <TT>set()</TT> has been renamed
+     <TT>assign()</TT>
+  */
+  template < class CharT >
+  class BasicStrPtrLenPair : public std::pair< CharT const *, size_t > {
+  public:
+    ///(Default) Constructor
+    BasicStrPtrLenPair( void ) :
+        std::pair< CharT const * , size_t >(NULL, 0) {}
+
+    ///Constructor from zero terminated string
+    BasicStrPtrLenPair(
+      const CharT * cpacString
+    ) :
+        std::pair< CharT const * , size_t >(cpacString, strlen_templ(cpacString)) {
+      assert(   (EXISTS(first) )
+                || ((first == NULL       ) && (second == 0)) );
+    }
+
+    ///Constructor from string and length
+    BasicStrPtrLenPair(
+      const CharT * cpacString,
+      size_t        uiLength
+    ) :
+        std::pair< CharT const * , size_t >(cpacString, uiLength) {
+      assert(   (EXISTS(first) )
+                || ((first == NULL       ) && (second == 0)) );
+    }
+
+    /// Constructor from a two pointers (begin/end). Note: end points to the first char <em>behind</em> the string.
+    BasicStrPtrLenPair(
+      const CharT * paucStringBegin,
+      const CharT * paucStringEnd
+    ) :
+        std::pair< CharT const * , size_t >(paucStringBegin, paucStringEnd - paucStringBegin )  //lint !e613: Possible use of null pointer 'paucStringEnd' in left argument to operator 'ptr-ptr'
+    {
+      assert(EXISTS(paucStringBegin));
+      assert(EXISTS(paucStringEnd));
+      assert(paucStringEnd >= paucStringBegin);
+      assert(   (EXISTS(first) )
+                || ((first == NULL       ) && (second == 0)) );
+    }
+
+    ///Constructor from basic_string<CharT>
+    BasicStrPtrLenPair(
+      const std::basic_string< CharT > & crclBasicString
+    ) :
+        std::pair< CharT const * , size_t >(crclBasicString.data(), crclBasicString.length()) {
+      assert(   (EXISTS(first) )
+                || ((first == NULL       ) && (second == 0)) );
+    }
+
+    ///Constructor from pair
+    BasicStrPtrLenPair(
+      std::pair< CharT const * , size_t > const & crclPair
+    ) :  //lint !e1724: Argument to copy constructor for class 'uima::BasicStrPtrLenPair<<1>>' should be a const reference
+        std::pair< CharT const * , size_t >(crclPair.first, crclPair.second) {
+      assert(   (EXISTS(first) )
+                || ((first == NULL       ) && (second == 0)) );
+    }
+
+    ///Accessor for the string length in logical characters
+    size_t
+    length( void ) const {
+      return second;
+    }
+
+    ///Accessor for the string length in bytes
+    size_t
+    getSizeInBytes( void ) const {
+      return (second * sizeof(CharT));
+    }
+
+    ///CONST Accessor for the string content (NOT ZERO DELIMITED!).
+    const CharT *
+    data( void ) const {
+      return first;
+    }
+
+    ///CONST Accessor to the begin of string content (NOT ZERO DELIMITED!).
+    const CharT *
+    begin( void ) const {
+      return (first);
+    }
+
+    ///Accessor to position AFTER the end of string content.
+    const CharT *
+    end( void ) const {
+      return (first == NULL ? NULL : first + second);
+    }
+
+    /**
+       Finds the first occurence of key character <TT>cPattern</TT> in the string.
+    */
+    size_t
+    find( CharT cPattern ) const {
+      return str_find_first( cPattern, first, second);
+    }  //lint !e1746: parameter 'cPattern' in function 'BasicStrPtrLenPair<UChar>::find(UChar) const' could be made const reference
+
+    /**
+       Finds the first occurence of key string <TT>cpacPattern</TT> (with length
+       <TT>uiPatternLen</TT>) in the string
+    */
+    size_t
+    find(
+      const BasicStrPtrLenPair< CharT > & crlstrPattern // pattern to search for
+    ) const {
+      return str_find_first( crlstrPattern.first, crlstrPattern.second,
+                             first, second);
+    }
+
+    /**
+       Finds the first occurence of key string <TT>cpacPattern</TT> (with length
+       <TT>uiPatternLen</TT>) in the string
+    */
+    size_t
+    find(
+      const CharT * cpacPattern,   // pattern to search for
+      size_t        uiPatternLen   // length of pattern
+    ) const {
+      return str_find_first( cpacPattern, uiPatternLen, first, second);
+    }
+
+    /**
+       Finds the first occurence of key string <TT>cpacPattern</TT> (with length
+       <TT>uiPatternLen</TT>), in the substring from <TT>uiStartPos</TT> to
+       <TT>uiStartPos+uiStartLength</TT>
+    */
+    size_t
+    find(
+      const CharT * cpacPattern,   // pattern to search for
+      size_t        uiPatternLen,  // length of pattern
+      size_t        uiStartPos,    // from this pos
+      size_t        /*uiStartLength*/  // up to uiStartPos+uiStartLength
+    ) const {
+      if ( uiStartPos >= second ) {
+        return STRING_NPOS; // If search starts past end of str, indicate "not found".
+      }
+
+      assert(EXISTS(cpacPattern));
+      assert(EXISTS(first));
+      return str_find_first( cpacPattern, uiPatternLen,
+                             (first+uiStartPos), second);
+    }
+
+    /** Return a sub-string of this string starting from position <TT>uiStartPos</TT>
+        and including the following <TT>uiLength</TT> characters.
+    */
+    BasicStrPtrLenPair< CharT >
+    sub_str(
+      size_t uiStartPos,
+      size_t uiLength
+    ) {
+      assert(uiStartPos < second);
+      assert(uiStartPos + uiLength < second);
+      assert(EXISTS(first));
+      return BasicStrPtrLenPair< CharT >(first+uiStartPos, uiLength);
+    }
+
+    ///Set the string to new value. (used to be named <TT>set()</TT>)
+    BasicStrPtrLenPair< CharT > &
+    assign(
+      const CharT * cpacString,
+      size_t        uiLength
+    ) {
+      first = cpacString;
+      second   = uiLength;
+      assert(   (EXISTS(first) )
+                || ((first == NULL       ) && (second == 0)) );
+
+      return (*this);
+    }
+
+    ///Set the string to new value. (used to be names set)
+    BasicStrPtrLenPair< CharT > &
+    assign(
+      const std::basic_string< CharT > & crclBasicString
+    ) {
+      first = crclBasicString.data();
+      second   = crclBasicString.length();
+      assert(   (EXISTS(first) )
+                || ((first == NULL       ) && (second == 0)) );
+
+      return (*this);
+    }
+
+    ///Assignment operator
+    BasicStrPtrLenPair< CharT > &
+    assign( const std::pair< CharT const *, size_t > & crclPair ) {
+      first = crclPair.first;
+      second = crclPair.second;
+      return (*this);
+    }
+
+    /** Accessor for the string content (CharT dependant string return type).
+    */
+    std::basic_string< CharT >
+    copyToBasicString(
+      void
+    ) const {
+      return basic_string< CharT >(first, second);
+    }
+
+#ifdef NEVER
+    /// convert to single byte string. crclCCSID specifies the target encoding)
+    std::string
+    prv_asSingleByteString(
+      const uima::CCSID & crclCCSID
+    ) const {
+      if (sizeof(CharT) == 1) {  //lint !e774: Boolean within 'if' always evaluates to True
+        // single byte lstrings
+        return string((char*)data(), length());
+      }
+      if (length() == 0) {
+        return string();
+      }
+      assert(sizeof(CharT) == 2); // unicode lstrings
+      assert(EXISTS(data()));  //lint !e527 !e666: Expression with side effects passed to repeated parameter 1 in macro EXISTS
+      // Small values are copied in a stack based buffer, larger are allocated
+      // Max string length to handle stack based
+      const size_t STACK_BUFF_LIMIT = 64;
+      // Our stack buffer
+      char   acStackBuff [STACK_BUFF_LIMIT];
+      // A pointer to either the stack buffer of dynamic storage
+      char * pcCharBuff;
+
+      Unicode2CodePageConverter clConverter(crclCCSID);
+      size_t uiMaxNewLength = clConverter.getMaximumSizeForLength(length());
+
+      if (uiMaxNewLength < STACK_BUFF_LIMIT) {
+        pcCharBuff = acStackBuff;     //use stack buffer
+      } else {
+        pcCharBuff = new char [uiMaxNewLength];  //allocate
+      }
+      // Now convert UChar into char array
+      size_t uiCharsWritten = clConverter.convertCharacters(pcCharBuff, uiMaxNewLength, (const UChar*)data(), length());
+
+      // Construct our string
+      string strRetVal(pcCharBuff, uiCharsWritten);
+
+
+      if (uiMaxNewLength >= STACK_BUFF_LIMIT) { // if allocated ...
+        delete [] pcCharBuff; //lint !e673 Possibly inappropriate deallocation (delete[]) for 'auto' data
+      }
+      return strRetVal;
+    }
+#endif
+
+    ///CONST Array Index Access operator
+    const CharT &
+    operator[]( size_t uiIndex ) const {
+      assert(uiIndex < second);
+      assert(EXISTS(first));
+      return first[uiIndex];  //lint !e613: Possible use of null pointer 'BasicStrPtrLenPair<wchar_t>::first' in left argument to operator '['
+    }
+
+
+    ///Equality operator
+    int
+    operator==( const BasicStrPtrLenPair< CharT > & crclRHS ) const {
+      if (second != crclRHS.second) {
+        return false;
+      }
+      return strncmp_templ(first, crclRHS.first, second) == 0;
+    }
+
+    ///Assignment operator
+    BasicStrPtrLenPair< CharT > &
+    operator=( BasicStrPtrLenPair< CharT > const & crclRHS ) {
+      first = crclRHS.first;
+      second = crclRHS.second;
+      return (*this);
+    }
+
+    ///Assignment operator
+    BasicStrPtrLenPair< CharT > &
+    operator=( std::pair< CharT const *, size_t > const & crclPair ) { //lint !e1520 !e1720 :multiple assignment ops  assignment operator for class 'uima::BasicStrPtrLenPair<<1>>' has non-const parameter
+      first = crclPair.first;
+      second = crclPair.second;
+      return (*this);
+    }
+
+    ///less operator
+    bool operator <( BasicStrPtrLenPair< CharT > const & crclRHS ) const {
+      size_t uiLen1 = length();
+      size_t uiLen2 = crclRHS.length();
+      if (!(bool)uiLen2) {
+        return(false);
+      }
+      if (!(bool)uiLen1) {
+        return(true);
+      }
+      const CharT * cpszString1 = data();
+      const CharT * cpszString2 = crclRHS.data();
+      while ((bool)uiLen1 && (bool)uiLen2 && *cpszString1 == *cpszString2) {
+        ++cpszString1;
+        ++cpszString2;
+        --uiLen1;
+        --uiLen2;
+      }
+      if (!(bool)uiLen2) {
+        return(false);
+      }
+      if (!(bool)uiLen1) {
+        return(true);
+      }
+      return (*cpszString1 < *cpszString2);
+    }
+
+  };
+
+///This defines the standard LString class with single byte character.
+  typedef BasicStrPtrLenPair< char >    StrPtrLenPair;
+
+///This defines the standard LString class with wide character.
+  typedef BasicStrPtrLenPair< wchar_t > WStrPtrLenPair;
+
+///This defines the wide LString class with wide/double byte character.
+  typedef BasicStrPtrLenPair< UChar >   UStrPtrLenPair;
+
+#if defined(UNDECLARED_FUNCTION_TEMPLATES_LINK_BUG)
+// To work around "unsatisfied symbols" during linking,
+// we need a declaration in addition to the definition below
+  template < class CharT >
+  std::ostream &
+  operator << (
+    std::ostream &                           rclOStream,
+    const BasicStrPtrLenPair< CharT > & crclLString
+  );
+#endif
+
+#ifdef NEVER
+///Output stream support for BasicStrPtrLenPair
+  template < class CharT >
+  inline std::ostream &
+  operator << (
+    std::ostream &                      rclOStream,
+    const BasicStrPtrLenPair< CharT > & crclLString
+  ) {
+    if (rclOStream == cout || rclOStream == cerr) {  //lint !e1912: Implicit call of conversion function from class 'basic_ostream' to type 'void *'
+      rclOStream << crclLString.prv_asSingleByteString(CosClCCSID::getConsoleCCSID()).c_str();
+    } else {
+      rclOStream << crclLString.prv_asSingleByteString(CosClCCSID::CosEnCCSID_UTF8).c_str();
+    }
+    return rclOStream;
+  }
+
+
+///Output stream support for pointer length pairs
+  inline std::ostream &
+  operator << (
+    std::ostream &                             rclOStream,
+    const std::pair< UChar const *, size_t > & crclPair
+  ) {
+    BasicStrPtrLenPair< UChar > const lString(crclPair);
+    if (rclOStream == std::cout || rclOStream == std::cerr) {  //lint !e1912: Implicit call of conversion function from class 'basic_ostream' to type 'void *'
+      rclOStream << lString.prv_asSingleByteString(CosClCCSID::getConsoleCCSID()).c_str();
+    } else {
+      rclOStream << lString.prv_asSingleByteString(CosClCCSID::CosEnCCSID_UTF8).c_str();
+    }
+    return rclOStream;
+  }
+
+#endif
+
+
+  /* ----------------------------------------------------------------------- */
+  /** @name vector to/from delimited string conversion routines              */
+  /* ----------------------------------------------------------------------- */
+  /*@{*/
+
+  /**
+     Removes whitespace from both ends of a string.
+     Template function using <TT>isspace_templ()</TT>.
+  */
+  template < class CharT >
+  inline BasicStrPtrLenPair< CharT >
+  strtrim(
+    const BasicStrPtrLenPair< CharT > & s
+  ) {
+    if (s.length() == 0) {
+      return s;
+    }
+    const CharT * beg = s.data();
+    const CharT * end = s.data()+s.length()-1;
+    while (end >= beg && isspace_templ(*end) ) {
+      --end;
+    }
+    while (beg < end && isspace_templ(*beg) ) {
+      ++beg;
+    }
+    return BasicStrPtrLenPair< CharT >(beg, end-beg+1);
+  }
+
+  /**
+     Splits a delimited string into pieces and stores the results in a vector
+     of strings. Delimiters are passed as a zero terminated string.
+
+     @param rveclstrOutput      (Output) The vector where the results are stored
+     @param pcInput             The delimited string to split.
+     @param uiInputLength       The number of chars in pcInput
+     @param cpszDelimiters      The delimiters. CharT* are interpreted as a set of delimiters.
+     @param bTrimString         Flag: If true, all pieces will be trimmed before storing in <TT>storeVar</TT>
+     @param bInsertEmptyStrings Flag: If false, pieces that have length 0 will not be stored in  <TT>storeVar</TT>
+
+     @return The number of strings added to <TT>rvecstrOutput</TT>
+  */
+  template < class CharT >
+  inline size_t
+  delimitedStrPtrLenPair2Vector(
+    std::vector< uima::BasicStrPtrLenPair< CharT > > & rveclstrOutput,
+    const CharT                           * pcInput,
+    size_t                                  uiInputLength,
+    const CharT                           * cpszDelimiters,
+    bool                                    bTrimString,
+    bool                                    bInsertEmptyStrings
+  ) {
+    const CharT * pcBegin = pcInput;
+    size_t uiEnd;
+    const CharT * pcEnd = pcBegin;
+    size_t uiNumFound = 0;
+    size_t uiDelimitersLen = strlen_templ(cpszDelimiters);
+
+    if (uiInputLength == 0) {
+      return 0;
+    }
+    const CharT * pcInputEnd = pcInput + uiInputLength;
+    BasicStrPtrLenPair< CharT > _s;
+
+    while (pcBegin < pcInputEnd) {
+      //      uiBegin--;
+      uiEnd   = str_find_first_of(cpszDelimiters, uiDelimitersLen, pcBegin, (size_t)(pcInputEnd-pcBegin));
+      pcEnd = pcBegin+uiEnd;
+      if (uiEnd != STRING_NPOS) {
+        ++pcEnd;
+      }
+      if (uiEnd == STRING_NPOS) {
+        uiEnd = uiInputLength+1;
+        pcEnd = pcInputEnd+1;
+      }
+      assert(pcEnd > pcBegin);
+      _s.assign(pcBegin, pcEnd-pcBegin-1);
+      if (bTrimString) {
+        _s = strtrim(_s);
+      }
+      if (bInsertEmptyStrings || _s.length() > 0) {
+        rveclstrOutput.push_back(_s);
+        uiNumFound++;
+      }
+      pcBegin = pcEnd;
+    }
+    return uiNumFound;
+  }
+
+  template < class CharT >
+  inline size_t
+  delimitedStrPtrLenPair2Vector(
+    std::vector< BasicStrPtrLenPair< CharT > > & veclstrOutput,
+    const CharT                           * pcInput,
+    const CharT                           * cpszDelimiters,
+    bool                                    bTrimString,
+    bool                                    bInsertEmptyStrings
+  ) {
+    return delimitedStrPtrLenPair2Vector(veclstrOutput, pcInput, strlen_templ(pcInput), cpszDelimiters, bTrimString, bInsertEmptyStrings);
+  }
+
+//@}
+
+} // namespace uima
+
+#endif /* UIMA_STRPTRLENPAIR_HPP */
+
+/* <EOF> */
+

Propchange: incubator/uima/uimacpp/trunk/src/test/src/uima/strptrlenpair.hpp
------------------------------------------------------------------------------
    svn:eol-style = native