You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2023/01/09 03:29:50 UTC

[doris-thirdparty] branch clucene updated: [Update] initialize clucene (#23)

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git


The following commit(s) were added to refs/heads/clucene by this push:
     new f62d011  [Update] initialize clucene (#23)
f62d011 is described below

commit f62d011d0ae1915a7f560467064ce642ab218ca4
Author: airborne12 <ai...@gmail.com>
AuthorDate: Mon Jan 9 11:29:44 2023 +0800

    [Update] initialize clucene (#23)
    
    Co-authored-by: airborne12 <ai...@gmail.com>
---
 APACHE.license                                     |   201 +
 AUTHORS                                            |    23 +
 CMakeLists.txt                                     |   218 +
 COPYING                                            |   126 +
 ChangeLog                                          |    17 +
 INSTALL                                            |   265 +
 LGPL.license                                       |   475 +
 NEWS                                               |     0
 README                                             |    63 +
 README.PACKAGE                                     |    11 +
 REQUESTS                                           |     4 +
 cmake/CLuceneBoost.cmake                           |    23 +
 cmake/CLuceneDocs.cmake                            |   151 +
 cmake/CreateClucenePackages.cmake                  |    91 +
 cmake/DefineOptions.cmake                          |    53 +
 cmake/Toolchain-g++32.cmake                        |    20 +
 cmake/Toolchain-llvm.cmake                         |     8 +
 cmake/Toolchain-mingw32.cmake                      |    32 +
 cmake/TurboPFOR.cmake                              |    15 +
 cmake/cmake_uninstall.cmake.in                     |    23 +
 dist-test.sh                                       |   272 +
 doc/Doxyfile.cmake                                 |   237 +
 doc/coding standards.txt                           |   113 +
 doc/doxygen.css.cmake                              |   163 +
 doc/helpfooter.htm.cmake                           |     4 +
 doc/helpheader.htm.cmake                           |    24 +
 .../CLucene/analysis/LanguageBasedAnalyzer.cpp     |    69 +
 .../CLucene/analysis/LanguageBasedAnalyzer.h       |    26 +
 .../CLucene/analysis/PorterStemmer.cpp             |   313 +
 src/contribs-lib/CLucene/analysis/PorterStemmer.h  |   151 +
 .../CLucene/analysis/cjk/CJKAnalyzer.cpp           |   190 +
 .../CLucene/analysis/cjk/CJKAnalyzer.h             |    94 +
 .../CLucene/analysis/de/GermanAnalyzer.cpp         |   149 +
 .../CLucene/analysis/de/GermanAnalyzer.h           |   108 +
 .../CLucene/analysis/de/GermanStemFilter.cpp       |    60 +
 .../CLucene/analysis/de/GermanStemFilter.h         |    54 +
 .../CLucene/analysis/de/GermanStemmer.cpp          |   213 +
 .../CLucene/analysis/de/GermanStemmer.h            |    98 +
 .../CLucene/analysis/jieba/ChineseTokenizer.cpp    |    48 +
 .../CLucene/analysis/jieba/ChineseTokenizer.h      |    54 +
 .../CLucene/analysis/jieba/DictTrie.hpp            |   286 +
 .../CLucene/analysis/jieba/FullSegment.hpp         |    93 +
 .../CLucene/analysis/jieba/HMMModel.hpp            |   129 +
 .../CLucene/analysis/jieba/HMMSegment.hpp          |   190 +
 src/contribs-lib/CLucene/analysis/jieba/Jieba.hpp  |   134 +
 .../CLucene/analysis/jieba/KeywordExtractor.hpp    |   153 +
 .../CLucene/analysis/jieba/LocalVector.hpp         |   139 +
 .../CLucene/analysis/jieba/Logging.hpp             |    90 +
 .../CLucene/analysis/jieba/MPSegment.hpp           |   137 +
 .../CLucene/analysis/jieba/MixSegment.hpp          |   109 +
 .../CLucene/analysis/jieba/PosTagger.hpp           |    77 +
 .../CLucene/analysis/jieba/PreFilter.hpp           |    54 +
 .../CLucene/analysis/jieba/QuerySegment.hpp        |    89 +
 .../CLucene/analysis/jieba/SegmentBase.hpp         |    46 +
 .../CLucene/analysis/jieba/SegmentTagged.hpp       |    23 +
 .../CLucene/analysis/jieba/StdExtension.hpp        |   157 +
 .../CLucene/analysis/jieba/StringUtil.hpp          |   380 +
 .../CLucene/analysis/jieba/TextRankExtractor.hpp   |   190 +
 src/contribs-lib/CLucene/analysis/jieba/Trie.hpp   |   200 +
 .../CLucene/analysis/jieba/Unicode.hpp             |   227 +
 src/contribs-lib/CLucene/highlighter/Encoder.cpp   |    19 +
 src/contribs-lib/CLucene/highlighter/Encoder.h     |    56 +
 src/contribs-lib/CLucene/highlighter/Formatter.cpp |    18 +
 src/contribs-lib/CLucene/highlighter/Formatter.h   |    49 +
 .../CLucene/highlighter/Fragmenter.cpp             |    19 +
 src/contribs-lib/CLucene/highlighter/Fragmenter.h  |    52 +
 .../CLucene/highlighter/HighlightScorer.h          |    63 +
 .../CLucene/highlighter/Highlighter.cpp            |   525 +
 src/contribs-lib/CLucene/highlighter/Highlighter.h |   226 +
 .../CLucene/highlighter/QueryScorer.cpp            |   118 +
 src/contribs-lib/CLucene/highlighter/QueryScorer.h |   114 +
 .../CLucene/highlighter/QueryTermExtractor.cpp     |   135 +
 .../CLucene/highlighter/QueryTermExtractor.h       |    82 +
 src/contribs-lib/CLucene/highlighter/Scorer.h      |    61 +
 .../CLucene/highlighter/SimpleFragmenter.cpp       |    55 +
 .../CLucene/highlighter/SimpleFragmenter.h         |    70 +
 .../CLucene/highlighter/SimpleHTMLEncoder.cpp      |    83 +
 .../CLucene/highlighter/SimpleHTMLEncoder.h        |    45 +
 .../CLucene/highlighter/SimpleHTMLFormatter.cpp    |    56 +
 .../CLucene/highlighter/SimpleHTMLFormatter.h      |    59 +
 .../CLucene/highlighter/TextFragment.cpp           |    78 +
 .../CLucene/highlighter/TextFragment.h             |    87 +
 .../CLucene/highlighter/TokenGroup.cpp             |   123 +
 src/contribs-lib/CLucene/highlighter/TokenGroup.h  |    83 +
 .../CLucene/highlighter/TokenSources.cpp           |   229 +
 .../CLucene/highlighter/TokenSources.h             |    86 +
 .../CLucene/highlighter/WeightedTerm.cpp           |   101 +
 .../CLucene/highlighter/WeightedTerm.h             |    79 +
 src/contribs-lib/CLucene/snowball/SNOWBALL_README  |    82 +
 src/contribs-lib/CLucene/snowball/Snowball.cpp     |   137 +
 .../CLucene/snowball/SnowballAnalyzer.h            |    44 +
 src/contribs-lib/CLucene/snowball/SnowballFilter.h |    41 +
 .../CLucene/snowball/include/libstemmer.h          |    82 +
 src/contribs-lib/CLucene/snowball/libstemmer.h     |    79 +
 .../CLucene/snowball/libstemmer/libstemmer.c       |    92 +
 .../CLucene/snowball/libstemmer/modules.h          |   166 +
 src/contribs-lib/CLucene/snowball/runtime/api.c    |    69 +
 src/contribs-lib/CLucene/snowball/runtime/api.h    |    30 +
 src/contribs-lib/CLucene/snowball/runtime/header.h |    61 +
 .../CLucene/snowball/runtime/utilities.c           |   446 +
 src/contribs-lib/CLucene/snowball/snowball.version |     2 +
 .../snowball/src_c/stem_ISO_8859_1_danish.c        |   338 +
 .../snowball/src_c/stem_ISO_8859_1_danish.h        |    16 +
 .../CLucene/snowball/src_c/stem_ISO_8859_1_dutch.c |   635 +
 .../CLucene/snowball/src_c/stem_ISO_8859_1_dutch.h |    16 +
 .../snowball/src_c/stem_ISO_8859_1_english.c       |  1156 ++
 .../snowball/src_c/stem_ISO_8859_1_english.h       |    16 +
 .../snowball/src_c/stem_ISO_8859_1_finnish.c       |   792 +
 .../snowball/src_c/stem_ISO_8859_1_finnish.h       |    16 +
 .../snowball/src_c/stem_ISO_8859_1_french.c        |  1276 ++
 .../snowball/src_c/stem_ISO_8859_1_french.h        |    16 +
 .../snowball/src_c/stem_ISO_8859_1_german.c        |   512 +
 .../snowball/src_c/stem_ISO_8859_1_german.h        |    16 +
 .../snowball/src_c/stem_ISO_8859_1_italian.c       |  1091 ++
 .../snowball/src_c/stem_ISO_8859_1_italian.h       |    16 +
 .../snowball/src_c/stem_ISO_8859_1_norwegian.c     |   296 +
 .../snowball/src_c/stem_ISO_8859_1_norwegian.h     |    16 +
 .../snowball/src_c/stem_ISO_8859_1_porter.c        |   776 +
 .../snowball/src_c/stem_ISO_8859_1_porter.h        |    16 +
 .../snowball/src_c/stem_ISO_8859_1_portuguese.c    |  1035 +
 .../snowball/src_c/stem_ISO_8859_1_portuguese.h    |    16 +
 .../snowball/src_c/stem_ISO_8859_1_spanish.c       |  1119 ++
 .../snowball/src_c/stem_ISO_8859_1_spanish.h       |    16 +
 .../snowball/src_c/stem_ISO_8859_1_swedish.c       |   307 +
 .../snowball/src_c/stem_ISO_8859_1_swedish.h       |    16 +
 .../CLucene/snowball/src_c/stem_KOI8_R_russian.c   |   701 +
 .../CLucene/snowball/src_c/stem_KOI8_R_russian.h   |    16 +
 .../CLucene/snowball/src_c/stem_UTF_8_danish.c     |   344 +
 .../CLucene/snowball/src_c/stem_UTF_8_danish.h     |    16 +
 .../CLucene/snowball/src_c/stem_UTF_8_dutch.c      |   653 +
 .../CLucene/snowball/src_c/stem_UTF_8_dutch.h      |    16 +
 .../CLucene/snowball/src_c/stem_UTF_8_english.c    |  1178 ++
 .../CLucene/snowball/src_c/stem_UTF_8_english.h    |    16 +
 .../CLucene/snowball/src_c/stem_UTF_8_finnish.c    |   808 +
 .../CLucene/snowball/src_c/stem_UTF_8_finnish.h    |    16 +
 .../CLucene/snowball/src_c/stem_UTF_8_french.c     |  1296 ++
 .../CLucene/snowball/src_c/stem_UTF_8_french.h     |    16 +
 .../CLucene/snowball/src_c/stem_UTF_8_german.c     |   526 +
 .../CLucene/snowball/src_c/stem_UTF_8_german.h     |    16 +
 .../CLucene/snowball/src_c/stem_UTF_8_italian.c    |  1113 ++
 .../CLucene/snowball/src_c/stem_UTF_8_italian.h    |    16 +
 .../CLucene/snowball/src_c/stem_UTF_8_norwegian.c  |   302 +
 .../CLucene/snowball/src_c/stem_UTF_8_norwegian.h  |    16 +
 .../CLucene/snowball/src_c/stem_UTF_8_porter.c     |   794 +
 .../CLucene/snowball/src_c/stem_UTF_8_porter.h     |    16 +
 .../CLucene/snowball/src_c/stem_UTF_8_portuguese.c |  1055 +
 .../CLucene/snowball/src_c/stem_UTF_8_portuguese.h |    16 +
 .../CLucene/snowball/src_c/stem_UTF_8_russian.c    |   709 +
 .../CLucene/snowball/src_c/stem_UTF_8_russian.h    |    16 +
 .../CLucene/snowball/src_c/stem_UTF_8_spanish.c    |  1137 ++
 .../CLucene/snowball/src_c/stem_UTF_8_spanish.h    |    16 +
 .../CLucene/snowball/src_c/stem_UTF_8_swedish.c    |   313 +
 .../CLucene/snowball/src_c/stem_UTF_8_swedish.h    |    16 +
 src/contribs-lib/CLucene/util/arrayinputstream.h   |    68 +
 src/contribs-lib/CLucene/util/byteinputstream.h    |    17 +
 .../CLucene/util/gzipcompressstream.cpp            |   126 +
 src/contribs-lib/CLucene/util/gzipcompressstream.h |    34 +
 src/contribs-lib/CLucene/util/gzipinputstream.cpp  |   184 +
 src/contribs-lib/CLucene/util/gzipinputstream.h    |    39 +
 src/contribs-lib/CLucene/util/streamarray.h        |    32 +
 src/contribs-lib/CMakeLists.txt                    |   128 +
 src/contribs-lib/cmake/FindIconv.cmake             |    57 +
 src/contribs/CMakeLists.txt                        |     2 +
 .../bashscripts/findPatchThatBrokeUnitTest.sh      |   113 +
 src/contribs/bashscripts/simpleupdate.sh           |    41 +
 src/contribs/bashscripts/twofileupdate.sh          |    42 +
 src/contribs/benchmarker/Benchmarker.cpp           |    39 +
 src/contribs/benchmarker/Benchmarker.h             |    23 +
 src/contribs/benchmarker/CMakeLists.txt            |    20 +
 src/contribs/benchmarker/Main.cpp                  |   109 +
 src/contribs/benchmarker/TestCLString.cpp          |    57 +
 src/contribs/benchmarker/TestCLString.h            |    23 +
 src/contribs/benchmarker/Timer.h                   |    48 +
 src/contribs/benchmarker/Unit.cpp                  |    94 +
 src/contribs/benchmarker/Unit.h                    |    29 +
 src/contribs/benchmarker/stdafx.cpp                |    11 +
 src/contribs/benchmarker/stdafx.h                  |    37 +
 src/contribs/contribs-lib-test/CMakeLists.txt      |    33 +
 src/contribs/contribs-lib-test/CuTest.cpp          |   536 +
 src/contribs/contribs-lib-test/CuTest.h            |   122 +
 src/contribs/contribs-lib-test/TestAnalysis.cpp    |   190 +
 src/contribs/contribs-lib-test/TestHighlight.cpp   |   269 +
 src/contribs/contribs-lib-test/TestSnowball.cpp    |    36 +
 src/contribs/contribs-lib-test/TestStreams.cpp     |    66 +
 src/contribs/contribs-lib-test/TestUtf8.cpp        |   189 +
 src/contribs/contribs-lib-test/contribTests.cpp    |    16 +
 src/contribs/contribs-lib-test/test.h              |    50 +
 src/contribs/contribs-lib-test/testall.cpp         |   216 +
 src/core/CLucene.h                                 |    48 +
 src/core/CLucene/CLConfig.h                        |   236 +
 src/core/CLucene/CLMonolithic.cpp                  |   137 +
 src/core/CLucene/StdHeader.cpp                     |    44 +
 src/core/CLucene/StdHeader.h                       |    40 +
 src/core/CLucene/_ApiHeader.h                      |    24 +
 src/core/CLucene/analysis/AnalysisHeader.cpp       |    85 +
 src/core/CLucene/analysis/AnalysisHeader.h         |   362 +
 src/core/CLucene/analysis/Analyzers.cpp            |   595 +
 src/core/CLucene/analysis/Analyzers.h              |   462 +
 src/core/CLucene/analysis/CachingTokenFilter.cpp   |    66 +
 src/core/CLucene/analysis/CachingTokenFilter.h     |    33 +
 .../CLucene/analysis/standard/StandardAnalyzer.cpp |   106 +
 .../CLucene/analysis/standard/StandardAnalyzer.h   |    75 +
 .../CLucene/analysis/standard/StandardFilter.cpp   |    58 +
 .../CLucene/analysis/standard/StandardFilter.h     |    35 +
 .../analysis/standard/StandardTokenizer.cpp        |   465 +
 .../CLucene/analysis/standard/StandardTokenizer.h  |    88 +
 .../analysis/standard/StandardTokenizerConstants.h |    27 +
 src/core/CLucene/debug/error.cpp                   |   108 +
 src/core/CLucene/debug/error.h                     |   105 +
 src/core/CLucene/debug/lucenebase.h                |    44 +
 src/core/CLucene/debug/mem.h                       |    64 +
 src/core/CLucene/document/DateField.cpp            |    61 +
 src/core/CLucene/document/DateField.h              |    59 +
 src/core/CLucene/document/DateTools.cpp            |   292 +
 src/core/CLucene/document/DateTools.h              |   101 +
 src/core/CLucene/document/Document.cpp             |   183 +
 src/core/CLucene/document/Document.h               |   178 +
 src/core/CLucene/document/Field.cpp                |   336 +
 src/core/CLucene/document/Field.h                  |   328 +
 src/core/CLucene/document/FieldSelector.cpp        |    65 +
 src/core/CLucene/document/FieldSelector.h          |   142 +
 src/core/CLucene/document/NumberTools.cpp          |    99 +
 src/core/CLucene/document/NumberTools.h            |    78 +
 src/core/CLucene/index/CompoundFile.cpp            |   456 +
 src/core/CLucene/index/DirectoryIndexReader.cpp    |   302 +
 src/core/CLucene/index/DirectoryIndexReader.h      |   139 +
 src/core/CLucene/index/DocumentsWriter.cpp         |  1683 ++
 .../CLucene/index/DocumentsWriterThreadState.cpp   |  1326 ++
 src/core/CLucene/index/FieldInfos.cpp              |   236 +
 src/core/CLucene/index/FieldsReader.cpp            |   565 +
 src/core/CLucene/index/FieldsWriter.cpp            |   269 +
 src/core/CLucene/index/IndexDeletionPolicy.cpp     |   140 +
 src/core/CLucene/index/IndexDeletionPolicy.h       |   152 +
 src/core/CLucene/index/IndexFileDeleter.cpp        |   530 +
 src/core/CLucene/index/IndexFileNameFilter.cpp     |    84 +
 src/core/CLucene/index/IndexFileNames.cpp          |   179 +
 src/core/CLucene/index/IndexModifier.cpp           |   277 +
 src/core/CLucene/index/IndexModifier.h             |   332 +
 src/core/CLucene/index/IndexReader.cpp             |   532 +
 src/core/CLucene/index/IndexReader.h               |   688 +
 src/core/CLucene/index/IndexWriter.cpp             |  2917 +++
 src/core/CLucene/index/IndexWriter.h               |  1370 ++
 src/core/CLucene/index/MergePolicy.cpp             |   521 +
 src/core/CLucene/index/MergePolicy.h               |   438 +
 src/core/CLucene/index/MergeScheduler.cpp          |    34 +
 src/core/CLucene/index/MergeScheduler.h            |    50 +
 src/core/CLucene/index/MultiReader.cpp             |   363 +
 src/core/CLucene/index/MultiReader.h               |   129 +
 src/core/CLucene/index/MultiSegmentReader.cpp      |   904 +
 src/core/CLucene/index/MultipleTermPositions.cpp   |   199 +
 src/core/CLucene/index/MultipleTermPositions.h     |    92 +
 src/core/CLucene/index/Payload.cpp                 |   108 +
 src/core/CLucene/index/Payload.h                   |   128 +
 src/core/CLucene/index/SDocumentWriter.cpp         |  1342 ++
 src/core/CLucene/index/SDocumentWriter.h           |   786 +
 src/core/CLucene/index/SegmentInfos.cpp            |  1133 ++
 src/core/CLucene/index/SegmentMergeInfo.cpp        |   107 +
 src/core/CLucene/index/SegmentMergeQueue.cpp       |    76 +
 src/core/CLucene/index/SegmentMerger.cpp           |   813 +
 src/core/CLucene/index/SegmentReader.cpp           |  1126 ++
 src/core/CLucene/index/SegmentTermDocs.cpp         |   210 +
 src/core/CLucene/index/SegmentTermEnum.cpp         |   395 +
 src/core/CLucene/index/SegmentTermPositions.cpp    |   169 +
 src/core/CLucene/index/SegmentTermVector.cpp       |   156 +
 src/core/CLucene/index/SkipListReader.cpp          |   357 +
 src/core/CLucene/index/SkipListWriter.cpp          |   187 +
 src/core/CLucene/index/Term.cpp                    |   450 +
 src/core/CLucene/index/Term.h                      |   177 +
 src/core/CLucene/index/TermInfo.cpp                |    98 +
 src/core/CLucene/index/TermInfosReader.cpp         |   477 +
 src/core/CLucene/index/TermInfosWriter.cpp         |   418 +
 src/core/CLucene/index/TermVector.h                |   159 +
 src/core/CLucene/index/TermVectorReader.cpp        |   551 +
 src/core/CLucene/index/TermVectorWriter.cpp        |   230 +
 src/core/CLucene/index/Terms.cpp                   |    30 +
 src/core/CLucene/index/Terms.h                     |   189 +
 src/core/CLucene/index/_CompoundFile.h             |   144 +
 src/core/CLucene/index/_DocumentsWriter.h          |  1005 +
 src/core/CLucene/index/_FieldInfo.h                |    13 +
 src/core/CLucene/index/_FieldInfos.h               |   199 +
 src/core/CLucene/index/_FieldsReader.h             |   169 +
 src/core/CLucene/index/_FieldsWriter.h             |    62 +
 src/core/CLucene/index/_IndexFileDeleter.h         |   224 +
 src/core/CLucene/index/_IndexFileNameFilter.h      |    54 +
 src/core/CLucene/index/_IndexFileNames.h           |    55 +
 src/core/CLucene/index/_MultiSegmentReader.h       |   232 +
 src/core/CLucene/index/_SegmentHeader.h            |   451 +
 src/core/CLucene/index/_SegmentInfos.h             |   532 +
 src/core/CLucene/index/_SegmentMergeInfo.h         |    48 +
 src/core/CLucene/index/_SegmentMergeQueue.h        |    34 +
 src/core/CLucene/index/_SegmentMerger.h            |   198 +
 src/core/CLucene/index/_SegmentTermEnum.h          |   133 +
 src/core/CLucene/index/_SkipListReader.h           |   183 +
 src/core/CLucene/index/_SkipListWriter.h           |   130 +
 src/core/CLucene/index/_Term.h                     |    36 +
 src/core/CLucene/index/_TermInfo.h                 |    52 +
 src/core/CLucene/index/_TermInfosReader.h          |   131 +
 src/core/CLucene/index/_TermInfosWriter.h          |   148 +
 src/core/CLucene/index/_TermVector.h               |   330 +
 src/core/CLucene/index/bpacking.cpp                | 13859 +++++++++++++
 src/core/CLucene/index/bpacking.h                  |   726 +
 src/core/CLucene/index/common.h                    |    26 +
 src/core/CLucene/index/compression.h               |    88 +
 src/core/CLucene/index/turbocompression.h          |   183 +
 src/core/CLucene/index/turbopacking32.h            |  3836 ++++
 src/core/CLucene/index/turbopacking64.h            |  9695 +++++++++
 src/core/CLucene/index/util.h                      |    83 +
 src/core/CLucene/queryParser/FastCharStream.cpp    |   121 +
 .../CLucene/queryParser/MultiFieldQueryParser.cpp  |   181 +
 .../CLucene/queryParser/MultiFieldQueryParser.h    |   163 +
 src/core/CLucene/queryParser/QueryParser.cpp       |  1504 ++
 src/core/CLucene/queryParser/QueryParser.h         |   530 +
 .../CLucene/queryParser/QueryParserConstants.h     |    68 +
 .../queryParser/QueryParserTokenManager.cpp        |  1265 ++
 .../CLucene/queryParser/QueryParserTokenManager.h  |   112 +
 src/core/CLucene/queryParser/QueryToken.cpp        |    51 +
 src/core/CLucene/queryParser/QueryToken.h          |    94 +
 src/core/CLucene/queryParser/_CharStream.h         |   121 +
 src/core/CLucene/queryParser/_FastCharStream.h     |    66 +
 src/core/CLucene/queryParser/legacy/Lexer.cpp      |   371 +
 .../queryParser/legacy/MultiFieldQueryParser.cpp   |   216 +
 .../queryParser/legacy/MultiFieldQueryParser.h     |   131 +
 .../CLucene/queryParser/legacy/QueryParser.cpp     |   507 +
 src/core/CLucene/queryParser/legacy/QueryParser.h  |   336 +
 .../CLucene/queryParser/legacy/QueryParserBase.cpp |   384 +
 src/core/CLucene/queryParser/legacy/QueryToken.cpp |    72 +
 src/core/CLucene/queryParser/legacy/QueryToken.h   |    70 +
 src/core/CLucene/queryParser/legacy/TokenList.cpp  |    79 +
 src/core/CLucene/queryParser/legacy/_Lexer.h       |    65 +
 src/core/CLucene/queryParser/legacy/_TokenList.h   |    36 +
 src/core/CLucene/search/BooleanClause.h            |   107 +
 src/core/CLucene/search/BooleanQuery.cpp           |   581 +
 src/core/CLucene/search/BooleanQuery.h             |   159 +
 src/core/CLucene/search/BooleanScorer.cpp          |   299 +
 src/core/CLucene/search/BooleanScorer2.cpp         |   688 +
 src/core/CLucene/search/CachingSpanFilter.cpp      |   120 +
 src/core/CLucene/search/CachingSpanFilter.h        |    60 +
 src/core/CLucene/search/CachingWrapperFilter.cpp   |   114 +
 src/core/CLucene/search/CachingWrapperFilter.h     |    66 +
 src/core/CLucene/search/ChainedFilter.cpp          |   219 +
 src/core/CLucene/search/ChainedFilter.h            |    86 +
 src/core/CLucene/search/Compare.cpp                |   120 +
 src/core/CLucene/search/Compare.h                  |    74 +
 src/core/CLucene/search/ConjunctionScorer.cpp      |   133 +
 src/core/CLucene/search/ConstantScoreQuery.cpp     |   313 +
 src/core/CLucene/search/ConstantScoreQuery.h       |   125 +
 src/core/CLucene/search/DateFilter.cpp             |    98 +
 src/core/CLucene/search/DateFilter.h               |    61 +
 src/core/CLucene/search/DisjunctionSumScorer.cpp   |   201 +
 src/core/CLucene/search/ExactPhraseScorer.cpp      |    91 +
 src/core/CLucene/search/Explanation.cpp            |   176 +
 src/core/CLucene/search/Explanation.h              |   120 +
 src/core/CLucene/search/FieldCache.cpp             |    65 +
 src/core/CLucene/search/FieldCache.h               |   174 +
 src/core/CLucene/search/FieldCacheImpl.cpp         |   577 +
 src/core/CLucene/search/FieldDoc.h                 |    52 +
 src/core/CLucene/search/FieldDocSortedHitQueue.cpp |   171 +
 src/core/CLucene/search/FieldSortedHitQueue.cpp    |   258 +
 src/core/CLucene/search/FieldSortedHitQueue.h      |   196 +
 src/core/CLucene/search/Filter.h                   |    42 +
 src/core/CLucene/search/FilterResultCache.cpp      |    48 +
 src/core/CLucene/search/FilterResultCache.h        |    68 +
 src/core/CLucene/search/FilteredTermEnum.cpp       |   119 +
 src/core/CLucene/search/FilteredTermEnum.h         |    58 +
 src/core/CLucene/search/FuzzyQuery.cpp             |   425 +
 src/core/CLucene/search/FuzzyQuery.h               |   204 +
 src/core/CLucene/search/HitQueue.cpp               |   108 +
 src/core/CLucene/search/Hits.cpp                   |   232 +
 src/core/CLucene/search/Hits.h                     |   102 +
 src/core/CLucene/search/IndexSearcher.cpp          |   441 +
 src/core/CLucene/search/IndexSearcher.h            |   100 +
 src/core/CLucene/search/MatchAllDocsQuery.cpp      |   201 +
 src/core/CLucene/search/MatchAllDocsQuery.h        |    74 +
 src/core/CLucene/search/MultiPhraseQuery.cpp       |   441 +
 src/core/CLucene/search/MultiPhraseQuery.h         |   116 +
 src/core/CLucene/search/MultiSearcher.cpp          |   244 +
 src/core/CLucene/search/MultiSearcher.h            |    81 +
 src/core/CLucene/search/MultiTermQuery.cpp         |   104 +
 src/core/CLucene/search/MultiTermQuery.h           |    64 +
 src/core/CLucene/search/PhrasePositions.cpp        |   114 +
 src/core/CLucene/search/PhraseQuery.cpp            |   485 +
 src/core/CLucene/search/PhraseQuery.h              |   107 +
 src/core/CLucene/search/PhraseScorer.cpp           |   223 +
 src/core/CLucene/search/PrefixQuery.cpp            |   311 +
 src/core/CLucene/search/PrefixQuery.h              |    80 +
 src/core/CLucene/search/Query.h                    |   154 +
 src/core/CLucene/search/QueryFilter.cpp            |    88 +
 src/core/CLucene/search/QueryFilter.h              |    40 +
 src/core/CLucene/search/RangeFilter.cpp            |   148 +
 src/core/CLucene/search/RangeFilter.h              |    75 +
 src/core/CLucene/search/RangeQuery.cpp             |   194 +
 src/core/CLucene/search/RangeQuery.h               |    92 +
 src/core/CLucene/search/Scorer.cpp                 |    41 +
 src/core/CLucene/search/Scorer.h                   |   134 +
 src/core/CLucene/search/ScorerDocQueue.cpp         |   191 +
 src/core/CLucene/search/ScorerDocQueue.h           |    54 +
 src/core/CLucene/search/SearchHeader.cpp           |   232 +
 src/core/CLucene/search/SearchHeader.h             |   152 +
 src/core/CLucene/search/Searchable.h               |   182 +
 src/core/CLucene/search/Similarity.cpp             |   243 +
 src/core/CLucene/search/Similarity.h               |   279 +
 src/core/CLucene/search/SloppyPhraseScorer.cpp     |   175 +
 src/core/CLucene/search/Sort.cpp                   |   340 +
 src/core/CLucene/search/Sort.h                     |   426 +
 src/core/CLucene/search/SpanFilter.h               |    39 +
 src/core/CLucene/search/SpanFilterResult.h         |   130 +
 src/core/CLucene/search/SpanQueryFilter.cpp        |    89 +
 src/core/CLucene/search/SpanQueryFilter.h          |    74 +
 src/core/CLucene/search/TermQuery.cpp              |   251 +
 src/core/CLucene/search/TermQuery.h                |    52 +
 src/core/CLucene/search/TermScorer.cpp             |   129 +
 src/core/CLucene/search/WildcardQuery.cpp          |   151 +
 src/core/CLucene/search/WildcardQuery.h            |    70 +
 src/core/CLucene/search/WildcardTermEnum.cpp       |   182 +
 src/core/CLucene/search/WildcardTermEnum.h         |    65 +
 src/core/CLucene/search/_BooleanScorer.h           |   101 +
 src/core/CLucene/search/_BooleanScorer2.h          |    46 +
 src/core/CLucene/search/_ConjunctionScorer.h       |    41 +
 src/core/CLucene/search/_DisjunctionSumScorer.h    |   139 +
 src/core/CLucene/search/_ExactPhraseScorer.h       |    28 +
 src/core/CLucene/search/_FieldCacheImpl.h          |   119 +
 src/core/CLucene/search/_FieldDocSortedHitQueue.h  |   120 +
 src/core/CLucene/search/_HitQueue.h                |    57 +
 src/core/CLucene/search/_PhrasePositions.h         |    44 +
 src/core/CLucene/search/_PhraseQueue.h             |    38 +
 src/core/CLucene/search/_PhraseScorer.h            |    78 +
 src/core/CLucene/search/_SloppyPhraseScorer.h      |    78 +
 src/core/CLucene/search/_TermScorer.h              |    82 +
 src/core/CLucene/search/spans/NearSpansOrdered.cpp |   285 +
 .../CLucene/search/spans/NearSpansUnordered.cpp    |   271 +
 src/core/CLucene/search/spans/SpanFirstQuery.cpp   |   205 +
 src/core/CLucene/search/spans/SpanFirstQuery.h     |    66 +
 src/core/CLucene/search/spans/SpanNearQuery.cpp    |   204 +
 src/core/CLucene/search/spans/SpanNearQuery.h      |   112 +
 src/core/CLucene/search/spans/SpanNotQuery.cpp     |   274 +
 src/core/CLucene/search/spans/SpanNotQuery.h       |    67 +
 src/core/CLucene/search/spans/SpanOrQuery.cpp      |   304 +
 src/core/CLucene/search/spans/SpanOrQuery.h        |   104 +
 src/core/CLucene/search/spans/SpanQuery.h          |    40 +
 src/core/CLucene/search/spans/SpanScorer.cpp       |   122 +
 src/core/CLucene/search/spans/SpanScorer.h         |    51 +
 src/core/CLucene/search/spans/SpanTermQuery.cpp    |   104 +
 src/core/CLucene/search/spans/SpanTermQuery.h      |    59 +
 src/core/CLucene/search/spans/SpanWeight.cpp       |   180 +
 src/core/CLucene/search/spans/SpanWeight.h         |    46 +
 src/core/CLucene/search/spans/Spans.h              |    54 +
 src/core/CLucene/search/spans/TermSpans.cpp        |    99 +
 src/core/CLucene/search/spans/_EmptySpans.h        |    35 +
 src/core/CLucene/search/spans/_NearSpansOrdered.h  |   106 +
 .../CLucene/search/spans/_NearSpansUnordered.h     |   103 +
 src/core/CLucene/search/spans/_TermSpans.h         |    47 +
 src/core/CLucene/store/ByteArrayDataInput.cpp      |   148 +
 src/core/CLucene/store/ByteArrayDataInput.h        |    68 +
 src/core/CLucene/store/Directory.cpp               |    78 +
 src/core/CLucene/store/Directory.h                 |   100 +
 src/core/CLucene/store/FSDirectory.cpp             |   693 +
 src/core/CLucene/store/FSDirectory.h               |   168 +
 src/core/CLucene/store/IndexInput.cpp              |   323 +
 src/core/CLucene/store/IndexInput.h                |   202 +
 src/core/CLucene/store/IndexOutput.cpp             |   240 +
 src/core/CLucene/store/IndexOutput.h               |   175 +
 src/core/CLucene/store/Lock.cpp                    |   174 +
 src/core/CLucene/store/Lock.h                      |    50 +
 src/core/CLucene/store/LockFactory.cpp             |   159 +
 src/core/CLucene/store/LockFactory.h               |    77 +
 src/core/CLucene/store/MMapInput.cpp               |   301 +
 src/core/CLucene/store/RAMDirectory.cpp            |   579 +
 src/core/CLucene/store/RAMDirectory.h              |    98 +
 src/core/CLucene/store/_Lock.h                     |   134 +
 src/core/CLucene/store/_MMapIndexInput.h           |    40 +
 src/core/CLucene/store/_RAMDirectory.h             |   159 +
 src/core/CLucene/util/Array.h                      |   340 +
 src/core/CLucene/util/BitSet.cpp                   |   198 +
 src/core/CLucene/util/BitSet.h                     |    99 +
 src/core/CLucene/util/BitUtil.cpp                  |    24 +
 src/core/CLucene/util/BitUtil.h                    |    28 +
 src/core/CLucene/util/BytesRef.cpp                 |    73 +
 src/core/CLucene/util/BytesRef.h                   |    95 +
 src/core/CLucene/util/BytesRefBuilder.cpp          |    50 +
 src/core/CLucene/util/BytesRefBuilder.h            |    78 +
 src/core/CLucene/util/CLStreams.h                  |   381 +
 src/core/CLucene/util/CodecUtil.cpp                |    53 +
 src/core/CLucene/util/CodecUtil.h                  |    36 +
 src/core/CLucene/util/Equators.cpp                 |   172 +
 src/core/CLucene/util/Equators.h                   |   283 +
 src/core/CLucene/util/FastCharStream.cpp           |   109 +
 src/core/CLucene/util/FixedBitSet.cpp              |    37 +
 src/core/CLucene/util/FixedBitSet.h                |    24 +
 src/core/CLucene/util/FutureArrays.cpp             |   246 +
 src/core/CLucene/util/FutureArrays.h               |   134 +
 src/core/CLucene/util/IntroSorter.cpp              |    66 +
 src/core/CLucene/util/IntroSorter.h                |    45 +
 src/core/CLucene/util/LongBitSet.cpp               |    41 +
 src/core/CLucene/util/LongBitSet.h                 |    25 +
 src/core/CLucene/util/MD5Digester.cpp              |   337 +
 src/core/CLucene/util/MSBRadixSorter.cpp           |   207 +
 src/core/CLucene/util/MSBRadixSorter.h             |   134 +
 src/core/CLucene/util/NumericUtils.cpp             |   124 +
 src/core/CLucene/util/NumericUtils.h               |    60 +
 src/core/CLucene/util/OfflineSorter.h              |    53 +
 src/core/CLucene/util/PriorityQueue.h              |   206 +
 src/core/CLucene/util/Reader.cpp                   |   589 +
 src/core/CLucene/util/Reader.h                     |    19 +
 src/core/CLucene/util/Sorter.cpp                   |   200 +
 src/core/CLucene/util/Sorter.h                     |    89 +
 src/core/CLucene/util/StringIntern.cpp             |   122 +
 src/core/CLucene/util/ThreadLocal.cpp              |   266 +
 src/core/CLucene/util/Time.h                       |    20 +
 src/core/CLucene/util/VoidList.h                   |   186 +
 src/core/CLucene/util/VoidMap.h                    |   327 +
 src/core/CLucene/util/_Arrays.h                    |   155 +
 src/core/CLucene/util/_FastCharStream.h            |    54 +
 src/core/CLucene/util/_MD5Digester.h               |   110 +
 src/core/CLucene/util/_StringIntern.h              |    56 +
 src/core/CLucene/util/_ThreadLocal.h               |    89 +
 src/core/CLucene/util/_VoidList.h                  |    19 +
 src/core/CLucene/util/_VoidMap.h                   |    19 +
 src/core/CLucene/util/_bufferedstream.h            |   185 +
 src/core/CLucene/util/_streambase.h                |   210 +
 src/core/CLucene/util/_streambuffer.h              |   167 +
 src/core/CLucene/util/bkd/bkd_docid_iterator.h     |   110 +
 src/core/CLucene/util/bkd/bkd_msb_radix_sorter.cpp |    79 +
 src/core/CLucene/util/bkd/bkd_msb_radix_sorter.h   |    29 +
 src/core/CLucene/util/bkd/bkd_reader.cpp           |   569 +
 src/core/CLucene/util/bkd/bkd_reader.h             |   214 +
 src/core/CLucene/util/bkd/bkd_writer.cpp           |   861 +
 src/core/CLucene/util/bkd/bkd_writer.h             |   167 +
 src/core/CLucene/util/bkd/docIds_writer.cpp        |   244 +
 src/core/CLucene/util/bkd/docIds_writer.h          |    37 +
 src/core/CLucene/util/bkd/heap_point_reader.cpp    |    66 +
 src/core/CLucene/util/bkd/heap_point_reader.h      |    47 +
 src/core/CLucene/util/bkd/heap_point_writer.cpp    |   242 +
 src/core/CLucene/util/bkd/heap_point_writer.h      |    58 +
 src/core/CLucene/util/bkd/index_tree.cpp           |    91 +
 src/core/CLucene/util/bkd/index_tree.h             |    41 +
 src/core/CLucene/util/bkd/legacy_index_tree.cpp    |    74 +
 src/core/CLucene/util/bkd/legacy_index_tree.h      |    29 +
 src/core/CLucene/util/bkd/packed_index_tree.cpp    |   130 +
 src/core/CLucene/util/bkd/packed_index_tree.h      |    36 +
 src/core/CLucene/util/bkd/point_reader.cpp         |    45 +
 src/core/CLucene/util/bkd/point_reader.h           |    25 +
 src/core/CLucene/util/bkd/point_writer.h           |    24 +
 src/core/CLucene/util/croaring/LICENSE             |   235 +
 src/core/CLucene/util/croaring/README.md           |    44 +
 src/core/CLucene/util/croaring/roaring.c           | 19542 +++++++++++++++++++
 src/core/CLucene/util/croaring/roaring.h           |  1031 +
 src/core/CLucene/util/croaring/roaring.hh          |  2016 ++
 src/core/CLucene/util/stringUtil.cpp               |    51 +
 src/core/CLucene/util/stringUtil.h                 |    27 +
 src/core/CMakeLists.txt                            |   348 +
 src/core/files_list.txt                            |   314 +
 src/core/libclucene-core.pc.cmake                  |    11 +
 src/core/vp4.h                                     |   355 +
 src/demo/CMakeLists.txt                            |    44 +
 src/demo/DeleteFiles.cpp                           |    42 +
 src/demo/IndexFiles.cpp                            |   207 +
 src/demo/Main.cpp                                  |    87 +
 src/demo/Main_Index.cpp                            |    79 +
 src/demo/README                                    |     2 +
 src/demo/SearchFiles.cpp                           |   100 +
 src/demo/Statistics.cpp                            |    47 +
 src/demo/TestAnalyzer.cpp                          |    83 +
 src/ext/CMakeLists.txt                             |    15 +
 src/ext/boost/assert.hpp                           |    50 +
 src/ext/boost/checked_delete.hpp                   |    69 +
 src/ext/boost/config.hpp                           |    70 +
 src/ext/boost/config/abi/borland_prefix.hpp        |    27 +
 src/ext/boost/config/abi/borland_suffix.hpp        |    12 +
 src/ext/boost/config/abi/msvc_prefix.hpp           |    22 +
 src/ext/boost/config/abi/msvc_suffix.hpp           |     8 +
 src/ext/boost/config/abi_prefix.hpp                |    25 +
 src/ext/boost/config/abi_suffix.hpp                |    27 +
 src/ext/boost/config/auto_link.hpp                 |   373 +
 src/ext/boost/config/compiler/borland.hpp          |   267 +
 src/ext/boost/config/compiler/codegear.hpp         |   163 +
 src/ext/boost/config/compiler/comeau.hpp           |    59 +
 src/ext/boost/config/compiler/common_edg.hpp       |    97 +
 src/ext/boost/config/compiler/compaq_cxx.hpp       |    19 +
 src/ext/boost/config/compiler/digitalmars.hpp      |    93 +
 src/ext/boost/config/compiler/gcc.hpp              |   204 +
 src/ext/boost/config/compiler/gcc_xml.hpp          |    30 +
 src/ext/boost/config/compiler/greenhills.hpp       |    28 +
 src/ext/boost/config/compiler/hp_acc.hpp           |   127 +
 src/ext/boost/config/compiler/intel.hpp            |   173 +
 src/ext/boost/config/compiler/kai.hpp              |    33 +
 src/ext/boost/config/compiler/metrowerks.hpp       |   139 +
 src/ext/boost/config/compiler/mpw.hpp              |    81 +
 src/ext/boost/config/compiler/pgi.hpp              |    62 +
 src/ext/boost/config/compiler/sgi_mipspro.hpp      |    29 +
 src/ext/boost/config/compiler/sunpro_cc.hpp        |   130 +
 src/ext/boost/config/compiler/vacpp.hpp            |    88 +
 src/ext/boost/config/compiler/visualc.hpp          |   258 +
 src/ext/boost/config/no_tr1/cmath.hpp              |    28 +
 src/ext/boost/config/no_tr1/complex.hpp            |    28 +
 src/ext/boost/config/no_tr1/functional.hpp         |    28 +
 src/ext/boost/config/no_tr1/memory.hpp             |    28 +
 src/ext/boost/config/no_tr1/utility.hpp            |    28 +
 src/ext/boost/config/platform/aix.hpp              |    33 +
 src/ext/boost/config/platform/amigaos.hpp          |    15 +
 src/ext/boost/config/platform/beos.hpp             |    26 +
 src/ext/boost/config/platform/bsd.hpp              |    86 +
 src/ext/boost/config/platform/cygwin.hpp           |    51 +
 src/ext/boost/config/platform/hpux.hpp             |    87 +
 src/ext/boost/config/platform/irix.hpp             |    31 +
 src/ext/boost/config/platform/linux.hpp            |    98 +
 src/ext/boost/config/platform/macos.hpp            |    86 +
 src/ext/boost/config/platform/qnxnto.hpp           |    31 +
 src/ext/boost/config/platform/solaris.hpp          |    28 +
 src/ext/boost/config/platform/vxworks.hpp          |    31 +
 src/ext/boost/config/platform/win32.hpp            |    58 +
 src/ext/boost/config/posix_features.hpp            |    95 +
 src/ext/boost/config/requires_threads.hpp          |    92 +
 src/ext/boost/config/select_compiler_config.hpp    |   119 +
 src/ext/boost/config/select_platform_config.hpp    |    94 +
 src/ext/boost/config/select_stdlib_config.hpp      |    77 +
 src/ext/boost/config/stdlib/dinkumware.hpp         |   138 +
 src/ext/boost/config/stdlib/libcomo.hpp            |    71 +
 src/ext/boost/config/stdlib/libstdcpp3.hpp         |   127 +
 src/ext/boost/config/stdlib/modena.hpp             |    55 +
 src/ext/boost/config/stdlib/msl.hpp                |    83 +
 src/ext/boost/config/stdlib/roguewave.hpp          |   179 +
 src/ext/boost/config/stdlib/sgi.hpp                |   136 +
 src/ext/boost/config/stdlib/stlport.hpp            |   236 +
 src/ext/boost/config/stdlib/vacpp.hpp              |    43 +
 src/ext/boost/config/suffix.hpp                    |   601 +
 src/ext/boost/config/user.hpp                      |   124 +
 src/ext/boost/config/warning_disable.hpp           |    47 +
 src/ext/boost/current_function.hpp                 |    67 +
 src/ext/boost/detail/algorithm.hpp                 |   222 +
 src/ext/boost/detail/allocator_utilities.hpp       |   212 +
 src/ext/boost/detail/atomic_count.hpp              |    21 +
 src/ext/boost/detail/binary_search.hpp             |   216 +
 src/ext/boost/detail/call_traits.hpp               |   164 +
 src/ext/boost/detail/catch_exceptions.hpp          |   146 +
 src/ext/boost/detail/compressed_pair.hpp           |   443 +
 src/ext/boost/detail/container_fwd.hpp             |    99 +
 src/ext/boost/detail/dynamic_bitset.hpp            |   229 +
 src/ext/boost/detail/endian.hpp                    |    73 +
 src/ext/boost/detail/has_default_constructor.hpp   |    29 +
 src/ext/boost/detail/identifier.hpp                |    89 +
 src/ext/boost/detail/indirect_traits.hpp           |   487 +
 src/ext/boost/detail/interlocked.hpp               |   142 +
 src/ext/boost/detail/is_function_ref_tester.hpp    |   135 +
 src/ext/boost/detail/is_incrementable.hpp          |   134 +
 src/ext/boost/detail/is_xxx.hpp                    |    61 +
 src/ext/boost/detail/iterator.hpp                  |   494 +
 src/ext/boost/detail/lcast_precision.hpp           |   184 +
 src/ext/boost/detail/lightweight_mutex.hpp         |    22 +
 src/ext/boost/detail/lightweight_test.hpp          |    91 +
 src/ext/boost/detail/lightweight_thread.hpp        |   135 +
 src/ext/boost/detail/limits.hpp                    |   449 +
 src/ext/boost/detail/named_template_params.hpp     |   177 +
 src/ext/boost/detail/no_exceptions_support.hpp     |    87 +
 src/ext/boost/detail/none_t.hpp                    |    28 +
 src/ext/boost/detail/numeric_traits.hpp            |   191 +
 src/ext/boost/detail/ob_call_traits.hpp            |   168 +
 src/ext/boost/detail/ob_compressed_pair.hpp        |   510 +
 src/ext/boost/detail/quick_allocator.hpp           |    23 +
 src/ext/boost/detail/reference_content.hpp         |   141 +
 src/ext/boost/detail/scoped_enum_emulation.hpp     |    56 +
 src/ext/boost/detail/select_type.hpp               |    36 +
 src/ext/boost/detail/sp_typeinfo.hpp               |   129 +
 src/ext/boost/detail/templated_streams.hpp         |    74 +
 src/ext/boost/detail/utf8_codecvt_facet.hpp        |   190 +
 src/ext/boost/detail/workaround.hpp                |   262 +
 src/ext/boost/exception/all.hpp                    |    36 +
 src/ext/boost/exception/current_exception_cast.hpp |    43 +
 .../boost/exception/detail/attribute_noreturn.hpp  |    17 +
 src/ext/boost/exception/detail/error_info_impl.hpp |    75 +
 src/ext/boost/exception/detail/exception_ptr.hpp   |   490 +
 .../exception/detail/is_output_streamable.hpp      |    47 +
 src/ext/boost/exception/detail/object_hex_dump.hpp |    50 +
 src/ext/boost/exception/detail/type_info.hpp       |    79 +
 src/ext/boost/exception/diagnostic_information.hpp |   182 +
 .../boost/exception/enable_current_exception.hpp   |     6 +
 src/ext/boost/exception/enable_error_info.hpp      |     6 +
 src/ext/boost/exception/errinfo_api_function.hpp   |    22 +
 src/ext/boost/exception/errinfo_at_line.hpp        |    18 +
 src/ext/boost/exception/errinfo_errno.hpp          |    44 +
 src/ext/boost/exception/errinfo_file_handle.hpp    |    20 +
 src/ext/boost/exception/errinfo_file_name.hpp      |    26 +
 src/ext/boost/exception/errinfo_file_open_mode.hpp |    26 +
 .../boost/exception/errinfo_nested_exception.hpp   |    17 +
 src/ext/boost/exception/errinfo_type_info_name.hpp |    23 +
 src/ext/boost/exception/error_info.hpp             |     6 +
 src/ext/boost/exception/exception.hpp              |   422 +
 src/ext/boost/exception/get_error_info.hpp         |   130 +
 src/ext/boost/exception/info.hpp                   |   167 +
 src/ext/boost/exception/info_tuple.hpp             |    76 +
 src/ext/boost/exception/to_string.hpp              |    83 +
 src/ext/boost/exception/to_string_stub.hpp         |   109 +
 src/ext/boost/memory_order.hpp                     |    53 +
 src/ext/boost/shared_ptr.hpp                       |    19 +
 src/ext/boost/smart_ptr/bad_weak_ptr.hpp           |    59 +
 src/ext/boost/smart_ptr/detail/atomic_count.hpp    |   119 +
 .../boost/smart_ptr/detail/atomic_count_gcc.hpp    |    72 +
 .../smart_ptr/detail/atomic_count_gcc_x86.hpp      |    77 +
 .../smart_ptr/detail/atomic_count_pthreads.hpp     |    96 +
 .../smart_ptr/detail/atomic_count_solaris.hpp      |    59 +
 .../boost/smart_ptr/detail/atomic_count_sync.hpp   |    61 +
 .../boost/smart_ptr/detail/atomic_count_win32.hpp  |    63 +
 .../boost/smart_ptr/detail/lightweight_mutex.hpp   |    42 +
 src/ext/boost/smart_ptr/detail/lwm_nop.hpp         |    37 +
 src/ext/boost/smart_ptr/detail/lwm_pthreads.hpp    |    87 +
 src/ext/boost/smart_ptr/detail/lwm_win32_cs.hpp    |   108 +
 src/ext/boost/smart_ptr/detail/operator_bool.hpp   |    56 +
 src/ext/boost/smart_ptr/detail/quick_allocator.hpp |   199 +
 .../boost/smart_ptr/detail/shared_array_nmt.hpp    |   151 +
 src/ext/boost/smart_ptr/detail/shared_count.hpp    |   444 +
 src/ext/boost/smart_ptr/detail/shared_ptr_nmt.hpp  |   182 +
 src/ext/boost/smart_ptr/detail/sp_convertible.hpp  |    76 +
 src/ext/boost/smart_ptr/detail/sp_counted_base.hpp |    70 +
 .../smart_ptr/detail/sp_counted_base_acc_ia64.hpp  |   150 +
 .../smart_ptr/detail/sp_counted_base_cw_ppc.hpp    |   170 +
 .../smart_ptr/detail/sp_counted_base_cw_x86.hpp    |   158 +
 .../smart_ptr/detail/sp_counted_base_gcc_ia64.hpp  |   157 +
 .../smart_ptr/detail/sp_counted_base_gcc_mips.hpp  |   172 +
 .../smart_ptr/detail/sp_counted_base_gcc_ppc.hpp   |   181 +
 .../smart_ptr/detail/sp_counted_base_gcc_sparc.hpp |   166 +
 .../smart_ptr/detail/sp_counted_base_gcc_x86.hpp   |   173 +
 .../boost/smart_ptr/detail/sp_counted_base_nt.hpp  |   107 +
 .../boost/smart_ptr/detail/sp_counted_base_pt.hpp  |   135 +
 .../smart_ptr/detail/sp_counted_base_solaris.hpp   |   113 +
 .../smart_ptr/detail/sp_counted_base_spin.hpp      |   131 +
 .../smart_ptr/detail/sp_counted_base_sync.hpp      |   155 +
 .../boost/smart_ptr/detail/sp_counted_base_w32.hpp |   130 +
 src/ext/boost/smart_ptr/detail/sp_counted_impl.hpp |   231 +
 src/ext/boost/smart_ptr/detail/sp_has_sync.hpp     |    49 +
 src/ext/boost/smart_ptr/detail/spinlock.hpp        |    53 +
 .../boost/smart_ptr/detail/spinlock_gcc_arm.hpp    |    85 +
 src/ext/boost/smart_ptr/detail/spinlock_nt.hpp     |    89 +
 src/ext/boost/smart_ptr/detail/spinlock_pool.hpp   |    87 +
 src/ext/boost/smart_ptr/detail/spinlock_pt.hpp     |    79 +
 src/ext/boost/smart_ptr/detail/spinlock_sync.hpp   |    87 +
 src/ext/boost/smart_ptr/detail/spinlock_w32.hpp    |   113 +
 src/ext/boost/smart_ptr/detail/yield_k.hpp         |   149 +
 .../boost/smart_ptr/enable_shared_from_this.hpp    |    79 +
 .../boost/smart_ptr/enable_shared_from_this2.hpp   |   132 +
 src/ext/boost/smart_ptr/intrusive_ptr.hpp          |   299 +
 src/ext/boost/smart_ptr/make_shared.hpp            |   506 +
 src/ext/boost/smart_ptr/scoped_array.hpp           |   107 +
 src/ext/boost/smart_ptr/scoped_ptr.hpp             |   131 +
 src/ext/boost/smart_ptr/shared_array.hpp           |   147 +
 src/ext/boost/smart_ptr/shared_ptr.hpp             |   701 +
 src/ext/boost/smart_ptr/weak_ptr.hpp               |   230 +
 src/ext/boost/throw_exception.hpp                  |    75 +
 src/ext/boost/version.hpp                          |    35 +
 src/ext/for/README.md                              |   585 +
 src/ext/for/bitpack.c                              |   426 +
 src/ext/for/bitpack.h                              |   310 +
 src/ext/for/bitpack_.h                             |  5190 +++++
 src/ext/for/bitunpack.c                            |  1231 ++
 src/ext/for/bitunpack_.h                           |  6049 ++++++
 src/ext/for/bitutil.c                              |   689 +
 src/ext/for/bitutil.h                              |   547 +
 src/ext/for/conf.h                                 |   282 +
 src/ext/for/eliasfano.c                            |   208 +
 src/ext/for/eliasfano.h                            |    61 +
 src/ext/for/fp.c                                   |   684 +
 src/ext/for/fp.h                                   |   125 +
 src/ext/for/icapp.c                                |  1909 ++
 src/ext/for/icbench.c                              |  1736 ++
 src/ext/for/idx.h                                  |    53 +
 src/ext/for/idxcr.c                                |   175 +
 src/ext/for/idxqry.c                               |   684 +
 src/ext/for/idxseg.c                               |   133 +
 src/ext/for/index.md                               |   567 +
 src/ext/for/jic.c                                  |   172 +
 src/ext/for/lz.c                                   |   442 +
 src/ext/for/makefile                               |   156 +
 src/ext/for/makefile.vs                            |    78 +
 src/ext/for/plugins.cc                             |   798 +
 src/ext/for/plugins.h                              |    74 +
 src/ext/for/sse_neon.h                             |   355 +
 src/ext/for/time_.h                                |   252 +
 src/ext/for/transpose.c                            |  1223 ++
 src/ext/for/transpose.h                            |   113 +
 src/ext/for/trle.h                                 |    72 +
 src/ext/for/trle_.h                                |    60 +
 src/ext/for/trlec.c                                |   343 +
 src/ext/for/trled.c                                |   413 +
 src/ext/for/v8.c                                   |  1465 ++
 src/ext/for/vint.c                                 |   407 +
 src/ext/for/vint.h                                 |   401 +
 src/ext/for/vp4.h                                  |   355 +
 src/ext/for/vp4c.c                                 |   423 +
 src/ext/for/vp4d.c                                 |   534 +
 src/ext/for/vs/bitpack_avx2.c                      |     2 +
 src/ext/for/vs/bitpack_sse.c                       |     2 +
 src/ext/for/vs/bitunpack_avx2.c                    |     2 +
 src/ext/for/vs/bitunpack_sse.c                     |     2 +
 src/ext/for/vs/getopt.c                            |   562 +
 src/ext/for/vs/getopt.h                            |    97 +
 src/ext/for/vs/inttypes.h                          |   306 +
 src/ext/for/vs/stdint.h                            |   259 +
 src/ext/for/vs/transpose_avx2.c                    |     2 +
 src/ext/for/vs/transpose_sse.c                     |     2 +
 src/ext/for/vs/vp4c_avx2.c                         |     2 +
 src/ext/for/vs/vp4c_sse.c                          |     2 +
 src/ext/for/vs/vp4d_avx2.c                         |     2 +
 src/ext/for/vs/vp4d_sse.c                          |     2 +
 src/ext/for/vs/vs2017/TurboPFor.sln                |    41 +
 src/ext/for/vs/vs2017/TurboPFor.vcxproj            |   226 +
 src/ext/for/vs/vs2017/TurboPFor.vcxproj.filters    |   101 +
 src/ext/for/vs/vs2017/icapp.vcxproj                |   175 +
 src/ext/for/vs/vs2017/icapp.vcxproj.filters        |    21 +
 src/ext/for/vsimple.c                              |   536 +
 src/ext/for/vsimple.h                              |    47 +
 src/ext/zlib/ChangeLog                             |   855 +
 src/ext/zlib/FAQ                                   |   339 +
 src/ext/zlib/INDEX                                 |    51 +
 src/ext/zlib/README                                |   125 +
 src/ext/zlib/adler32.c                             |   149 +
 src/ext/zlib/algorithm.txt                         |   209 +
 src/ext/zlib/compress.c                            |    79 +
 src/ext/zlib/crc32.c                               |   423 +
 src/ext/zlib/crc32.h                               |   441 +
 src/ext/zlib/deflate.c                             |  1736 ++
 src/ext/zlib/deflate.h                             |   331 +
 src/ext/zlib/gzio.c                                |  1026 +
 src/ext/zlib/inffast.c                             |   318 +
 src/ext/zlib/inffast.h                             |    11 +
 src/ext/zlib/inffixed.h                            |    94 +
 src/ext/zlib/inflate.c                             |  1368 ++
 src/ext/zlib/inflate.h                             |   115 +
 src/ext/zlib/inftrees.c                            |   329 +
 src/ext/zlib/inftrees.h                            |    55 +
 src/ext/zlib/trees.c                               |  1219 ++
 src/ext/zlib/trees.h                               |   128 +
 src/ext/zlib/zconf.h                               |   332 +
 src/ext/zlib/zlib.h                                |  1357 ++
 src/ext/zlib/zutil.c                               |   318 +
 src/ext/zlib/zutil.h                               |   269 +
 src/gtest/CMakeLists.txt                           |    26 +
 src/gtest/parser/main.cpp                          |    14 +
 src/htdocs/README                                  |     9 +
 src/htdocs/_footer.html                            |    55 +
 src/htdocs/_header.html                            |    37 +
 src/htdocs/_index.php                              |     9 +
 src/htdocs/clucene.jpg                             |   Bin 0 -> 7432 bytes
 src/htdocs/contribute.shtml                        |    36 +
 src/htdocs/download.shtml                          |   168 +
 src/htdocs/images/disk.png                         |   Bin 0 -> 620 bytes
 src/htdocs/images/img01.gif                        |   Bin 0 -> 1945 bytes
 src/htdocs/images/img02.gif                        |   Bin 0 -> 11174 bytes
 src/htdocs/images/img03.gif                        |   Bin 0 -> 47 bytes
 src/htdocs/images/img04.jpg                        |   Bin 0 -> 5009 bytes
 src/htdocs/images/img05.jpg                        |   Bin 0 -> 564 bytes
 src/htdocs/images/img06.jpg                        |   Bin 0 -> 2438 bytes
 src/htdocs/images/img07.gif                        |   Bin 0 -> 469 bytes
 src/htdocs/images/img08.jpg                        |   Bin 0 -> 10067 bytes
 src/htdocs/images/img09.jpg                        |   Bin 0 -> 1571 bytes
 src/htdocs/images/img10.jpg                        |   Bin 0 -> 1071 bytes
 src/htdocs/images/img11.gif                        |   Bin 0 -> 272 bytes
 src/htdocs/images/spacer.gif                       |   Bin 0 -> 43 bytes
 src/htdocs/index.shtml                             |    19 +
 src/htdocs/style.css                               |   344 +
 src/shared/CLucene/CLSharedMonolithic.cpp          |    26 +
 src/shared/CLucene/LuceneThreads.h                 |   176 +
 src/shared/CLucene/SharedHeader.cpp                |    32 +
 src/shared/CLucene/SharedHeader.h                  |   213 +
 src/shared/CLucene/_SharedHeader.h                 |    71 +
 src/shared/CLucene/_clucene-config.h.cmake         |   112 +
 src/shared/CLucene/clucene-config.h.cmake          |   148 +
 src/shared/CLucene/config/_gunichartables.h        | 11264 +++++++++++
 src/shared/CLucene/config/_threads.h               |   125 +
 src/shared/CLucene/config/gunichartables.cpp       |   377 +
 src/shared/CLucene/config/repl_lltot.cpp           |    47 +
 src/shared/CLucene/config/repl_tchar.h             |   181 +
 src/shared/CLucene/config/repl_tcscasecmp.cpp      |    21 +
 src/shared/CLucene/config/repl_tcslwr.cpp          |    15 +
 src/shared/CLucene/config/repl_tcstod.cpp          |    24 +
 src/shared/CLucene/config/repl_tcstoll.cpp         |    53 +
 src/shared/CLucene/config/repl_tprintf.cpp         |   149 +
 src/shared/CLucene/config/repl_wchar.h             |    90 +
 src/shared/CLucene/config/repl_wctype.h            |    76 +
 src/shared/CLucene/config/threads.cpp              |   292 +
 src/shared/CLucene/config/utf8.cpp                 |   261 +
 src/shared/CLucene/debug/_condition.h              |    69 +
 src/shared/CLucene/debug/condition.cpp             |    78 +
 src/shared/CLucene/util/Misc.cpp                   |   739 +
 src/shared/CLucene/util/Misc.h                     |   116 +
 src/shared/CLucene/util/StringBuffer.cpp           |   430 +
 src/shared/CLucene/util/StringBuffer.h             |    99 +
 src/shared/CLucene/util/deflate.cpp                |  1714 ++
 src/shared/CLucene/util/dirent.cpp                 |   224 +
 src/shared/CLucene/util/dirent.h                   |   109 +
 src/shared/CMakeLists.txt                          |   375 +
 src/shared/README                                  |     5 +
 src/shared/cmake/CheckAtomicFunctions.cmake        |    25 +
 src/shared/cmake/CheckErrorHandling.cmake          |    12 +
 src/shared/cmake/CheckFloatByte.cmake              |    37 +
 src/shared/cmake/CheckFloatByte.cpp.in             |    81 +
 src/shared/cmake/CheckHashmaps.cmake               |    72 +
 src/shared/cmake/CheckNamespace.cmake              |     8 +
 src/shared/cmake/CheckPthread.cmake                |    27 +
 src/shared/cmake/CheckSnprintf.cmake               |    42 +
 src/shared/cmake/CheckStdCallFunctionExists.cmake  |    89 +
 src/shared/cmake/CheckStdCallFunctionExists.cpp.in |    12 +
 src/shared/cmake/DefineDword.cmake                 |    17 +
 src/shared/cmake/DefineFloat.cmake                 |    29 +
 src/shared/cmake/DefineLongLongSyntax.cmake        |    14 +
 src/shared/cmake/DefineMAXPATHValue.cmake          |    30 +
 src/shared/cmake/DefineStaticSyntax.cmake          |    13 +
 src/shared/cmake/MacroCheckGccVisibility.cmake     |    58 +
 src/shared/cmake/MacroChooseFunction.cmake         |    49 +
 src/shared/cmake/MacroChooseMisc.cmake             |    82 +
 src/shared/cmake/MacroChooseSymbol.cmake           |    62 +
 src/shared/cmake/MacroChooseType.cmake             |    49 +
 src/shared/cmake/MacroEnsureVersion.cmake          |    71 +
 src/shared/cmake/MacroGetVariableValue.c.in        |    20 +
 src/shared/cmake/MacroGetVariableValue.cmake       |    33 +
 src/shared/cmake/MacroMustDefine.cmake             |    69 +
 src/shared/cmake/Macro_ChooseStatus.cmake          |    20 +
 src/test/CLMonolithic_Test.cpp                     |    67 +
 src/test/CMakeLists.txt                            |   276 +
 src/test/CuTest.cpp                                |   639 +
 src/test/CuTest.h                                  |   139 +
 src/test/README                                    |   239 +
 src/test/analysis/TestAnalysis.cpp                 |   110 +
 src/test/analysis/TestAnalyzers.cpp                |   524 +
 .../analysis/de/TestGermanStemFilter.cpp           |    65 +
 src/test/contribs-lib/analysis/testChinese.cpp     |   593 +
 src/test/data/StopWords.test                       |    12 +
 src/test/data/contribs-lib/analysis/de/data.txt    |    51 +
 src/test/data/french_unicode.bin                   |   Bin 0 -> 5604 bytes
 src/test/data/readme.txt                           |    41 +
 src/test/data/reuters-21578-index/_z.f0            |     1 +
 src/test/data/reuters-21578-index/_z.f1            |     1 +
 src/test/data/reuters-21578-index/_z.fdt           |   Bin 0 -> 616 bytes
 src/test/data/reuters-21578-index/_z.fdx           |   Bin 0 -> 248 bytes
 src/test/data/reuters-21578-index/_z.fnm           |     1 +
 src/test/data/reuters-21578-index/_z.frq           |  1047 +
 src/test/data/reuters-21578-index/_z.prx           |   Bin 0 -> 388750 bytes
 src/test/data/reuters-21578-index/_z.tii           |   Bin 0 -> 3462 bytes
 src/test/data/reuters-21578-index/_z.tis           |   Bin 0 -> 248046 bytes
 src/test/data/reuters-21578-index/deletable        |   Bin 0 -> 4 bytes
 src/test/data/reuters-21578-index/segments         |   Bin 0 -> 27 bytes
 src/test/data/reuters-21578/LEWIS.DTD              |    60 +
 src/test/data/reuters-21578/README.TXT             |   816 +
 .../reuters-21578/all-exchanges-strings.lc.txt     |    39 +
 .../data/reuters-21578/all-orgs-strings.lc.txt     |    56 +
 .../data/reuters-21578/all-people-strings.lc.txt   |   267 +
 .../data/reuters-21578/all-places-strings.lc.txt   |   175 +
 .../data/reuters-21578/all-topics-strings.lc.txt   |   135 +
 .../data/reuters-21578/cat-descriptions_120396.txt |  1203 ++
 .../feldman-cia-worldfactbook-data.txt             |  5199 +++++
 src/test/data/reuters-21578/reut2-000.sgm          |  2032 ++
 src/test/data/reuters-21578/reut2-001.sgm          |  2010 ++
 src/test/data/reuters-21578/reut2-002.sgm          |  2013 ++
 src/test/data/reuters-21578/reut2-003.sgm          |  2006 ++
 src/test/data/reuters-21578/reut2-004.sgm          |  2023 ++
 src/test/data/reuters-21578/reut2-005.sgm          |  2012 ++
 src/test/data/reuters-21578/reut2-006.sgm          |  2012 ++
 src/test/data/reuters-21578/reut2-007.sgm          |  2010 ++
 src/test/data/reuters-21578/reut2-008.sgm          |  2019 ++
 src/test/data/reuters-21578/reut2-009.sgm          |  2003 ++
 src/test/data/reuters-21578/reut2-010.sgm          |  2002 ++
 src/test/data/reuters-21578/reut2-011.sgm          |  2010 ++
 src/test/data/reuters-21578/reut2-012.sgm          |  2010 ++
 src/test/data/reuters-21578/reut2-013.sgm          |  2011 ++
 src/test/data/reuters-21578/reut2-014.sgm          |  2029 ++
 src/test/data/reuters-21578/reut2-015.sgm          |  2003 ++
 src/test/data/reuters-21578/reut2-016.sgm          |  2030 ++
 src/test/data/reuters-21578/reut2-017.sgm          |  2014 ++
 src/test/data/reuters-21578/reut2-018.sgm          |  2004 ++
 src/test/data/reuters-21578/reut2-019.sgm          |  2013 ++
 src/test/data/reuters-21578/reut2-020.sgm          |  2041 ++
 src/test/data/reuters-21578/reut2-021.sgm          |  2002 ++
 src/test/data/utf8text/arabic_utf8.txt             |    15 +
 src/test/data/utf8text/chinese_utf8.txt            |    32 +
 src/test/data/utf8text/czech_utf8.txt              |    78 +
 src/test/data/utf8text/english_utf8.txt            |    70 +
 src/test/data/utf8text/french_utf8.txt             |    30 +
 src/test/data/utf8text/german_utf8.txt             |    20 +
 src/test/data/utf8text/greek_utf8.txt              |    14 +
 src/test/data/utf8text/hebrew_utf8.txt             |   408 +
 src/test/data/utf8text/japanese_utf8.txt           |    28 +
 src/test/data/utf8text/korean_utf8.txt             |    51 +
 src/test/data/utf8text/polish_utf8.txt             |    65 +
 src/test/data/utf8text/russian_utf8.txt            |    11 +
 src/test/debug/TestError.cpp                       |   100 +
 src/test/document/TestDateTools.cpp                |   122 +
 src/test/document/TestDocument.cpp                 |   480 +
 src/test/document/TestField.cpp                    |    35 +
 src/test/document/TestNumberTools.cpp              |    80 +
 src/test/index/IndexWriter4Test.cpp                |    44 +
 src/test/index/IndexWriter4Test.h                  |    35 +
 src/test/index/TestAddIndexesNoOptimize.cpp        |   592 +
 src/test/index/TestHighFreqTerms.cpp               |    86 +
 src/test/index/TestIndexModifier.cpp               |   215 +
 src/test/index/TestIndexReader.cpp                 |   309 +
 src/test/index/TestIndexWriter.cpp                 |   668 +
 src/test/index/TestReuters.cpp                     |   234 +
 src/test/index/TestTermVectorsReader.cpp           |   466 +
 src/test/index/TestThreading.cpp                   |   161 +
 src/test/index/TestUtf8.cpp                        |   189 +
 src/test/nanobench.h                               |  3367 ++++
 src/test/queryParser/TestMultiFieldQueryParser.cpp |   171 +
 src/test/queryParser/TestQueryParser.cpp           |   885 +
 src/test/search/BaseTestRangeFilter.cpp            |   115 +
 src/test/search/BaseTestRangeFilter.h              |    47 +
 src/test/search/CheckHits.cpp                      |   542 +
 src/test/search/CheckHits.h                        |   120 +
 src/test/search/MockHitCollector.h                 |    40 +
 src/test/search/MockScorer.h                       |    93 +
 src/test/search/QueryUtils.cpp                     |   376 +
 src/test/search/QueryUtils.h                       |    54 +
 src/test/search/TestBoolean.cpp                    |   197 +
 src/test/search/TestConstantScoreRangeQuery.cpp    |   533 +
 src/test/search/TestDateFilter.cpp                 |   201 +
 src/test/search/TestExplanations.cpp               |   237 +
 src/test/search/TestExplanations.h                 |   151 +
 src/test/search/TestExtractTerms.cpp               |   309 +
 src/test/search/TestForDuplicates.cpp              |   164 +
 src/test/search/TestIndexSearcher.cpp              |    99 +
 src/test/search/TestQueries.cpp                    |   384 +
 src/test/search/TestRangeFilter.cpp                |   342 +
 src/test/search/TestSearch.cpp                     |   483 +
 src/test/search/TestSort.cpp                       |   552 +
 src/test/search/TestTermVector.cpp                 |   272 +
 src/test/search/TestWildcard.cpp                   |   132 +
 src/test/search/spans/TestBasics.cpp               |   588 +
 src/test/search/spans/TestBasics.h                 |    64 +
 src/test/search/spans/TestNearSpansOrdered.cpp     |   221 +
 src/test/search/spans/TestNearSpansOrdered.h       |    50 +
 src/test/search/spans/TestSpanExplanations.cpp     |   269 +
 src/test/search/spans/TestSpanExplanations.h       |    57 +
 .../spans/TestSpanExplanationsOfNonMatches.cpp     |    22 +
 .../spans/TestSpanExplanationsOfNonMatches.h       |    28 +
 src/test/search/spans/TestSpanQueries.cpp          |   186 +
 src/test/search/spans/TestSpans.cpp                |   300 +
 src/test/search/spans/TestSpans.h                  |    60 +
 src/test/search/spans/TestSpansAdvanced.cpp        |   136 +
 src/test/search/spans/TestSpansAdvanced.h          |    59 +
 src/test/search/spans/TestSpansAdvanced2.cpp       |    85 +
 src/test/search/spans/TestSpansAdvanced2.h         |    33 +
 src/test/store/MockRAMDirectory.cpp                |   326 +
 src/test/store/MockRAMDirectory.h                  |   176 +
 src/test/store/TestRAMDirectory.cpp                |   218 +
 src/test/store/TestStore.cpp                       |   123 +
 src/test/test.h                                    |   160 +
 src/test/testall.cpp                               |   345 +
 src/test/tests.cpp                                 |    52 +
 src/test/util/English.cpp                          |   134 +
 src/test/util/TestBKD.cpp                          |   960 +
 src/test/util/TestBKD.h                            |   110 +
 src/test/util/TestBitSet.cpp                       |   266 +
 src/test/util/TestMSBRadixSorter.cpp               |   123 +
 src/test/util/TestMSBRadixSorter.h                 |    71 +
 src/test/util/TestPriorityQueue.cpp                |    77 +
 src/test/util/TestStringBuffer.cpp                 |   116 +
 1052 files changed, 323119 insertions(+)

diff --git a/APACHE.license b/APACHE.license
new file mode 100644
index 0000000..261eeb9
--- /dev/null
+++ b/APACHE.license
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 0000000..a9ed2f9
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,23 @@
+As with most development projects, contributions come from many people and in 
+many forms. The CLucene project would like to thank it's many contributors. 
+Omissions are merely accidental, please e-mail ustramooner@users.sourceforge.net 
+if you have been left out or a contribution is not mentioned.
+
+CLucene was originally ported to C++ by Ben van Klinken (ustramooner@users.sourceforge.net)
+from Doug Cutting's popular java search engine, Lucene (see http://lucene.apache.org).
+
+Here is a list of contributors. Please send me an email at ustramooner@users.sourceforge.net
+if I have left you out.
+
+Doug Cutting  	         cutting@users.sourceforge.net
+John Wheeler 	         j_wheeler@users.sourceforge.net
+Robert G. Ristroph 	   rgristroph@users.sourceforge.net
+David Rushby 	         woodsplitter@users.sourceforge.net
+Jimmy Pritts            jpritts@sdf.lonestar.org
+Peter Edwards           peter@dragonstaff.co.uk
+Jorge Sabater Redondo   jsabater@elderecho.com
+Daniel Glassey          danglassey@ntlworld.com
+Peter Gladkikh          batyi@mail.ru
+Pedja                   amigo@max3d.com
+Peter Hodges            hodges.peter@gmail.com
+Itamar Syn-Hershko		synhershko@users.sourceforge.net
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..6e80332
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,218 @@
+PROJECT (clucene)
+
+#Rules for version:
+#MAJOR and MINOR versions are purely political (tracks JLucene compatibility)
+#REVISION version MUST be revised if the headers or compatibility change
+#PATCH should be 0 unless a patch is made that doesn't affect the public signature (i.e. clients don't need to re-compile).
+SET(CLUCENE_VERSION_MAJOR "2")
+SET(CLUCENE_VERSION_MINOR "3")
+SET(CLUCENE_VERSION_REVISION "3")
+SET(CLUCENE_VERSION_PATCH "4")
+
+# SOVERSION information
+#Must be incremented for releases if the api is not backwards compatible
+SET(CLUCENE_SOVERSION "1")
+
+MATH(EXPR CLUCENE_INT_VERSION "(${CLUCENE_VERSION_MAJOR} * 1000000) + (${CLUCENE_VERSION_MINOR} * 10000) + (${CLUCENE_VERSION_REVISION} * 100) + (${CLUCENE_VERSION_PATCH} * 1)" )
+SET(CLUCENE_VERSION "${CLUCENE_VERSION_MAJOR}.${CLUCENE_VERSION_MINOR}.${CLUCENE_VERSION_REVISION}.${CLUCENE_VERSION_PATCH}")
+
+#CMake 2.6+ is recommended to an improved Boost module
+CMAKE_MINIMUM_REQUIRED(VERSION 2.4.0 FATAL_ERROR)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17")
+
+if(COMMAND cmake_policy)
+  cmake_policy(SET CMP0003 NEW)
+  cmake_policy(SET CMP0043 NEW)
+  cmake_policy(SET CMP0054 NEW)
+endif(COMMAND cmake_policy)
+
+#set various platform specific global options
+if(WIN32)
+ set(CMAKE_DEBUG_POSTFIX "d")
+endif(WIN32)
+
+
+# include specific modules
+set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
+
+#remove FOR cmake,use shell script
+include(cmake/TurboPFOR.cmake)
+
+#define options...
+
+include(cmake/CLuceneDocs.cmake)
+Include (FindThreads)
+
+IF(NOT CMAKE_BUILD_TYPE)
+    SET(CMAKE_BUILD_TYPE RelWithDebInfo CACHE STRING
+      "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel."
+      FORCE)
+ELSE(NOT CMAKE_BUILD_TYPE)
+    MESSAGE( "Compiling as ${CMAKE_BUILD_TYPE}" )
+ENDIF(NOT CMAKE_BUILD_TYPE)
+
+OPTION(ENABLE_DEBUG
+  "enable debug support"
+  OFF)
+OPTION(ENABLE_MMAP
+  "enable mmap support (experimental)"
+  OFF)
+OPTION(DISABLE_MULTITHREADING
+  "disable multithreading - remove all locking code"
+  OFF)
+OPTION(ENABLE_DMALLOC
+  "enable dmalloc memory leak checker"
+  OFF)
+OPTION(ENABLE_ASCII_MODE
+  "enable ascii support"
+  OFF)
+  
+SET(ENABLE_ANSI_MODE OFF)
+IF(CMAKE_COMPILER_IS_GNUCXX)
+  SET(ENABLE_ANSI_MODE ON)
+  
+  #exceptions:
+  IF(MINGW OR CYGWIN)
+    SET(ENABLE_ANSI_MODE OFF)
+  ENDIF(MINGW OR CYGWIN)
+ENDIF(CMAKE_COMPILER_IS_GNUCXX)
+
+SET(ENABLE_ANSI_MODE OFF)
+
+OPTION(ENABLE_ANSI_MODE
+  "compile with -ansi flag"
+  ${ENABLE_ANSI_MODE})
+OPTION(LUCENE_USE_INTERNAL_CHAR_FUNCTIONS
+  "use internal character functions. required to run tests correctly"
+  ON)
+OPTION(ENABLE_PACKAGING
+  "create build scripts for creating clucene packages"
+  OFF)
+OPTION(BUILD_STATIC_LIBRARIES
+  "create targets for building static libraries"
+  ON)
+OPTION(BUILD_CONTRIBS
+  "create targets for building the clucene-contribs"
+  OFF)
+OPTION(BUILD_CONTRIBS_LIB
+  "create targets for building the clucene-contribs-lib"
+  OFF)
+SET(LUCENE_SYS_INCLUDES "" CACHE PATH
+      "location for non-system independent files. defaults to CMAKE_INSTALL_PREFIX. see INSTALL documentation for further information."
+      )
+#install path options
+SET(LIB_SUFFIX "" CACHE STRING "Define suffix of directory name (32/64)" )
+SET(LIB_DESTINATION "lib${LIB_SUFFIX}")
+
+
+SET ( ENABLE_COMPILE_TESTS_VALUE ON )
+IF ( MSVC_IDE )
+	#this is annoying...
+	SET ( ENABLE_COMPILE_TESTS_VALUE OFF )
+ENDIF( MSVC_IDE )
+
+OPTION(ENABLE_COMPILE_TESTS
+  "enable various projects that test alternative build switches"
+  ${ENABLE_COMPILE_TESTS_VALUE})
+
+if (__COMPILER_CLANG)
+    SET(CXX_FLAGS_ASAN "${CXX_GCC_FLAGS} -O0 -fsanitize=address -DADDRESS_SANITIZER")
+    SET(CXX_FLAGS_LSAN "${CXX_GCC_FLAGS} -O0 -fsanitize=leak -DLEAK_SANITIZER")
+else ()
+    SET(CXX_FLAGS_ASAN "${CXX_GCC_FLAGS} -O0 -fsanitize=address -DADDRESS_SANITIZER -static-libasan")
+    SET(CXX_FLAGS_LSAN "${CXX_GCC_FLAGS} -O0 -fsanitize=leak -DLEAK_SANITIZER -static-liblsan")
+endif ()
+
+# Set compile flags based on the build type.
+if ("${CMAKE_BUILD_TYPE}" STREQUAL "ASAN")
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_FLAGS_ASAN}")
+elseif ("${CMAKE_BUILD_TYPE}" STREQUAL "LSAN")
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_FLAGS_LSAN}")
+endif()
+
+if (USE_AVX2)
+	SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_AVX2")
+endif()
+if (__COMPILER_CLANG)
+	SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-c++11-narrowing")
+else ()
+	SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
+endif ()
+
+#check flags...
+INCLUDE (TestCXXAcceptsFlag)
+IF ( CMAKE_COMPILER_IS_GNUCC )
+    CHECK_CXX_ACCEPTS_FLAG(-pg GccFlagPg)
+    IF ( GccFlagPg )
+        OPTION(ENABLE_GPROF
+          "turn on gprof profiling support"
+          OFF)
+
+        IF ( ENABLE_GPROF )
+            SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pg")
+            SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -pg")
+            SET(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -pg")
+            SET(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -pg")
+        ENDIF ( ENABLE_GPROF )
+    ENDIF ( GccFlagPg )
+    
+    IF("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
+        SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC"  )
+    ENDIF("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
+
+   #IF( ENABLE_ANSI_MODE )
+   # SET (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ansi")
+   #ENDIF ( ENABLE_ANSI_MODE )
+ENDIF(CMAKE_COMPILER_IS_GNUCC) 
+
+
+#Single output directory for building all executables and libraries.
+SET(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR}/bin CACHE PATH "Executable Output Directory" FORCE)
+SET(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR}/bin CACHE PATH "Library Output Directory" FORCE)
+
+option(ENABLE_TESTS "Enable tests" OFF)
+#add tests
+ENABLE_TESTING()
+ADD_TEST(SimpleTest ${EXECUTABLE_OUTPUT_PATH}/cl_test )
+
+#use single output directory
+INCLUDE_DIRECTORIES( ${clucene_SOURCE_DIR}/src/shared )
+INCLUDE_DIRECTORIES( ${clucene_BINARY_DIR}/src/shared )
+INCLUDE_DIRECTORIES( ${clucene_SOURCE_DIR}/src/core )
+
+#set boost path. we need src/ext to be defined before this works...
+include(cmake/CLuceneBoost.cmake)
+GET_BOOST_INCLUDE_PATH(_CL_BOOST_INCLUDE_PATH)
+INCLUDE_DIRECTORIES( ${_CL_BOOST_INCLUDE_PATH} )
+
+#include the projects
+ADD_SUBDIRECTORY (src/ext)
+ADD_SUBDIRECTORY (src/shared)
+ADD_SUBDIRECTORY (src/core)
+ADD_SUBDIRECTORY (src/test)
+ADD_SUBDIRECTORY (src/demo EXCLUDE_FROM_ALL)
+
+IF ( BUILD_CONTRIBS )
+  ADD_SUBDIRECTORY (src/contribs EXCLUDE_FROM_ALL)
+  SET(BUILD_CONTRIBS_LIB 1)
+ENDIF ( BUILD_CONTRIBS )
+IF ( BUILD_CONTRIBS_LIB )
+  ADD_SUBDIRECTORY (src/contribs-lib)
+ENDIF ( BUILD_CONTRIBS_LIB )
+
+#add uninstall command
+CONFIGURE_FILE(
+  "${CMAKE_MODULE_PATH}/cmake_uninstall.cmake.in"
+  "${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake"
+  IMMEDIATE @ONLY)
+  
+#ADD_CUSTOM_TARGET(uninstall "${CMAKE_COMMAND}" -P "${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake")
+
+#this must go last...
+IF (ENABLE_PACKAGING)
+  INCLUDE(CreateClucenePackages)
+ENDIF ( ENABLE_PACKAGING)
+
+set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
+
diff --git a/COPYING b/COPYING
new file mode 100644
index 0000000..13dc166
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,126 @@
+License
+
+The CLucene code is split into two sections for licensing reasons. The 'Core'
+is strictly dual licensed Apache 2 and LGPL. The CLucene Contributions code
+could not be licensed under Apache because of the subcomponents it uses.
+
+CLUCENE CORE:
+
+The CLucene Core Library uses a dual license strategy for the source code. 
+These licenses are the GNU Lesser General Public License (LGPL) and the Apache
+License (Version 2.0). Users can choose the license they wish to distribute
+their software under. This means that you do not need to abide by *both*
+licenses, but rather than you can choose the license which most suits your
+needs.
+
+For details of these licenses see APACHE.license and LGPL.license in the root
+of the source distribution.
+
+Some components of CLucene Core use other licenses. See the CLUCENE CORE SUBCOMPONENTS
+section for details. We understand that these licenses are compatible with LGPL and/or
+Apache v2, but please consult with a lawyer to be certain of this.
+
+To rephrase the licensing and to make it perfectly clear:
+CLucene is distributed under the GNU Lesser General Public License (LGPL) 
+	*or*
+the Apache License, Version 2.0
+
+However, we are an open source project, and we encourage users to participate fully 
+in the free software community by contributing their code back to the project. 
+Dual licensing of the CLucene source code provides open and free access to the 
+technology both for the GPL community and for other developers or companies 
+that cannot use the GPL license.
+
+You can freely modify, extend, and improve the CLucene source code. The only
+question is whether or not you must provide the source code and contribute
+modifications to the community. The GNU and Apache licenses allow different
+ranges of flexibility in this regard, but in the end, regardless of the license
+used, we highly recommend that you submit any bugs, incompatibilities or
+added features.
+
+Note that this same license does *not* apply to the CLucene Contributions
+package. You should read the COPYING file in that directory or package for
+more information.
+
+
+CLUCENE CORE SUBCOMPONENTS: 
+
+CLucene includes a number of subcomponents with separate copyright 
+notices and license terms. Your use of the source code for the 
+these subcomponents is subject to the terms and conditions of the 
+following licenses. 
+
+For the  src\CLucene\util\MD5Digester.cpp component:
+/*
+ * This is work is derived from material Copyright RSA Data Security, Inc.
+ *
+ * The RSA copyright statement and Licence for that original material is
+ * included below. This is followed by the Apache copyright statement and
+ * licence for the modifications made to that material.
+ */
+/* Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All
+   rights reserved.
+
+   License to copy and use this software is granted provided that it
+   is identified as the "RSA Data Security, Inc. MD5 Message-Digest
+   Algorithm" in all material mentioning or referencing this software
+   or this function.
+
+   License is also granted to make and use derivative works provided
+   that such works are identified as "derived from the RSA Data
+   Security, Inc. MD5 Message-Digest Algorithm" in all material
+   mentioning or referencing the derived work.
+
+   RSA Data Security, Inc. makes no representations concerning either
+   the merchantability of this software or the suitability of this
+   software for any particular purpose. It is provided "as is"
+   without express or implied warranty of any kind.
+
+   These notices must be retained in any copies of any part of this
+   documentation and/or software.
+ */
+ 
+
+
+the cmake/MacroCheckGccVisibility.cmake and MacroEnsureVersion.cmake components:
+#
+# Copyright (c) 2006, Alexander Neundorf <ne...@kde.org>
+# Copyright (c) 2006, Laurent Montel, <mo...@kde.org>
+#
+# Redistribution and use is allowed according to the terms of the BSD license.
+# For details see the accompanying COPYING-CMAKE-SCRIPTS file.
+
+the src/core/util/Compress.cpp component:
+/* zlib.h -- interface of the 'zlib' general purpose compression library
+ version 1.2.3, July 18th, 2005
+
+ Copyright (C) 1995-2005 Jean-loup Gailly and Mark Adler
+
+ This software is provided 'as-is', without any express or implied
+ warranty.  In no event will the authors be held liable for any damages
+ arising from the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you must not
+    claim that you wrote the original software. If you use this software
+    in a product, an acknowledgment in the product documentation would be
+    appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not be
+    misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source distribution.
+
+ Jean-loup Gailly jloup@gzip.org
+ Mark Adler madler@alumni.caltech.edu
+
+*/
+
+
+
+CLUCENE CONTRIBUTIONS CODE:
+
+PorterStemmer code: couldn't find license. This component is deprecated and will be removed very soon.
+
+Snowball code: needs to be researched.
diff --git a/ChangeLog b/ChangeLog
new file mode 100644
index 0000000..137d92d
--- /dev/null
+++ b/ChangeLog
@@ -0,0 +1,17 @@
+Removed jstreams namespace. Sorry, I couldn't think of a way to nicely deprecate jstreams.
+
+version 0.9.23:
+Changes:
+* Static object fields have been changed to method accessors (SortField::FIELDDOC now chould be accessed as SortField::FIELDDOC(), for example). Classes changed: FieldCache, ScoreDocComparator,
+  This was necessary for creating static libraries work on certain platforms.
+* Folders were reorganised, this seems like a good time to do it  
+* Some deprecated functions were removed.
+* moved platform configuration type code and utility code into 'shared' project. This enables tests to be built with a shared library on windows
+* Moved moved of the platform specific logic into cmake in order to reduce #ifdefs in code (i love cmake!)
+* added contributions code into the trunk. this will hopefully mean more exposure to the contributions. need to make clear about the licensing still, though.
+* Deletor::Array was renamed to Deletor::vArray.
+* re-worked the install location for system-dependent files (clucene-config.h). this was a confusing issue, and i think it's better to stick to the standards rather than push the more compatible (in my opinion) way of doing things. this one has been getting so many complaints from downstream. however, LUCENE_SYS_INCLUDES is available to install the clucene-config.h type files into the library directory (or any other place).
+
+Here is a summary of changes that you'll need to look at for this release:
+* Action deprecated features. Some features that were deprecated for a long time have now been finally removed.
+* fix things that may affect you, such as the LUCENE_SYS_INCLUDES change, and the reorganisation of code (install locations are still the same though). Also autotools removals may affect your work, depending on how you use clucene.
diff --git a/INSTALL b/INSTALL
new file mode 100644
index 0000000..71f3329
--- /dev/null
+++ b/INSTALL
@@ -0,0 +1,265 @@
+* There are packages available for most linux distributions through the usual channels.
+* The Clucene Sourceforge website also has some distributions available.
+
+Also in this document is information how to build from source, troubleshooting, 
+performance, and how to create a new distribution.
+
+
+Building from source:
+--------------------
+
+Dependencies:
+* CMake version 2.4.2 or later.
+* A functioning and fairly new C++ compiler. We test mostly on GCC and Visual Studio 6+.
+Anything other than that may not work.
+* Something to unzip/untar the source code.
+
+Build instructions:
+1.) Download the latest sourcecode from http://www.sourceforge.net/projects/clucene
+    [Choose stable if you want the 'time tested' version of code. However, often
+    the unstable version will suite your needs more since it is newer and has had
+    more work put into it. The decision is up to you.]
+2.) Unpack the tarball/zip/bzip/whatever
+3.) Open a command prompt, terminal window, or cygwin session.
+4.) Change directory into the root of the sourcecode (from now on referred to as <clucene>)
+# cd <clucene>
+5.) Create and change directory into an 'out-of-source' directory for your build. 
+    [This is by far the easiest way to build,  it has the benefit of being able to 
+    create different types of builds in the same source-tree.]
+# mkdir <clucene>/build-name
+# cd <clucene>/build-name
+6.) Configure using cmake. This can be done many different ways, but the basic syntax is
+# cmake [-G "Script name"] ..
+    [Where "Script name" is the name of the scripts to build (e.g. Visual Studio 8 2005).
+    A list of supported build scripts can be found by]
+# cmake --help
+7.) You can configure several options such as the build type, debugging information, 
+    mmap support, etc, by using the CMake GUI or by calling 
+# ccmake ..
+    Make sure you call configure again if you make any changes.
+8.) Start the build. This depends on which build script you specified, but it would be something like
+# make
+or
+# nmake
+    Or open the solution files with your IDE.
+
+    [You can also specify to just build a certain target (such as cl_test, cl_demo, 
+    clucene-core (shared library), clucene-core-static (static library).]
+9.) The binary files will be available in <clucene>build-name/bin
+10.)Test the code. (After building the tests - this is done by default, or by calling make cl_test)
+# ctest -V
+11.)At this point you can install the library:
+# make install
+    [There are options to do this from the IDE, but I find it easier to create a 
+    distribution (see instructions below) and install that instead.]
+or
+# make cl_demo
+    [This creates the demo application, which demonstrates a simple text indexing and searching].
+or
+	Adjust build values using ccmake or the Cmake GUI and rebuild.
+	
+12.)Now you can develop your own code. This is beyond the scope of this document.
+    Read the README for information about documentation or to get help on the mailinglist.
+
+Other platforms:
+----------------
+Some platforms require specific actions to get cmake working. Here are some general tips:
+
+Solaris:
+I had problems when using the standard stl library. Using the -stlport4 switch worked. Had
+to specify compiler from the command line: cmake -DCXX_COMPILER=xxx -stlport4
+
+Building Performance
+--------------------
+Use of ccache will speed up build times a lot. I found it easiest to add the /usr/lib/ccache directory to the beginning of your paths. This works for most common compilers.
+
+PATH=/usr/lib/ccache:$PATH
+
+Note: you must do this BEFORE you configure the path, since you cannot change the compiler path after it is configured.
+
+Installing:
+-----------
+CLucene is installed in CMAKE_INSTALL_PREFIX by default. 
+
+CLucene used to put config headers next to the library. this was done
+because these headers are generated and are relevant to the library.
+CMAKE_INSTALL_PREFIX was for system-independent files. the idea is that
+you could have several versions of the library installed (ascii version,
+ucs2 version, multithread, etc) and have only one set of headers.
+in version 0.9.24+ we allow this feature, but you have to use 
+LUCENE_SYS_INCLUDES to specify where to install these files.
+
+Troubleshooting:
+----------------
+
+'Too many open files'
+Some platforms don't provide enough file handles to run CLucene properly.
+To solve this, increase the open file limit:
+
+On Solaris:
+ulimit -n 1024
+set rlim_fd_cur=1024
+
+GDB - GNU debugging tool (linux only)
+------------------------
+If you get an error, try doing this. More information on GDB can be found on the internet
+
+#gdb bin/cl_test
+# gdb> run
+when gdb shows a crash run
+# gdb> bt
+a backtrace will be printed. This may help to solve any problems.
+
+Code layout
+--------------
+File locations:
+* clucene-config.h is required and is distributed next to the library, so that multiple libraries can exist on the
+  same machine, but use the same header files.
+* _HeaderFile.h files are private, and are not to be used or distributed by anything besides the clucene-core library.
+* _clucene-config.h should NOT be used, it is also internal
+* HeaderFile.h are public and are distributed and the classes within should be exported using CLUCENE_EXPORT.
+* The exception to the internal/public conventions is if you use the static library. In this case the internal
+  symbols will be available (this is the way the tests program tests internal code). However this is not recommended.
+
+Memory management
+------------------
+Memory in CLucene has been a bit of a difficult thing to manage because of the
+unclear specification about who owns what memory. This was mostly a result of
+CLucene's java-esque coding style resulting from porting from java to c++ without
+too much re-writing of the API. However, CLucene is slowly improving
+in this respect and we try and follow these development and coding rules (though
+we dont guarantee that they are all met at this stage):
+
+1. Whenever possible the caller must create the object that is being filled. For example:
+IndexReader->getDocument(id, document);
+As opposed to the old method of document = IndexReader->getDocument(id);
+
+2. Clone always returns a new object that must be cleaned up manually.
+
+Questions:
+1. What should be the convention for an object taking ownership of memory? 
+   Some documenting is available on this, but not much
+
+Working with valgrind
+----------------------
+Valgrind reports memory leaks and memory problems. Tests should always pass
+valgrind before being passed.
+
+#valgrind --leak-check=full <program>
+
+Memory leak tracking with dmalloc
+---------------------------------
+dmalloc (http://dmalloc.com/) is also a nice tool for finding memory leaks. 
+To enable, set the ENABLE_DMALLOC flag to ON in cmake. You will of course
+have to have the dmalloc lib installed for this to work.
+
+The cl_test file will by default print a low number of errors and leaks into
+the dmalloc.log.txt file (however, this has a tendency to print false positives). 
+You can override this by setting your environment variable DMALLOC_OPTIONS. 
+See http://dmalloc.com/ or dmalloc --usage for more information on how to use dmalloc
+
+For example:
+# DMALLOC_OPTIONS=medium,log=dmalloc.log.txt
+# export DMALLOC_OPTIONS
+
+UPDATE: when i upgrade my machine to Ubuntu 9.04, dmalloc stopped working (caused
+clucene to crash).
+
+Performance with callgrind
+--------------------------
+Really simple
+
+valgrind --tool=callgrind <command: e.g. bin/cl_test>
+this will create a file like callgrind.out.12345. you can open this with kcachegrind or some
+tool like that.
+
+
+Performance with gprof
+----------------------
+Note: I recommend callgrind, it works much better.
+
+Compile with gprof turned on (ENABLE_GPROF in cmake gui or using ccmake).
+I've found (at least on windows cygwin) that gprof wasn't working over
+dll boundaries, running the cl_test-pedantic monolithic build worked better.
+
+This is typically what I use to produce some meaningful output after a -pg
+compiled application has exited:
+# gprof bin/cl_test-pedantic.exe gmon.out >gprof.txt
+
+Code coverage with gcov
+-----------------------
+To create a code coverage report of the test, you can use gcov. Here are the
+steps I followed to create a nice html report. You'll need the lcov package
+installed to generate html. Also, I recommend using an out-of-source build
+directory as there are lots of files that will be generated.
+
+NOTE: you must have lcov installed for this to work
+
+* It is normally recommended to compile with no optimisations, so change CMAKE_BUILD_TYPE
+to Debug.
+
+* I have created a cl_test-gcov target which contains the necessary gcc switches
+already. So all you need to do is
+# make test-gcov
+
+If everything goes well, there will be a directory called code-coverage containing the report.
+
+If you want to do this process manually, then:
+# lcov --directory ./src/test/CMakeFiles/cl_test-gcov.dir/__/core/CLucene -c -o clucene-coverage.info
+# lcov --remove clucene-coverage.info "/usr/*" > clucene-coverage.clean
+# genhtml -o clucene-coverage clucene-coverage.clean
+
+If both those commands pass, then there will be a clucene coverage report in the 
+clucene-coverage directory.
+
+Benchmarks
+----------
+Very little benchmarking has been done on clucene. Andi Vajda posted some 
+limited statistics on the clucene list a while ago with the following results.
+
+There are 250 HTML files under $JAVA_HOME/docs/api/java/util for about
+6108kb of HTML text. 
+org.apache.lucene.demo.IndexFiles with java and gcj: 
+on mac os x 10.3.1 (panther) powerbook g4 1ghz 1gb:
+    . running with java 1.4.1_01-99 : 20379 ms
+    . running with gcj 3.3.2 -O2    : 17842 ms
+    . running clucene 0.8.9's demo  :  9930 ms 
+
+I recently did some more tests and came up with these rough tests:
+663mb (797 files) of Guttenberg texts 
+on a Pentium 4 running Windows XP with 1 GB of RAM. Indexing max 100,000 fields
+- Jlucene: 646453ms. peak mem usage ~72mb, avg ~14mb ram
+- Clucene: 232141. peak mem usage ~60, avg ~4mb ram
+
+Searching indexing using 10,000 single word queries
+- Jlucene: ~60078ms and used ~13mb ram
+- Clucene: ~48359ms and used ~4.2mb ram
+
+Distribution
+------------
+CPack is used for creating distributions.
+* Create a out-of-source build as per usual
+* Make sure the version number is correct (see <clucene>/CMakeList.txt, right at the top of the file)
+* Make sure you are compiling in the correct release mode (check ccmake or the cmake gui)
+* Make sure you enable ENABLE_PACKAGING (check ccmake or the cmake gui)
+* Next, check that the package is compliant using several tests (must be done from a linux terminal, or cygwin):
+# cd <clucene>/build-name
+# ../dist-check.sh
+* Make sure the source directory is clean. Make sure there are no unknown svn files:
+# svn stat .. 
+* Run the tests to make sure that the code is ok (documented above)
+* If all tests pass, then run
+# make package
+for the binary package (and header files). This will only create a tar.gz package.
+and/or
+# make package_source
+for the source package. This will create a ZIP on windows, and tar.bz2 and tar.gz packages on other platforms.
+
+There are also options for create RPM, Cygwin, NSIS, Debian packages, etc. It depends on your version of CPack.
+Call 
+# cpack --help
+to get a list of generators. 
+
+Then create a special package by calling
+# cpack -G <GENERATOR> CPackConfig.cmake
+
diff --git a/LGPL.license b/LGPL.license
new file mode 100644
index 0000000..422c760
--- /dev/null
+++ b/LGPL.license
@@ -0,0 +1,475 @@
+		  GNU LESSER GENERAL PUBLIC LICENSE
+		       Version 2.1, February 1999
+
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+     59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+[This is the first released version of the Lesser GPL.  It also counts
+ as the successor of the GNU Library Public License, version 2, hence
+ the version number 2.1.]
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+Licenses are intended to guarantee your freedom to share and change
+free software--to make sure the software is free for all its users.
+
+  This license, the Lesser General Public License, applies to some
+specially designated software packages--typically libraries--of the
+Free Software Foundation and other authors who decide to use it.  You
+can use it too, but we suggest you first think carefully about whether
+this license or the ordinary General Public License is the better
+strategy to use in any particular case, based on the explanations below.
+
+  When we speak of free software, we are referring to freedom of use,
+not price.  Our General Public Licenses are designed to make sure that
+you have the freedom to distribute copies of free software (and charge
+for this service if you wish); that you receive source code or can get
+it if you want it; that you can change the software and use pieces of
+it in new free programs; and that you are informed that you can do
+these things.
+
+  To protect your rights, we need to make restrictions that forbid
+distributors to deny you these rights or to ask you to surrender these
+rights.  These restrictions translate to certain responsibilities for
+you if you distribute copies of the library or if you modify it.
+
+  For example, if you distribute copies of the library, whether gratis
+or for a fee, you must give the recipients all the rights that we gave
+you.  You must make sure that they, too, receive or can get the source
+code.  If you link other code with the library, you must provide
+complete object files to the recipients, so that they can relink them
+with the library after making changes to the library and recompiling
+it.  And you must show them these terms so they know their rights.
+
+  We protect your rights with a two-step method: (1) we copyright the
+library, and (2) we offer you this license, which gives you legal
+permission to copy, distribute and/or modify the library.
+
+  To protect each distributor, we want to make it very clear that
+there is no warranty for the free library.  Also, if the library is
+modified by someone else and passed on, the recipients should know
+that what they have is not the original version, so that the original
+author's reputation will not be affected by problems that might be
+introduced by others.
+
+-------------------------------------------------------------------------------
+
+  Finally, software patents pose a constant threat to the existence of
+any free program.  We wish to make sure that a company cannot
+effectively restrict the users of a free program by obtaining a
+restrictive license from a patent holder.  Therefore, we insist that
+any patent license obtained for a version of the library must be
+consistent with the full freedom of use specified in this license.
+
+  Most GNU software, including some libraries, is covered by the
+ordinary GNU General Public License.  This license, the GNU Lesser
+General Public License, applies to certain designated libraries, and
+is quite different from the ordinary General Public License.  We use
+this license for certain libraries in order to permit linking those
+libraries into non-free programs.
+
+  When a program is linked with a library, whether statically or using
+a shared library, the combination of the two is legally speaking a
+combined work, a derivative of the original library.  The ordinary
+General Public License therefore permits such linking only if the
+entire combination fits its criteria of freedom.  The Lesser General
+Public License permits more lax criteria for linking other code with
+the library.
+
+  We call this license the "Lesser" General Public License because it
+does Less to protect the user's freedom than the ordinary General
+Public License.  It also provides other free software developers Less
+of an advantage over competing non-free programs.  These disadvantages
+are the reason we use the ordinary General Public License for many
+libraries.  However, the Lesser license provides advantages in certain
+special circumstances.
+
+  For example, on rare occasions, there may be a special need to
+encourage the widest possible use of a certain library, so that it becomes
+a de-facto standard.  To achieve this, non-free programs must be
+allowed to use the library.  A more frequent case is that a free
+library does the same job as widely used non-free libraries.  In this
+case, there is little to gain by limiting the free library to free
+software only, so we use the Lesser General Public License.
+
+  In other cases, permission to use a particular library in non-free
+programs enables a greater number of people to use a large body of
+free software.  For example, permission to use the GNU C Library in
+non-free programs enables many more people to use the whole GNU
+operating system, as well as its variant, the GNU/Linux operating
+system.
+
+  Although the Lesser General Public License is Less protective of the
+users' freedom, it does ensure that the user of a program that is
+linked with the Library has the freedom and the wherewithal to run
+that program using a modified version of the Library.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.  Pay close attention to the difference between a
+"work based on the library" and a "work that uses the library".  The
+former contains code derived from the library, whereas the latter must
+be combined with the library in order to run.
+
+-------------------------------------------------------------------------------
+
+		  GNU LESSER GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+
+  A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+
+  The "Library", below, refers to any such software library or work
+which has been distributed under these terms.  A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language.  (Hereinafter, translation is
+included without limitation in the term "modification".)
+
+  "Source code" for a work means the preferred form of the work for
+making modifications to it.  For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control compilation
+and installation of the library.
+
+  Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it).  Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+  
+  1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+Library.
+
+  You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+fee.
+
+-------------------------------------------------------------------------------
+
+  2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) The modified work must itself be a software library.
+
+    b) You must cause the files modified to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    c) You must cause the whole of the work to be licensed at no
+    charge to all third parties under the terms of this License.
+
+    d) If a facility in the modified Library refers to a function or a
+    table of data to be supplied by an application program that uses
+    the facility, other than as an argument passed when the facility
+    is invoked, then you must make a good faith effort to ensure that,
+    in the event an application does not supply such function or
+    table, the facility still operates, and performs whatever part of
+    its purpose remains meaningful.
+
+    (For example, a function in a library to compute square roots has
+    a purpose that is entirely well-defined independent of the
+    application.  Therefore, Subsection 2d requires that any
+    application-supplied function or table used by this function must
+    be optional: if the application does not supply it, the square
+    root function must still compute square roots.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library.  To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License.  (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.)  Do not make any other change in
+these notices.
+
+-------------------------------------------------------------------------------
+
+  Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+
+  This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+
+  4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+
+  If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library".  Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+
+  However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library".  The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+
+  When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library.  The
+threshold for this to be true is not precisely defined by law.
+
+  If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work.  (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+
+  Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+
+-------------------------------------------------------------------------------
+
+  6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+
+  You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License.  You must supply a copy of this License.  If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License.  Also, you must do one
+of these things:
+
+    a) Accompany the work with the complete corresponding
+    machine-readable source code for the Library including whatever
+    changes were used in the work (which must be distributed under
+    Sections 1 and 2 above); and, if the work is an executable linked
+    with the Library, with the complete machine-readable "work that
+    uses the Library", as object code and/or source code, so that the
+    user can modify the Library and then relink to produce a modified
+    executable containing the modified Library.  (It is understood
+    that the user who changes the contents of definitions files in the
+    Library will not necessarily be able to recompile the application
+    to use the modified definitions.)
+
+    b) Use a suitable shared library mechanism for linking with the
+    Library.  A suitable mechanism is one that (1) uses at run time a
+    copy of the library already present on the user's computer system,
+    rather than copying library functions into the executable, and (2)
+    will operate properly with a modified version of the library, if
+    the user installs one, as long as the modified version is
+    interface-compatible with the version that the work was made with.
+
+    c) Accompany the work with a written offer, valid for at
+    least three years, to give the same user the materials
+    specified in Subsection 6a, above, for a charge no more
+    than the cost of performing this distribution.
+
+    d) If distribution of the work is made by offering access to copy
+    from a designated place, offer equivalent access to copy the above
+    specified materials from the same place.
+
+    e) Verify that the user has already received a copy of these
+    materials or that you have already sent this user a copy.
+
+  For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it.  However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+
+  It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system.  Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+distribute.
+
+-------------------------------------------------------------------------------
+
+  7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+
+    a) Accompany the combined library with a copy of the same work
+    based on the Library, uncombined with any other library
+    facilities.  This must be distributed under the terms of the
+    Sections above.
+
+    b) Give prominent notice with the combined library of the fact
+    that part of it is a work based on the Library, and explaining
+    where to find the accompanying uncombined form of the same work.
+
+  8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License.  Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License.  However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+
+  9. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Library or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+
+  10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+
+-------------------------------------------------------------------------------
+
+  11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+
+If any portion of this section is held invalid or unenforceable under any
+particular circumstance, the balance of the section is intended to apply,
+and the section as a whole is intended to apply in other circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License may add
+an explicit geographical distribution limitation excluding those countries,
+so that distribution is permitted only in or among countries not thus
+excluded.  In such case, this License incorporates the limitation as if
+written in the body of this License.
+
+  13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation.  If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+
+-------------------------------------------------------------------------------
+
+  14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission.  For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this.  Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+
+			    NO WARRANTY
+
+  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
+OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
+KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
diff --git a/NEWS b/NEWS
new file mode 100644
index 0000000..e69de29
diff --git a/README b/README
new file mode 100644
index 0000000..9b411e7
--- /dev/null
+++ b/README
@@ -0,0 +1,63 @@
+CLucene README
+==============
+
+------------------------------------------------------
+CLucene is a C++ port of Lucene.
+It is a high-performance, full-featured text search 
+engine written in C++. CLucene is faster than lucene
+as it is written in C++.
+------------------------------------------------------
+
+CLucene has contributions from many, see AUTHORS
+
+CLucene is distributed under the GNU Lesser General Public License (LGPL) 
+	*or*
+the Apache License, Version 2.0
+See the LGPL.license and APACHE.license for the respective license information.
+Read COPYING for more about the license.
+
+
+Installation
+------------
+Read the INSTALL file
+
+
+Mailing List
+------------
+Questions and discussion should be directed to the CLucene mailing list
+  at clucene-developers@lists.sourceforge.net  
+Find subscription instructions at 
+  http://lists.sourceforge.net/lists/listinfo/clucene-developers
+Suggestions and bug reports can be made on our bug tracking database
+  (http://sourceforge.net/tracker/?group_id=80013&atid=558446)
+
+
+The latest version
+------------------
+Details of the latest version can be found on the CLucene sourceforge project
+web site: http://www.sourceforge.net/projects/clucene
+
+
+Documentation
+-------------
+You can build your own documentation by running 'make DoxygenDoc' from your 
+'out-of-source' cmake-configured build directory.
+CLucene is a very close port of Java Lucene, so you can also try looking at the
+Java Docs on http://lucene.apache.org/java/
+There is an online version (which won't be as up to date as if you build your
+own) at http://clucene.sourceforge.net/doc/html/
+
+
+Acknowledgments
+----------------
+The Apache Lucene project is the basis for this software, so the biggest
+acknoledgment goes to that project.
+
+We wish to acknowledge the following copyrighted works that
+make up portions of the CLucene software:
+
+This software contains code derived from the RSA Data Security
+Inc. MD5 Message-Digest Algorithm.
+
+CLucene relies heavily on the use of cmake to provide a stable build environment.
+
diff --git a/README.PACKAGE b/README.PACKAGE
new file mode 100644
index 0000000..e33b1d4
--- /dev/null
+++ b/README.PACKAGE
@@ -0,0 +1,11 @@
+CLucene is a C++ port of the popular Apache Lucene search engine
+(http://lucene.apache.org/java). It is released under LGPL or the Apache
+License.
+
+CLucene aims to be a high-speed alternative to Java Lucene, its API is very
+similar to that of the Java version. CLucene has recently been brought up to
+date with Lucene 2.3.2. It contains most of the same functionality as the
+Java version.
+
+This package contains the files necessary for running applications that
+use the libclucene library.
diff --git a/REQUESTS b/REQUESTS
new file mode 100644
index 0000000..762fce5
--- /dev/null
+++ b/REQUESTS
@@ -0,0 +1,4 @@
+The todo list has been moved to the tracker at
+http://sourceforge.net/tracker/?func=browse&group_id=80013&atid=558449
+
+You need to be logged into sourceforge to view this list.
diff --git a/cmake/CLuceneBoost.cmake b/cmake/CLuceneBoost.cmake
new file mode 100644
index 0000000..f6e2558
--- /dev/null
+++ b/cmake/CLuceneBoost.cmake
@@ -0,0 +1,23 @@
+#Locate Boost libs. Windows users: make sure BOOST_ROOT and BOOST_PATH are set correctly on your environment.
+#See the site FAQ for more details.
+
+MACRO (GET_BOOST_INCLUDE_PATH path)
+  #todo: allow this to fall back on a local distributed copy, so user doesn't have to d/l Boost seperately
+  SET(Boost_USE_MULTITHREAD ON)
+  message(STATUS "old Boost_INCLUDE_DIR    : ${Boost_INCLUDE_DIR}")
+  FIND_PACKAGE( Boost )
+  
+  #todo: limit Boost version?
+  #todo: use COMPONENTS threads to locate boost_threads without breaking the current support
+  IF(Boost_FOUND)
+    IF (NOT _boost_IN_CACHE)
+      MESSAGE( "Boost found" )
+      message(STATUS "Boost_INCLUDE_DIR    : ${Boost_INCLUDE_DIR}")
+    ENDIF (NOT _boost_IN_CACHE)
+    SET(${path} ${Boost_INCLUDE_DIRS} )
+  ELSE()
+    MESSAGE( "Boost not found, using local: ${clucene_SOURCE_DIR}/src/ext" )
+    SET(${path} ${clucene_SOURCE_DIR}/src/ext )
+  ENDIF()
+ENDMACRO (GET_BOOST_INCLUDE_PATH path)
+
diff --git a/cmake/CLuceneDocs.cmake b/cmake/CLuceneDocs.cmake
new file mode 100644
index 0000000..d46eed9
--- /dev/null
+++ b/cmake/CLuceneDocs.cmake
@@ -0,0 +1,151 @@
+# - CLuceneDocs.cmake
+# This file provides support for building the CLucene Documentation.
+# To build the documention, you will have to enable it
+# and then do the equivalent of "make doc".
+OPTION(ENABLE_CLDOCS "Build the clucene documentation." OFF)
+
+MACRO(SET_YESNO)
+    FOREACH(param ${ARGV})
+    	IF ( ${param} )
+    	    SET(${param} "YES")
+        ELSE ( ${param} )
+            SET(${param} "NO")
+    	ENDIF ( ${param} )
+    ENDFOREACH(param)
+ENDMACRO(SET_YESNO)
+MACRO(SET_BLANK)
+    FOREACH(param ${ARGV})
+    	IF ( NOT ${param} )
+    	    SET(${param} "")
+    	ENDIF ( NOT ${param} )
+    ENDFOREACH(param)
+ENDMACRO(SET_BLANK)
+
+IF (ENABLE_CLDOCS)
+    OPTION(CLDOCS_HTML_HELP 
+        "Doxygen should compile HTML into a Help file (CHM)." NO)
+        
+    OPTION(CLDOCS_HTML
+        "Doxygen should build HTML documentation." YES)
+    OPTION(CLDOCS_XML
+        "Doxygen should build XML documentation." NO)
+    OPTION(CLDOCS_RTF
+        "Doxygen should build RTF documentation." NO)
+    OPTION(CLDOCS_MAN
+        "Doxygen should build man documentation." NO)
+    OPTION(CLDOCS_TAGFILE
+        "Doxygen should build a tagfile." NO)
+        
+    OPTION(CLDOCS_LATEX
+        "Doxygen should build Latex documentation." NO )
+
+    MARK_AS_ADVANCED(
+        CLDOCS_HTML_HELP
+        CLDOCS_LATEX
+        CLDOCS_XML
+        CLDOCS_HTML
+        CLDOCS_RTF
+        CLDOCS_MAN
+        CLDOCS_TAGFILE
+    )
+    
+    #
+    # Check for the tools
+    #
+    FIND_PACKAGE(Doxygen)
+    
+    IF ( DOXYGEN_FOUND )
+        # This creates a new target to build documentation.
+        # It runs ${DOXYGEN_EXECUTABLE} which is the full path and executable to
+        # Doxygen on your system, set by the FindDoxygen.cmake module
+        # (called by FindDocumentation.cmake).
+        # It runs the final generated Doxyfile against it.
+        # The DOT_PATH is substituted into the Doxyfile.
+        ADD_CUSTOM_TARGET(doc
+            ${DOXYGEN_EXECUTABLE} ${PROJECT_BINARY_DIR}/doc/doxyfile
+        )
+        
+        IF ( CLDOCS_HTML_HELP )
+            IF ( NOT CLDOCS_HTML )
+                MESSAGE ( FATAL_ERROR "CLDOCS_HTML is required to buidl CLDOCS_HTML_HELP" )
+            ENDIF ( NOT CLDOCS_HTML )
+            FIND_PACKAGE(HTMLHelp)
+            IF ( NOT HTML_HELP_COMPILER )
+                MESSAGE(FATAL_ERROR "HTML Help compiler not found, turn CLDOCS_HTML_HELP off to proceed")
+            ENDIF ( NOT HTML_HELP_COMPILER )
+            
+            #make cygwin work with hhc...
+            IF ( CYGWIN )
+                EXECUTE_PROCESS ( COMMAND cygpath "${HTML_HELP_COMPILER}"
+                    OUTPUT_VARIABLE HTML_HELP_COMPILER_EX )
+                STRING ( REPLACE "\n" "" HTML_HELP_COMPILER_EX ${HTML_HELP_COMPILER_EX} )
+                STRING ( REPLACE "\r" "" HTML_HELP_COMPILER_EX ${HTML_HELP_COMPILER_EX} )
+                SET ( HTML_HELP_COMPILER_EX "\"${HTML_HELP_COMPILER_EX}\"" )
+            ELSE ( CYGWIN )
+                SET ( HTML_HELP_COMPILER_EX ${HTML_HELP_COMPILER} )
+            ENDIF ( CYGWIN )
+        ENDIF ( CLDOCS_HTML_HELP )
+        
+        IF ( CLDOCS_LATEX )
+            FIND_PACKAGE(LATEX)
+            IF ( NOT LATEX_COMPILER )
+                MESSAGE(FATAL_ERROR "Latex compiler not found, turn CLDOCS_LATEX off to proceed")
+            ENDIF ( NOT LATEX_COMPILER )
+        ENDIF ( CLDOCS_LATEX )
+    
+        FIND_PACKAGE(Perl)
+        
+        IF ( DOXYGEN_DOT_EXECUTABLE )
+            SET ( HAVE_DOT "YES" )
+        ELSE ( DOXYGEN_DOT_EXECUTABLE )
+            SET ( HAVE_DOT "NO" )
+        ENDIF ( DOXYGEN_DOT_EXECUTABLE )
+        
+        #doxygen expects YES/NO parameters
+        SET_YESNO(
+            CLDOCS_HTML_HELP
+            CLDOCS_LATEX
+            CLDOCS_XML
+            CLDOCS_HTML
+            CLDOCS_RTF
+            CLDOCS_MAN
+        )
+        #empty out paths if not found
+        SET_BLANK(
+            PERL_EXECUTABLE
+            DOXYGEN_DOT_EXECUTABLE
+            HTML_HELP_COMPILER
+            LATEX_COMPILER
+        )
+        
+        IF ( CLDOCS_TAGFILE )
+            SET ( CLDOCS_TAGFILE_LOCATION "${PROJECT_BINARY_DIR}/doc/tag/clucene.tag"  )
+        ENDIF ( CLDOCS_TAGFILE )
+        
+        # This processes our Doxyfile.cmake and substitutes paths to generate a final Doxyfile
+        CONFIGURE_FILE(${PROJECT_SOURCE_DIR}/doc/Doxyfile.cmake ${PROJECT_BINARY_DIR}/doc/doxyfile )
+        CONFIGURE_FILE(${PROJECT_SOURCE_DIR}/doc/helpheader.htm.cmake ${PROJECT_BINARY_DIR}/doc/helpheader.htm )
+        CONFIGURE_FILE(${PROJECT_SOURCE_DIR}/doc/helpfooter.htm.cmake ${PROJECT_BINARY_DIR}/doc/helpfooter.htm )
+        CONFIGURE_FILE(${PROJECT_SOURCE_DIR}/doc/doxygen.css.cmake ${PROJECT_BINARY_DIR}/doc/html/doxygen.css )
+        
+        #create a target for tar.gz html help
+        FIND_PACKAGE(UnixCommands)
+        IF ( TAR AND GZIP )
+            ADD_CUSTOM_TARGET(doc-tarz
+                COMMAND "${TAR}" "-cf" "doc/clucene-core-doc.tar" "${PROJECT_BINARY_DIR}/doc/html/"
+                COMMAND "${GZIP}" "doc/clucene-core-doc.tar"
+                #DEPENDS doc-doxygen
+            )
+        ENDIF ( TAR AND GZIP )
+        
+        #install man if it was built
+        IF ( CLDOCS_MAN )
+            INSTALL(DIRECTORY ${PROJECT_BINARY_DIR}/doc/man/ DESTINATION man)
+        ENDIF ( CLDOCS_MAN )
+        
+    ELSE ( DOXYGEN_FOUND )
+        MESSAGE(FATAL_ERROR "Doxygen not found, turn ENABLE_CLDOCS off to proceed")
+    ENDIF ( DOXYGEN_FOUND )
+
+    
+ENDIF (ENABLE_CLDOCS)
diff --git a/cmake/CreateClucenePackages.cmake b/cmake/CreateClucenePackages.cmake
new file mode 100644
index 0000000..b9de7b1
--- /dev/null
+++ b/cmake/CreateClucenePackages.cmake
@@ -0,0 +1,91 @@
+#Creates all the relevant packages
+
+#Rules for version:
+#MAJOR and MINOR versions are purely political
+#REVISION version MUST be revised if the headers or compatibility change
+#PATCH should be 0 unless a patch is made that doesn't affect the public signature (i.e. clients don't need to re-compile).
+
+SET(CPACK_PACKAGE_VERSION_MAJOR ${CLUCENE_VERSION_MAJOR})
+SET(CPACK_PACKAGE_VERSION_MINOR ${CLUCENE_VERSION_MINOR})
+SET(CPACK_PACKAGE_VERSION_REVISION ${CLUCENE_VERSION_REVISION})
+SET(CPACK_PACKAGE_VERSION_PATCH ${CLUCENE_VERSION_MAJOR})
+
+SET(CPACK_PACKAGE_VERSION ${CLUCENE_VERSION})
+SET(CPACK_PACKAGE_SOVERSION ${CLUCENE_SOVERSION})
+
+SET(CPACK_PACKAGE_DESCRIPTION_SUMMARY "library for full-featured text search engine (runtime)")
+SET(CPACK_PACKAGE_VENDOR "Ben van Klinken")
+SET(CPACK_PACKAGE_CONTACT "clucene-developers@lists.sourceforge.net")
+SET(CPACK_PACKAGE_NAME "libclucene1")
+
+SET(CPACK_PACKAGE_DESCRIPTION_FILE "${CMAKE_CURRENT_SOURCE_DIR}/README.PACKAGE")
+SET(CPACK_PACKAGE_DESCRIPTION_SUMMARY "CLucene - a C++ search engine, ported from the popular Apache Lucene")
+
+SET(CPACK_RESOURCE_FILE_README "${CMAKE_CURRENT_SOURCE_DIR}/README.PACKAGE")
+SET(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/COPYING")
+SET(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/README.PACKAGE")
+
+#so, what are we going to install?
+SET(CPACK_INSTALL_CMAKE_PROJECTS
+  "${CMAKE_BINARY_DIR};clucene-core;ALL;/"
+  "${CMAKE_BINARY_DIR};clucene-shared;ALL;/")
+SET(CPACK_COMPONENTS_ALL development runtime)
+SET(CPACK_GENERATOR "TGZ")
+SET(CPACK_PACKAGE_FILE_NAME "clucene-core-${CPACK_PACKAGE_VERSION}-${CMAKE_SYSTEM_NAME}")
+
+IF(WIN32 AND NOT UNIX)
+	SET(CPACK_SOURCE_GENERATOR "ZIP")
+ELSE(WIN32 AND NOT UNIX)
+	SET(CPACK_SOURCE_GENERATOR "TBZ2;TGZ")
+ENDIF(WIN32 AND NOT UNIX)
+SET(CPACK_SOURCE_PACKAGE_FILE_NAME "clucene-core-${CPACK_PACKAGE_VERSION}-Source")
+
+#specific packaging requirements:
+SET(CPACK_DEBIAN_PACKAGE_DEPENDS "libc6 (>= 2.4), libgcc1 (>= 1:4.1.1-21), libstdc++6 (>= 4.1.1-21), zlib1g")
+SET(CPACK_DEBIAN_PACKAGE_SECTION "libs")
+SET(CPACK_RPM_PACKAGE_LICENSE "Apache 2.0")
+SET(CPACK_RPM_PACKAGE_GROUP "libs")
+SET(CPACK_RPM_PACKAGE_REQUIRES "libz")
+
+#don't include the current binary dir.
+get_filename_component(clucene_BINARY_DIR_name ${clucene_BINARY_DIR} NAME)
+SET(CPACK_SOURCE_IGNORE_FILES
+  "/\\\\.svn/"
+  "/\\\\.git/"
+  "\\\\.swp$"
+  "\\\\.#;/#"
+  ".*~"
+  ".*\\\\.tmp"
+  ".*\\\\.save"
+  "/${clucene_BINARY_DIR_name}/"
+)
+
+IF(WIN32 AND NOT UNIX)
+  # There is a bug in NSI that does not handle full unix paths properly. Make
+  # sure there is at least one set of four (4) backlasshes.
+  SET(CPACK_GENERATOR "${CPACK_GENERATOR};NSIS")
+  #SET(CPACK_PACKAGE_ICON "${CMake_SOURCE_DIR}/Utilities/Release\\\\InstallIcon.bmp")
+  #SET(CPACK_NSIS_INSTALLED_ICON_NAME "bin\\\\MyExecutable.exe")
+  SET(CPACK_NSIS_DISPLAY_NAME "${CPACK_PACKAGE_INSTALL_DIRECTORY} CLucene Core Library")
+  SET(CPACK_NSIS_HELP_LINK "http:\\\\\\\\clucene.sourceforge.net")
+  SET(CPACK_NSIS_URL_INFO_ABOUT "http:\\\\\\\\clucene.sourceforge.net")
+  SET(CPACK_NSIS_CONTACT "clucene-developers@lists.sourceforge.net")
+  #SET(CPACK_NSIS_MODIFY_PATH ON)
+ELSE(WIN32 AND NOT UNIX)
+#  SET(CPACK_STRIP_FILES "bin/xxx")
+  SET(CPACK_SOURCE_STRIP_FILES "")
+ENDIF(WIN32 AND NOT UNIX)
+#SET(CPACK_PACKAGE_EXECUTABLES "MyExecutable" "My Executable")
+
+
+ADD_CUSTOM_TARGET(dist-package
+    COMMAND rsync -avP -e ssh ${CPACK_PACKAGE_FILE_NAME}.* ustramooner@frs.sourceforge.net:uploads/
+#    DEPENDS package
+)
+ADD_CUSTOM_TARGET(dist-package_source
+    COMMAND rsync -avP -e ssh ${CPACK_SOURCE_PACKAGE_FILE_NAME}.* ustramooner@frs.sourceforge.net:uploads/
+#    DEPENDS package_source
+)
+
+#this must be last
+INCLUDE(CPack)
diff --git a/cmake/DefineOptions.cmake b/cmake/DefineOptions.cmake
new file mode 100644
index 0000000..97d6377
--- /dev/null
+++ b/cmake/DefineOptions.cmake
@@ -0,0 +1,53 @@
+#define global options, this makes it easy to use ccmake, or the cmake gui
+MACRO (DEFINE_OPTIONS extraOptions extraLibs)
+  IF(ENABLE_DEBUG)
+    SET (${extraOptions} "${${extraOptions}} -D_DEBUG")
+  ENDIF(ENABLE_DEBUG)
+
+  IF(ENABLE_MMAP)
+    SET (${extraOptions} "${${extraOptions}} -DLUCENE_FS_MMAP")
+  ENDIF(ENABLE_MMAP)
+
+  IF(ENABLE_DMALLOC)
+    SET (${extraOptions} "${${extraOptions}} -DDMALLOC")
+    IF ( DISABLE_MULTITHREADING )
+      SET (${extraLibs} ${${extraLibs}} "dmalloccxx")
+    ELSE( DISABLE_MULTITHREADING )
+      SET (${extraLibs} ${${extraLibs}} "dmallocthcxx")
+    ENDIF ( DISABLE_MULTITHREADING )
+  ENDIF(ENABLE_DMALLOC)
+
+  IF(DISABLE_MULTITHREADING)
+    SET (${extraOptions} "${${extraOptions}} -D_CL_DISABLE_MULTITHREADING")
+  ELSE(DISABLE_MULTITHREADING)
+    SET(${extraOptions} "${${extraOptions}} -D_REENTRANT")
+  ENDIF(DISABLE_MULTITHREADING)
+
+  IF(ENABLE_ASCII_MODE)
+    SET (${extraOptions} "${${extraOptions}} -D_ASCII")
+  ELSE(ENABLE_ASCII_MODE)
+    SET (${extraOptions} "${${extraOptions}} -D_UCS2")
+    SET (${extraOptions} "${${extraOptions}} -D_UNICODE")
+  ENDIF(ENABLE_ASCII_MODE)
+
+	IF ( MSVC80 OR MSVC90)
+	    #todo: remove this once crt functions are fixed...
+		SET (${extraOptions} "${${extraOptions}} -D_CRT_SECURE_NO_DEPRECATE -D_CRT_NONSTDC_NO_DEPRECATE")
+	ENDIF ( MSVC80 OR MSVC90 )
+	
+	IF(CYGWIN)
+        ADD_DEFINITIONS(-D__LARGE64_FILES)
+    ENDIF(CYGWIN)
+    
+    # calm mdown msvc
+    IF(MSVC)
+        IF ( NOT MSVC60 )
+            #ADD_DEFINITIONS(-wd4251) # 'identifier' : class 'type' needs to have dll-interface to be used by clients of class 'type2'
+            #ADD_DEFINITIONS(-wd4275) # non  DLL-interface classkey 'identifier' used as base for DLL-interface classkey 'identifier'
+            #ADD_DEFINITIONS(-wd4309) # 'conversion' : truncation of constant value
+            #ADD_DEFINITIONS(-wd4503) # decorated name length exceeded
+            #ADD_DEFINITIONS(-wd4786) # identifier was truncated to '255' characters in the debug information
+        ENDIF ( NOT MSVC60 )
+    ENDIF(MSVC)
+
+ENDMACRO (DEFINE_OPTIONS)
diff --git a/cmake/Toolchain-g++32.cmake b/cmake/Toolchain-g++32.cmake
new file mode 100644
index 0000000..e891324
--- /dev/null
+++ b/cmake/Toolchain-g++32.cmake
@@ -0,0 +1,20 @@
+# Cross compiling from linux using g++-multilib to create 32 bit output
+# On ubuntu, you'll need to install the packages: g++-multilib gcc-multilib
+#
+# Use of this file:
+# cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchain-g++32.cmake ..
+
+SET(CMAKE_CXX_FLAGS "-m32")
+SET(CMAKE_C_FLAGS "-m32")
+SET(CMAKE_EXE_LINKER_FLAGS "-m32")
+SET(CMAKE_MODULE_LINKER_FLAGS "-m32")
+
+# here is the target environment located
+SET(CMAKE_FIND_ROOT_PATH  /usr/lib32 )
+
+# adjust the default behaviour of the FIND_XXX() commands:
+# search headers and libraries in the target environment, search 
+# programs in the host environment
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
diff --git a/cmake/Toolchain-llvm.cmake b/cmake/Toolchain-llvm.cmake
new file mode 100644
index 0000000..b8616bb
--- /dev/null
+++ b/cmake/Toolchain-llvm.cmake
@@ -0,0 +1,8 @@
+# Use of this file:
+# cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchain-llvm.cmake ..
+
+# which compilers to use for C and C++
+SET(CMAKE_C_COMPILER clang)
+SET(CMAKE_CXX_COMPILER clang++)
+
+
diff --git a/cmake/Toolchain-mingw32.cmake b/cmake/Toolchain-mingw32.cmake
new file mode 100644
index 0000000..ec5e729
--- /dev/null
+++ b/cmake/Toolchain-mingw32.cmake
@@ -0,0 +1,32 @@
+# Cross compiling from linux using mingw32 tools
+# On ubuntu, you'll need to install the packages: mingw32, mingw32-binutils, mingw32-runtime
+#
+# Use of this file:
+# cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchain-mingw32.cmake -C ../cmake/Define-mingw32.cmake ..
+
+# the name of the target operating system
+SET(CMAKE_SYSTEM_NAME Windows)
+
+# which compilers to use for C and C++
+SET(CMAKE_C_COMPILER i586-mingw32msvc-gcc)
+SET(CMAKE_CXX_COMPILER i586-mingw32msvc-g++)
+
+# here is the target environment located
+SET(CMAKE_FIND_ROOT_PATH  /usr/i586-mingw32msvc /home/alex/mingw-install )
+
+INCLUDE_DIRECTORIES(/usr/lib/gcc/i586-mingw32msvc/4.2.1-sjlj/include/c++)
+
+# adjust the default behaviour of the FIND_XXX() commands:
+# search headers and libraries in the target environment, search 
+# programs in the host environment
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+SET(_CL_HAVE_GCCVISIBILITYPATCH 0)
+SET(_CL_HAVE_NAMESPACES_EXITCODE 0)
+SET(_CL_HAVE_NO_SNPRINTF_BUG_EXITCODE 0)
+SET(_CL_HAVE_NO_SNWPRINTF_BUG_EXITCODE 0)
+SET(LUCENE_STATIC_CONSTANT_SYNTAX_EXITCODE 1)
+SET(_CL_HAVE_TRY_BLOCKS_EXITCODE 0)
+
diff --git a/cmake/TurboPFOR.cmake b/cmake/TurboPFOR.cmake
new file mode 100644
index 0000000..d84daff
--- /dev/null
+++ b/cmake/TurboPFOR.cmake
@@ -0,0 +1,15 @@
+PROJECT(turbo-pfor)
+set(PFOR_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src/ext/for)
+
+add_custom_target(build_ic ALL
+	COMMAND make USE_AVX2=${USE_AVX2} libic.a -j 8
+        WORKING_DIRECTORY ${PFOR_SOURCE_DIR}
+        COMMENT "Original Turbo-PFOR makefile target")
+
+add_library(ic STATIC IMPORTED)
+set_target_properties(ic PROPERTIES IMPORTED_LOCATION "${PFOR_SOURCE_DIR}/libic.a")
+add_dependencies(ic build_ic)
+
+install(FILES ${PFOR_SOURCE_DIR}/libic.a 
+	       DESTINATION "lib"
+               COMPONENT development)
diff --git a/cmake/cmake_uninstall.cmake.in b/cmake/cmake_uninstall.cmake.in
new file mode 100644
index 0000000..eee28b6
--- /dev/null
+++ b/cmake/cmake_uninstall.cmake.in
@@ -0,0 +1,23 @@
+IF(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
+  MESSAGE(FATAL_ERROR "Cannot find install manifest: \"@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt\"")
+ENDIF(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
+
+FILE(READ "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt" files)
+STRING(REGEX REPLACE "\n" ";" files "${files}")
+FOREACH(file ${files})
+  MESSAGE(STATUS "Uninstalling \"${file}\"")
+  IF(EXISTS "${file}")
+    EXEC_PROGRAM(
+      "@CMAKE_COMMAND@" ARGS "-E remove \"${file}\""
+      OUTPUT_VARIABLE rm_out
+      RETURN_VALUE rm_retval
+     )
+    IF("${rm_retval}" STREQUAL 0)
+    ELSE("${rm_retval}" STREQUAL 0)
+      MESSAGE(FATAL_ERROR "Problem when removing \"${file}\"")
+    ENDIF("${rm_retval}" STREQUAL 0)
+    
+  #ELSE(EXISTS "${file}")
+  #  MESSAGE(STATUS "File \"${file}\" does not exist.")
+  ENDIF(EXISTS "${file}")
+ENDFOREACH(file)
diff --git a/dist-test.sh b/dist-test.sh
new file mode 100755
index 0000000..e556d74
--- /dev/null
+++ b/dist-test.sh
@@ -0,0 +1,272 @@
+#!/bin/bash
+#Check compliance with Coding standards...
+
+#where to keep the temp files...
+TMP=disttest
+
+function usage {
+    echo "usage: ../dist-test.sh [all | "
+    echo "    <env - creates environment>"
+    echo "    <c_all - compile all headers together>"
+    echo "    <compile - compile and test>"
+    echo "    <inline - test for inline using doxygen documents>]"
+    echo "    <c_header - test that each header compiles independently of each other>]"
+    echo "    <license - test that each header has a valid license>]"
+    echo "    <ifdefs - test that each header doesn't have invalid ifdefs>]"
+    echo "    <exports - test that each header exports all its classes>]"
+    exit 1;
+}
+t_all=0
+t_env=0
+t_c_all=0
+t_inline=0
+t_compile=0
+t_c_h=0
+t_license=0
+t_ifdefs=0
+t_exports=0
+FAIL=0
+
+if [ $# -eq 0 ]; then
+    usage
+else
+    while [ "$1" != "" ]; do
+        if [ "$1" == "all" ]; then
+            t_all=1
+        elif [ "$1" == "env" ]; then
+            t_env=1
+        elif [ "$1" == "c_all" ]; then
+            t_c_all=1
+        elif [ "$1" == "inline" ]; then
+            t_inline=1
+        elif [ "$1" == "compile" ]; then
+            t_compile=1
+        elif [ "$1" == "c_header" ]; then
+            t_c_h=1
+        elif [ "$1" == "license" ]; then
+            t_license=1
+        elif [ "$1" == "ifdefs" ]; then
+            t_ifdefs=1
+        elif [ "$1" == "exports" ]; then
+            t_exports=1
+        else
+            usage
+        fi
+        shift
+    done
+fi
+
+if [ $t_all -eq 1 ]; then
+    t_env=1
+    t_c_all=1
+    t_c_h=1
+    t_inline=1
+    t_compile=1
+    t_license=1
+    t_ifdefs=1
+    t_exports=1
+fi
+
+
+#check to see that no #ifdefs exist in headers that don't belong
+function checkForIfdefs {
+    I=0
+    grep "#if" $1| grep -v "_UCS2" |grep -v "_CL_HAVE_" |grep -v "_ASCII" |grep -v "_WIN32"|grep -v "_MSC_"|grep -v "__MINGW32__" |grep -v "_WIN64" | while read line; do
+        I=`expr $I + 1`
+        if [ $I -gt 1 ]; then
+            echo $1 might have invalid ifdef: $line
+        fi
+    done
+}
+
+
+if [ $t_env -eq 1 ]; then
+    rm -fdr $TMP 2>/dev/null
+    mkdir $TMP
+    
+    #create header file for testing of symbols in headers.
+    echo "#include \"CLucene/StdHeader.h"\" >$TMP/pub-headers.cpp
+
+		#iterate all headers
+    for H in `find ../src/shared/CLucene| grep "\.h$"` `find ../src/core/CLucene| grep "\.h$"`; do
+        BH=`basename "$H"`
+        DN=`dirname "$H"`
+        if [ "${BH:0:1}" != "_" ]; then
+            DH=`dirname "${H:3}"`
+        
+            if [ "${H:7}" != "core/CLucene/util/Reader.h" ]; then
+	            #move headers somewhere to compile
+	            mkdir -p "$TMP/$DH" 2>/dev/null
+	            ln -s "`cd "$DN" && pwd`/$BH" "$TMP/${H:3}" 2>/dev/null
+	            
+	            #create pub-headers.cpp
+              echo "#include \"${H:7}\"" >>$TMP/pub-headers.cpp
+            fi
+        fi
+    done
+    
+    echo "int main(){return 0;}"  >>$TMP/pub-headers.cpp
+fi
+
+
+################################################
+#now the environment is finished being setup...
+################################################
+echo "Starting tests..."
+
+if [ $t_c_h -eq 1 ] || [ $t_ifdefs -eq 1 ] || [ $t_exports -eq 1 ]; then
+    for H in `find $TMP/src | grep "\.h$"`; do
+        BH=`basename "$H"`
+        DH=`dirname "${H:3}"`
+        
+        if [ $t_ifdefs -eq 1 ]; then
+            checkForIfdefs $H
+        fi
+    
+        #check that all classes are exported
+        if [ $t_exports -eq 1 ]; then
+      		if [ "${H:0:1}" == "_" ]; then
+      			#internal headers... none must be exported
+	          XX=`awk '/^[ \t]*(class|struct)/ { print $line }' $H| grep -v ";$"| grep -v CLUCENE_EXPORT| grep -v CLUCENE_INLINE_EXPORT| grep -v CLUCENE_SHARED_EXPORT| grep -v CLUCENE_SHARED_INLINE_EXPORT`
+	          if [ "$XX" == "" ]; then
+	              echo "$H is internal but has exported class: $XX"
+	              echo ""
+	              FAIL=1
+	          fi
+          else
+          	#external headers... all must be exported
+	          XX=`awk '/^[ \t]*(class|struct)/ { print $line }' $H| grep -v ";$"| grep -v CLUCENE_EXPORT| grep -v CLUCENE_INLINE_EXPORT| grep -v CLUCENE_SHARED_EXPORT| grep -v CLUCENE_SHARED_INLINE_EXPORT`
+	          if [ "$XX" != "" ]; then
+	              echo "$H has unexported class: $XX"
+	              echo ""
+	              FAIL=1
+	          fi
+          fi
+        fi
+        
+        #test that each header compiles independently...
+        if [ $t_c_h -eq 1 ]; then
+            echo "#include \"CLucene/StdHeader.h"\" >$TMP/pub-header.cpp
+            echo "#include \"$H"\" >>$TMP/pub-header.cpp
+            echo "int main(){ return 0; }" >>"$TMP/pub-header.cpp"
+            ERROR=`g++ -I. -I$TMP/src/shared -I./src/shared -I../src/ext -I$TMP/src/core $TMP/pub-header.cpp`
+            if [ $? -ne 0 ]; then 
+              echo ""
+            	echo "$H doesn't compile seperately..."
+            	echo $ERROR
+            	FAIL=1; 
+            fi
+        fi
+    done
+    
+    
+    if [ $t_ifdefs -eq 1 ]; then
+      echo "Not all ifdefs are invalid, you have to figure it out for yourself :-)"
+      echo "If defs in classes which change depending on a user setting can cause big problems due to offset changes"
+      echo "for example:"
+      echo "class X {"
+      echo " #ifdef _DEBUG"
+      echo "  int x;"
+      echo " #endif"
+      echo " int y;"
+      echo "}"
+      echo "If the library is compiled with _DEBUG, and then a user calls y without _DEBUG defined, unexpected behaviour will occur"
+    fi
+fi
+
+#iterate all our code...
+if [ $t_license -eq 1 ]; then
+    for H in `find ../src`; do
+        BH=`basename "$H"`
+        BH_len=${#BH}
+        
+        if [ "${BH:BH_len-2}" == ".h" ] || [ "${BH:BH_len-2}" == ".c" ] || [ "${BH:BH_len-4}" == ".cpp" ]; then
+		        
+		        #snowball has its own license...
+		        if [ "echo $H|grep 'snowball/src_c'" != "" ]; then
+		        	continue
+		        fi
+		        #snowball has its own license...
+		        if [ "echo $H|grep 'libstemmer'" != "" ]; then
+		        	continue
+		        fi
+		        #zlib has its own license...
+		        if [ "echo $H|grep 'CLucene/util/zlib'" != "" ]; then
+		        	continue
+		        fi
+		        
+            if [ "`awk '/\* Copyright \(C\) [0-9]*-[0-9]* .*$/ { print $line }' $H`" == "" ]; then
+                if [ "`awk '/\* Copyright [0-9]*-[0-9]* .*$/ { print $line }' $H`" == "" ]; then
+                    echo "$H ($BH) has invalid license"
+                    FAIL=1
+                fi
+            fi
+        fi
+    done
+fi
+
+
+#test if headers can compile together by themselves:
+if [ $t_c_all -eq 1 ]; then
+    g++ -I$TMP/src -I../src/ext -I$TMP/src/shared -I$TMP/src/core $TMP/pub-headers.cpp -I./src/shared
+fi
+
+if [ $t_inline -eq 1 ]; then
+		if [ ! -d "./doc" ]; then
+			echo "Couldn't find docs, run:"
+		  echo "# cmake -DENABLE_CLDOCS:BOOLEAN=TRUE ."
+		  echo "# make doc"
+		  echo "and then try again"
+		  exit 1
+		fi
+	
+    INLINES=0
+    grep -c "\[inline" doc/html/*.html|grep -v ":0$"|grep -v "util"|grep -v "jstreams" | while read line; do
+    
+        #ignore some actual inlines...
+        if [ "doc/html/classlucene_1_1index_1_1Term.html:1" == $line ]; then
+            continue;
+        fi
+        if [ "doc/html/classlucene_1_1search_1_1Similarity.html:1" == $line ]; then
+            continue;
+        fi
+        if [ "doc/html/classlucene_1_1store_1_1BufferedIndexInput.html:1" == $line ]; then
+            continue;
+        fi
+        
+        if [ $INLINES -eq 0 ]; then
+            echo "These files report inline code:"
+            INLINES=1
+            FAIL=1
+        fi
+        echo $line
+    done
+fi
+
+if [ $t_compile -eq 1 ]; then
+    #compile serperately
+    make cl_test
+    if [ $? -ne 0 ]; then 
+        FAIL=1; 
+    fi
+    
+    echo "Undefines for shared lib:"
+    nm -u --demangle bin/libclucene-shared.so |grep -E "lucene::"
+    echo "Undefines for core lib:"
+    nm -u --demangle bin/libclucene-core.so |grep -E "lucene::"|grep -v "lucene::util::Misc" |grep -v "lucene::util::mutex" |grep -v "lucene::util::StringBuffer"|grep -v "lucene::util::shared_condition"
+
+    #compile together
+    make test-all
+    if [ $? -ne 0 ]; then 
+        FAIL=1; 
+    fi
+    
+    
+fi
+
+
+if [ $FAIL == 1 ]; then
+    echo "There were errors, please correct them and re-run"
+    exit 1
+fi
+exit 0
diff --git a/doc/Doxyfile.cmake b/doc/Doxyfile.cmake
new file mode 100644
index 0000000..ab090e3
--- /dev/null
+++ b/doc/Doxyfile.cmake
@@ -0,0 +1,237 @@
+# Doxyfile 1.2.18
+
+#---------------------------------------------------------------------------
+# General configuration options
+#---------------------------------------------------------------------------
+
+PROJECT_NAME           = CLucene-core
+PROJECT_NUMBER         = @CPACK_PACKAGE_VERSION@
+
+OUTPUT_DIRECTORY       = @PROJECT_BINARY_DIR@/doc
+OUTPUT_LANGUAGE        = English
+
+EXTRACT_ALL            = YES
+EXTRACT_PRIVATE        = NO
+EXTRACT_STATIC         = YES
+EXTRACT_LOCAL_CLASSES  = NO
+HIDE_UNDOC_MEMBERS     = NO
+HIDE_UNDOC_CLASSES     = NO
+HIDE_FRIEND_COMPOUNDS  = NO
+BRIEF_MEMBER_DESC      = YES
+REPEAT_BRIEF           = YES
+ALWAYS_DETAILED_SEC    = NO
+INLINE_INHERITED_MEMB  = NO
+FULL_PATH_NAMES        = NO
+STRIP_FROM_PATH        = 
+INTERNAL_DOCS          = NO
+STRIP_CODE_COMMENTS    = YES
+CASE_SENSE_NAMES       = YES
+SHORT_NAMES            = NO
+HIDE_SCOPE_NAMES       = NO
+VERBATIM_HEADERS       = YES
+SHOW_INCLUDE_FILES     = YES
+JAVADOC_AUTOBRIEF      = YES
+MULTILINE_CPP_IS_BRIEF = NO
+DETAILS_AT_TOP         = NO
+INHERIT_DOCS           = YES
+INLINE_INFO            = YES
+SORT_MEMBER_DOCS       = YES
+DISTRIBUTE_GROUP_DOC   = NO
+TAB_SIZE               = 8
+GENERATE_TODOLIST      = YES
+GENERATE_TESTLIST      = YES
+GENERATE_BUGLIST       = YES
+GENERATE_DEPRECATEDLIST= YES
+ALIASES                = "memory=\par Memory management:\n"
+ENABLED_SECTIONS       = 
+MAX_INITIALIZER_LINES  = 30
+OPTIMIZE_OUTPUT_FOR_C  = YES
+OPTIMIZE_OUTPUT_JAVA   = NO
+SHOW_USED_FILES        = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+QUIET                  = NO
+WARNINGS               = YES
+WARN_IF_UNDOCUMENTED   = YES
+WARN_FORMAT            = "$file:$line: $text"
+WARN_LOGFILE           = @PROJECT_BINARY_DIR@/doc/doxygen.warnings.log
+
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+
+INPUT                  = @PROJECT_SOURCE_DIR@/src/core/CLucene
+INPUT                  += @PROJECT_SOURCE_DIR@/src/shared/CLucene
+FILE_PATTERNS          = *.h
+RECURSIVE              = YES
+#EXCLUDE                = mem.h bufferedstream.h fileinputstream.h stringreader.h Misc.h LuceneThreads.h jstreamconfig.h
+EXCLUDE_SYMLINKS       = NO
+EXCLUDE_PATTERNS       = "**/config/**" \
+                         "**/.svn/**" \
+                         "**/debug/**" \
+                         "_*.h"
+EXAMPLE_PATH           = 
+EXAMPLE_PATTERNS       = 
+EXAMPLE_RECURSIVE      = NO
+IMAGE_PATH             = 
+INPUT_FILTER           = 
+FILTER_SOURCE_FILES    = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+SOURCE_BROWSER         = NO
+INLINE_SOURCES         = NO
+REFERENCED_BY_RELATION = YES
+REFERENCES_RELATION    = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+ALPHABETICAL_INDEX     = NO
+COLS_IN_ALPHA_INDEX    = 5
+IGNORE_PREFIX          = 
+
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+GENERATE_HTML          = @CLDOCS_HTML@
+HTML_OUTPUT            = html
+HTML_FILE_EXTENSION    = .html
+HTML_HEADER            = @PROJECT_BINARY_DIR@/doc/helpheader.htm
+HTML_FOOTER            = @PROJECT_BINARY_DIR@/doc/helpfooter.htm
+HTML_STYLESHEET        = 
+HTML_ALIGN_MEMBERS     = YES
+
+GENERATE_HTMLHELP      = @CLDOCS_HTML_HELP@
+CHM_FILE               = ../clucene.chm
+HHC_LOCATION           = @HTML_HELP_COMPILER_EX@
+GENERATE_CHI           = YES
+BINARY_TOC             = YES
+TOC_EXPAND             = NO
+DISABLE_INDEX          = NO
+ENUM_VALUES_PER_LINE   = 4
+GENERATE_TREEVIEW      = NO
+TREEVIEW_WIDTH         = 250
+
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+GENERATE_LATEX         = @CLDOCS_LATEX@
+LATEX_OUTPUT           = latex
+LATEX_CMD_NAME         = @LATEX_COMPILER@
+MAKEINDEX_CMD_NAME     = makeindex
+COMPACT_LATEX          = NO
+PAPER_TYPE             = a4wide
+EXTRA_PACKAGES         = 
+LATEX_HEADER           = 
+PDF_HYPERLINKS         = NO
+USE_PDFLATEX           = NO
+LATEX_BATCHMODE        = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+GENERATE_RTF           = @CLDOCS_RTF@
+RTF_OUTPUT             = rtf
+COMPACT_RTF            = NO
+RTF_HYPERLINKS         = NO
+RTF_STYLESHEET_FILE    = 
+RTF_EXTENSIONS_FILE    = 
+
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+GENERATE_MAN           = @CLDOCS_MAN@
+MAN_OUTPUT             = man
+MAN_EXTENSION          = .3
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+GENERATE_XML           = @CLDOCS_XML@
+XML_SCHEMA             = 
+XML_DTD                = 
+
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor   
+#---------------------------------------------------------------------------
+
+ENABLE_PREPROCESSING   = YES
+MACRO_EXPANSION        = YES
+EXPAND_ONLY_PREDEF     = NO
+SEARCH_INCLUDES        = YES
+INCLUDE_PATH           = @PROJECT_SOURCE_DIR@/src/core
+INCLUDE_PATH           += @PROJECT_SOURCE_DIR@/src/shared
+INCLUDE_PATH           += @PROJECT_BINARY_DIR@/src/shared
+INCLUDE_FILE_PATTERNS  = 
+
+PREDEFINED             = "_MSC_VER=1400"
+PREDEFINED             += "WIN32"
+PREDEFINED             += "_CL_DISABLE_MULTITHREADING"
+PREDEFINED             += "_CL_DEPRECATED(x)="
+
+#namespaces
+PREDEFINED             += "CL_NS(sub)=lucene::sub"
+PREDEFINED             += "CL_NS2(sub,sub2)=lucene::sub:sub2"
+PREDEFINED             += "CL_NS_DEF(sub)=namespace lucene{ namespace sub{"
+PREDEFINED             += "CL_NS_DEF2(sub,sub2)=namespace lucene{ namespace sub{ namespace sub2 {"
+PREDEFINED             += "CL_NS_END=}}"
+PREDEFINED             += "CL_NS_END2=}}}"
+PREDEFINED             += "CL_NS_USE(sub)=using namespace lucene::sub"
+PREDEFINED             += "CL_NS_USE2(sub,sub2)=using namespace lucene::sub::sub2"
+PREDEFINED             += "CL_NS_STD(func)=std::func"
+PREDEFINED             += "CL_NS_HASHING(func)=std::func"
+
+EXPAND_AS_DEFINED      = 
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration::addtions related to external references   
+#---------------------------------------------------------------------------
+
+TAGFILES               = 
+GENERATE_TAGFILE       = @CLDOCS_TAGFILE_LOCATION@
+ALLEXTERNALS           = NO
+EXTERNAL_GROUPS        = YES
+PERL_PATH              = @PERL_EXECUTABLE@
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool   
+#---------------------------------------------------------------------------
+
+CLASS_DIAGRAMS         = YES
+HIDE_UNDOC_RELATIONS   = YES
+HAVE_DOT               = @HAVE_DOT@
+CLASS_GRAPH            = YES
+COLLABORATION_GRAPH    = YES
+TEMPLATE_RELATIONS     = YES
+INCLUDE_GRAPH          = YES
+INCLUDED_BY_GRAPH      = YES
+GRAPHICAL_HIERARCHY    = YES
+DOT_IMAGE_FORMAT       = png
+DOT_PATH               = @DOXYGEN_DOT_EXECUTABLE@
+DOTFILE_DIRS           = 
+GENERATE_LEGEND        = YES
+DOT_CLEANUP            = YES
+
+#---------------------------------------------------------------------------
+# Configuration::addtions related to the search engine   
+#---------------------------------------------------------------------------
+
+SEARCHENGINE           = NO
diff --git a/doc/coding standards.txt b/doc/coding standards.txt
new file mode 100644
index 0000000..fc4a649
--- /dev/null
+++ b/doc/coding standards.txt	
@@ -0,0 +1,113 @@
+Coding Style
+------------
+
+CLucene follows a hybrid coding style. Because of the nature of the code being
+a java port, there are naturally some java like syntax.
+
+* Never use CL_NS_USE(x) in a header file (use CL_NS(x):: for each class), it defeats the purpose of namespaces.
+* Use CL_NS_USE(x) in .cpp files if there are more than a few usages of that namespace.
+
+
+Headers:
+* _headername.h headers are private, and will not be distributed. Don't include these files from public headers.
+* The shared library is not distributed, except for: SharedHeader.h and clucene-config.h
+* Keep _ifdef's in public headers to an absolute minimum.
+* Public headers should contain only classes that are exported (class CLUCENE_EXPORT classname).
+* All classes should have a destructor, 
+  the destructor should be virtual if there is any chance of the class being overridden
+
+Documentation:
+Although CLucene documentation is not complete, it would be nice to see documentation created for new files.
+We used doxygen to create documentation. 2 basic formats of documentation are:
+
+/** documentation must have ** to be included */
+void function();
+void function2(); //< can also document functions retrospectively by adding <
+
+
+/** 
+* You can also document memory with the special @memory alias.
+* @memory you must delete data returned from this function using _CLDELETE 
+*/
+Object createObject();
+
+
+Cross platform specifics:
+* Use macros provided in shared project. This applies to data types and functions
+* static const int32_t x=1; Should be coded as: Use LUCENE_STATIC_CONSTANT (int32_t, x=1). Else it is not portable.
+* Static objects should not be initialised in the class header. ( class x{ static object a; }; ). This will not work
+  everywhere. Instead use a getter.
+  
+  class x{ 
+    static object* a; 
+  public:
+    static Object* getA(); 
+    static void CLUCENE_LOCAL _shutdown();
+  }; 
+  
+  Then in the implementation code
+  
+  Object* x::a = NULL;
+  Object* x::getA(){
+    if ( a == NULL )
+        x::a = _CLNEW Object;
+    return a;
+  }
+  void x::_shutdown(){
+    _CLDELETE(a);
+  }
+  
+  In CLucene/StdHeader.cpp, add x::_shutdown() to the list _lucene_shutdown function.
+  
+* This is bad:
+
+    class x{
+        LUCENE_STATIC_CONSTANT(int32_t, x=1)
+        void func( int32_t value=x);
+    };
+  
+  This will fail on some platforms. It is better to do:
+  int32_t value=-1 (-1 should be some logical value, not necessarily -1).
+  then in the implementation, check if -1 and default to the x static constant.
+* Try and use the functions in util/Array.h instead of Void Map/List functions. Void Map/List will be deprecated for public access
+* Most compilers don't complain about this (in normal mode), but we require pedantic mode behaviour. Some important things for this are:
+  1. Initialise variables in correct order as in the class
+     class x{
+        int a;
+        int b;
+        x():
+            b(12),
+            a(11)   //THIS IS WRONG! a is initialised after b.
+        {
+        }
+     };
+
+Good development tips
+---------------------
+When developing, use the available clucene debugging tools:
+* _CND_DEBUG - condition debugging, an 'assert' type system (or configure with --enable-cnddebug)
+
+Good performance tips:
+----------------------
+CLucene has a lot of new changes to help improve performance.
+Some of them are still being tuned...
+
+MSVC profiling tutorial:
+http://webserver.tc.cornell.edu/services/edu/topics/Performance/Profiling/more.asp
+
+For GCC see gprof
+you can enable gprof by configuring with ENABLE_GPROF
+
+Developing
+----------
+When developing, please keep in mind cross-platform issues and also
+character set issues (unicode/ascii).
+
+Hint:
+To do a quick test to see if the code compiles
+run this command from the root directory of clucene.
+It will compile all the CLucene code monolithically.
+
+    % test-pedantic
+    
+This will do a quick compile then run all the clucene tests.
diff --git a/doc/doxygen.css.cmake b/doc/doxygen.css.cmake
new file mode 100644
index 0000000..b9fae69
--- /dev/null
+++ b/doc/doxygen.css.cmake
@@ -0,0 +1,163 @@
+H1 {
+        text-align: center;
+        font-family: Arial, Helvetica, sans-serif;
+}
+H2 {
+        font-family: Geneva, Arial, Helvetica, sans-serif;
+}
+CAPTION { font-weight: bold }
+DIV.qindex { width: 100%;
+             background-color: #eeeeff;
+             border: 4px solid #eeeeff;
+             text-align: center;
+             margin-bottom: 2px
+}
+A.qindex { text-decoration: none; font-weight: bold; }
+A.qindex:hover { text-decoration: none; background-color: #ddddff }
+A.qindexHL { text-decoration: none; font-weight: bold;
+             background-color: #6666cc;
+             color: #ffffff
+           }
+A.qindexHL:hover { text-decoration: none; background-color: #6666cc }
+A.qindexRef { text-decoration: none; font-weight: bold; }
+A.qindexRef:hover { text-decoration: none; background-color: #ddddff }
+A.qindexRefHL { text-decoration: none; font-weight: bold;
+             background-color: #6666cc;
+             color: #ffffff
+           }
+A.qindexRefHL:hover { text-decoration: none; background-color: #6666cc }
+A.el { text-decoration: none; font-weight: bold }
+A.elRef { font-weight: bold }
+A.code { text-decoration: none; font-weight: normal; color: #4444ee }
+A.codeRef { font-weight: normal; color: #4444ee }
+A:hover { text-decoration: none; background-color: #f2f2ff }
+DL.el { margin-left: -1cm }
+DIV.fragment {
+        width: 98%;
+        border: 1px solid #CCCCCC;
+        background-color: #f5f5f5;
+        padding-left: 4px;
+        margin: 4px;
+}
+DIV.ah { background-color: black; font-weight: bold; color: #ffffff; 
+margin-bottom: 3px; margin-top: 3px }
+TD.md { background-color: #f2f2ff; font-weight: bold; }
+TD.mdname1 { background-color: #f2f2ff; font-weight: bold; color: #602020; }
+TD.mdname { background-color: #f2f2ff; font-weight: bold; color: #602020; 
+width: 600px; }
+DIV.groupHeader { margin-left: 16px; margin-top: 12px; margin-bottom: 6px; 
+font-weight: bold }
+DIV.groupText { margin-left: 16px; font-style: italic; font-size: smaller }
+BODY {
+        background: white;
+        color: black;
+        margin-right: 20px;
+        margin-left: 20px;
+}
+TD.indexkey { 
+   background-color: #eeeeff; 
+   font-weight: bold; 
+   padding-right  : 10px; 
+   padding-top    : 2px; 
+   padding-left   : 10px; 
+   padding-bottom : 2px; 
+   margin-left    : 0px; 
+   margin-right   : 0px; 
+   margin-top     : 2px; 
+   margin-bottom  : 2px  
+}
+TD.indexvalue { 
+   background-color: #eeeeff; 
+   font-style: italic; 
+   padding-right  : 10px; 
+   padding-top    : 2px; 
+   padding-left   : 10px; 
+   padding-bottom : 2px; 
+   margin-left    : 0px; 
+   margin-right   : 0px; 
+   margin-top     : 2px; 
+   margin-bottom  : 2px  
+}
+TR.memlist {
+   background-color: #f0f0f0; 
+}
+P.formulaDsp { text-align: center; }
+IMG.formulaDsp { }
+IMG.formulaInl { vertical-align: middle; }
+SPAN.keyword       { color: #008000 }
+SPAN.keywordtype   { color: #604020 }
+SPAN.keywordflow   { color: #e08000 }
+SPAN.comment       { color: #800000 }
+SPAN.preprocessor  { color: #806020 }
+SPAN.stringliteral { color: #002080 }
+SPAN.charliteral   { color: #008080 }
+.mdTable {
+        border: 1px solid #868686;
+        background-color: #f2f2ff;
+}
+.mdRow {
+        padding: 8px 20px;
+}
+.mdescLeft {
+        font-size: smaller;
+        font-family: Arial, Helvetica, sans-serif;
+        background-color: #FAFAFA;
+        padding-left: 8px;
+        border-top: 1px none #E0E0E0;
+        border-right: 1px none #E0E0E0;
+        border-bottom: 1px none #E0E0E0;
+        border-left: 1px none #E0E0E0;
+        margin: 0px;
+}
+.mdescRight {
+        font-size: smaller;
+        font-family: Arial, Helvetica, sans-serif;
+        font-style: italic;
+        background-color: #FAFAFA;
+        padding-left: 4px;
+        border-top: 1px none #E0E0E0;
+        border-right: 1px none #E0E0E0;
+        border-bottom: 1px none #E0E0E0;
+        border-left: 1px none #E0E0E0;
+        margin: 0px;
+        padding-bottom: 0px;
+        padding-right: 8px;
+}
+.memItemLeft {
+        padding: 1px 0px 0px 8px;
+        margin: 4px;
+        border-top-width: 1px;
+        border-right-width: 1px;
+        border-bottom-width: 1px;
+        border-left-width: 1px;
+        border-top-style: solid;
+        border-top-color: #E0E0E0;
+        border-right-color: #E0E0E0;
+        border-bottom-color: #E0E0E0;
+        border-left-color: #E0E0E0;
+        border-right-style: none;
+        border-bottom-style: none;
+        border-left-style: none;
+        background-color: #FAFAFA;
+        font-family: Geneva, Arial, Helvetica, sans-serif;
+        font-size: 12px;
+}
+.memItemRight {
+        padding: 1px 0px 0px 8px;
+        margin: 4px;
+        border-top-width: 1px;
+        border-right-width: 1px;
+        border-bottom-width: 1px;
+        border-left-width: 1px;
+        border-top-style: solid;
+        border-top-color: #E0E0E0;
+        border-right-color: #E0E0E0;
+        border-bottom-color: #E0E0E0;
+        border-left-color: #E0E0E0;
+        border-right-style: none;
+        border-bottom-style: none;
+        border-left-style: none;
+        background-color: #FAFAFA;
+        font-family: Geneva, Arial, Helvetica, sans-serif;
+        font-size: 13px;
+}
diff --git a/doc/helpfooter.htm.cmake b/doc/helpfooter.htm.cmake
new file mode 100644
index 0000000..d12f5bb
--- /dev/null
+++ b/doc/helpfooter.htm.cmake
@@ -0,0 +1,4 @@
+<HR>
+<p><a href=http://clucene.sourceforge.net><i>clucene.sourceforge.net</i></a></p>
+</BODY>
+</HTML>
\ No newline at end of file
diff --git a/doc/helpheader.htm.cmake b/doc/helpheader.htm.cmake
new file mode 100644
index 0000000..f75de0e
--- /dev/null
+++ b/doc/helpheader.htm.cmake
@@ -0,0 +1,24 @@
+<HTML>
+<HEAD>
+<TITLE>
+CLucene API Documentation (Version @CPACK_PACKAGE_VERSION@)
+</TITLE>
+
+<LINK HREF="doxygen.css" REL="stylesheet" TYPE="text/css">
+<META NAME="keywords" CONTENT="CLucene API Documentation (Version @CPACK_PACKAGE_VERSION@)">
+
+<SCRIPT type="text/javascript">
+function windowTitle()
+{
+    parent.document.title="CLucene API Documentation (Version @CPACK_PACKAGE_VERSION@)";
+}
+</SCRIPT>
+
+</HEAD>
+
+<BODY BGCOLOR="white" onload="windowTitle();">
+
+<P align=center><B>CLucene - a full-featured, c++ search engine</b><BR>
+API Documentation
+</P>
+<hr>
diff --git a/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp b/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp
new file mode 100644
index 0000000..13294de
--- /dev/null
+++ b/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp
@@ -0,0 +1,69 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+*
+* Distributable under the terms of either the Apache License (Version 2.0) or
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#include "CLucene/_ApiHeader.h"
+
+#include "LanguageBasedAnalyzer.h"
+#include "CLucene/analysis/Analyzers.h"
+#include "CLucene/analysis/cjk/CJKAnalyzer.h"
+#include "CLucene/analysis/jieba/ChineseTokenizer.h"
+#include "CLucene/analysis/standard/StandardFilter.h"
+#include "CLucene/analysis/standard/StandardTokenizer.h"
+#include "CLucene/snowball/SnowballFilter.h"
+
+CL_NS_USE(util)
+CL_NS_USE2(analysis, cjk)
+CL_NS_USE2(analysis, jieba)
+CL_NS_USE2(analysis, standard)
+CL_NS_USE2(analysis, snowball)
+
+CL_NS_DEF(analysis)
+
+LanguageBasedAnalyzer::LanguageBasedAnalyzer(const TCHAR *language, bool stem) {
+    if (language == NULL)
+        _tcsncpy(lang, LUCENE_BLANK_STRING, 100);
+    else
+        _tcsncpy(lang, language, 100);
+    this->stem = stem;
+}
+LanguageBasedAnalyzer::~LanguageBasedAnalyzer() {
+}
+void LanguageBasedAnalyzer::setLanguage(const TCHAR *language) {
+    _tcsncpy(lang, language, 100);
+}
+void LanguageBasedAnalyzer::setStem(bool stem) {
+    this->stem = stem;
+}
+TokenStream *LanguageBasedAnalyzer::tokenStream(const TCHAR *fieldName, Reader *reader) {
+    TokenStream *ret = NULL;
+    if (_tcscmp(lang, _T("cjk")) == 0) {
+        ret = _CLNEW CL_NS2(analysis, cjk)::CJKTokenizer(reader);
+    } else if (_tcscmp(lang, _T("chinese")) == 0) {
+        ret = _CLNEW CL_NS2(analysis, jieba)::ChineseTokenizer(reader);
+    } else {
+        BufferedReader *bufferedReader = reader->__asBufferedReader();
+        if (bufferedReader == NULL)
+            ret = _CLNEW StandardTokenizer(_CLNEW FilteredBufferedReader(reader, false), true);
+        else
+            ret = _CLNEW StandardTokenizer(bufferedReader);
+
+        ret = _CLNEW StandardFilter(ret, true);
+
+        if (stem)
+            ret = _CLNEW SnowballFilter(ret, lang, true);//todo: should check whether snowball supports the language
+
+        if (stem)                                         //hmm... this could be configured seperately from stem
+            ret = _CLNEW ISOLatin1AccentFilter(ret, true);//todo: this should really only be applied to latin languages...
+
+        //lower case after the latin1 filter
+        ret = _CLNEW LowerCaseFilter(ret, true);
+    }
+    //todo: could add a stop filter based on the language - need to fix the stoplist loader first
+
+    return ret;
+}
+
+CL_NS_END
diff --git a/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.h b/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.h
new file mode 100644
index 0000000..596c86b
--- /dev/null
+++ b/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.h
@@ -0,0 +1,26 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+*
+* Distributable under the terms of either the Apache License (Version 2.0) or
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#ifndef _lucene_analysis_languagebasedanalyzer_
+#define _lucene_analysis_languagebasedanalyzer_
+
+#include "CLucene/analysis/AnalysisHeader.h"
+
+CL_NS_DEF(analysis)
+
+class CLUCENE_CONTRIBS_EXPORT LanguageBasedAnalyzer: public CL_NS(analysis)::Analyzer{
+	TCHAR lang[100];
+	bool stem;
+public:
+	LanguageBasedAnalyzer(const TCHAR* language=NULL, bool stem=true);
+	~LanguageBasedAnalyzer();
+	void setLanguage(const TCHAR* language);
+	void setStem(bool stem);
+	TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
+  };
+
+CL_NS_END
+#endif
diff --git a/src/contribs-lib/CLucene/analysis/PorterStemmer.cpp b/src/contribs-lib/CLucene/analysis/PorterStemmer.cpp
new file mode 100644
index 0000000..d5ec3a6
--- /dev/null
+++ b/src/contribs-lib/CLucene/analysis/PorterStemmer.cpp
@@ -0,0 +1,313 @@
+/* This is the Porter stemming algorithm, originally written by Martin Porter.
+   It may be regarded as cononical, in that it follows the
+   algorithm presented in
+
+   Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
+   no. 3, pp 130-137,
+
+   See also http://www.tartarus.org/~martin/PorterStemmer
+
+   Modified by "Hemant Muthiyan"
+   email: hemant_muthiyan@yahoo.co.in
+
+*/
+
+#include "CLucene/_ApiHeader.h"
+#include "PorterStemmer.h"
+
+CL_NS_DEF(analysis)
+
+	bool PorterStemmer::cons(size_t i) {
+		switch (b[i]) {
+			case 'a': case 'e': case 'i': case 'o': case 'u':
+			return false;
+			case 'y':
+			return (i==k0) ? true : !cons(i-1);
+			default:
+			return true;
+		}
+	}
+
+   int32_t PorterStemmer::m() {
+    int32_t n = 0;
+    size_t i = k0;
+    while(true) {
+      if (i > j)
+        return n;
+      if (! cons(i))
+        break;
+      i++;
+    }
+    i++;
+    while(true) {
+      while(true) {
+        if (i > j)
+          return n;
+        if (cons(i))
+          break;
+        i++;
+      }
+      i++;
+      n++;
+      while(true) {
+        if (i > j)
+          return n;
+        if (! cons(i))
+          break;
+        i++;
+      }
+      i++;
+    }
+  }
+
+   bool PorterStemmer::vowelinstem() {
+    for (size_t i = k0; i <= j; i++)
+      if (! cons(i))
+        return true;
+    return false;
+  }
+
+  /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
+   bool PorterStemmer::doublec(size_t j) {
+    if (j < k0+1)
+      return false;
+    if (b[j] != b[j-1])
+      return false;
+    return cons(j);
+  }
+
+  /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
+     and also if the second c is not w,x or y. this is used when trying to
+     restore an e at the end of a short word. e.g.
+
+          cav(e), lov(e), hop(e), crim(e), but
+          snow, box, tray.
+
+  */
+   bool PorterStemmer::cvc(size_t i) {
+    if (i < k0+2 || !cons(i) || cons(i-1) || !cons(i-2))
+      return false;
+    else {
+      int32_t ch = b[i];
+      if (ch == 'w' || ch == 'x' || ch == 'y') return false;
+    }
+    return true;
+  }
+
+  bool PorterStemmer::ends(TCHAR *s) {
+	size_t l = _tcslen(s);
+    size_t o = k-l+1;
+    if (o < k0)
+      return false;
+    for (size_t i = 0; i < l; i++)
+      if (b[o+i] != s[i])
+        return false;
+    j = (l > k) ? 0 : k-l;
+    return true;
+  }
+
+  void PorterStemmer::setto(const TCHAR *s) {
+    size_t l = _tcslen(s);
+    size_t o = j+1;
+    for (size_t i = 0; i < l; i++)
+      b[o+i] = s[i];
+    k = j+l;
+    dirty = true;
+  }
+
+  void PorterStemmer::r(const TCHAR *s) { 
+	  if (m() > 0) setto(s); 
+  }
+
+  void PorterStemmer::step1() {
+    if (b[k] == _T('s')) {
+      if (ends(_T("sses"))) k -= 2;
+      else if (ends(_T("ies"))) setto(_T("i"));
+      else if (b[k-1] != _T('s')) k--;
+    }
+    if (ends(_T("eed"))) {
+      if (m() > 0)
+        k--;
+    }
+    else if ((ends(_T("ed")) || ends(_T("ing"))) && vowelinstem()) {
+      k = j;
+      if (ends(_T("at"))) setto(_T("ate"));
+      else if (ends(_T("bl"))) setto(_T("ble"));
+      else if (ends(_T("iz"))) setto(_T("ize"));
+      else if (doublec(k)) {
+        int32_t ch = b[k--];
+        if (ch == _T('l') || ch == _T('s') || ch == _T('z'))
+          k++;
+      }
+      else if (m() == 1 && cvc(k))
+        setto(_T("e"));
+    }
+  }
+
+  void PorterStemmer::step2() {
+    if (ends(_T("y")) && vowelinstem()) {
+      b[k] = 'i';
+      dirty = true;
+    }
+  }
+
+  void PorterStemmer::step3() {
+    if (k == k0) return; /* For Bug 1 */
+    switch (b[k-1]) {
+    case 'a':
+      if (ends(_T("ational"))) { r(_T("ate")); break; }
+      if (ends(_T("tional"))) { r(_T("tion")); break; }
+      break;
+    case 'c':
+      if (ends(_T("enci"))) { r(_T("ence")); break; }
+      if (ends(_T("anci"))) { r(_T("ance")); break; }
+      break;
+    case 'e':
+      if (ends(_T("izer"))) { r(_T("ize")); break; }
+      break;
+    case 'l':
+      if (ends(_T("bli"))) { r(_T("ble")); break; }
+      if (ends(_T("alli"))) { r(_T("al")); break; }
+      if (ends(_T("entli"))) { r(_T("ent")); break; }
+      if (ends(_T("eli"))) { r(_T("e")); break; }
+      if (ends(_T("ousli"))) { r(_T("ous")); break; }
+      break;
+    case 'o':
+      if (ends(_T("ization"))) { r(_T("ize")); break; }
+      if (ends(_T("ation"))) { r(_T("ate")); break; }
+      if (ends(_T("ator"))) { r(_T("ate")); break; }
+      break;
+    case 's':
+      if (ends(_T("alism"))) { r(_T("al")); break; }
+      if (ends(_T("iveness"))) { r(_T("ive")); break; }
+      if (ends(_T("fulness"))) { r(_T("ful")); break; }
+      if (ends(_T("ousness"))) { r(_T("ous")); break; }
+      break;
+    case 't':
+      if (ends(_T("aliti"))) { r(_T("al")); break; }
+      if (ends(_T("iviti"))) { r(_T("ive")); break; }
+      if (ends(_T("biliti"))) { r(_T("ble")); break; }
+      break;
+    case 'g':
+      if (ends(_T("logi"))) { r(_T("log")); break; }
+    }
+  }
+
+  void PorterStemmer::step4() {
+    switch (b[k]) {
+    case 'e':
+      if (ends(_T("icate"))) { r(_T("ic")); break; }
+      if (ends(_T("ative"))) { r(LUCENE_BLANK_STRING); break; }
+      if (ends(_T("alize"))) { r(_T("al")); break; }
+      break;
+    case 'i':
+      if (ends(_T("iciti"))) { r(_T("ic")); break; }
+      break;
+    case 'l':
+      if (ends(_T("ical"))) { r(_T("ic")); break; }
+      if (ends(_T("ful"))) { r(LUCENE_BLANK_STRING); break; }
+      break;
+    case 's':
+      if (ends(_T("ness"))) { r(LUCENE_BLANK_STRING); break; }
+      break;
+    }
+  }
+
+  void PorterStemmer::step5() {
+    if (k == k0) return; /* for Bug 1 */
+    switch (b[k-1]) {
+    case 'a':
+      if (ends(_T("al"))) break;
+      return;
+    case 'c':
+      if (ends(_T("ance"))) break;
+      if (ends(_T("ence"))) break;
+      return;
+    case 'e':
+      if (ends(_T("er"))) break; return;
+    case 'i':
+      if (ends(_T("ic"))) break; return;
+    case 'l':
+      if (ends(_T("able"))) break;
+      if (ends(_T("ible"))) break; return;
+    case 'n':
+      if (ends(_T("ant"))) break;
+      if (ends(_T("ement"))) break;
+      if (ends(_T("ment"))) break;
+      /* element etc. not stripped before the m */
+      if (ends(_T("ent"))) break;
+      return;
+    case 'o':
+      if (ends(_T("ion")) && j >= 0 && (b[j] == 's' || b[j] == 't')) break;
+      /* j >= 0 fixes Bug 2 */
+      if (ends(_T("ou"))) break;
+      return;
+      /* takes care of -ous */
+    case 's':
+      if (ends(_T("ism"))) break;
+      return;
+    case 't':
+      if (ends(_T("ate"))) break;
+      if (ends(_T("iti"))) break;
+      return;
+    case 'u':
+      if (ends(_T("ous"))) break;
+      return;
+    case 'v':
+      if (ends(_T("ive"))) break;
+      return;
+    case 'z':
+      if (ends(_T("ize"))) break;
+      return;
+    default:
+      return;
+    }
+    if (m() > 1)
+      k = j;
+  }
+
+  void PorterStemmer::step6() {
+    j = k;
+    if (b[k] == 'e') {
+      int32_t a = m();
+      if (a > 1 || a == 1 && !cvc(k-1))
+        k--;
+    }
+    if (b[k] == 'l' && doublec(k) && m() > 1)
+      k--;
+  }
+
+
+ 	PorterStemmer::PorterStemmer(TCHAR *Text) {
+     b = Text;
+     i = _tcslen(b);
+ 	dirty = false;
+   }
+
+   PorterStemmer::~PorterStemmer(){
+ 		b = NULL;
+ 	}
+
+
+   int32_t PorterStemmer::getResultLength() { return i; }
+
+	 bool PorterStemmer::stem() {
+    //i = strlen(b);
+		 k = i -1;
+    k0 = 0;
+    if (k > k0+1) {
+      step1(); step2(); step3(); step4(); step5(); step6();
+    }
+    // Also, a word is considered dirty if we lopped off letters
+    // Thanks to Ifigenia Vairelles for pointing this out.
+    if (i != k+1)
+      dirty = true;
+    i = k+1;
+    return dirty;
+  }
+
+  const TCHAR* PorterStemmer::getResultBuffer() { 
+   return b; 
+  }
+
+CL_NS_END
diff --git a/src/contribs-lib/CLucene/analysis/PorterStemmer.h b/src/contribs-lib/CLucene/analysis/PorterStemmer.h
new file mode 100644
index 0000000..35ff817
--- /dev/null
+++ b/src/contribs-lib/CLucene/analysis/PorterStemmer.h
@@ -0,0 +1,151 @@
+/* This is the Porter stemming algorithm, originally written by Martin Porter.
+   It may be regarded as cononical, in that it follows the
+   algorithm presented in
+
+   Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
+   no. 3, pp 130-137,
+
+   See also http://www.tartarus.org/~martin/PorterStemmer
+
+   Modified by "Hemant Muthiyan"
+   email: hemant_muthiyan@yahoo.co.in
+    
+   The Porter stemmer should be regarded as �frozen�, that is, strictly defined, 
+   and not amenable to further modification. As a stemmer, it is slightly inferior 
+   to the Snowball English or Porter2 stemmer, which derives from it, and which is 
+   subjected to occasional improvements. For practical work, therefore, the new 
+   Snowball stemmer is recommended. The Porter stemmer is appropriate to IR 
+   research work involving stemming where the experiments need to be exactly 
+   repeatable. 
+
+*/
+#ifndef _lucene_analysis_PorterStemmer_
+#define _lucene_analysis_PorterStemmer_
+
+CL_NS_DEF(analysis)
+
+class CLUCENE_CONTRIBS_EXPORT PorterStemmer
+{
+private:
+	TCHAR *b;
+    size_t i,    /* offset into b */
+    j, k, k0;
+	bool dirty;
+    //private static final int32_t EXTRA = 1;
+
+  /* cons(i) is true <=> b[i] is a consonant. */
+
+private:
+	bool cons(size_t i);
+
+  /* m() measures the number of consonant sequences between k0 and j. if c is
+     a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
+     presence,
+
+          <c><v>       gives 0
+          <c>vc<v>     gives 1
+          <c>vcvc<v>   gives 2
+          <c>vcvcvc<v> gives 3
+          ....
+  */
+
+   int32_t m();
+
+  /* vowelinstem() is true <=> k0,...j contains a vowel */
+
+   bool vowelinstem();
+
+  /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
+   bool doublec(size_t j);
+
+  /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
+     and also if the second c is not w,x or y. this is used when trying to
+     restore an e at the end of a short word. e.g.
+
+          cav(e), lov(e), hop(e), crim(e), but
+          snow, box, tray.
+
+  */
+   bool cvc(size_t i);
+
+  bool ends(TCHAR *s);
+
+  /* setto(s) sets (j+1),...k to the characters in the string s, readjusting
+     k. */
+
+  void setto(const TCHAR *s);
+
+  /* r(s) is used further down. */
+
+  void r(const TCHAR *s);
+
+  /* step1() gets rid of plurals and -ed or -ing. e.g.
+
+           caresses  ->  caress
+           ponies    ->  poni
+           ties      ->  ti
+           caress    ->  caress
+           cats      ->  cat
+
+           feed      ->  feed
+           agreed    ->  agree
+           disabled  ->  disable
+
+           matting   ->  mat
+           mating    ->  mate
+           meeting   ->  meet
+           milling   ->  mill
+           messing   ->  mess
+
+           meetings  ->  meet
+
+  */
+
+  void step1();
+
+  /* step2() turns terminal y to i when there is another vowel in the stem. */
+
+  void step2();
+
+  /* step3() maps double suffices to single ones. so -ization ( = -ize plus
+     -ation) maps to -ize etc. note that the string before the suffix must give
+     m() > 0. */
+
+  void step3();
+
+  /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
+
+  void step4();
+
+  /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
+
+  void step5();
+
+  /* step6() removes a final -e if m() > 1. */
+
+  void step6();
+
+ public:
+
+ 	PorterStemmer(TCHAR *Text);
+   ~PorterStemmer();
+
+
+   /**
+    * Returns the length of the word resulting from the stemming process.
+    */
+   int32_t getResultLength();
+
+	 bool stem();
+
+  /**
+   * Returns a reference to a character buffer containing the results of
+   * the stemming process.  You also need to consult getResultLength()
+   * to determine the length of the result.
+   */
+  const TCHAR* getResultBuffer();
+
+};
+CL_NS_END
+
+#endif
diff --git a/src/contribs-lib/CLucene/analysis/cjk/CJKAnalyzer.cpp b/src/contribs-lib/CLucene/analysis/cjk/CJKAnalyzer.cpp
new file mode 100644
index 0000000..b19ed77
--- /dev/null
+++ b/src/contribs-lib/CLucene/analysis/cjk/CJKAnalyzer.cpp
@@ -0,0 +1,190 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+*
+* Distributable under the terms of either the Apache License (Version 2.0) or
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#include "CLucene/_ApiHeader.h"
+#include "CJKAnalyzer.h"
+#include "CLucene/util/CLStreams.h"
+
+CL_NS_DEF2(analysis,cjk)
+CL_NS_USE(analysis)
+CL_NS_USE(util)
+
+
+const TCHAR* CJKTokenizer::tokenTypeSingle = _T("single");
+const TCHAR* CJKTokenizer::tokenTypeDouble = _T("double");
+
+CJKTokenizer::CJKTokenizer(Reader* in):
+	Tokenizer(in)
+{
+	tokenType = Token::getDefaultType();
+	offset = 0;
+	bufferIndex = 0;
+	dataLen = 0;
+	preIsTokened = false;
+	ignoreSurrogates = true;
+}
+
+CL_NS(analysis)::Token* CJKTokenizer::next(Token* token){
+    /** how many character(s) has been stored in buffer */
+    int32_t length = 0;
+
+    /** the position used to create Token */
+    int32_t start = offset;
+
+    while (true) {
+        /** current character */
+        clunichar c;
+	int charlen = 1;
+
+        offset++;
+
+        if (bufferIndex >= dataLen) {
+            dataLen = input->read((const void**)&ioBuffer, 1, LUCENE_IO_BUFFER_SIZE);
+            bufferIndex = 0;
+        }
+
+        if (dataLen == -1) {
+            if (length > 0) {
+                if (preIsTokened == true) {
+                    length = 0;
+                    preIsTokened = false;
+                }
+
+                break;
+            } else {
+                return NULL;
+            }
+        } else {
+            //get current character
+            c = ioBuffer[bufferIndex++];
+        }
+
+		//to support surrogates, we'll need to convert the incoming utf16 into
+		//ucs4(c variable). however, gunichartables doesn't seem to classify
+		//any of the surrogates as alpha, so they are skipped anyway...
+		//so for now we just convert to ucs4 so that we dont corrupt the input.
+		if ( c >= 0xd800 || c <= 0xdfff ){
+			clunichar c2 = ioBuffer[bufferIndex];
+			if ( c2 >= 0xdc00 && c2 <= 0xdfff ){
+				bufferIndex++;
+				offset++;
+				charlen=2;
+
+				c = (((c & 0x03ffL) << 10) | ((c2 & 0x03ffL) <<  0)) + 0x00010000L;
+			}
+		}
+
+        //if the current character is ASCII or Extend ASCII
+        if ((c <= 0xFF) //is BASIC_LATIN
+            || (c>=0xFF00 && c<=0xFFEF) //ascii >0x74 cast to unsigned...
+           ) {
+            if (c >= 0xFF00) {
+				//todo: test this... only happens on platforms where char is signed, i think...
+                /** convert  HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */
+                c -= 0xFEE0;
+            }
+
+            // if the current character is a letter or "_" "+" "#"
+			if (_istalnum(c) || ((c == '_') || (c == '+') || (c == '#')) ) {
+                if (length == 0) {
+                    // "javaC1C2C3C4linux" <br>
+                    //      ^--: the current character begin to token the ASCII
+                    // letter
+                    start = offset - 1;
+                } else if (tokenType == tokenTypeDouble) {
+                    // "javaC1C2C3C4linux" <br>
+                    //              ^--: the previous non-ASCII
+                    // : the current character
+                    offset-=charlen;
+                    bufferIndex-=charlen;
+                    tokenType = tokenTypeSingle;
+
+                    if (preIsTokened == true) {
+                        // there is only one non-ASCII has been stored
+                        length = 0;
+                        preIsTokened = false;
+
+                        break;
+                    } else {
+                        break;
+                    }
+                }
+
+                // store the LowerCase(c) in the buffer
+                buffer[length++] = _totlower((TCHAR)c);
+				tokenType = tokenTypeSingle;
+
+                // break the procedure if buffer overflowed!
+                if (length == LUCENE_MAX_WORD_LEN) {
+                    break;
+                }
+            } else if (length > 0) {
+                if (preIsTokened == true) {
+                    length = 0;
+                    preIsTokened = false;
+                } else {
+                    break;
+                }
+            }
+        } else {
+            // non-ASCII letter, eg."C1C2C3C4"
+			if ( _istalpha(c) || (!ignoreSurrogates && c>=0x10000) ) {
+                if (length == 0) {
+                    start = offset - 1;
+                    
+					if ( c < 0x00010000L )
+						buffer[length++] = (TCHAR)c;
+					else{
+						clunichar ucs4 = c - 0x00010000L;
+						buffer[length++] = (TCHAR)((ucs4 >> 10) & 0x3ff) | 0xd800;
+						buffer[length++] = (TCHAR)((ucs4 >>  0) & 0x3ff) | 0xdc00;
+					}
+
+                    tokenType = tokenTypeDouble;
+                } else {
+                    if (tokenType == tokenTypeSingle) {
+                        offset-=charlen;
+                        bufferIndex-=charlen;
+
+                        //return the previous ASCII characters
+                        break;
+                    } else {
+						if ( c < 0x00010000L )
+							buffer[length++] = (TCHAR)c;
+						else{
+							clunichar ucs4 = c - 0x00010000L;
+							buffer[length++] = (TCHAR)((ucs4 >> 10) & 0x3ff) | 0xd800;
+							buffer[length++] = (TCHAR)((ucs4 >>  0) & 0x3ff) | 0xdc00;
+						}
+						tokenType = tokenTypeDouble;
+
+                        if (length >= 2) {
+                            offset-=charlen;
+                            bufferIndex-=charlen;
+                            preIsTokened = true;
+
+                            break;
+                        }
+                    }
+                }
+            } else if (length > 0) {
+                if (preIsTokened == true) {
+                    // empty the buffer
+                    length = 0;
+                    preIsTokened = false;
+                } else {
+                    break;
+                }
+            }
+        }
+    }
+
+	buffer[length]='\0';
+	token->set(buffer,start, start+length, tokenType);
+	return token;
+}
+
+CL_NS_END2
diff --git a/src/contribs-lib/CLucene/analysis/cjk/CJKAnalyzer.h b/src/contribs-lib/CLucene/analysis/cjk/CJKAnalyzer.h
new file mode 100644
index 0000000..978ad81
--- /dev/null
+++ b/src/contribs-lib/CLucene/analysis/cjk/CJKAnalyzer.h
@@ -0,0 +1,94 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+*
+* Distributable under the terms of either the Apache License (Version 2.0) or
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#ifndef _lucene_analysis_cjk_cjkanalyzer_
+#define _lucene_analysis_cjk_cjkanalyzer_
+
+#include "CLucene/analysis/AnalysisHeader.h"
+
+CL_NS_DEF2(analysis,cjk)
+
+/**
+ * CJKTokenizer was modified from StopTokenizer which does a decent job for
+ * most European languages. It performs other token methods for double-byte
+ * Characters: the token will return at each two charactors with overlap match.<br>
+ * Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
+ * also need filter filter zero length token ""<br>
+ * for Digit: digit, '+', '#' will token as letter<br>
+ * for more info on Asia language(Chinese Japanese Korean) text segmentation:
+ * please search  <a
+ * href="http://www.google.com/search?q=word+chinese+segment">google</a>
+ *
+ * @author Che, Dong
+ */
+class CLUCENE_CONTRIBS_EXPORT CJKTokenizer: public CL_NS(analysis)::Tokenizer {
+private:
+	/** word offset, used to imply which character(in ) is parsed */
+    int32_t offset;
+
+    /** the index used only for ioBuffer */
+    int32_t bufferIndex;
+
+    /** data length */
+    int32_t dataLen;
+
+    /**
+     * character buffer, store the characters which are used to compose <br>
+     * the returned Token
+     */
+    TCHAR buffer[LUCENE_MAX_WORD_LEN];
+
+    /**
+     * I/O buffer, used to store the content of the input(one of the <br>
+     * members of Tokenizer)
+     */
+    const TCHAR* ioBuffer;
+
+    /** word type: single=>ASCII  double=>non-ASCII word=>default */
+    const TCHAR* tokenType;
+
+	static const TCHAR* tokenTypeSingle;
+	static const TCHAR* tokenTypeDouble;
+
+    /**
+     * tag: previous character is a cached double-byte character  "C1C2C3C4"
+     * ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
+     * C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
+     */
+    bool preIsTokened;
+
+
+	bool ignoreSurrogates;
+
+public:
+    /**
+     * Construct a token stream processing the given input.
+     *
+     * @param in I/O reader
+     */
+	CJKTokenizer(CL_NS(util)::Reader* in);
+
+	/**
+     * Returns the next token in the stream, or null at EOS.
+     * See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
+     * for detail.
+     *
+     * @return Token
+     *
+     * @throws java.io.IOException - throw IOException when read error <br>
+     *         hanppened in the InputStream
+     *
+     */
+	CL_NS(analysis)::Token* next(CL_NS(analysis)::Token* token);
+
+	bool getIgnoreSurrogates(){ return ignoreSurrogates; };
+	void setIgnoreSurrogates(bool ignoreSurrogates){ this->ignoreSurrogates = ignoreSurrogates; };
+};
+
+
+
+CL_NS_END2
+#endif
diff --git a/src/contribs-lib/CLucene/analysis/de/GermanAnalyzer.cpp b/src/contribs-lib/CLucene/analysis/de/GermanAnalyzer.cpp
new file mode 100644
index 0000000..b2fe172
--- /dev/null
+++ b/src/contribs-lib/CLucene/analysis/de/GermanAnalyzer.cpp
@@ -0,0 +1,149 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2010 Ben van Klinken and the CLucene Team
+* 
+* Distributable under the terms of either the Apache License (Version 2.0) or 
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#include "CLucene/_ApiHeader.h"
+#include "CLucene/util/CLStreams.h"
+#include "CLucene/analysis/Analyzers.h"
+#include "CLucene/analysis/standard/StandardTokenizer.h"
+#include "CLucene/analysis/standard/StandardFilter.h"
+#include "CLucene/util/StringBuffer.h"
+#include "GermanAnalyzer.h"
+#include "GermanStemmer.h"
+#include "GermanStemFilter.h"
+
+CL_NS_USE(analysis)
+CL_NS_USE2(analysis,de)
+CL_NS_USE2(analysis,standard)
+
+  const TCHAR GermanAnalyzer_DASZ[] = { 0x64, 0x61, 0xdf };
+  const TCHAR GermanAnalyzer_FUER[] = { 0x66, 0xfc, 0x72 };
+  const TCHAR* GermanAnalyzer_GERMAN_STOP_WORDS[] = {
+    _T("einer"), _T("eine"), _T("eines"), _T("einem"), _T("einen"),
+    _T("der"), _T("die"), _T("das"), _T("dass"), GermanAnalyzer_DASZ,
+    _T("du"), _T("er"), _T("sie"), _T("es"),
+    _T("was"), _T("wer"), _T("wie"), _T("wir"),
+    _T("und"), _T("oder"), _T("ohne"), _T("mit"),
+    _T("am"), _T("im"),_T("in"), _T("aus"), _T("auf"),
+    _T("ist"), _T("sein"), _T("war"), _T("wird"),
+    _T("ihr"), _T("ihre"), _T("ihres"),
+    _T("als"), GermanAnalyzer_FUER, _T("von"), _T("mit"),
+    _T("dich"), _T("dir"), _T("mich"), _T("mir"),
+    _T("mein"), _T("sein"), _T("kein"),
+    _T("durch"), _T("wegen"), _T("wird")
+  };
+
+  CL_NS(util)::ConstValueArray<const TCHAR*> GermanAnalyzer::GERMAN_STOP_WORDS( GermanAnalyzer_GERMAN_STOP_WORDS, 48 );
+
+  class GermanAnalyzer::SavedStreams : public TokenStream {
+  public:
+      StandardTokenizer* tokenStream;
+      TokenStream* filteredTokenStream;
+
+      SavedStreams():tokenStream(NULL), filteredTokenStream(NULL)
+      {
+      }
+
+      void close(){}
+      Token* next(Token* token) {return NULL;}
+  };
+
+  GermanAnalyzer::GermanAnalyzer() {
+    exclusionSet = NULL;
+    stopSet = _CLNEW CLTCSetList;
+    StopFilter::fillStopTable(stopSet, GERMAN_STOP_WORDS.values);
+  }
+
+  GermanAnalyzer::GermanAnalyzer(const TCHAR** stopwords) {
+    exclusionSet = NULL;
+    stopSet = _CLNEW CLTCSetList;
+    StopFilter::fillStopTable(stopSet, stopwords);
+  }
+
+  GermanAnalyzer::GermanAnalyzer(CL_NS(analysis)::CLTCSetList* stopwords) {
+    exclusionSet = NULL;
+    stopSet = stopwords;
+  }
+
+  GermanAnalyzer::GermanAnalyzer(const char* stopwordsFile, const char* enc) {
+    exclusionSet = NULL;
+    stopSet = WordlistLoader::getWordSet(stopwordsFile, enc);
+  }
+
+  GermanAnalyzer::GermanAnalyzer(CL_NS(util)::Reader* stopwordsReader, const bool deleteReader) {
+    exclusionSet = NULL;
+    stopSet = WordlistLoader::getWordSet(stopwordsReader, NULL, deleteReader);
+  }
+
+  GermanAnalyzer::~GermanAnalyzer() {
+    _CLLDELETE(stopSet);
+    _CLLDELETE(exclusionSet);
+  }
+
+  void GermanAnalyzer::setStemExclusionTable(const TCHAR** exclusionlist) {
+    if (exclusionSet != NULL) {
+      exclusionSet->clear();
+    } else {
+      exclusionSet = _CLNEW CLTCSetList;
+    }
+
+    CL_NS(analysis)::StopFilter::fillStopTable(exclusionSet, exclusionlist);
+  }
+
+  void GermanAnalyzer::setStemExclusionTable(CL_NS(analysis)::CLTCSetList* exclusionlist) {
+    if (exclusionSet != exclusionlist) {
+      _CLLDELETE(exclusionSet);
+      exclusionSet = exclusionlist;
+    }
+  }
+
+  void GermanAnalyzer::setStemExclusionTable(const char* exclusionlistFile, const char* enc) {
+    exclusionSet = WordlistLoader::getWordSet(exclusionlistFile, enc, exclusionSet);
+  }
+
+  void GermanAnalyzer::setStemExclusionTable(CL_NS(util)::Reader* exclusionlistReader, const bool deleteReader) {
+    exclusionSet = WordlistLoader::getWordSet(exclusionlistReader, exclusionSet, deleteReader);
+  }
+
+  TokenStream* GermanAnalyzer::tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader) {
+    TokenStream* result;
+    CL_NS(util)::BufferedReader* bufferedReader = reader->__asBufferedReader();
+
+    if ( bufferedReader == NULL )
+      result = _CLNEW StandardTokenizer( _CLNEW CL_NS(util)::FilteredBufferedReader(reader, false), true );
+    else
+      result = _CLNEW StandardTokenizer(bufferedReader);
+
+    result = _CLNEW StandardFilter(result, true);
+    result = _CLNEW LowerCaseFilter(result, true);
+    result = _CLNEW StopFilter(result, true, stopSet);
+    result = _CLNEW GermanStemFilter(result, true, exclusionSet);
+
+    return result;
+  }
+
+  TokenStream* GermanAnalyzer::reusableTokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader)
+  {
+    SavedStreams* streams = reinterpret_cast<SavedStreams*>(getPreviousTokenStream());
+
+    if (streams == NULL) {
+      streams = _CLNEW SavedStreams();
+      CL_NS(util)::BufferedReader* bufferedReader = reader->__asBufferedReader();
+
+      if ( bufferedReader == NULL )
+        streams->tokenStream = _CLNEW StandardTokenizer( _CLNEW CL_NS(util)::FilteredBufferedReader(reader, false), true );
+      else
+        streams->tokenStream = _CLNEW StandardTokenizer(bufferedReader);
+
+      streams->filteredTokenStream = _CLNEW StandardFilter(streams->tokenStream, true);
+      streams->filteredTokenStream = _CLNEW LowerCaseFilter(streams->filteredTokenStream, true);
+      streams->filteredTokenStream = _CLNEW StopFilter(streams->filteredTokenStream, true, stopSet);
+      streams->filteredTokenStream = _CLNEW GermanStemFilter(streams->filteredTokenStream, true, exclusionSet);
+      setPreviousTokenStream(streams);
+    } else
+      streams->tokenStream->reset(reader);
+
+    return streams->filteredTokenStream;
+  }
diff --git a/src/contribs-lib/CLucene/analysis/de/GermanAnalyzer.h b/src/contribs-lib/CLucene/analysis/de/GermanAnalyzer.h
new file mode 100644
index 0000000..9c02073
--- /dev/null
+++ b/src/contribs-lib/CLucene/analysis/de/GermanAnalyzer.h
@@ -0,0 +1,108 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2010 Ben van Klinken and the CLucene Team
+* 
+* Distributable under the terms of either the Apache License (Version 2.0) or 
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#ifndef _lucene_analysis_de_GermanAnalyzer
+#define _lucene_analysis_de_GermanAnalyzer
+
+CL_NS_DEF2(analysis,de)
+
+/**
+ * Analyzer for German language. Supports an external list of stopwords (words that
+ * will not be indexed at all) and an external list of exclusions (word that will
+ * not be stemmed, but indexed).
+ * A default set of stopwords is used unless an alternative list is specified, the
+ * exclusion list is empty by default.
+ *
+ * 
+ * @version $Id: GermanAnalyzer.java 564236 2007-08-09 15:21:19Z gsingers $
+ */
+class CLUCENE_CONTRIBS_EXPORT GermanAnalyzer : public CL_NS(analysis)::Analyzer {
+public:
+
+  /**
+   * List of typical german stopwords.
+   */
+  static CL_NS(util)::ConstValueArray<const TCHAR*> GERMAN_STOP_WORDS;
+
+private:
+
+  class SavedStreams;
+
+  /**
+   * Contains the stopwords used with the StopFilter.
+   */
+  CL_NS(analysis)::CLTCSetList* stopSet;
+
+  /**
+   * Contains words that should be indexed but not stemmed.
+   */
+  CL_NS(analysis)::CLTCSetList* exclusionSet;
+
+public:
+
+  /**
+   * Builds an analyzer with the default stop words
+   * (<code>GERMAN_STOP_WORDS</code>).
+   */
+  GermanAnalyzer();
+
+  /**
+   * Builds an analyzer with the given stop words.
+   */
+  GermanAnalyzer(const TCHAR** stopWords);
+
+  /**
+   * Builds an analyzer with the given stop words.
+   */
+  GermanAnalyzer(CL_NS(analysis)::CLTCSetList* stopwords);
+
+  /**
+   * Builds an analyzer with the given stop words.
+   */
+  GermanAnalyzer(const char* stopwordsFile, const char* enc = NULL);
+
+  /**
+   * Builds an analyzer with the given stop words.
+   */
+  GermanAnalyzer(CL_NS(util)::Reader* stopwordsReader, const bool deleteReader = false);
+
+  /**
+   */
+  virtual ~GermanAnalyzer();
+
+  /**
+   * Builds an exclusionlist from an array of Strings.
+   */
+  void setStemExclusionTable(const TCHAR** exclusionlist);
+
+  /**
+   * Builds an exclusionlist from a Hashtable.
+   */
+  void setStemExclusionTable(CL_NS(analysis)::CLTCSetList* exclusionlist);
+
+  /**
+   * Builds an exclusionlist from the words contained in the given file.
+   */
+  void setStemExclusionTable(const char* exclusionlistFile, const char* enc = NULL);
+
+  /**
+   * Builds an exclusionlist from the words contained in the given file.
+   */
+  void setStemExclusionTable(CL_NS(util)::Reader* exclusionlistReader, const bool deleteReader = false);
+
+  /**
+   * Creates a TokenStream which tokenizes all the text in the provided Reader.
+   *
+   * @return A TokenStream build from a StandardTokenizer filtered with
+   *         StandardFilter, LowerCaseFilter, StopFilter, GermanStemFilter
+   */
+  virtual TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
+
+  virtual TokenStream* reusableTokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
+};
+
+CL_NS_END2
+#endif
diff --git a/src/contribs-lib/CLucene/analysis/de/GermanStemFilter.cpp b/src/contribs-lib/CLucene/analysis/de/GermanStemFilter.cpp
new file mode 100644
index 0000000..87f7fcb
--- /dev/null
+++ b/src/contribs-lib/CLucene/analysis/de/GermanStemFilter.cpp
@@ -0,0 +1,60 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2010 Ben van Klinken and the CLucene Team
+* 
+* Distributable under the terms of either the Apache License (Version 2.0) or 
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#include "CLucene/_ApiHeader.h"
+#include "CLucene/analysis/AnalysisHeader.h"
+#include "CLucene/util/StringBuffer.h"
+#include "GermanStemmer.h"
+#include "GermanStemFilter.h"
+
+CL_NS_USE(analysis)
+CL_NS_USE2(analysis,de)
+
+    GermanStemFilter::GermanStemFilter(TokenStream* in, bool deleteTS) :
+      TokenFilter(in, deleteTS)
+    {
+      stemmer = _CLNEW GermanStemmer();
+      exclusionSet = NULL;
+    }
+
+    GermanStemFilter::GermanStemFilter(TokenStream* in, bool deleteTS, CLTCSetList* exclusionSet) :
+      TokenFilter(in, deleteTS)
+    {
+      stemmer = _CLNEW GermanStemmer();
+      this->exclusionSet = exclusionSet;
+    }
+
+    Token* GermanStemFilter::next(Token* t) {
+      if (input->next(t) == NULL) {
+        return NULL;
+      } else if (exclusionSet != NULL && exclusionSet->find(t->termBuffer<TCHAR>()) != exclusionSet->end()) { // Check the exclusiontable
+        return t;
+      } else {
+        TCHAR* s = stemmer->stem(t->termBuffer<TCHAR>(), t->termLength<TCHAR>());
+        // If not stemmed, dont waste the time creating a new token
+        if (_tcscmp(s, t->termBuffer<TCHAR>()) != 0) {
+          t->setText(s, _tcslen(s));
+        }
+        return t;
+      }
+    }
+
+    void GermanStemFilter::setStemmer(GermanStemmer* stemmer) {
+      if (stemmer != NULL && this->stemmer != stemmer) {
+        _CLLDELETE(this->stemmer);
+        this->stemmer = stemmer;
+      }
+    }
+
+    /**
+     * Set an alternative exclusion list for this filter.
+     */
+    void GermanStemFilter::setExclusionSet(CLTCSetList* exclusionSet) {
+      if (this->exclusionSet != exclusionSet) {
+        _CLLDELETE(exclusionSet);
+        this->exclusionSet = exclusionSet;
+      }
+    }
diff --git a/src/contribs-lib/CLucene/analysis/de/GermanStemFilter.h b/src/contribs-lib/CLucene/analysis/de/GermanStemFilter.h
new file mode 100644
index 0000000..ab53668
--- /dev/null
+++ b/src/contribs-lib/CLucene/analysis/de/GermanStemFilter.h
@@ -0,0 +1,54 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2010 Ben van Klinken and the CLucene Team
+* 
+* Distributable under the terms of either the Apache License (Version 2.0) or 
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#ifndef _lucene_analysis_de_GermanStemFilter
+#define _lucene_analysis_de_GermanStemFilter
+
+CL_NS_DEF2(analysis,de)
+
+/**
+ * A filter that stems German words. It supports a table of words that should
+ * not be stemmed at all. The stemmer used can be changed at runtime after the
+ * filter object is created (as long as it is a GermanStemmer).
+ */
+class CLUCENE_CONTRIBS_EXPORT GermanStemFilter : public CL_NS(analysis)::TokenFilter
+{
+private:
+
+    /**
+     * The actual token in the input stream.
+     */
+    CL_NS(analysis)::Token* token;
+    GermanStemmer* stemmer;
+    CL_NS(analysis)::CLTCSetList* exclusionSet;
+
+public:
+
+    GermanStemFilter(TokenStream* in, bool deleteTS = false);
+
+    /**
+     * Builds a GermanStemFilter that uses an exclusiontable.
+     */
+    GermanStemFilter(TokenStream* in, bool deleteTS, CL_NS(analysis)::CLTCSetList* exclusionSet);
+
+    /**
+     * @return  Returns the next token in the stream, or null at EOS
+     */
+    virtual Token* next(Token* t);
+
+    /**
+     * Set a alternative/custom GermanStemmer for this filter.
+     */
+    void setStemmer(GermanStemmer* stemmer);
+
+    /**
+     * Set an alternative exclusion list for this filter.
+     */
+   void setExclusionSet(CL_NS(analysis)::CLTCSetList* exclusionSet);
+};
+
+CL_NS_END2
+#endif
diff --git a/src/contribs-lib/CLucene/analysis/de/GermanStemmer.cpp b/src/contribs-lib/CLucene/analysis/de/GermanStemmer.cpp
new file mode 100644
index 0000000..e31d6f8
--- /dev/null
+++ b/src/contribs-lib/CLucene/analysis/de/GermanStemmer.cpp
@@ -0,0 +1,213 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2010 Ben van Klinken and the CLucene Team
+* 
+* Distributable under the terms of either the Apache License (Version 2.0) or 
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#include "CLucene/_ApiHeader.h"
+#include "CLucene/util/StringBuffer.h"
+#include "GermanStemmer.h"
+
+CL_NS_USE(util)
+CL_NS_USE2(analysis,de)
+
+    GermanStemmer::GermanStemmer() :
+      sb() {
+    }
+
+    TCHAR* GermanStemmer::stem(const TCHAR* term, size_t length) {
+      if (length < 0) {
+        length = _tcslen(term);
+      }
+
+      // Reset the StringBuffer.
+      sb.clear();
+      sb.append(term, length);
+
+      if (!isStemmable(sb.getBuffer(), sb.length()))
+        return sb.giveBuffer();
+
+      // Stemming starts here...
+      substitute(sb);
+      strip(sb);
+      optimize(sb);
+      resubstitute(sb);
+      removeParticleDenotion(sb);
+
+      return sb.giveBuffer();
+    }
+
+    bool GermanStemmer::isStemmable(const TCHAR* term, size_t length) const {
+      if (length < 0) {
+        length = _tcslen(term);
+      }
+      for (size_t c = 0; c < length; c++) {
+        if (_istalpha(term[c]) == 0)
+          return false;
+      }
+      return true;
+    }
+
+    void GermanStemmer::strip(StringBuffer& buffer)
+    {
+      bool doMore = true;
+      while ( doMore && buffer.length() > 3 ) {
+        if ( ( buffer.length() + substCount > 5 ) &&
+          buffer.substringEquals( buffer.length() - 2, buffer.length(), _T("nd"), 2 ) )
+        {
+          buffer.deleteChars( buffer.length() - 2, buffer.length() );
+        }
+        else if ( ( buffer.length() + substCount > 4 ) &&
+          buffer.substringEquals( buffer.length() - 2, buffer.length(), _T("em"), 2 ) ) {
+            buffer.deleteChars( buffer.length() - 2, buffer.length() );
+        }
+        else if ( ( buffer.length() + substCount > 4 ) &&
+          buffer.substringEquals( buffer.length() - 2, buffer.length(), _T("er"), 2 ) ) {
+            buffer.deleteChars( buffer.length() - 2, buffer.length() );
+        }
+        else if ( buffer.charAt( buffer.length() - 1 ) == _T('e') ) {
+          buffer.deleteCharAt( buffer.length() - 1 );
+        }
+        else if ( buffer.charAt( buffer.length() - 1 ) == _T('s') ) {
+          buffer.deleteCharAt( buffer.length() - 1 );
+        }
+        else if ( buffer.charAt( buffer.length() - 1 ) == _T('n') ) {
+          buffer.deleteCharAt( buffer.length() - 1 );
+        }
+        // "t" occurs only as suffix of verbs.
+        else if ( buffer.charAt( buffer.length() - 1 ) == _T('t') ) {
+          buffer.deleteCharAt( buffer.length() - 1 );
+        }
+        else {
+          doMore = false;
+        }
+      }
+    }
+
+    void GermanStemmer::optimize(StringBuffer& buffer) {
+      // Additional step for female plurals of professions and inhabitants.
+      if ( buffer.length() > 5 && buffer.substringEquals( buffer.length() - 5, buffer.length(), _T("erin*"), 5 ) ) {
+        buffer.deleteCharAt( buffer.length() -1 );
+        strip( buffer );
+      }
+      // Additional step for irregular plural nouns like "Matrizen -> Matrix".
+      if ( buffer.charAt( buffer.length() - 1 ) == ( _T('z') ) ) {
+        buffer.setCharAt( buffer.length() - 1, _T('x') );
+      }
+    }
+
+    void GermanStemmer::removeParticleDenotion(StringBuffer& buffer) {
+      if ( buffer.length() > 4 ) {
+        for ( size_t c = 0; c < buffer.length() - 3; c++ ) {
+          if ( buffer.substringEquals( c, c + 4, _T("gege"), 4 ) ) {
+            buffer.deleteChars( c, c + 2 );
+            return;
+          }
+        }
+      }
+    }
+
+    void GermanStemmer::substitute(StringBuffer& buffer) {
+      substCount = 0;
+
+      for ( size_t i = 0; i < buffer.length(); i++ ) {
+#ifdef _UCS2
+        TCHAR c = buffer.charAt(i);
+#else
+        unsigned char c = buffer.charAt(i);
+#endif
+        // Replace the second char of a pair of the equal characters with an asterisk
+        if ( i > 0 && c == buffer.charAt ( i - 1 )  ) {
+          buffer.setCharAt( i, _T('*') );
+        }
+        // Substitute Umlauts.
+        else if ( c  == 0xe4 ) {
+          buffer.setCharAt( i, _T('a') );
+        }
+        else if ( c == 0xf6 ) {
+          buffer.setCharAt( i, _T('o') );
+        }
+        else if ( c == 0xfc ) {
+          buffer.setCharAt( i, _T('u') );
+        }
+        // Fix bug so that 'ß' at the end of a word is replaced.
+        else if ( c == 0xdf ) {
+            buffer.setCharAt( i, _T('s') );
+            buffer.insert( i + 1, _T('s') );
+            substCount++;
+        }
+        // Take care that at least one character is left left side from the current one
+        if ( i < buffer.length() - 1 ) {
+          // Masking several common character combinations with an token
+          if ( ( i < buffer.length() - 2 ) && c == _T('s') &&
+            buffer.charAt( i + 1 ) == _T('c') && buffer.charAt( i + 2 ) == _T('h') )
+          {
+            buffer.setCharAt( i, _T('$') );
+            buffer.deleteChars( i + 1, i + 3 );
+            substCount =+ 2;
+          }
+          else if ( c == _T('c') && buffer.charAt( i + 1 ) == _T('h') ) {
+            buffer.setCharAt( i, 0xa7 ); // section sign in UTF-16
+            buffer.deleteCharAt( i + 1 );
+            substCount++;
+          }
+          else if ( c == _T('e') && buffer.charAt( i + 1 ) == _T('i') ) {
+            buffer.setCharAt( i, _T('%') );
+            buffer.deleteCharAt( i + 1 );
+            substCount++;
+          }
+          else if ( c == _T('i') && buffer.charAt( i + 1 ) == _T('e') ) {
+            buffer.setCharAt( i, _T('&') );
+            buffer.deleteCharAt( i + 1 );
+            substCount++;
+          }
+          else if ( c == _T('i') && buffer.charAt( i + 1 ) == _T('g') ) {
+            buffer.setCharAt( i, _T('#') );
+            buffer.deleteCharAt( i + 1 );
+            substCount++;
+          }
+          else if ( c == _T('s') && buffer.charAt( i + 1 ) == _T('t') ) {
+            buffer.setCharAt( i, _T('!') );
+            buffer.deleteCharAt( i + 1 );
+            substCount++;
+          }
+        }
+      }
+    }
+
+    void GermanStemmer::resubstitute(StringBuffer& buffer) {
+      for ( size_t i = 0; i < buffer.length(); i++ ) {
+#ifdef _UCS2
+        TCHAR c = buffer.charAt(i);
+#else
+        unsigned char c = buffer.charAt(i);
+#endif
+        if ( c == _T('*') ) {
+          buffer.setCharAt( i, buffer.charAt( i - 1 ) );
+        }
+        else if ( c == _T('$') ) {
+          buffer.setCharAt( i, 's' );
+          buffer.insert( i + 1, _T("ch"), 2 );
+        }
+        else if ( c == 0xa7 ) { // section sign in UTF-16
+          buffer.setCharAt( i, _T('c') );
+          buffer.insert( i + 1, _T('h') );
+        }
+        else if ( c == _T('%') ) {
+          buffer.setCharAt( i, _T('e') );
+          buffer.insert( i + 1, _T('i') );
+        }
+        else if ( c == _T('&') ) {
+          buffer.setCharAt( i, _T('i') );
+          buffer.insert( i + 1, _T('e') );
+        }
+        else if ( c == _T('#') ) {
+          buffer.setCharAt( i, _T('i') );
+          buffer.insert( i + 1, _T('g') );
+        }
+        else if ( c == _T('!') ) {
+          buffer.setCharAt( i, _T('s') );
+          buffer.insert( i + 1, _T('t') );
+        }
+      }
+    }
diff --git a/src/contribs-lib/CLucene/analysis/de/GermanStemmer.h b/src/contribs-lib/CLucene/analysis/de/GermanStemmer.h
new file mode 100644
index 0000000..5743e94
--- /dev/null
+++ b/src/contribs-lib/CLucene/analysis/de/GermanStemmer.h
@@ -0,0 +1,98 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2010 Ben van Klinken and the CLucene Team
+* 
+* Distributable under the terms of either the Apache License (Version 2.0) or 
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#ifndef _lucene_analysis_de_GermanStemmer
+#define _lucene_analysis_de_GermanStemmer
+
+CL_CLASS_DEF(util,StringBuffer)
+
+CL_NS_DEF2(analysis,de)
+
+/**
+ * A stemmer for German words. The algorithm is based on the report
+ * "A Fast and Simple Stemming Algorithm for German Words" by J&ouml;rg
+ * Caumanns (joerg.caumanns at isst.fhg.de).
+ */
+class CLUCENE_CONTRIBS_EXPORT GermanStemmer
+{
+private:
+
+    /**
+     * Buffer for the terms while stemming them.
+     */
+    CL_NS(util)::StringBuffer sb;
+
+    /**
+     * Amount of characters that are removed with <tt>substitute()</tt> while stemming.
+     */
+    int substCount;
+
+public:
+
+    /**
+     */
+    GermanStemmer();
+
+    /**
+     * Stemms the given term to an unique <tt>discriminator</tt>.
+     *
+     * @param term  The term that should be stemmed.
+     * @return      Discriminator for <tt>term</tt>
+     */
+    TCHAR* stem(const TCHAR* term, size_t length = -1);
+
+private:
+
+    /**
+     * Checks if a term could be stemmed.
+     *
+     * @return  true if, and only if, the given term consists in letters.
+     */
+    bool isStemmable(const TCHAR* term, size_t length = -1) const;
+
+    /**
+     * suffix stripping (stemming) on the current term. The stripping is reduced
+     * to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd",
+     * from which all regular suffixes are build of. The simplification causes
+     * some overstemming, and way more irregular stems, but still provides unique.
+     * discriminators in the most of those cases.
+     * The algorithm is context free, except of the length restrictions.
+     */
+     void strip(CL_NS(util)::StringBuffer& buffer);
+
+    /**
+     * Does some optimizations on the term. This optimisations are
+     * contextual.
+     */
+    void optimize(CL_NS(util)::StringBuffer& buffer);
+
+    /**
+     * Removes a particle denotion ("ge") from a term.
+     */
+    void removeParticleDenotion(CL_NS(util)::StringBuffer& buffer);
+
+    /**
+     * Do some substitutions for the term to reduce overstemming:
+     *
+     * - Substitute Umlauts with their corresponding vowel: äöü -> aou,
+     *   "ß" is substituted by "ss"
+     * - Substitute a second char of a pair of equal characters with
+     *   an asterisk: ?? -> ?*
+     * - Substitute some common character combinations with a token:
+     *   sch/ch/ei/ie/ig/st -> $/§/%/&/#/!
+     */
+    void substitute(CL_NS(util)::StringBuffer& buffer);
+
+    /**
+     * Undoes the changes made by substitute(). That are character pairs and
+     * character combinations. Umlauts will remain as their corresponding vowel,
+     * as "ß" remains as "ss".
+     */
+    void resubstitute(CL_NS(util)::StringBuffer& buffer);
+};
+
+CL_NS_END2
+#endif
diff --git a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
new file mode 100644
index 0000000..02c50aa
--- /dev/null
+++ b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
@@ -0,0 +1,48 @@
+#include "CLucene/_ApiHeader.h"
+#include "ChineseTokenizer.h"
+#include "CLucene/util/CLStreams.h"
+#include <filesystem>
+namespace fs = std::filesystem;
+
+CL_NS_DEF2(analysis,jieba)
+CL_NS_USE(analysis)
+CL_NS_USE(util)
+
+std::string get_dict_path() {
+    if(const char* env_p = std::getenv("DICT_PATH")) {
+        return env_p;
+    }
+    return "";
+}
+
+static unique_ptr<cppjieba::Jieba> cppjieba = std::make_unique<cppjieba::Jieba>(
+        get_dict_path() + "dict/jieba.dict.utf8",
+        get_dict_path() + "dict/hmm_model.utf8",
+        get_dict_path() + "dict/user.dict.utf8",
+        get_dict_path() + "dict/idf.utf8",
+        get_dict_path() + "dict/stop_words.utf8");
+
+CL_NS(analysis)::Token* ChineseTokenizer::next(lucene::analysis::Token* token) {
+    // try to read all words
+    if (dataLen == 0) {
+        auto bufferLen = input->read((const void **) &ioBuffer, 1, 0);
+        if (bufferLen == -1) {
+            dataLen = 0;
+            return NULL;
+        }
+        char tmp_buffer[4 * bufferLen];
+        lucene_wcsntoutf8(tmp_buffer, ioBuffer, bufferLen, 4 * bufferLen);
+        cppjieba->Cut(tmp_buffer, tokens_text, true);
+        dataLen = tokens_text.size();
+    }
+    if (bufferIndex < dataLen) {
+        auto token_text = tokens_text[bufferIndex];
+        bufferIndex++;
+        lucene_utf8towcs(buffer, token_text.c_str(), LUCENE_MAX_WORD_LEN);
+        auto length = _tcslen(buffer);
+        token->set(buffer, 0, length);
+        return token;
+    }
+    return NULL;
+}
+CL_NS_END2
\ No newline at end of file
diff --git a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h
new file mode 100644
index 0000000..cecdd17
--- /dev/null
+++ b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h
@@ -0,0 +1,54 @@
+#ifndef _lucene_analysis_jiebatokenizer_
+#define _lucene_analysis_jiebatokenizer_
+
+#include <CLucene.h>
+
+#include <memory>
+#include "Jieba.hpp"
+
+#include "CLucene/analysis/AnalysisHeader.h"
+
+CL_NS_DEF2(analysis,jieba)
+
+class ChineseTokenizer : public lucene::analysis::Tokenizer {
+private:
+    /** word offset, used to imply which character(in ) is parsed */
+    int32_t offset{};
+
+    /** the index used only for ioBuffer */
+    int32_t bufferIndex{};
+
+    /** data length */
+    int32_t dataLen{};
+
+    /**
+     * character buffer, store the characters which are used to compose <br>
+     * the returned Token
+     */
+    TCHAR buffer[LUCENE_MAX_WORD_LEN]{};
+
+    /**
+     * I/O buffer, used to store the content of the input(one of the <br>
+     * members of Tokenizer)
+     */
+    const TCHAR* ioBuffer{};
+    //std::unique_ptr<cppjieba::Jieba> cppjieba;
+    std::vector<std::string> tokens_text;
+    std::vector<std::unique_ptr<Token>> tokens;
+
+public:
+    // Constructor
+    explicit ChineseTokenizer(lucene::util::Reader *reader) : Tokenizer(reader) {
+
+        buffer[0]=0;
+    }
+
+    // Destructor
+    ~ChineseTokenizer() override {}
+
+    // Override the next method to tokenize Chinese text using Jieba
+    lucene::analysis::Token* next(lucene::analysis::Token* token);
+};
+
+CL_NS_END2
+#endif
\ No newline at end of file
diff --git a/src/contribs-lib/CLucene/analysis/jieba/DictTrie.hpp b/src/contribs-lib/CLucene/analysis/jieba/DictTrie.hpp
new file mode 100644
index 0000000..1d2ad3f
--- /dev/null
+++ b/src/contribs-lib/CLucene/analysis/jieba/DictTrie.hpp
@@ -0,0 +1,286 @@
+#ifndef CPPJIEBA_DICT_TRIE_HPP
+#define CPPJIEBA_DICT_TRIE_HPP
+
+#include <iostream>
+#include <fstream>
+#include <map>
+#include <string>
+#include <cstring>
+#include <cstdlib>
+#include <stdint.h>
+#include <cmath>
+#include <limits>
+#include "StringUtil.hpp"
+#include "Logging.hpp"
+#include "Unicode.hpp"
+#include "Trie.hpp"
+
+namespace cppjieba {
+
+using namespace limonp;
+
+const double MIN_DOUBLE = -3.14e+100;
+const double MAX_DOUBLE = 3.14e+100;
+const size_t DICT_COLUMN_NUM = 3;
+const char* const UNKNOWN_TAG = "";
+
+class DictTrie {
+ public:
+  enum UserWordWeightOption {
+    WordWeightMin,
+    WordWeightMedian,
+    WordWeightMax,
+  }; // enum UserWordWeightOption
+
+  DictTrie(const string& dict_path, const string& user_dict_paths = "", UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
+    Init(dict_path, user_dict_paths, user_word_weight_opt);
+  }
+
+  ~DictTrie() {
+    delete trie_;
+  }
+
+  bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
+    DictUnit node_info;
+    if (!MakeNodeInfo(node_info, word, user_word_default_weight_, tag)) {
+      return false;
+    }
+    active_node_infos_.push_back(node_info);
+    trie_->InsertNode(node_info.word, &active_node_infos_.back());
+    return true;
+  }
+
+  bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) {
+    DictUnit node_info;
+    double weight = freq ? log(1.0 * freq / freq_sum_) : user_word_default_weight_ ;
+    if (!MakeNodeInfo(node_info, word, weight , tag)) {
+      return false;
+    }
+    active_node_infos_.push_back(node_info);
+    trie_->InsertNode(node_info.word, &active_node_infos_.back());
+    return true;
+  }
+
+  bool DeleteUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
+    DictUnit node_info;
+    if (!MakeNodeInfo(node_info, word, user_word_default_weight_, tag)) {
+      return false;
+    }
+    trie_->DeleteNode(node_info.word, &node_info);
+    return true;
+  }
+  
+  const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
+    return trie_->Find(begin, end);
+  }
+
+  void Find(RuneStrArray::const_iterator begin, 
+        RuneStrArray::const_iterator end, 
+        vector<struct Dag>&res,
+        size_t max_word_len = MAX_WORD_LENGTH) const {
+    trie_->Find(begin, end, res, max_word_len);
+  }
+
+  bool Find(const string& word)
+  {
+    const DictUnit *tmp = NULL;
+    RuneStrArray runes;
+    if (!DecodeRunesInString(word, runes))
+    {
+      XLOG(ERROR) << "Decode failed.";
+    }
+    tmp = Find(runes.begin(), runes.end());
+    if (tmp == NULL)
+    {
+      return false;
+    }
+    else
+    {
+      return true;
+    }
+  }
+
+  bool IsUserDictSingleChineseWord(const Rune& word) const {
+    return IsIn(user_dict_single_chinese_word_, word);
+  }
+
+  double GetMinWeight() const {
+    return min_weight_;
+  }
+
+  void InserUserDictNode(const string& line) {
+    vector<string> buf;
+    DictUnit node_info;
+    Split(line, buf, " ");
+    if(buf.size() == 1){
+          MakeNodeInfo(node_info, 
+                buf[0], 
+                user_word_default_weight_,
+                UNKNOWN_TAG);
+        } else if (buf.size() == 2) {
+          MakeNodeInfo(node_info, 
+                buf[0], 
+                user_word_default_weight_,
+                buf[1]);
+        } else if (buf.size() == 3) {
+          int freq = atoi(buf[1].c_str());
+          assert(freq_sum_ > 0.0);
+          double weight = log(1.0 * freq / freq_sum_);
+          MakeNodeInfo(node_info, buf[0], weight, buf[2]);
+        }
+        static_node_infos_.push_back(node_info);
+        if (node_info.word.size() == 1) {
+          user_dict_single_chinese_word_.insert(node_info.word[0]);
+        }
+  }
+  
+  void LoadUserDict(const vector<string>& buf) {
+    for (size_t i = 0; i < buf.size(); i++) {
+      InserUserDictNode(buf[i]);
+    }
+  }
+
+   void LoadUserDict(const set<string>& buf) {
+    std::set<string>::const_iterator iter;
+    for (iter = buf.begin(); iter != buf.end(); iter++){
+      InserUserDictNode(*iter);
+    }
+  }
+
+  void LoadUserDict(const string& filePaths) {
+    vector<string> files = limonp::Split(filePaths, "|;");
+    size_t lineno = 0;
+    for (size_t i = 0; i < files.size(); i++) {
+      ifstream ifs(files[i].c_str());
+      XCHECK(ifs.is_open()) << "open " << files[i] << " failed"; 
+      string line;
+      
+      for (; getline(ifs, line); lineno++) {
+        if (line.size() == 0) {
+          continue;
+        }
+        InserUserDictNode(line);
+      }
+    }
+  }
+
+
+ private:
+  void Init(const string& dict_path, const string& user_dict_paths, UserWordWeightOption user_word_weight_opt) {
+    LoadDict(dict_path);
+    freq_sum_ = CalcFreqSum(static_node_infos_);
+    CalculateWeight(static_node_infos_, freq_sum_);
+    SetStaticWordWeights(user_word_weight_opt);
+
+    if (user_dict_paths.size()) {
+      LoadUserDict(user_dict_paths);
+    }
+    Shrink(static_node_infos_);
+    CreateTrie(static_node_infos_);
+  }
+  
+  void CreateTrie(const vector<DictUnit>& dictUnits) {
+    assert(dictUnits.size());
+    vector<Unicode> words;
+    vector<const DictUnit*> valuePointers;
+    for (size_t i = 0 ; i < dictUnits.size(); i ++) {
+      words.push_back(dictUnits[i].word);
+      valuePointers.push_back(&dictUnits[i]);
+    }
+
+    trie_ = new Trie(words, valuePointers);
+  }
+
+  
+
+
+  bool MakeNodeInfo(DictUnit& node_info,
+        const string& word, 
+        double weight, 
+        const string& tag) {
+    if (!DecodeRunesInString(word, node_info.word)) {
+      XLOG(ERROR) << "Decode " << word << " failed.";
+      return false;
+    }
+    node_info.weight = weight;
+    node_info.tag = tag;
+    return true;
+  }
+
+  void LoadDict(const string& filePath) {
+    ifstream ifs(filePath.c_str());
+    XCHECK(ifs.is_open()) << "open " << filePath << " failed.";
+    string line;
+    vector<string> buf;
+
+    DictUnit node_info;
+    for (size_t lineno = 0; getline(ifs, line); lineno++) {
+      Split(line, buf, " ");
+      XCHECK(buf.size() == DICT_COLUMN_NUM) << "split result illegal, line:" << line;
+      MakeNodeInfo(node_info, 
+            buf[0], 
+            atof(buf[1].c_str()), 
+            buf[2]);
+      static_node_infos_.push_back(node_info);
+    }
+  }
+
+  static bool WeightCompare(const DictUnit& lhs, const DictUnit& rhs) {
+    return lhs.weight < rhs.weight;
+  }
+
+  void SetStaticWordWeights(UserWordWeightOption option) {
+    XCHECK(!static_node_infos_.empty());
+    vector<DictUnit> x = static_node_infos_;
+    sort(x.begin(), x.end(), WeightCompare);
+    min_weight_ = x[0].weight;
+    max_weight_ = x[x.size() - 1].weight;
+    median_weight_ = x[x.size() / 2].weight;
+    switch (option) {
+     case WordWeightMin:
+       user_word_default_weight_ = min_weight_;
+       break;
+     case WordWeightMedian:
+       user_word_default_weight_ = median_weight_;
+       break;
+     default:
+       user_word_default_weight_ = max_weight_;
+       break;
+    }
+  }
+
+  double CalcFreqSum(const vector<DictUnit>& node_infos) const {
+    double sum = 0.0;
+    for (size_t i = 0; i < node_infos.size(); i++) {
+      sum += node_infos[i].weight;
+    }
+    return sum;
+  }
+
+  void CalculateWeight(vector<DictUnit>& node_infos, double sum) const {
+    assert(sum > 0.0);
+    for (size_t i = 0; i < node_infos.size(); i++) {
+      DictUnit& node_info = node_infos[i];
+      assert(node_info.weight > 0.0);
+      node_info.weight = log(double(node_info.weight)/sum);
+    }
+  }
+
+  void Shrink(vector<DictUnit>& units) const {
+    vector<DictUnit>(units.begin(), units.end()).swap(units);
+  }
+
+  vector<DictUnit> static_node_infos_;
+  deque<DictUnit> active_node_infos_; // must not be vector
+  Trie * trie_;
+
+  double freq_sum_;
+  double min_weight_;
+  double max_weight_;
+  double median_weight_;
+  double user_word_default_weight_;
+  unordered_set<Rune> user_dict_single_chinese_word_;
+};
+}
+
+#endif
diff --git a/src/contribs-lib/CLucene/analysis/jieba/FullSegment.hpp b/src/contribs-lib/CLucene/analysis/jieba/FullSegment.hpp
new file mode 100644
index 0000000..8ae715f
--- /dev/null
+++ b/src/contribs-lib/CLucene/analysis/jieba/FullSegment.hpp
@@ -0,0 +1,93 @@
+#ifndef CPPJIEBA_FULLSEGMENT_H
+#define CPPJIEBA_FULLSEGMENT_H
+
+#include <algorithm>
+#include <set>
+#include <cassert>
+#include "Logging.hpp"
+#include "DictTrie.hpp"
+#include "SegmentBase.hpp"
+#include "Unicode.hpp"
+
+namespace cppjieba {
+class FullSegment: public SegmentBase {
+ public:
+  FullSegment(const string& dictPath) {
+    dictTrie_ = new DictTrie(dictPath);
+    isNeedDestroy_ = true;
+  }
+  FullSegment(const DictTrie* dictTrie)
+    : dictTrie_(dictTrie), isNeedDestroy_(false) {
+    assert(dictTrie_);
+  }
+  ~FullSegment() {
+    if (isNeedDestroy_) {
+      delete dictTrie_;
+    }
+  }
+  void Cut(const string& sentence, 
+        vector<string>& words) const {
+    vector<Word> tmp;
+    Cut(sentence, tmp);
+    GetStringsFromWords(tmp, words);
+  }
+  void Cut(const string& sentence, 
+        vector<Word>& words) const {
+    PreFilter pre_filter(symbols_, sentence);
+    PreFilter::Range range;
+    vector<WordRange> wrs;
+    wrs.reserve(sentence.size()/2);
+    while (pre_filter.HasNext()) {
+      range = pre_filter.Next();
+      Cut(range.begin, range.end, wrs);
+    }
+    words.clear();
+    words.reserve(wrs.size());
+    GetWordsFromWordRanges(sentence, wrs, words);
+  }
+  void Cut(RuneStrArray::const_iterator begin, 
+        RuneStrArray::const_iterator end, 
+        vector<WordRange>& res) const {
+    // result of searching in trie tree
+    LocalVector<pair<size_t, const DictUnit*> > tRes;
+
+    // max index of res's words
+    size_t maxIdx = 0;
+
+    // always equals to (uItr - begin)
+    size_t uIdx = 0;
+
+    // tmp variables
+    size_t wordLen = 0;
+    assert(dictTrie_);
+    vector<struct Dag> dags;
+    dictTrie_->Find(begin, end, dags);
+    for (size_t i = 0; i < dags.size(); i++) {
+      for (size_t j = 0; j < dags[i].nexts.size(); j++) {
+        size_t nextoffset = dags[i].nexts[j].first;
+        assert(nextoffset < dags.size());
+        const DictUnit* du = dags[i].nexts[j].second;
+        if (du == NULL) {
+          if (dags[i].nexts.size() == 1 && maxIdx <= uIdx) {
+            WordRange wr(begin + i, begin + nextoffset);
+            res.push_back(wr);
+          }
+        } else {
+          wordLen = du->word.size();
+          if (wordLen >= 2 || (dags[i].nexts.size() == 1 && maxIdx <= uIdx)) {
+            WordRange wr(begin + i, begin + nextoffset);
+            res.push_back(wr);
+          }
+        }
+        maxIdx = uIdx + wordLen > maxIdx ? uIdx + wordLen : maxIdx;
+      }
+      uIdx++;
+    }
+  }
+ private:
+  const DictTrie* dictTrie_;
+  bool isNeedDestroy_;
+};
+}
+
+#endif
diff --git a/src/contribs-lib/CLucene/analysis/jieba/HMMModel.hpp b/src/contribs-lib/CLucene/analysis/jieba/HMMModel.hpp
new file mode 100644
index 0000000..e0d4feb
--- /dev/null
+++ b/src/contribs-lib/CLucene/analysis/jieba/HMMModel.hpp
@@ -0,0 +1,129 @@
+#ifndef CPPJIEBA_HMMMODEL_H
+#define CPPJIEBA_HMMMODEL_H
+
+#include "StringUtil.hpp"
+#include "Trie.hpp"
+
+namespace cppjieba {
+
+using namespace limonp;
+typedef unordered_map<Rune, double> EmitProbMap;
+
+struct HMMModel {
+  /*
+   * STATUS:
+   * 0: HMMModel::B, 1: HMMModel::E, 2: HMMModel::M, 3:HMMModel::S
+   * */
+  enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4};
+
+  HMMModel(const string& modelPath) {
+    memset(startProb, 0, sizeof(startProb));
+    memset(transProb, 0, sizeof(transProb));
+    statMap[0] = 'B';
+    statMap[1] = 'E';
+    statMap[2] = 'M';
+    statMap[3] = 'S';
+    emitProbVec.push_back(&emitProbB);
+    emitProbVec.push_back(&emitProbE);
+    emitProbVec.push_back(&emitProbM);
+    emitProbVec.push_back(&emitProbS);
+    LoadModel(modelPath);
+  }
+  ~HMMModel() {
+  }
+  void LoadModel(const string& filePath) {
+    ifstream ifile(filePath.c_str());
+    XCHECK(ifile.is_open()) << "open " << filePath << " failed";
+    string line;
+    vector<string> tmp;
+    vector<string> tmp2;
+    //Load startProb
+    XCHECK(GetLine(ifile, line));
+    Split(line, tmp, " ");
+    XCHECK(tmp.size() == STATUS_SUM);
+    for (size_t j = 0; j< tmp.size(); j++) {
+      startProb[j] = atof(tmp[j].c_str());
+    }
+
+    //Load transProb
+    for (size_t i = 0; i < STATUS_SUM; i++) {
+      XCHECK(GetLine(ifile, line));
+      Split(line, tmp, " ");
+      XCHECK(tmp.size() == STATUS_SUM);
+      for (size_t j =0; j < STATUS_SUM; j++) {
+        transProb[i][j] = atof(tmp[j].c_str());
+      }
+    }
+
+    //Load emitProbB
+    XCHECK(GetLine(ifile, line));
+    XCHECK(LoadEmitProb(line, emitProbB));
+
+    //Load emitProbE
+    XCHECK(GetLine(ifile, line));
+    XCHECK(LoadEmitProb(line, emitProbE));
+
+    //Load emitProbM
+    XCHECK(GetLine(ifile, line));
+    XCHECK(LoadEmitProb(line, emitProbM));
+
+    //Load emitProbS
+    XCHECK(GetLine(ifile, line));
+    XCHECK(LoadEmitProb(line, emitProbS));
+  }
+  double GetEmitProb(const EmitProbMap* ptMp, Rune key, 
+        double defVal)const {
+    EmitProbMap::const_iterator cit = ptMp->find(key);
+    if (cit == ptMp->end()) {
+      return defVal;
+    }
+    return cit->second;
+  }
+  bool GetLine(ifstream& ifile, string& line) {
+    while (getline(ifile, line)) {
+      Trim(line);
+      if (line.empty()) {
+        continue;
+      }
+      if (StartsWith(line, "#")) {
+        continue;
+      }
+      return true;
+    }
+    return false;
+  }
+  bool LoadEmitProb(const string& line, EmitProbMap& mp) {
+    if (line.empty()) {
+      return false;
+    }
+    vector<string> tmp, tmp2;
+    Unicode unicode;
+    Split(line, tmp, ",");
+    for (size_t i = 0; i < tmp.size(); i++) {
+      Split(tmp[i], tmp2, ":");
+      if (2 != tmp2.size()) {
+        XLOG(ERROR) << "emitProb illegal.";
+        return false;
+      }
+      if (!DecodeRunesInString(tmp2[0], unicode) || unicode.size() != 1) {
+        XLOG(ERROR) << "TransCode failed.";
+        return false;
+      }
+      mp[unicode[0]] = atof(tmp2[1].c_str());
+    }
+    return true;
+  }
+
+  char statMap[STATUS_SUM];
+  double startProb[STATUS_SUM];
+  double transProb[STATUS_SUM][STATUS_SUM];
+  EmitProbMap emitProbB;
+  EmitProbMap emitProbE;
+  EmitProbMap emitProbM;
+  EmitProbMap emitProbS;
+  vector<EmitProbMap* > emitProbVec;
+}; // struct HMMModel
+
+} // namespace cppjieba
+
+#endif
diff --git a/src/contribs-lib/CLucene/analysis/jieba/HMMSegment.hpp b/src/contribs-lib/CLucene/analysis/jieba/HMMSegment.hpp
new file mode 100644
index 0000000..d515c04
--- /dev/null
+++ b/src/contribs-lib/CLucene/analysis/jieba/HMMSegment.hpp
@@ -0,0 +1,190 @@
+#ifndef CPPJIBEA_HMMSEGMENT_H
+#define CPPJIBEA_HMMSEGMENT_H
+
+#include <iostream>
+#include <fstream>
+#include <memory.h>
+#include <cassert>
+#include "HMMModel.hpp"
+#include "SegmentBase.hpp"
+
+namespace cppjieba {
+class HMMSegment: public SegmentBase {
+ public:
+  HMMSegment(const string& filePath)
+  : model_(new HMMModel(filePath)), isNeedDestroy_(true) {
+  }
+  HMMSegment(const HMMModel* model) 
+  : model_(model), isNeedDestroy_(false) {
+  }
+  ~HMMSegment() {
+    if (isNeedDestroy_) {
+      delete model_;
+    }
+  }
+
+  void Cut(const string& sentence, 
+        vector<string>& words) const {
+    vector<Word> tmp;
+    Cut(sentence, tmp);
+    GetStringsFromWords(tmp, words);
+  }
+  void Cut(const string& sentence, 
+        vector<Word>& words) const {
+    PreFilter pre_filter(symbols_, sentence);
+    PreFilter::Range range;
+    vector<WordRange> wrs;
+    wrs.reserve(sentence.size()/2);
+    while (pre_filter.HasNext()) {
+      range = pre_filter.Next();
+      Cut(range.begin, range.end, wrs);
+    }
+    words.clear();
+    words.reserve(wrs.size());
+    GetWordsFromWordRanges(sentence, wrs, words);
+  }
+  void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res) const {
+    RuneStrArray::const_iterator left = begin;
+    RuneStrArray::const_iterator right = begin;
+    while (right != end) {
+      if (right->rune < 0x80) {
+        if (left != right) {
+          InternalCut(left, right, res);
+        }
+        left = right;
+        do {
+          right = SequentialLetterRule(left, end);
+          if (right != left) {
+            break;
+          }
+          right = NumbersRule(left, end);
+          if (right != left) {
+            break;
+          }
+          right ++;
+        } while (false);
+        WordRange wr(left, right - 1);
+        res.push_back(wr);
+        left = right;
+      } else {
+        right++;
+      }
+    }
+    if (left != right) {
+      InternalCut(left, right, res);
+    }
+  }
+ private:
+  // sequential letters rule
+  RuneStrArray::const_iterator SequentialLetterRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
+    Rune x = begin->rune;
+    if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) {
+      begin ++;
+    } else {
+      return begin;
+    }
+    while (begin != end) {
+      x = begin->rune;
+      if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) {
+        begin ++;
+      } else {
+        break;
+      }
+    }
+    return begin;
+  }
+  //
+  RuneStrArray::const_iterator NumbersRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
+    Rune x = begin->rune;
+    if ('0' <= x && x <= '9') {
+      begin ++;
+    } else {
+      return begin;
+    }
+    while (begin != end) {
+      x = begin->rune;
+      if ( ('0' <= x && x <= '9') || x == '.') {
+        begin++;
+      } else {
+        break;
+      }
+    }
+    return begin;
+  }
+  void InternalCut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res) const {
+    vector<size_t> status;
+    Viterbi(begin, end, status);
+
+    RuneStrArray::const_iterator left = begin;
+    RuneStrArray::const_iterator right;
+    for (size_t i = 0; i < status.size(); i++) {
+      if (status[i] % 2) { //if (HMMModel::E == status[i] || HMMModel::S == status[i])
+        right = begin + i + 1;
+        WordRange wr(left, right - 1);
+        res.push_back(wr);
+        left = right;
+      }
+    }
+  }
+
+  void Viterbi(RuneStrArray::const_iterator begin, 
+        RuneStrArray::const_iterator end, 
+        vector<size_t>& status) const {
+    size_t Y = HMMModel::STATUS_SUM;
+    size_t X = end - begin;
+
+    size_t XYSize = X * Y;
+    size_t now, old, stat;
+    double tmp, endE, endS;
+
+    vector<int> path(XYSize);
+    vector<double> weight(XYSize);
+
+    //start
+    for (size_t y = 0; y < Y; y++) {
+      weight[0 + y * X] = model_->startProb[y] + model_->GetEmitProb(model_->emitProbVec[y], begin->rune, MIN_DOUBLE);
+      path[0 + y * X] = -1;
+    }
+
+    double emitProb;
+
+    for (size_t x = 1; x < X; x++) {
+      for (size_t y = 0; y < Y; y++) {
+        now = x + y*X;
+        weight[now] = MIN_DOUBLE;
+        path[now] = HMMModel::E; // warning
+        emitProb = model_->GetEmitProb(model_->emitProbVec[y], (begin+x)->rune, MIN_DOUBLE);
+        for (size_t preY = 0; preY < Y; preY++) {
+          old = x - 1 + preY * X;
+          tmp = weight[old] + model_->transProb[preY][y] + emitProb;
+          if (tmp > weight[now]) {
+            weight[now] = tmp;
+            path[now] = preY;
+          }
+        }
+      }
+    }
+
+    endE = weight[X-1+HMMModel::E*X];
+    endS = weight[X-1+HMMModel::S*X];
+    stat = 0;
+    if (endE >= endS) {
+      stat = HMMModel::E;
+    } else {
+      stat = HMMModel::S;
+    }
+
+    status.resize(X);
+    for (int x = X -1 ; x >= 0; x--) {
+      status[x] = stat;
+      stat = path[x + stat*X];
+    }
+  }
+
+  const HMMModel* model_;
+  bool isNeedDestroy_;
+}; // class HMMSegment
+
+} // namespace cppjieba
+
+#endif
diff --git a/src/contribs-lib/CLucene/analysis/jieba/Jieba.hpp b/src/contribs-lib/CLucene/analysis/jieba/Jieba.hpp
new file mode 100644
index 0000000..8475404
--- /dev/null
+++ b/src/contribs-lib/CLucene/analysis/jieba/Jieba.hpp
@@ -0,0 +1,134 @@
+#ifndef CPPJIEAB_JIEBA_H
+#define CPPJIEAB_JIEBA_H
+
+#include "QuerySegment.hpp"
+#include "KeywordExtractor.hpp"
+
+namespace cppjieba {
+
+class Jieba {
+ public:
+  Jieba(const string& dict_path, 
+        const string& model_path,
+        const string& user_dict_path, 
+        const string& idfPath, 
+        const string& stopWordPath) 
+    : dict_trie_(dict_path, user_dict_path),
+      model_(model_path),
+      mp_seg_(&dict_trie_),
+      hmm_seg_(&model_),
+      mix_seg_(&dict_trie_, &model_),
+      full_seg_(&dict_trie_),
+      query_seg_(&dict_trie_, &model_),
+      extractor(&dict_trie_, &model_, idfPath, stopWordPath) {
+  }
+  ~Jieba() {
+  }
+
+  struct LocWord {
+    string word;
+    size_t begin;
+    size_t end;
+  }; // struct LocWord
+
+  void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
+    mix_seg_.Cut(sentence, words, hmm);
+  }
+  void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
+    mix_seg_.Cut(sentence, words, hmm);
+  }
+  void CutAll(const string& sentence, vector<string>& words) const {
+    full_seg_.Cut(sentence, words);
+  }
+  void CutAll(const string& sentence, vector<Word>& words) const {
+    full_seg_.Cut(sentence, words);
+  }
+  void CutForSearch(const string& sentence, vector<string>& words, bool hmm = true) const {
+    query_seg_.Cut(sentence, words, hmm);
+  }
+  void CutForSearch(const string& sentence, vector<Word>& words, bool hmm = true) const {
+    query_seg_.Cut(sentence, words, hmm);
+  }
+  void CutHMM(const string& sentence, vector<string>& words) const {
+    hmm_seg_.Cut(sentence, words);
+  }
+  void CutHMM(const string& sentence, vector<Word>& words) const {
+    hmm_seg_.Cut(sentence, words);
+  }
+  void CutSmall(const string& sentence, vector<string>& words, size_t max_word_len) const {
+    mp_seg_.Cut(sentence, words, max_word_len);
+  }
+  void CutSmall(const string& sentence, vector<Word>& words, size_t max_word_len) const {
+    mp_seg_.Cut(sentence, words, max_word_len);
+  }
+  
+  void Tag(const string& sentence, vector<pair<string, string> >& words) const {
+    mix_seg_.Tag(sentence, words);
+  }
+  string LookupTag(const string &str) const {
+    return mix_seg_.LookupTag(str);
+  }
+  bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
+    return dict_trie_.InsertUserWord(word, tag);
+  }
+
+  bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) {
+    return dict_trie_.InsertUserWord(word,freq, tag);
+  }
+
+  bool DeleteUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
+    return dict_trie_.DeleteUserWord(word, tag);
+  }
+  
+  bool Find(const string& word)
+  {
+    return dict_trie_.Find(word);
+  }
+
+  void ResetSeparators(const string& s) {
+    //TODO
+    mp_seg_.ResetSeparators(s);
+    hmm_seg_.ResetSeparators(s);
+    mix_seg_.ResetSeparators(s);
+    full_seg_.ResetSeparators(s);
+    query_seg_.ResetSeparators(s);
+  }
+
+  const DictTrie* GetDictTrie() const {
+    return &dict_trie_;
+  } 
+  
+  const HMMModel* GetHMMModel() const {
+    return &model_;
+  }
+
+  void LoadUserDict(const vector<string>& buf)  {
+    dict_trie_.LoadUserDict(buf);
+  }
+
+  void LoadUserDict(const set<string>& buf)  {
+    dict_trie_.LoadUserDict(buf);
+  }
+
+  void LoadUserDict(const string& path)  {
+    dict_trie_.LoadUserDict(path);
+  }
+
+ private:
+  DictTrie dict_trie_;
+  HMMModel model_;
+  
+  // They share the same dict trie and model
+  MPSegment mp_seg_;
+  HMMSegment hmm_seg_;
+  MixSegment mix_seg_;
+  FullSegment full_seg_;
+  QuerySegment query_seg_;
+
+ public:
+  KeywordExtractor extractor;
+}; // class Jieba
+
+} // namespace cppjieba
+
+#endif // CPPJIEAB_JIEBA_H
diff --git a/src/contribs-lib/CLucene/analysis/jieba/KeywordExtractor.hpp b/src/contribs-lib/CLucene/analysis/jieba/KeywordExtractor.hpp
new file mode 100644
index 0000000..319ce0a
--- /dev/null
+++ b/src/contribs-lib/CLucene/analysis/jieba/KeywordExtractor.hpp
@@ -0,0 +1,153 @@
+#ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H
+#define CPPJIEBA_KEYWORD_EXTRACTOR_H
+
+#include <cmath>
+#include <set>
+#include "MixSegment.hpp"
+
+namespace cppjieba {
+
+using namespace limonp;
+using namespace std;
+
+/*utf8*/
+class KeywordExtractor {
+ public:
+  struct Word {
+    string word;
+    vector<size_t> offsets;
+    double weight;
+  }; // struct Word
+
+  KeywordExtractor(const string& dictPath, 
+        const string& hmmFilePath, 
+        const string& idfPath, 
+        const string& stopWordPath, 
+        const string& userDict = "") 
+    : segment_(dictPath, hmmFilePath, userDict) {
+    LoadIdfDict(idfPath);
+    LoadStopWordDict(stopWordPath);
+  }
+  KeywordExtractor(const DictTrie* dictTrie, 
+        const HMMModel* model,
+        const string& idfPath, 
+        const string& stopWordPath) 
+    : segment_(dictTrie, model) {
+    LoadIdfDict(idfPath);
+    LoadStopWordDict(stopWordPath);
+  }
+  ~KeywordExtractor() {
+  }
+
+  void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
+    vector<Word> topWords;
+    Extract(sentence, topWords, topN);
+    for (size_t i = 0; i < topWords.size(); i++) {
+      keywords.push_back(topWords[i].word);
+    }
+  }
+
+  void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
+    vector<Word> topWords;
+    Extract(sentence, topWords, topN);
+    for (size_t i = 0; i < topWords.size(); i++) {
+      keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
+    }
+  }
+
+  void Extract(const string& sentence, vector<Word>& keywords, size_t topN) const {
+    vector<string> words;
+    segment_.Cut(sentence, words);
+
+    map<string, Word> wordmap;
+    size_t offset = 0;
+    for (size_t i = 0; i < words.size(); ++i) {
+      size_t t = offset;
+      offset += words[i].size();
+      if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) {
+        continue;
+      }
+      wordmap[words[i]].offsets.push_back(t);
+      wordmap[words[i]].weight += 1.0;
+    }
+    if (offset != sentence.size()) {
+      XLOG(ERROR) << "words illegal";
+      return;
+    }
+
+    keywords.clear();
+    keywords.reserve(wordmap.size());
+    for (map<string, Word>::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
+      unordered_map<string, double>::const_iterator cit = idfMap_.find(itr->first);
+      if (cit != idfMap_.end()) {
+        itr->second.weight *= cit->second;
+      } else {
+        itr->second.weight *= idfAverage_;
+      }
+      itr->second.word = itr->first;
+      keywords.push_back(itr->second);
+    }
+    topN = min(topN, keywords.size());
+    partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
+    keywords.resize(topN);
+  }
+ private:
+  void LoadIdfDict(const string& idfPath) {
+    ifstream ifs(idfPath.c_str());
+    XCHECK(ifs.is_open()) << "open " << idfPath << " failed";
+    string line ;
+    vector<string> buf;
+    double idf = 0.0;
+    double idfSum = 0.0;
+    size_t lineno = 0;
+    for (; getline(ifs, line); lineno++) {
+      buf.clear();
+      if (line.empty()) {
+        XLOG(ERROR) << "lineno: " << lineno << " empty. skipped.";
+        continue;
+      }
+      Split(line, buf, " ");
+      if (buf.size() != 2) {
+        XLOG(ERROR) << "line: " << line << ", lineno: " << lineno << " empty. skipped.";
+        continue;
+      }
+      idf = atof(buf[1].c_str());
+      idfMap_[buf[0]] = idf;
+      idfSum += idf;
+
+    }
+
+    assert(lineno);
+    idfAverage_ = idfSum / lineno;
+    assert(idfAverage_ > 0.0);
+  }
+  void LoadStopWordDict(const string& filePath) {
+    ifstream ifs(filePath.c_str());
+    XCHECK(ifs.is_open()) << "open " << filePath << " failed";
+    string line ;
+    while (getline(ifs, line)) {
+      stopWords_.insert(line);
+    }
+    assert(stopWords_.size());
+  }
+
+  static bool Compare(const Word& lhs, const Word& rhs) {
+    return lhs.weight > rhs.weight;
+  }
+
+  MixSegment segment_;
+  unordered_map<string, double> idfMap_;
+  double idfAverage_;
+
+  unordered_set<string> stopWords_;
+}; // class KeywordExtractor
+
+inline ostream& operator << (ostream& os, const KeywordExtractor::Word& word) {
+  return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}"; 
+}
+
+} // namespace cppjieba
+
+#endif
+
+
diff --git a/src/contribs-lib/CLucene/analysis/jieba/LocalVector.hpp b/src/contribs-lib/CLucene/analysis/jieba/LocalVector.hpp
new file mode 100644
index 0000000..808378c
--- /dev/null
+++ b/src/contribs-lib/CLucene/analysis/jieba/LocalVector.hpp
@@ -0,0 +1,139 @@
+#ifndef LIMONP_LOCAL_VECTOR_HPP
+#define LIMONP_LOCAL_VECTOR_HPP
+
+#include <iostream>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+
+namespace limonp {
+    using namespace std;
+    /*
+ * LocalVector<T> : T must be primitive type (char , int, size_t), if T is struct or class, LocalVector<T> may be dangerous..
+ * LocalVector<T> is simple and not well-tested.
+ */
+    const size_t LOCAL_VECTOR_BUFFER_SIZE = 16;
+    template <class T>
+    class LocalVector {
+    public:
+        typedef const T* const_iterator ;
+        typedef T value_type;
+        typedef size_t size_type;
+    private:
+        T buffer_[LOCAL_VECTOR_BUFFER_SIZE];
+        T * ptr_;
+        size_t size_;
+        size_t capacity_;
+    public:
+        LocalVector() {
+            init_();
+        };
+        LocalVector(const LocalVector<T>& vec) {
+            init_();
+            *this = vec;
+        }
+        LocalVector(const_iterator  begin, const_iterator end) { // TODO: make it faster
+            init_();
+            while(begin != end) {
+                push_back(*begin++);
+            }
+        }
+        LocalVector(size_t size, const T& t) { // TODO: make it faster
+            init_();
+            while(size--) {
+                push_back(t);
+            }
+        }
+        ~LocalVector() {
+            if(ptr_ != buffer_) {
+                free(ptr_);
+            }
+        };
+    public:
+        LocalVector<T>& operator = (const LocalVector<T>& vec) {
+            clear();
+            size_ = vec.size();
+            capacity_ = vec.capacity();
+            if(vec.buffer_ == vec.ptr_) {
+                memcpy(static_cast<void*>(buffer_), vec.buffer_, sizeof(T) * size_);
+                ptr_ = buffer_;
+            } else {
+                ptr_ = (T*) malloc(vec.capacity() * sizeof(T));
+                assert(ptr_);
+                memcpy(static_cast<void*>(ptr_), vec.ptr_, vec.size() * sizeof(T));
+            }
+            return *this;
+        }
+    private:
+        void init_() {
+            ptr_ = buffer_;
+            size_ = 0;
+            capacity_ = LOCAL_VECTOR_BUFFER_SIZE;
+        }
+    public:
+        T& operator [] (size_t i) {
+            return ptr_[i];
+        }
+        const T& operator [] (size_t i) const {
+            return ptr_[i];
+        }
+        void push_back(const T& t) {
+            if(size_ == capacity_) {
+                assert(capacity_);
+                reserve(capacity_ * 2);
+            }
+            ptr_[size_ ++ ] = t;
+        }
+        void reserve(size_t size) {
+            if(size <= capacity_) {
+                return;
+            }
+            T * next =  (T*)malloc(sizeof(T) * size);
+            assert(next);
+            T * old = ptr_;
+            ptr_ = next;
+            memcpy(static_cast<void*>(ptr_), old, sizeof(T) * capacity_);
+            capacity_ = size;
+            if(old != buffer_) {
+                free(old);
+            }
+        }
+        bool empty() const {
+            return 0 == size();
+        }
+        size_t size() const {
+            return size_;
+        }
+        size_t capacity() const {
+            return capacity_;
+        }
+        const_iterator begin() const {
+            return ptr_;
+        }
+        const_iterator end() const {
+            return ptr_ + size_;
+        }
+        void clear() {
+            if(ptr_ != buffer_) {
+                free(ptr_);
+            }
+            init_();
+        }
+    };
+
+    template <class T>
+    ostream & operator << (ostream& os, const LocalVector<T>& vec) {
+        if(vec.empty()) {
+            return os << "[]";
+        }
+        os<<"[\""<<vec[0];
+        for(size_t i = 1; i < vec.size(); i++) {
+            os<<"\", \""<<vec[i];
+        }
+        os<<"\"]";
+        return os;
+    }
+
+}
+
+#endif
\ No newline at end of file
diff --git a/src/contribs-lib/CLucene/analysis/jieba/Logging.hpp b/src/contribs-lib/CLucene/analysis/jieba/Logging.hpp
new file mode 100644
index 0000000..77540ce
--- /dev/null
+++ b/src/contribs-lib/CLucene/analysis/jieba/Logging.hpp
@@ -0,0 +1,90 @@
+#ifndef LIMONP_LOGGING_HPP
+#define LIMONP_LOGGING_HPP
+
+#include <sstream>
+#include <iostream>
+#include <cassert>
+#include <cstdlib>
+#include <ctime>
+
+#ifdef XLOG
+#error "XLOG has been defined already"
+#endif // XLOG
+#ifdef XCHECK
+#error "XCHECK has been defined already"
+#endif // XCHECK
+
+#define XLOG(level) limonp::Logger(limonp::LL_##level, __FILE__, __LINE__).Stream() 
+#define XCHECK(exp) if(!(exp)) XLOG(FATAL) << "exp: ["#exp << "] false. "
+
+namespace limonp {
+
+    enum {
+        LL_DEBUG = 0,
+        LL_INFO = 1,
+        LL_WARNING = 2,
+        LL_ERROR = 3,
+        LL_FATAL = 4,
+    }; // enum
+
+    static const char * LOG_LEVEL_ARRAY[] = {"DEBUG","INFO","WARN","ERROR","FATAL"};
+    static const char * LOG_TIME_FORMAT = "%Y-%m-%d %H:%M:%S";
+
+    class Logger {
+    public:
+        Logger(size_t level, const char* filename, int lineno)
+            : level_(level) {
+#ifdef LOGGING_LEVEL
+            if (level_ < LOGGING_LEVEL) {
+                return;
+            }
+#endif
+            assert(level_ <= sizeof(LOG_LEVEL_ARRAY)/sizeof(*LOG_LEVEL_ARRAY));
+    
+            char buf[32];
+    
+            time_t timeNow;
+            time(&timeNow);
+
+            struct tm tmNow;
+
+#if defined(_WIN32) || defined(_WIN64)
+            errno_t e = localtime_s(&tmNow, &timeNow);
+            assert(e = 0);
+#else
+            struct tm * tm_tmp = localtime_r(&timeNow, &tmNow);
+            assert(tm_tmp != nullptr);
+#endif
+
+            strftime(buf, sizeof(buf), LOG_TIME_FORMAT, &tmNow);
+
+            stream_ << buf
+                    << " " << filename
+                    << ":" << lineno
+                    << " " << LOG_LEVEL_ARRAY[level_]
+                    << " ";
+        }
+        ~Logger() {
+#ifdef LOGGING_LEVEL
+            if (level_ < LOGGING_LEVEL) {
+                return;
+            }
+#endif
+            std::cerr << stream_.str() << std::endl;
+            if (level_ == LL_FATAL) {
+                abort();
+            }
+        }
+
+        std::ostream& Stream() {
+            return stream_;
+        }
+
+    private:
+        std::ostringstream stream_;
+        size_t level_;
+    }; // class Logger
+
+} // namespace limonp
+
+#endif // LIMONP_LOGGING_HPP
\ No newline at end of file
diff --git a/src/contribs-lib/CLucene/analysis/jieba/MPSegment.hpp b/src/contribs-lib/CLucene/analysis/jieba/MPSegment.hpp
new file mode 100644
index 0000000..988c1f2
--- /dev/null
+++ b/src/contribs-lib/CLucene/analysis/jieba/MPSegment.hpp
@@ -0,0 +1,137 @@
+#ifndef CPPJIEBA_MPSEGMENT_H
+#define CPPJIEBA_MPSEGMENT_H
+
+#include <algorithm>
+#include <set>
+#include <cassert>
+#include "Logging.hpp"
+#include "DictTrie.hpp"
+#include "SegmentTagged.hpp"
+#include "PosTagger.hpp"
+
+namespace cppjieba {
+
+class MPSegment: public SegmentTagged {
+ public:
+  MPSegment(const string& dictPath, const string& userDictPath = "")
+    : dictTrie_(new DictTrie(dictPath, userDictPath)), isNeedDestroy_(true) {
+  }
+  MPSegment(const DictTrie* dictTrie)
+    : dictTrie_(dictTrie), isNeedDestroy_(false) {
+    assert(dictTrie_);
+  }
+  ~MPSegment() {
+    if (isNeedDestroy_) {
+      delete dictTrie_;
+    }
+  }
+
+  void Cut(const string& sentence, vector<string>& words) const {
+    Cut(sentence, words, MAX_WORD_LENGTH);
+  }
+
+  void Cut(const string& sentence,
+        vector<string>& words,
+        size_t max_word_len) const {
+    vector<Word> tmp;
+    Cut(sentence, tmp, max_word_len);
+    GetStringsFromWords(tmp, words);
+  }
+  void Cut(const string& sentence, 
+        vector<Word>& words, 
+        size_t max_word_len = MAX_WORD_LENGTH) const {
+    PreFilter pre_filter(symbols_, sentence);
+    PreFilter::Range range;
+    vector<WordRange> wrs;
+    wrs.reserve(sentence.size()/2);
+    while (pre_filter.HasNext()) {
+      range = pre_filter.Next();
+      Cut(range.begin, range.end, wrs, max_word_len);
+    }
+    words.clear();
+    words.reserve(wrs.size());
+    GetWordsFromWordRanges(sentence, wrs, words);
+  }
+  void Cut(RuneStrArray::const_iterator begin,
+           RuneStrArray::const_iterator end,
+           vector<WordRange>& words,
+           size_t max_word_len = MAX_WORD_LENGTH) const {
+    vector<Dag> dags;
+    dictTrie_->Find(begin, 
+          end, 
+          dags,
+          max_word_len);
+    CalcDP(dags);
+    CutByDag(begin, end, dags, words);
+  }
+
+  const DictTrie* GetDictTrie() const {
+    return dictTrie_;
+  }
+
+  bool Tag(const string& src, vector<pair<string, string> >& res) const {
+    return tagger_.Tag(src, res, *this);
+  }
+
+  bool IsUserDictSingleChineseWord(const Rune& value) const {
+    return dictTrie_->IsUserDictSingleChineseWord(value);
+  }
+ private:
+  void CalcDP(vector<Dag>& dags) const {
+    size_t nextPos;
+    const DictUnit* p;
+    double val;
+
+    for (vector<Dag>::reverse_iterator rit = dags.rbegin(); rit != dags.rend(); rit++) {
+      rit->pInfo = NULL;
+      rit->weight = MIN_DOUBLE;
+      assert(!rit->nexts.empty());
+      for (LocalVector<pair<size_t, const DictUnit*> >::const_iterator it = rit->nexts.begin(); it != rit->nexts.end(); it++) {
+        nextPos = it->first;
+        p = it->second;
+        val = 0.0;
+        if (nextPos + 1 < dags.size()) {
+          val += dags[nextPos + 1].weight;
+        }
+
+        if (p) {
+          val += p->weight;
+        } else {
+          val += dictTrie_->GetMinWeight();
+        }
+        if (val > rit->weight) {
+          rit->pInfo = p;
+          rit->weight = val;
+        }
+      }
+    }
+  }
+  void CutByDag(RuneStrArray::const_iterator begin, 
+        RuneStrArray::const_iterator end, 
+        const vector<Dag>& dags, 
+        vector<WordRange>& words) const {
+    size_t i = 0;
+    while (i < dags.size()) {
+      const DictUnit* p = dags[i].pInfo;
+      if (p) {
+        assert(p->word.size() >= 1);
+        WordRange wr(begin + i, begin + i + p->word.size() - 1);
+        words.push_back(wr);
+        i += p->word.size();
+      } else { //single chinese word
+        WordRange wr(begin + i, begin + i);
+        words.push_back(wr);
+        i++;
+      }
+    }
+  }
+
+  const DictTrie* dictTrie_;
+  bool isNeedDestroy_;
+  PosTagger tagger_;
+
+}; // class MPSegment
+
+} // namespace cppjieba
+
+#endif
diff --git a/src/contribs-lib/CLucene/analysis/jieba/MixSegment.hpp b/src/contribs-lib/CLucene/analysis/jieba/MixSegment.hpp
new file mode 100644
index 0000000..95084da
--- /dev/null
+++ b/src/contribs-lib/CLucene/analysis/jieba/MixSegment.hpp
@@ -0,0 +1,109 @@
+#ifndef CPPJIEBA_MIXSEGMENT_H
+#define CPPJIEBA_MIXSEGMENT_H
+
+#include <cassert>
+#include "MPSegment.hpp"
+#include "HMMSegment.hpp"
+#include "StringUtil.hpp"
+#include "PosTagger.hpp"
+
+namespace cppjieba {
+class MixSegment: public SegmentTagged {
+ public:
+  MixSegment(const string& mpSegDict, const string& hmmSegDict, 
+        const string& userDict = "") 
+    : mpSeg_(mpSegDict, userDict), 
+      hmmSeg_(hmmSegDict) {
+  }
+  MixSegment(const DictTrie* dictTrie, const HMMModel* model) 
+    : mpSeg_(dictTrie), hmmSeg_(model) {
+  }
+  ~MixSegment() {
+  }
+
+  void Cut(const string& sentence, vector<string>& words) const {
+    Cut(sentence, words, true);
+  }
+  void Cut(const string& sentence, vector<string>& words, bool hmm) const {
+    vector<Word> tmp;
+    Cut(sentence, tmp, hmm);
+    GetStringsFromWords(tmp, words);
+  }
+  void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
+    PreFilter pre_filter(symbols_, sentence);
+    PreFilter::Range range;
+    vector<WordRange> wrs;
+    wrs.reserve(sentence.size() / 2);
+    while (pre_filter.HasNext()) {
+      range = pre_filter.Next();
+      Cut(range.begin, range.end, wrs, hmm);
+    }
+    words.clear();
+    words.reserve(wrs.size());
+    GetWordsFromWordRanges(sentence, wrs, words);
+  }
+
+  void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm) const {
+    if (!hmm) {
+      mpSeg_.Cut(begin, end, res);
+      return;
+    }
+    vector<WordRange> words;
+    assert(end >= begin);
+    words.reserve(end - begin);
+    mpSeg_.Cut(begin, end, words);
+
+    vector<WordRange> hmmRes;
+    hmmRes.reserve(end - begin);
+    for (size_t i = 0; i < words.size(); i++) {
+      //if mp Get a word, it's ok, put it into result
+      if (words[i].left != words[i].right || (words[i].left == words[i].right && mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune))) {
+        res.push_back(words[i]);
+        continue;
+      }
+
+      // if mp Get a single one and it is not in userdict, collect it in sequence
+      size_t j = i;
+      while (j < words.size() && words[j].left == words[j].right && !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
+        j++;
+      }
+
+      // Cut the sequence with hmm
+      assert(j - 1 >= i);
+      // TODO
+      hmmSeg_.Cut(words[i].left, words[j - 1].left + 1, hmmRes);
+      //put hmm result to result
+      for (size_t k = 0; k < hmmRes.size(); k++) {
+        res.push_back(hmmRes[k]);
+      }
+
+      //clear tmp vars
+      hmmRes.clear();
+
+      //let i jump over this piece
+      i = j - 1;
+    }
+  }
+
+  const DictTrie* GetDictTrie() const {
+    return mpSeg_.GetDictTrie();
+  }
+
+  bool Tag(const string& src, vector<pair<string, string> >& res) const {
+    return tagger_.Tag(src, res, (SegmentTagged &)*this);
+  }
+
+  string LookupTag(const string &str) const {
+    return tagger_.LookupTag(str, (SegmentTagged &)*this);
+  }
+
+ private:
+  MPSegment mpSeg_;
+  HMMSegment hmmSeg_;
+  PosTagger tagger_;
+
+}; // class MixSegment
+
+} // namespace cppjieba
+
+#endif
diff --git a/src/contribs-lib/CLucene/analysis/jieba/PosTagger.hpp b/src/contribs-lib/CLucene/analysis/jieba/PosTagger.hpp
new file mode 100644
index 0000000..b1d97a4
--- /dev/null
+++ b/src/contribs-lib/CLucene/analysis/jieba/PosTagger.hpp
@@ -0,0 +1,77 @@
+#ifndef CPPJIEBA_POS_TAGGING_H
+#define CPPJIEBA_POS_TAGGING_H
+
+#include "StringUtil.hpp"
+#include "SegmentTagged.hpp"
+#include "DictTrie.hpp"
+
+namespace cppjieba {
+using namespace limonp;
+
+static const char* const POS_M = "m";
+static const char* const POS_ENG = "eng";
+static const char* const POS_X = "x";
+
+class PosTagger {
+ public:
+  PosTagger() {
+  }
+  ~PosTagger() {
+  }
+
+  bool Tag(const string& src, vector<pair<string, string> >& res, const SegmentTagged& segment) const {
+    vector<string> CutRes;
+    segment.Cut(src, CutRes);
+
+    for (vector<string>::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) {
+      res.push_back(make_pair(*itr, LookupTag(*itr, segment)));
+    }
+    return !res.empty();
+  }
+
+  string LookupTag(const string &str, const SegmentTagged& segment) const {
+    const DictUnit *tmp = NULL;
+    RuneStrArray runes;
+    const DictTrie * dict = segment.GetDictTrie();
+    assert(dict != NULL);
+      if (!DecodeRunesInString(str, runes)) {
+        XLOG(ERROR) << "Decode failed.";
+        return POS_X;
+      }
+      tmp = dict->Find(runes.begin(), runes.end());
+      if (tmp == NULL || tmp->tag.empty()) {
+        return SpecialRule(runes);
+      } else {
+        return tmp->tag;
+      }
+  }
+
+ private:
+  const char* SpecialRule(const RuneStrArray& unicode) const {
+    size_t m = 0;
+    size_t eng = 0;
+    for (size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) {
+      if (unicode[i].rune < 0x80) {
+        eng ++;
+        if ('0' <= unicode[i].rune && unicode[i].rune <= '9') {
+          m++;
+        }
+      }
+    }
+    // ascii char is not found
+    if (eng == 0) {
+      return POS_X;
+    }
+    // all the ascii is number char
+    if (m == eng) {
+      return POS_M;
+    }
+    // the ascii chars contain english letter
+    return POS_ENG;
+  }
+
+}; // class PosTagger
+
+} // namespace cppjieba
+
+#endif
diff --git a/src/contribs-lib/CLucene/analysis/jieba/PreFilter.hpp b/src/contribs-lib/CLucene/analysis/jieba/PreFilter.hpp
new file mode 100644
index 0000000..2c10835
--- /dev/null
+++ b/src/contribs-lib/CLucene/analysis/jieba/PreFilter.hpp
@@ -0,0 +1,54 @@
+#ifndef CPPJIEBA_PRE_FILTER_H
+#define CPPJIEBA_PRE_FILTER_H
+
+#include "Trie.hpp"
+#include "Logging.hpp"
+
+namespace cppjieba {
+
+class PreFilter {
+ public:
+  //TODO use WordRange instead of Range
+  struct Range {
+    RuneStrArray::const_iterator begin;
+    RuneStrArray::const_iterator end;
+  }; // struct Range
+
+  PreFilter(const unordered_set<Rune>& symbols, 
+        const string& sentence)
+    : symbols_(symbols) {
+    if (!DecodeRunesInString(sentence, sentence_)) {
+      XLOG(ERROR) << "decode failed. "; 
+    }
+    cursor_ = sentence_.begin();
+  }
+  ~PreFilter() {
+  }
+  bool HasNext() const {
+    return cursor_ != sentence_.end();
+  }
+  Range Next() {
+    Range range;
+    range.begin = cursor_;
+    while (cursor_ != sentence_.end()) {
+      if (IsIn(symbols_, cursor_->rune)) {
+        if (range.begin == cursor_) {
+          cursor_ ++;
+        }
+        range.end = cursor_;
+        return range;
+      }
+      cursor_ ++;
+    }
+    range.end = sentence_.end();
+    return range;
+  }
+ private:
+  RuneStrArray::const_iterator cursor_;
+  RuneStrArray sentence_;
+  const unordered_set<Rune>& symbols_;
+}; // class PreFilter
+
+} // namespace cppjieba
+
+#endif // CPPJIEBA_PRE_FILTER_H
diff --git a/src/contribs-lib/CLucene/analysis/jieba/QuerySegment.hpp b/src/contribs-lib/CLucene/analysis/jieba/QuerySegment.hpp
new file mode 100644
index 0000000..b6a5f75
--- /dev/null
+++ b/src/contribs-lib/CLucene/analysis/jieba/QuerySegment.hpp
@@ -0,0 +1,89 @@
+#ifndef CPPJIEBA_QUERYSEGMENT_H
+#define CPPJIEBA_QUERYSEGMENT_H
+
+#include <algorithm>
+#include <set>
+#include <cassert>
+#include "Logging.hpp"
+#include "DictTrie.hpp"
+#include "SegmentBase.hpp"
+#include "FullSegment.hpp"
+#include "MixSegment.hpp"
+#include "Unicode.hpp"
+
+namespace cppjieba {
+class QuerySegment: public SegmentBase {
+ public:
+  QuerySegment(const string& dict, const string& model, const string& userDict = "")
+    : mixSeg_(dict, model, userDict),
+      trie_(mixSeg_.GetDictTrie()) {
+  }
+  QuerySegment(const DictTrie* dictTrie, const HMMModel* model)
+    : mixSeg_(dictTrie, model), trie_(dictTrie) {
+  }
+  ~QuerySegment() {
+  }
+
+  void Cut(const string& sentence, vector<string>& words) const {
+    Cut(sentence, words, true);
+  }
+  void Cut(const string& sentence, vector<string>& words, bool hmm) const {
+    vector<Word> tmp;
+    Cut(sentence, tmp, hmm);
+    GetStringsFromWords(tmp, words);
+  }
+  void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
+    PreFilter pre_filter(symbols_, sentence);
+    PreFilter::Range range;
+    vector<WordRange> wrs;
+    wrs.reserve(sentence.size()/2);
+    while (pre_filter.HasNext()) {
+      range = pre_filter.Next();
+      Cut(range.begin, range.end, wrs, hmm);
+    }
+    words.clear();
+    words.reserve(wrs.size());
+    GetWordsFromWordRanges(sentence, wrs, words);
+  }
+  void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm) const {
+    //use mix Cut first
+    vector<WordRange> mixRes;
+    mixSeg_.Cut(begin, end, mixRes, hmm);
+
+    vector<WordRange> fullRes;
+    for (vector<WordRange>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
+      if (mixResItr->Length() > 2) {
+        for (size_t i = 0; i + 1 < mixResItr->Length(); i++) {
+          WordRange wr(mixResItr->left + i, mixResItr->left + i + 1);
+          if (trie_->Find(wr.left, wr.right + 1) != NULL) {
+            res.push_back(wr);
+          }
+        }
+      }
+      if (mixResItr->Length() > 3) {
+        for (size_t i = 0; i + 2 < mixResItr->Length(); i++) {
+          WordRange wr(mixResItr->left + i, mixResItr->left + i + 2);
+          if (trie_->Find(wr.left, wr.right + 1) != NULL) {
+            res.push_back(wr);
+          }
+        }
+      }
+      res.push_back(*mixResItr);
+    }
+  }
+ private:
+  bool IsAllAscii(const Unicode& s) const {
+   for(size_t i = 0; i < s.size(); i++) {
+     if (s[i] >= 0x80) {
+       return false;
+     }
+   }
+   return true;
+  }
+  MixSegment mixSeg_;
+  const DictTrie* trie_;
+}; // QuerySegment
+
+} // namespace cppjieba
+
+#endif
diff --git a/src/contribs-lib/CLucene/analysis/jieba/SegmentBase.hpp b/src/contribs-lib/CLucene/analysis/jieba/SegmentBase.hpp
new file mode 100644
index 0000000..8608ecf
--- /dev/null
+++ b/src/contribs-lib/CLucene/analysis/jieba/SegmentBase.hpp
@@ -0,0 +1,46 @@
+#ifndef CPPJIEBA_SEGMENTBASE_H
+#define CPPJIEBA_SEGMENTBASE_H
+
+#include "Logging.hpp"
+#include "PreFilter.hpp"
+#include <cassert>
+
+
+namespace cppjieba {
+
+const char* const SPECIAL_SEPARATORS = " \t\n\xEF\xBC\x8C\xE3\x80\x82";
+
+using namespace limonp;
+
+class SegmentBase {
+ public:
+  SegmentBase() {
+    XCHECK(ResetSeparators(SPECIAL_SEPARATORS));
+  }
+  virtual ~SegmentBase() {
+  }
+
+  virtual void Cut(const string& sentence, vector<string>& words) const = 0;
+
+  bool ResetSeparators(const string& s) {
+    symbols_.clear();
+    RuneStrArray runes;
+    if (!DecodeRunesInString(s, runes)) {
+      XLOG(ERROR) << "decode " << s << " failed";
+      return false;
+    }
+    for (size_t i = 0; i < runes.size(); i++) {
+      if (!symbols_.insert(runes[i].rune).second) {
+        XLOG(ERROR) << s.substr(runes[i].offset, runes[i].len) << " already exists";
+        return false;
+      }
+    }
+    return true;
+  }
+ protected:
+  unordered_set<Rune> symbols_;
+}; // class SegmentBase
+
+} // cppjieba
+
+#endif
diff --git a/src/contribs-lib/CLucene/analysis/jieba/SegmentTagged.hpp b/src/contribs-lib/CLucene/analysis/jieba/SegmentTagged.hpp
new file mode 100644
index 0000000..4d99a31
--- /dev/null
+++ b/src/contribs-lib/CLucene/analysis/jieba/SegmentTagged.hpp
@@ -0,0 +1,23 @@
+#ifndef CPPJIEBA_SEGMENTTAGGED_H
+#define CPPJIEBA_SEGMENTTAGGED_H
+
+#include "SegmentBase.hpp"
+
+namespace cppjieba {
+
+class SegmentTagged : public SegmentBase{
+ public:
+  SegmentTagged() {
+  }
+  virtual ~SegmentTagged() {
+  }
+
+  virtual bool Tag(const string& src, vector<pair<string, string> >& res) const = 0;
+
+  virtual const DictTrie* GetDictTrie() const = 0;
+
+}; // class SegmentTagged
+
+} // cppjieba
+
+#endif
diff --git a/src/contribs-lib/CLucene/analysis/jieba/StdExtension.hpp b/src/contribs-lib/CLucene/analysis/jieba/StdExtension.hpp
new file mode 100644
index 0000000..a5278c5
--- /dev/null
+++ b/src/contribs-lib/CLucene/analysis/jieba/StdExtension.hpp
@@ -0,0 +1,157 @@
+#ifndef LIMONP_STD_EXTEMSION_HPP
+#define LIMONP_STD_EXTEMSION_HPP
+
+#include <map>
+
+#ifdef __APPLE__
+#include <unordered_map>
+#include <unordered_set>
+#elif(__cplusplus >= 201103L)
+#include <unordered_map>
+#include <unordered_set>
+#elif defined _MSC_VER
+#include <unordered_map>
+#include <unordered_set>
+#else
+#include <tr1/unordered_map>
+#include <tr1/unordered_set>
+namespace std {
+    using std::tr1::unordered_map;
+    using std::tr1::unordered_set;
+}
+
+#endif
+
+#include <set>
+#include <string>
+#include <vector>
+#include <deque>
+#include <fstream>
+#include <sstream>
+
+namespace std {
+
+    template<typename T>
+    ostream& operator << (ostream& os, const vector<T>& v) {
+        if(v.empty()) {
+            return os << "[]";
+        }
+        os<<"["<<v[0];
+        for(size_t i = 1; i < v.size(); i++) {
+            os<<", "<<v[i];
+        }
+        os<<"]";
+        return os;
+    }
+
+    template<>
+    inline ostream& operator << (ostream& os, const vector<string>& v) {
+        if(v.empty()) {
+            return os << "[]";
+        }
+        os<<"[\""<<v[0];
+        for(size_t i = 1; i < v.size(); i++) {
+            os<<"\", \""<<v[i];
+        }
+        os<<"\"]";
+        return os;
+    }
+
+    template<typename T>
+    ostream& operator << (ostream& os, const deque<T>& dq) {
+        if(dq.empty()) {
+            return os << "[]";
+        }
+        os<<"[\""<<dq[0];
+        for(size_t i = 1; i < dq.size(); i++) {
+            os<<"\", \""<<dq[i];
+        }
+        os<<"\"]";
+        return os;
+    }
+
+
+    template<class T1, class T2>
+    ostream& operator << (ostream& os, const pair<T1, T2>& pr) {
+        os << pr.first << ":" << pr.second ;
+        return os;
+    }
+
+
+    template<class T>
+    string& operator << (string& str, const T& obj) {
+        stringstream ss;
+        ss << obj; // call ostream& operator << (ostream& os,
+        return str = ss.str();
+    }
+
+    template<class T1, class T2>
+    ostream& operator << (ostream& os, const map<T1, T2>& mp) {
+        if(mp.empty()) {
+            os<<"{}";
+            return os;
+        }
+        os<<'{';
+        typename map<T1, T2>::const_iterator it = mp.begin();
+        os<<*it;
+        it++;
+        while(it != mp.end()) {
+            os<<", "<<*it;
+            it++;
+        }
+        os<<'}';
+        return os;
+    }
+    template<class T1, class T2>
+    ostream& operator << (ostream& os, const std::unordered_map<T1, T2>& mp) {
+        if(mp.empty()) {
+            return os << "{}";
+        }
+        os<<'{';
+        typename std::unordered_map<T1, T2>::const_iterator it = mp.begin();
+        os<<*it;
+        it++;
+        while(it != mp.end()) {
+            os<<", "<<*it++;
+        }
+        return os<<'}';
+    }
+
+    template<class T>
+    ostream& operator << (ostream& os, const set<T>& st) {
+        if(st.empty()) {
+            os << "{}";
+            return os;
+        }
+        os<<'{';
+        typename set<T>::const_iterator it = st.begin();
+        os<<*it;
+        it++;
+        while(it != st.end()) {
+            os<<", "<<*it;
+            it++;
+        }
+        os<<'}';
+        return os;
+    }
+
+    template<class KeyType, class ContainType>
+    bool IsIn(const ContainType& contain, const KeyType& key) {
+        return contain.end() != contain.find(key);
+    }
+
+    template<class T>
+    basic_string<T> & operator << (basic_string<T> & s, ifstream & ifs) {
+        return s.assign((istreambuf_iterator<T>(ifs)), istreambuf_iterator<T>());
+    }
+
+    template<class T>
+    ofstream & operator << (ofstream & ofs, const basic_string<T>& s) {
+        ostreambuf_iterator<T> itr (ofs);
+        copy(s.begin(), s.end(), itr);
+        return ofs;
+    }
+
+} // namespace std
+
+#endif
\ No newline at end of file
diff --git a/src/contribs-lib/CLucene/analysis/jieba/StringUtil.hpp b/src/contribs-lib/CLucene/analysis/jieba/StringUtil.hpp
new file mode 100644
index 0000000..6079ba4
--- /dev/null
+++ b/src/contribs-lib/CLucene/analysis/jieba/StringUtil.hpp
@@ -0,0 +1,380 @@
+/************************************
+* file enc : ascii
+* author   : wuyanyi09@gmail.com
+************************************/
+#ifndef LIMONP_STR_FUNCTS_H
+#define LIMONP_STR_FUNCTS_H
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <algorithm>
+#include <cctype>
+#include <map>
+#include <cassert>
+#include <ctime>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <memory.h>
+#include <functional>
+#include <locale>
+#include <sstream>
+#include <sys/types.h>
+#include <iterator>
+#include <algorithm>
+#include "StdExtension.hpp"
+
+namespace limonp {
+   using namespace std;
+   inline string StringFormat(const char* fmt, ...) {
+       int size = 256;
+       std::string str;
+       va_list ap;
+       while (1) {
+           str.resize(size);
+           va_start(ap, fmt);
+           int n = vsnprintf((char *)str.c_str(), size, fmt, ap);
+           va_end(ap);
+           if (n > -1 && n < size) {
+               str.resize(n);
+               return str;
+           }
+           if (n > -1)
+               size = n + 1;
+           else
+               size *= 2;
+       }
+       return str;
+   }
+
+   template<class T>
+   void Join(T begin, T end, string& res, const string& connector) {
+       if(begin == end) {
+           return;
+       }
+       stringstream ss;
+       ss<<*begin;
+       begin++;
+       while(begin != end) {
+           ss << connector << *begin;
+           begin ++;
+       }
+       res = ss.str();
+   }
+
+   template<class T>
+   string Join(T begin, T end, const string& connector) {
+       string res;
+       Join(begin ,end, res, connector);
+       return res;
+   }
+
+   inline string& Upper(string& str) {
+       transform(str.begin(), str.end(), str.begin(), (int (*)(int))toupper);
+       return str;
+   }
+
+   inline string& Lower(string& str) {
+       transform(str.begin(), str.end(), str.begin(), (int (*)(int))tolower);
+       return str;
+   }
+
+   inline bool IsSpace(unsigned c) {
+       // when passing large int as the argument of isspace, it core dump, so here need a type cast.
+       return c > 0xff ? false : std::isspace(c & 0xff) != 0;
+   }
+
+   inline std::string& LTrim(std::string &s) {
+       s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](const char & c) { return !IsSpace(c); }));
+       return s;
+   }
+
+   inline std::string& RTrim(std::string &s) {
+       s.erase(std::find_if(s.rbegin(), s.rend(), [](const char & c) { return !IsSpace(c); }).base(), s.end());
+       return s;
+   }
+
+   inline std::string& Trim(std::string &s) {
+       return LTrim(RTrim(s));
+   }
+
+   inline std::string& LTrim(std::string & s, char x) {
+       s.erase(s.begin(), std::find_if(s.begin(), s.end(), [&x](const char & c) { return c != x; }));
+       return s;
+   }
+
+   inline std::string& RTrim(std::string & s, char x) {
+       s.erase(std::find_if(s.rbegin(), s.rend(), [&x](const char & c) { return c != x; }).base(), s.end());
+       return s;
+   }
+
+   inline std::string& Trim(std::string &s, char x) {
+       return LTrim(RTrim(s, x), x);
+   }
+
+   inline void Split(const string& src, vector<string>& res, const string& pattern, size_t maxsplit = string::npos) {
+       res.clear();
+       size_t Start = 0;
+       size_t end = 0;
+       string sub;
+       while(Start < src.size()) {
+           end = src.find_first_of(pattern, Start);
+           if(string::npos == end || res.size() >= maxsplit) {
+               sub = src.substr(Start);
+               res.push_back(sub);
+               return;
+           }
+           sub = src.substr(Start, end - Start);
+           res.push_back(sub);
+           Start = end + 1;
+       }
+       return;
+   }
+
+   inline vector<string> Split(const string& src, const string& pattern, size_t maxsplit = string::npos) {
+       vector<string> res;
+       Split(src, res, pattern, maxsplit);
+       return res;
+   }
+
+   inline bool StartsWith(const string& str, const string& prefix) {
+       if(prefix.length() > str.length()) {
+           return false;
+       }
+       return 0 == str.compare(0, prefix.length(), prefix);
+   }
+
+   inline bool EndsWith(const string& str, const string& suffix) {
+       if(suffix.length() > str.length()) {
+           return false;
+       }
+       return 0 == str.compare(str.length() -  suffix.length(), suffix.length(), suffix);
+   }
+
+   inline bool IsInStr(const string& str, char ch) {
+       return str.find(ch) != string::npos;
+   }
+
+   inline uint16_t TwocharToUint16(char high, char low) {
+       return (((uint16_t(high) & 0x00ff ) << 8) | (uint16_t(low) & 0x00ff));
+   }
+
+   template <class Uint16Container>
+   bool Utf8ToUnicode(const char * const str, size_t len, Uint16Container& vec) {
+       if(!str) {
+           return false;
+       }
+       char ch1, ch2;
+       uint16_t tmp;
+       vec.clear();
+       for(size_t i = 0; i < len;) {
+           if(!(str[i] & 0x80)) { // 0xxxxxxx
+               vec.push_back(str[i]);
+               i++;
+           } else if ((uint8_t)str[i] <= 0xdf && i + 1 < len) { // 110xxxxxx
+               ch1 = (str[i] >> 2) & 0x07;
+               ch2 = (str[i+1] & 0x3f) | ((str[i] & 0x03) << 6 );
+               tmp = (((uint16_t(ch1) & 0x00ff ) << 8) | (uint16_t(ch2) & 0x00ff));
+               vec.push_back(tmp);
+               i += 2;
+           } else if((uint8_t)str[i] <= 0xef && i + 2 < len) {
+               ch1 = ((uint8_t)str[i] << 4) | ((str[i+1] >> 2) & 0x0f );
+               ch2 = (((uint8_t)str[i+1]<<6) & 0xc0) | (str[i+2] & 0x3f);
+               tmp = (((uint16_t(ch1) & 0x00ff ) << 8) | (uint16_t(ch2) & 0x00ff));
+               vec.push_back(tmp);
+               i += 3;
+           } else {
+               return false;
+           }
+       }
+       return true;
+   }
+
+   template <class Uint16Container>
+   bool Utf8ToUnicode(const string& str, Uint16Container& vec) {
+       return Utf8ToUnicode(str.c_str(), str.size(), vec);
+   }
+
+   template <class Uint32Container>
+   bool Utf8ToUnicode32(const string& str, Uint32Container& vec) {
+       uint32_t tmp;
+       vec.clear();
+       for(size_t i = 0; i < str.size();) {
+           if(!(str[i] & 0x80)) { // 0xxxxxxx
+               // 7bit, total 7bit
+               tmp = (uint8_t)(str[i]) & 0x7f;
+               i++;
+           } else if ((uint8_t)str[i] <= 0xdf && i + 1 < str.size()) { // 110xxxxxx
+               // 5bit, total 5bit
+               tmp = (uint8_t)(str[i]) & 0x1f;
+
+               // 6bit, total 11bit
+               tmp <<= 6;
+               tmp |= (uint8_t)(str[i+1]) & 0x3f;
+               i += 2;
+           } else if((uint8_t)str[i] <= 0xef && i + 2 < str.size()) { // 1110xxxxxx
+               // 4bit, total 4bit
+               tmp = (uint8_t)(str[i]) & 0x0f;
+
+               // 6bit, total 10bit
+               tmp <<= 6;
+               tmp |= (uint8_t)(str[i+1]) & 0x3f;
+
+               // 6bit, total 16bit
+               tmp <<= 6;
+               tmp |= (uint8_t)(str[i+2]) & 0x3f;
+
+               i += 3;
+           } else if((uint8_t)str[i] <= 0xf7 && i + 3 < str.size()) { // 11110xxxx
+               // 3bit, total 3bit
+               tmp = (uint8_t)(str[i]) & 0x07;
+
+               // 6bit, total 9bit
+               tmp <<= 6;
+               tmp |= (uint8_t)(str[i+1]) & 0x3f;
+
+               // 6bit, total 15bit
+               tmp <<= 6;
+               tmp |= (uint8_t)(str[i+2]) & 0x3f;
+
+               // 6bit, total 21bit
+               tmp <<= 6;
+               tmp |= (uint8_t)(str[i+3]) & 0x3f;
+
+               i += 4;
+           } else {
+               return false;
+           }
+           vec.push_back(tmp);
+       }
+       return true;
+   }
+
+   template <class Uint32ContainerConIter>
+   void Unicode32ToUtf8(Uint32ContainerConIter begin, Uint32ContainerConIter end, string& res) {
+       res.clear();
+       uint32_t ui;
+       while(begin != end) {
+           ui = *begin;
+           if(ui <= 0x7f) {
+               res += char(ui);
+           } else if(ui <= 0x7ff) {
+               res += char(((ui >> 6) & 0x1f) | 0xc0);
+               res += char((ui & 0x3f) | 0x80);
+           } else if(ui <= 0xffff) {
+               res += char(((ui >> 12) & 0x0f) | 0xe0);
+               res += char(((ui >> 6) & 0x3f) | 0x80);
+               res += char((ui & 0x3f) | 0x80);
+           } else {
+               res += char(((ui >> 18) & 0x03) | 0xf0);
+               res += char(((ui >> 12) & 0x3f) | 0x80);
+               res += char(((ui >> 6) & 0x3f) | 0x80);
+               res += char((ui & 0x3f) | 0x80);
+           }
+           begin ++;
+       }
+   }
+
+   template <class Uint16ContainerConIter>
+   void UnicodeToUtf8(Uint16ContainerConIter begin, Uint16ContainerConIter end, string& res) {
+       res.clear();
+       uint16_t ui;
+       while(begin != end) {
+           ui = *begin;
+           if(ui <= 0x7f) {
+               res += char(ui);
+           } else if(ui <= 0x7ff) {
+               res += char(((ui>>6) & 0x1f) | 0xc0);
+               res += char((ui & 0x3f) | 0x80);
+           } else {
+               res += char(((ui >> 12) & 0x0f )| 0xe0);
+               res += char(((ui>>6) & 0x3f )| 0x80 );
+               res += char((ui & 0x3f) | 0x80);
+           }
+           begin ++;
+       }
+   }
+
+
+   template <class Uint16Container>
+   bool GBKTrans(const char* const str, size_t len, Uint16Container& vec) {
+       vec.clear();
+       if(!str) {
+           return true;
+       }
+       size_t i = 0;
+       while(i < len) {
+           if(0 == (str[i] & 0x80)) {
+               vec.push_back(uint16_t(str[i]));
+               i++;
+           } else {
+               if(i + 1 < len) { //&& (str[i+1] & 0x80))
+                   uint16_t tmp = (((uint16_t(str[i]) & 0x00ff ) << 8) | (uint16_t(str[i+1]) & 0x00ff));
+                   vec.push_back(tmp);
+                   i += 2;
+               } else {
+                   return false;
+               }
+           }
+       }
+       return true;
+   }
+
+   template <class Uint16Container>
+   bool GBKTrans(const string& str, Uint16Container& vec) {
+       return GBKTrans(str.c_str(), str.size(), vec);
+   }
+
+   template <class Uint16ContainerConIter>
+   void GBKTrans(Uint16ContainerConIter begin, Uint16ContainerConIter end, string& res) {
+       res.clear();
+       //pair<char, char> pa;
+       char first, second;
+       while(begin != end) {
+           //pa = uint16ToChar2(*begin);
+           first = ((*begin)>>8) & 0x00ff;
+           second = (*begin) & 0x00ff;
+           if(first & 0x80) {
+               res += first;
+               res += second;
+           } else {
+               res += second;
+           }
+           begin++;
+       }
+   }
+
+   /*
+* format example: "%Y-%m-%d %H:%M:%S"
+*/
+   inline void GetTime(const string& format, string&  timeStr) {
+       time_t timeNow;
+       time(&timeNow);
+
+       struct tm tmNow;
+
+#if defined(_WIN32) || defined(_WIN64)
+       errno_t e = localtime_s(&tmNow, &timeNow);
+       assert(e = 0);
+#else
+       struct tm * tm_tmp = localtime_r(&timeNow, &tmNow);
+       assert(tm_tmp != nullptr);
+#endif
+
+       timeStr.resize(64);
+
+       size_t len = strftime((char*)timeStr.c_str(), timeStr.size(), format.c_str(), &tmNow);
+
+       timeStr.resize(len);
+   }
+
+   inline string PathJoin(const string& path1, const string& path2) {
+       if(EndsWith(path1, "/")) {
+           return path1 + path2;
+       }
+       return path1 + "/" + path2;
+   }
+
+}
+#endif
\ No newline at end of file
diff --git a/src/contribs-lib/CLucene/analysis/jieba/TextRankExtractor.hpp b/src/contribs-lib/CLucene/analysis/jieba/TextRankExtractor.hpp
new file mode 100644
index 0000000..292d0a8
--- /dev/null
+++ b/src/contribs-lib/CLucene/analysis/jieba/TextRankExtractor.hpp
@@ -0,0 +1,190 @@
+#ifndef CPPJIEBA_TEXTRANK_EXTRACTOR_H
+#define CPPJIEBA_TEXTRANK_EXTRACTOR_H
+
+#include <cmath>
+#include "Jieba.hpp"
+
+namespace cppjieba {
+  using namespace limonp;
+  using namespace std;
+
+  class TextRankExtractor {
+  public:
+    typedef struct _Word {string word;vector<size_t> offsets;double weight;}    Word; // struct Word
+  private:
+    typedef std::map<string,Word> WordMap;
+  
+    class WordGraph{
+    private:
+      typedef double Score;
+      typedef string Node;
+      typedef std::set<Node> NodeSet;
+
+      typedef std::map<Node,double> Edges;
+      typedef std::map<Node,Edges> Graph;
+      //typedef std::unordered_map<Node,double> Edges;
+      //typedef std::unordered_map<Node,Edges> Graph;
+
+      double d;
+      Graph graph;
+      NodeSet nodeSet;
+    public:
+      WordGraph(): d(0.85) {};
+      WordGraph(double in_d): d(in_d) {};
+
+      void addEdge(Node start,Node end,double weight){
+        Edges temp;
+        Edges::iterator gotEdges;
+        nodeSet.insert(start);
+        nodeSet.insert(end);
+        graph[start][end]+=weight;
+        graph[end][start]+=weight;
+      }
+
+      void rank(WordMap &ws,size_t rankTime=10){
+        WordMap outSum;
+        Score wsdef, min_rank, max_rank;
+
+        if( graph.size() == 0)
+          return;
+
+        wsdef = 1.0 / graph.size();
+
+        for(Graph::iterator edges=graph.begin();edges!=graph.end();++edges){
+          // edges->first start节点;edge->first end节点;edge->second 权重
+          ws[edges->first].word=edges->first;
+          ws[edges->first].weight=wsdef;
+          outSum[edges->first].weight=0;
+          for(Edges::iterator edge=edges->second.begin();edge!=edges->second.end();++edge){
+            outSum[edges->first].weight+=edge->second;
+          }
+        }
+        //sort(nodeSet.begin(),nodeSet.end()); 是否需要排序?
+        for( size_t i=0; i<rankTime; i++ ){
+          for(NodeSet::iterator node = nodeSet.begin(); node != nodeSet.end(); node++ ){
+            double s = 0;
+            for( Edges::iterator edge= graph[*node].begin(); edge != graph[*node].end(); edge++ )
+              // edge->first end节点;edge->second 权重
+              s += edge->second / outSum[edge->first].weight * ws[edge->first].weight;
+            ws[*node].weight = (1 - d) + d * s;
+          }
+        }
+
+        min_rank=max_rank=ws.begin()->second.weight;
+        for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++){
+          if( i->second.weight < min_rank ){
+            min_rank = i->second.weight;
+          }
+          if( i->second.weight > max_rank ){
+            max_rank = i->second.weight;
+          }
+        }
+        for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++){
+          ws[i->first].weight = (i->second.weight - min_rank / 10.0) / (max_rank - min_rank / 10.0);
+        }
+      }
+    };
+
+  public: 
+  TextRankExtractor(const string& dictPath, 
+        const string& hmmFilePath, 
+        const string& stopWordPath, 
+        const string& userDict = "") 
+    : segment_(dictPath, hmmFilePath, userDict) {
+    LoadStopWordDict(stopWordPath);
+  }
+  TextRankExtractor(const DictTrie* dictTrie, 
+        const HMMModel* model,
+        const string& stopWordPath) 
+    : segment_(dictTrie, model) {
+    LoadStopWordDict(stopWordPath);
+  }
+    TextRankExtractor(const Jieba& jieba, const string& stopWordPath) : segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) {
+        LoadStopWordDict(stopWordPath);
+    }
+    ~TextRankExtractor() {
+    }
+
+    void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
+      vector<Word> topWords;
+      Extract(sentence, topWords, topN);
+      for (size_t i = 0; i < topWords.size(); i++) {
+        keywords.push_back(topWords[i].word);
+      }
+    }
+
+    void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
+      vector<Word> topWords;
+      Extract(sentence, topWords, topN);
+      for (size_t i = 0; i < topWords.size(); i++) {
+        keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
+      }
+    }
+
+    void Extract(const string& sentence, vector<Word>& keywords, size_t topN, size_t span=5,size_t rankTime=10) const {
+      vector<string> words;
+      segment_.Cut(sentence, words);
+
+      TextRankExtractor::WordGraph graph;
+      WordMap wordmap;
+      size_t offset = 0;
+
+      for(size_t i=0; i < words.size(); i++){
+        size_t t = offset;
+        offset += words[i].size();
+        if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) {
+          continue;
+        }
+        for(size_t j=i+1,skip=0;j<i+span+skip && j<words.size();j++){
+          if (IsSingleWord(words[j]) || stopWords_.find(words[j]) != stopWords_.end()) {
+            skip++;
+            continue;
+          }
+          graph.addEdge(words[i],words[j],1);
+        }
+        wordmap[words[i]].offsets.push_back(t);
+      }
+      if (offset != sentence.size()) {
+        XLOG(ERROR) << "words illegal";
+        return;
+      }
+
+      graph.rank(wordmap,rankTime);
+      
+      keywords.clear();
+      keywords.reserve(wordmap.size());
+      for (WordMap::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
+        keywords.push_back(itr->second);
+      }
+      
+      topN = min(topN, keywords.size());
+      partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
+      keywords.resize(topN);
+    }
+  private:
+    void LoadStopWordDict(const string& filePath) {
+      ifstream ifs(filePath.c_str());
+      XCHECK(ifs.is_open()) << "open " << filePath << " failed";
+      string line ;
+      while (getline(ifs, line)) {
+        stopWords_.insert(line);
+      }
+      assert(stopWords_.size());
+    }
+
+    static bool Compare(const Word &x,const Word &y){
+      return x.weight > y.weight;
+    }
+
+    MixSegment segment_;
+    unordered_set<string> stopWords_;
+  }; // class TextRankExtractor
+  
+  inline ostream& operator << (ostream& os, const TextRankExtractor::Word& word) {
+    return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}"; 
+  }
+} // namespace cppjieba
+
+#endif
+
+
diff --git a/src/contribs-lib/CLucene/analysis/jieba/Trie.hpp b/src/contribs-lib/CLucene/analysis/jieba/Trie.hpp
new file mode 100644
index 0000000..2ed2a82
--- /dev/null
+++ b/src/contribs-lib/CLucene/analysis/jieba/Trie.hpp
@@ -0,0 +1,200 @@
+#ifndef CPPJIEBA_TRIE_HPP
+#define CPPJIEBA_TRIE_HPP
+
+#include <vector>
+#include <queue>
+#include "StdExtension.hpp"
+#include "Unicode.hpp"
+
+namespace cppjieba {
+
+using namespace std;
+
+const size_t MAX_WORD_LENGTH = 512;
+
+struct DictUnit {
+  Unicode word;
+  double weight;
+  string tag;
+}; // struct DictUnit
+
+// for debugging
+// inline ostream & operator << (ostream& os, const DictUnit& unit) {
+//   string s;
+//   s << unit.word;
+//   return os << StringFormat("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight);
+// }
+
+struct Dag {
+  RuneStr runestr;
+  // [offset, nexts.first]
+  limonp::LocalVector<pair<size_t, const DictUnit*> > nexts;
+  const DictUnit * pInfo;
+  double weight;
+  size_t nextPos; // TODO
+  Dag():runestr(), pInfo(NULL), weight(0.0), nextPos(0) {
+  }
+}; // struct Dag
+
+typedef Rune TrieKey;
+
+class TrieNode {
+ public :
+  TrieNode(): next(NULL), ptValue(NULL) {
+  }
+ public:
+  typedef unordered_map<TrieKey, TrieNode*> NextMap;
+  NextMap *next;
+  const DictUnit *ptValue;
+};
+
+class Trie {
+ public:
+  Trie(const vector<Unicode>& keys, const vector<const DictUnit*>& valuePointers)
+   : root_(new TrieNode) {
+    CreateTrie(keys, valuePointers);
+  }
+  ~Trie() {
+    DeleteNode(root_);
+  }
+
+  const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
+    if (begin == end) {
+      return NULL;
+    }
+
+    const TrieNode* ptNode = root_;
+    TrieNode::NextMap::const_iterator citer;
+    for (RuneStrArray::const_iterator it = begin; it != end; it++) {
+      if (NULL == ptNode->next) {
+        return NULL;
+      }
+      citer = ptNode->next->find(it->rune);
+      if (ptNode->next->end() == citer) {
+        return NULL;
+      }
+      ptNode = citer->second;
+    }
+    return ptNode->ptValue;
+  }
+
+  void Find(RuneStrArray::const_iterator begin, 
+        RuneStrArray::const_iterator end, 
+        vector<struct Dag>&res, 
+        size_t max_word_len = MAX_WORD_LENGTH) const {
+    assert(root_ != NULL);
+    res.resize(end - begin);
+
+    const TrieNode *ptNode = NULL;
+    TrieNode::NextMap::const_iterator citer;
+    for (size_t i = 0; i < size_t(end - begin); i++) {
+      res[i].runestr = *(begin + i);
+
+      if (root_->next != NULL && root_->next->end() != (citer = root_->next->find(res[i].runestr.rune))) {
+        ptNode = citer->second;
+      } else {
+        ptNode = NULL;
+      }
+      if (ptNode != NULL) {
+        res[i].nexts.push_back(pair<size_t, const DictUnit*>(i, ptNode->ptValue));
+      } else {
+        res[i].nexts.push_back(pair<size_t, const DictUnit*>(i, static_cast<const DictUnit*>(NULL)));
+      }
+
+      for (size_t j = i + 1; j < size_t(end - begin) && (j - i + 1) <= max_word_len; j++) {
+        if (ptNode == NULL || ptNode->next == NULL) {
+          break;
+        }
+        citer = ptNode->next->find((begin + j)->rune);
+        if (ptNode->next->end() == citer) {
+          break;
+        }
+        ptNode = citer->second;
+        if (NULL != ptNode->ptValue) {
+          res[i].nexts.push_back(pair<size_t, const DictUnit*>(j, ptNode->ptValue));
+        }
+      }
+    }
+  }
+
+  void InsertNode(const Unicode& key, const DictUnit* ptValue) {
+    if (key.begin() == key.end()) {
+      return;
+    }
+
+    TrieNode::NextMap::const_iterator kmIter;
+    TrieNode *ptNode = root_;
+    for (Unicode::const_iterator citer = key.begin(); citer != key.end(); ++citer) {
+      if (NULL == ptNode->next) {
+        ptNode->next = new TrieNode::NextMap;
+      }
+      kmIter = ptNode->next->find(*citer);
+      if (ptNode->next->end() == kmIter) {
+        TrieNode *nextNode = new TrieNode;
+
+        ptNode->next->insert(make_pair(*citer, nextNode));
+        ptNode = nextNode;
+      } else {
+        ptNode = kmIter->second;
+      }
+    }
+    assert(ptNode != NULL);
+    ptNode->ptValue = ptValue;
+  }
+  void DeleteNode(const Unicode& key, const DictUnit* ptValue) {
+      if (key.begin() == key.end()) {
+        return;
+      }
+      //定义一个NextMap迭代器
+      TrieNode::NextMap::const_iterator kmIter;
+      //定义一个指向root的TrieNode指针
+      TrieNode *ptNode = root_;
+      for (Unicode::const_iterator citer = key.begin(); citer != key.end(); ++citer) {
+        //链表不存在元素
+        if (NULL == ptNode->next) {
+          return;
+        }
+        kmIter = ptNode->next->find(*citer);
+        //如果map中不存在,跳出循环
+        if (ptNode->next->end() == kmIter) {
+              break;
+        }
+        //从unordered_map中擦除该项
+        ptNode->next->erase(*citer);
+        //删除该node
+        ptNode = kmIter->second;
+        delete ptNode;
+        break;
+      }
+      return;
+ }
+ private:
+  void CreateTrie(const vector<Unicode>& keys, const vector<const DictUnit*>& valuePointers) {
+    if (valuePointers.empty() || keys.empty()) {
+      return;
+    }
+    assert(keys.size() == valuePointers.size());
+
+    for (size_t i = 0; i < keys.size(); i++) {
+      InsertNode(keys[i], valuePointers[i]);
+    }
+  }
+
+  void DeleteNode(TrieNode* node) {
+    if (NULL == node) {
+      return;
+    }
+    if (NULL != node->next) {
+      for (TrieNode::NextMap::iterator it = node->next->begin(); it != node->next->end(); ++it) {
+        DeleteNode(it->second);
+      }
+      delete node->next;
+    }
+    delete node;
+  }
+
+  TrieNode* root_;
+}; // class Trie
+} // namespace cppjieba
+
+#endif // CPPJIEBA_TRIE_HPP
diff --git a/src/contribs-lib/CLucene/analysis/jieba/Unicode.hpp b/src/contribs-lib/CLucene/analysis/jieba/Unicode.hpp
new file mode 100644
index 0000000..28dbd23
--- /dev/null
+++ b/src/contribs-lib/CLucene/analysis/jieba/Unicode.hpp
@@ -0,0 +1,227 @@
+#ifndef CPPJIEBA_UNICODE_H
+#define CPPJIEBA_UNICODE_H
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string>
+#include <vector>
+#include <ostream>
+#include "LocalVector.hpp"
+
+namespace cppjieba {
+
+using std::string;
+using std::vector;
+
+typedef uint32_t Rune;
+
+struct Word {
+  string word;
+  uint32_t offset;
+  uint32_t unicode_offset;
+  uint32_t unicode_length;
+  Word(const string& w, uint32_t o)
+   : word(w), offset(o) {
+  }
+  Word(const string& w, uint32_t o, uint32_t unicode_offset, uint32_t unicode_length)
+          : word(w), offset(o), unicode_offset(unicode_offset), unicode_length(unicode_length) {
+  }
+}; // struct Word
+
+inline std::ostream& operator << (std::ostream& os, const Word& w) {
+  return os << "{\"word\": \"" << w.word << "\", \"offset\": " << w.offset << "}";
+}
+
+struct RuneStr {
+  Rune rune;
+  uint32_t offset;
+  uint32_t len;
+  uint32_t unicode_offset;
+  uint32_t unicode_length;
+  RuneStr(): rune(0), offset(0), len(0), unicode_offset(0), unicode_length(0) {
+  }
+  RuneStr(Rune r, uint32_t o, uint32_t l)
+    : rune(r), offset(o), len(l), unicode_offset(0), unicode_length(0) {
+  }
+  RuneStr(Rune r, uint32_t o, uint32_t l, uint32_t unicode_offset, uint32_t unicode_length)
+          : rune(r), offset(o), len(l), unicode_offset(unicode_offset), unicode_length(unicode_length) {
+  }
+}; // struct RuneStr
+
+inline std::ostream& operator << (std::ostream& os, const RuneStr& r) {
+  return os << "{\"rune\": \"" << r.rune << "\", \"offset\": " << r.offset << ", \"len\": " << r.len << "}";
+}
+
+typedef limonp::LocalVector<Rune> Unicode;
+typedef limonp::LocalVector<struct RuneStr> RuneStrArray;
+
+// [left, right]
+struct WordRange {
+  RuneStrArray::const_iterator left;
+  RuneStrArray::const_iterator right;
+  WordRange(RuneStrArray::const_iterator l, RuneStrArray::const_iterator r)
+   : left(l), right(r) {
+  }
+  size_t Length() const {
+    return right - left + 1;
+  }
+  bool IsAllAscii() const {
+    for (RuneStrArray::const_iterator iter = left; iter <= right; ++iter) {
+      if (iter->rune >= 0x80) {
+        return false;
+      }
+    }
+    return true;
+  }
+}; // struct WordRange
+
+struct RuneStrLite {
+  uint32_t rune;
+  uint32_t len;
+  RuneStrLite(): rune(0), len(0) {
+  }
+  RuneStrLite(uint32_t r, uint32_t l): rune(r), len(l) {
+  }
+}; // struct RuneStrLite
+
+inline RuneStrLite DecodeRuneInString(const char* str, size_t len) {
+  RuneStrLite rp(0, 0);
+  if (str == NULL || len == 0) {
+    return rp;
+  }
+  if (!(str[0] & 0x80)) { // 0xxxxxxx
+    // 7bit, total 7bit
+    rp.rune = (uint8_t)(str[0]) & 0x7f;
+    rp.len = 1;
+  } else if ((uint8_t)str[0] <= 0xdf &&  1 < len) { 
+    // 110xxxxxx
+    // 5bit, total 5bit
+    rp.rune = (uint8_t)(str[0]) & 0x1f;
+
+    // 6bit, total 11bit
+    rp.rune <<= 6;
+    rp.rune |= (uint8_t)(str[1]) & 0x3f;
+    rp.len = 2;
+  } else if((uint8_t)str[0] <= 0xef && 2 < len) { // 1110xxxxxx
+    // 4bit, total 4bit
+    rp.rune = (uint8_t)(str[0]) & 0x0f;
+
+    // 6bit, total 10bit
+    rp.rune <<= 6;
+    rp.rune |= (uint8_t)(str[1]) & 0x3f;
+
+    // 6bit, total 16bit
+    rp.rune <<= 6;
+    rp.rune |= (uint8_t)(str[2]) & 0x3f;
+
+    rp.len = 3;
+  } else if((uint8_t)str[0] <= 0xf7 && 3 < len) { // 11110xxxx
+    // 3bit, total 3bit
+    rp.rune = (uint8_t)(str[0]) & 0x07;
+
+    // 6bit, total 9bit
+    rp.rune <<= 6;
+    rp.rune |= (uint8_t)(str[1]) & 0x3f;
+
+    // 6bit, total 15bit
+    rp.rune <<= 6;
+    rp.rune |= (uint8_t)(str[2]) & 0x3f;
+
+    // 6bit, total 21bit
+    rp.rune <<= 6;
+    rp.rune |= (uint8_t)(str[3]) & 0x3f;
+
+    rp.len = 4;
+  } else {
+    rp.rune = 0;
+    rp.len = 0;
+  }
+  return rp;
+}
+
+inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) {
+  runes.clear();
+  runes.reserve(len / 2);
+  for (uint32_t i = 0, j = 0; i < len;) {
+    RuneStrLite rp = DecodeRuneInString(s + i, len - i);
+    if (rp.len == 0) {
+      runes.clear();
+      return false;
+    }
+    RuneStr x(rp.rune, i, rp.len, j, 1);
+    runes.push_back(x);
+    i += rp.len;
+    ++j;
+  }
+  return true;
+}
+
+inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) {
+  return DecodeRunesInString(s.c_str(), s.size(), runes);
+}
+
+inline bool DecodeRunesInString(const char* s, size_t len, Unicode& unicode) {
+  unicode.clear();
+  RuneStrArray runes;
+  if (!DecodeRunesInString(s, len, runes)) {
+    return false;
+  }
+  unicode.reserve(runes.size());
+  for (size_t i = 0; i < runes.size(); i++) {
+    unicode.push_back(runes[i].rune);
+  }
+  return true;
+}
+
+inline bool IsSingleWord(const string& str) {
+  RuneStrLite rp = DecodeRuneInString(str.c_str(), str.size());
+  return rp.len == str.size();
+}
+
+inline bool DecodeRunesInString(const string& s, Unicode& unicode) {
+  return DecodeRunesInString(s.c_str(), s.size(), unicode);
+}
+
+inline Unicode DecodeRunesInString(const string& s) {
+  Unicode result;
+  DecodeRunesInString(s, result);
+  return result;
+}
+
+
+// [left, right]
+inline Word GetWordFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
+  assert(right->offset >= left->offset);
+  uint32_t len = right->offset - left->offset + right->len;
+  uint32_t unicode_length = right->unicode_offset - left->unicode_offset + right->unicode_length;
+  return Word(s.substr(left->offset, len), left->offset, left->unicode_offset, unicode_length);
+}
+
+inline string GetStringFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
+  assert(right->offset >= left->offset);
+  uint32_t len = right->offset - left->offset + right->len;
+  return s.substr(left->offset, len);
+}
+
+inline void GetWordsFromWordRanges(const string& s, const vector<WordRange>& wrs, vector<Word>& words) {
+  for (size_t i = 0; i < wrs.size(); i++) {
+    words.push_back(GetWordFromRunes(s, wrs[i].left, wrs[i].right));
+  }
+}
+
+inline vector<Word> GetWordsFromWordRanges(const string& s, const vector<WordRange>& wrs) {
+  vector<Word> result;
+  GetWordsFromWordRanges(s, wrs, result);
+  return result;
+}
+
+inline void GetStringsFromWords(const vector<Word>& words, vector<string>& strs) {
+  strs.resize(words.size());
+  for (size_t i = 0; i < words.size(); ++i) {
+    strs[i] = words[i].word;
+  }
+}
+
+} // namespace cppjieba
+
+#endif // CPPJIEBA_UNICODE_H
diff --git a/src/contribs-lib/CLucene/highlighter/Encoder.cpp b/src/contribs-lib/CLucene/highlighter/Encoder.cpp
new file mode 100644
index 0000000..9541c35
--- /dev/null
+++ b/src/contribs-lib/CLucene/highlighter/Encoder.cpp
@@ -0,0 +1,19 @@
+/**
+ * Copyright 2002-2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CLucene/_ApiHeader.h"
+#include "Encoder.h"
+
diff --git a/src/contribs-lib/CLucene/highlighter/Encoder.h b/src/contribs-lib/CLucene/highlighter/Encoder.h
new file mode 100644
index 0000000..b91ab29
--- /dev/null
+++ b/src/contribs-lib/CLucene/highlighter/Encoder.h
@@ -0,0 +1,56 @@
+/**
+ * Copyright 2002-2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef _lucene_search_highlight_encoder_
+#define _lucene_search_highlight_encoder_
+
+
+CL_NS_DEF2(search,highlight)
+
+/**
+ * Encodes original text. The Encoder works with the Formatter to generate the output.
+ *
+ */
+class CLUCENE_CONTRIBS_EXPORT Encoder:LUCENE_BASE
+{
+public:
+	/** Virtual destructor */
+	virtual ~Encoder(){
+	}
+
+	/**
+	 * @param originalText The section of text being output
+	 */
+	virtual TCHAR* encodeText(TCHAR* originalText) = 0;
+};
+
+/**
+ * Simple {@link Encoder} implementation that does not modify the output
+ * @author Nicko Cadell
+ *
+ */
+class DefaultEncoder: public Encoder
+{
+public:
+	TCHAR* encodeText(TCHAR* originalText)
+	{
+		return STRDUP_TtoT(originalText);
+	}
+};
+
+
+CL_NS_END2
+
+#endif
diff --git a/src/contribs-lib/CLucene/highlighter/Formatter.cpp b/src/contribs-lib/CLucene/highlighter/Formatter.cpp
new file mode 100644
index 0000000..88b8a0e
--- /dev/null
+++ b/src/contribs-lib/CLucene/highlighter/Formatter.cpp
@@ -0,0 +1,18 @@
+/**
+ * Copyright 2002-2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CLucene/_ApiHeader.h"
+#include "Formatter.h"
diff --git a/src/contribs-lib/CLucene/highlighter/Formatter.h b/src/contribs-lib/CLucene/highlighter/Formatter.h
new file mode 100644
index 0000000..39c3315
--- /dev/null
+++ b/src/contribs-lib/CLucene/highlighter/Formatter.h
@@ -0,0 +1,49 @@
+/**
+ * Copyright 2002-2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _lucene_search_highlight_formatter_
+#define _lucene_search_highlight_formatter_
+
+CL_NS_DEF2(search,highlight)
+class TokenGroup;
+
+/**
+ * Processes terms found in the original text, typically by applying some form 
+ * of mark-up to highlight terms in HTML search results pages.
+ *
+ */
+class CLUCENE_CONTRIBS_EXPORT Formatter:LUCENE_BASE
+{
+public:
+
+	/** Virtual destructor */
+	virtual ~Formatter(){
+	}
+
+  /**
+	 * @param originalText The section of text being considered for markup
+	 * @param tokenGroup contains one or several overlapping Tokens along with
+	 * their scores and positions.
+	 */
+  virtual TCHAR* highlightTerm(const TCHAR* originalTermText, const TokenGroup* tokenGroup) = 0;
+};
+
+CL_NS_END2
+
+#endif
+
+
+
diff --git a/src/contribs-lib/CLucene/highlighter/Fragmenter.cpp b/src/contribs-lib/CLucene/highlighter/Fragmenter.cpp
new file mode 100644
index 0000000..5d174b4
--- /dev/null
+++ b/src/contribs-lib/CLucene/highlighter/Fragmenter.cpp
@@ -0,0 +1,19 @@
+/**
+ * Copyright 2002-2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CLucene/_ApiHeader.h"
+#include "Fragmenter.h"
+
diff --git a/src/contribs-lib/CLucene/highlighter/Fragmenter.h b/src/contribs-lib/CLucene/highlighter/Fragmenter.h
new file mode 100644
index 0000000..5a1b8c4
--- /dev/null
+++ b/src/contribs-lib/CLucene/highlighter/Fragmenter.h
@@ -0,0 +1,52 @@
+/**
+ * Copyright 2002-2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _lucene_search_highlight_fragmenter_
+#define _lucene_search_highlight_fragmenter_
+
+
+CL_CLASS_DEF(analysis, Token)
+
+CL_NS_DEF2(search,highlight)
+
+/**
+ * Implements the policy for breaking text into multiple fragments for consideration
+ * by the {@link Highlighter} class. A sophisticated implementation may do this on the basis
+ * of detecting end of sentences in the text. 
+ */
+class CLUCENE_CONTRIBS_EXPORT Fragmenter:LUCENE_BASE
+{
+public:
+	/** Virtual destructor */
+	virtual ~Fragmenter(){
+	}
+
+	/**
+	 * Initializes the Fragmenter
+	 * @param originalText
+	 */
+	virtual void start(const TCHAR* originalText) = 0;
+
+	/**
+	 * Test to see if this token from the stream should be held in a new TextFragment
+	 * @param nextToken
+	 */
+	virtual bool isNewFragment(const CL_NS(analysis)::Token * nextToken) = 0;
+};
+
+CL_NS_END2
+
+#endif
diff --git a/src/contribs-lib/CLucene/highlighter/HighlightScorer.h b/src/contribs-lib/CLucene/highlighter/HighlightScorer.h
new file mode 100644
index 0000000..ed73998
--- /dev/null
+++ b/src/contribs-lib/CLucene/highlighter/HighlightScorer.h
@@ -0,0 +1,63 @@
+/**
+ * Copyright 2002-2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _lucene_search_highlight_highlighterscorer_
+#define _lucene_search_highlight_highlighterscorer_
+
+
+CL_CLASS_DEF(analysis, Token)
+//#include "TextFragment.h"
+
+CL_NS_DEF2(search,highlight)
+class TextFragment;
+
+/**
+ * Adds to the score for a fragment based on its tokens
+ */
+class CLUCENE_CONTRIBS_EXPORT HighlightScorer:LUCENE_BASE
+{
+public:
+	virtual ~HighlightScorer(){
+	}
+
+	/**
+	 * called when a new fragment is started for consideration
+	 * @param newFragment
+	 */
+	virtual void startFragment(TextFragment* newFragment) = 0;
+
+	/**
+	 * Called for each token in the current fragment
+	 * @param token The token to be scored
+	 * @return a score which is passed to the Highlighter class to influence the mark-up of the text
+	 * (this return value is NOT used to score the fragment)
+	 */
+	virtual float_t getTokenScore(CL_NS(analysis)::Token* token) = 0;
+	
+
+	/**
+	 * Called when the highlighter has no more tokens for the current fragment - the scorer returns
+	 * the weighting it has derived for the most recent fragment, typically based on the tokens
+	 * passed to getTokenScore(). 
+	 *
+	 */	
+	virtual float_t getFragmentScore() = 0;
+};
+
+CL_NS_END2
+#endif
+
+
diff --git a/src/contribs-lib/CLucene/highlighter/Highlighter.cpp b/src/contribs-lib/CLucene/highlighter/Highlighter.cpp
new file mode 100644
index 0000000..5a8401c
--- /dev/null
+++ b/src/contribs-lib/CLucene/highlighter/Highlighter.cpp
@@ -0,0 +1,525 @@
+/**
+ * Copyright 2002-2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CLucene/_ApiHeader.h"
+#include "Highlighter.h"
+#include "TokenGroup.h"
+#include "Encoder.h"
+#include "Scorer.h"
+#include "Formatter.h"
+#include "HighlightScorer.h"
+#include "Fragmenter.h"
+#include "TextFragment.h"
+#include "SimpleFragmenter.h"
+#include "SimpleHTMLFormatter.h"
+#include "CLucene/analysis/AnalysisHeader.h"
+#include "CLucene/util/PriorityQueue.h"
+#include "CLucene/util/StringBuffer.h"
+#include "CLucene/util/CLStreams.h"
+
+CL_NS_DEF2(search,highlight)
+CL_NS_USE(analysis)
+CL_NS_USE(util)
+
+	class FragmentQueue : public CL_NS(util)::PriorityQueue<TextFragment*, CL_NS(util)::Deletor::Object<TextFragment> >
+	{
+	public:
+		FragmentQueue(int32_t size)
+		{
+			initialize(size, true);
+		}
+
+	protected:
+		bool lessThan(TextFragment * fragA, TextFragment * fragB)
+		{
+			if (fragA->getScore() == fragB->getScore())
+				return fragA->getFragNum() > fragB->getFragNum();
+			else
+				return fragA->getScore() < fragB->getScore();
+		}
+	};
+
+
+	Highlighter::Highlighter(HighlightScorer * fragmentScorer):
+		delete_textFragmenter(true),
+		delete_fragmentScorer(false),
+		delete_formatter(true),
+		delete_encoder(true)
+	{
+		maxDocBytesToAnalyze = DEFAULT_MAX_DOC_BYTES_TO_ANALYZE;
+		
+		_textFragmenter = _CLNEW SimpleFragmenter();
+		_fragmentScorer = fragmentScorer;
+		_formatter = _CLNEW SimpleHTMLFormatter();
+		_encoder = _CLNEW DefaultEncoder();
+	}
+
+	Highlighter::Highlighter(Formatter * formatter, HighlightScorer * fragmentScorer):
+		delete_textFragmenter(true),
+		delete_fragmentScorer(false),
+		delete_formatter(false),
+		delete_encoder(true)
+	{
+		maxDocBytesToAnalyze = DEFAULT_MAX_DOC_BYTES_TO_ANALYZE;
+		
+		_textFragmenter = _CLNEW SimpleFragmenter();
+		_fragmentScorer = fragmentScorer;
+		_formatter = formatter;
+		_encoder = _CLNEW DefaultEncoder();
+	}
+
+	Highlighter::Highlighter(Formatter * formatter, Encoder* encoder, HighlightScorer * fragmentScorer):
+		delete_textFragmenter(true),
+		delete_fragmentScorer(false),
+		delete_formatter(false),
+		delete_encoder(false)
+	{
+		maxDocBytesToAnalyze = DEFAULT_MAX_DOC_BYTES_TO_ANALYZE;
+		_textFragmenter = _CLNEW SimpleFragmenter();
+		_fragmentScorer = fragmentScorer;
+		_formatter = formatter;
+		_encoder = encoder;
+	}
+
+	Highlighter::~Highlighter()
+	{
+		if ( delete_textFragmenter )
+			_CLDELETE ( _textFragmenter );
+
+		if ( delete_fragmentScorer )
+			_CLDELETE(_fragmentScorer);
+
+		if( delete_formatter )
+			_CLDELETE(_formatter);
+
+		if ( delete_encoder )
+			_CLDELETE(_encoder);
+	}
+
+	TCHAR* Highlighter::getBestFragment(TokenStream * tokenStream, const TCHAR* text)
+	{
+		TCHAR** results = getBestFragments(tokenStream,text, 1);
+		TCHAR* result = 0;
+
+		if (results[0] != NULL )
+			result = stringDuplicate(results[0]);
+
+		_CLDELETE_CARRAY_ALL(results);
+
+		return result;
+	}
+
+	/**
+  	* Highlights chosen terms in a text, extracting the most relevant section.
+  	* This is a convenience method that calls
+  	* {@link #getBestFragment(TokenStream, const TCHAR*)}
+  	*
+  	* @param analyzer   the analyzer that will be used to split <code>text</code>
+  	* into chunks
+  	* @param text text to highlight terms in
+  	* @param fieldName Name of field used to influence analyzer's tokenization policy
+  	*
+  	* @return highlighted text fragment or NULL if no terms found
+  	*/
+  	TCHAR* Highlighter::getBestFragment(Analyzer* analyzer, const TCHAR* fieldName, const TCHAR* text)
+  	{
+  	    TokenStream* tokenStream = analyzer->tokenStream(fieldName, _CLNEW StringReader(text));
+  	    return getBestFragment(tokenStream, text);
+  	}
+
+	TCHAR** Highlighter::getBestFragments(
+		TokenStream * tokenStream,	
+		const TCHAR* text,
+		int32_t maxNumFragments)
+	{
+		maxNumFragments = cl_max((int32_t)1, maxNumFragments); //sanity check
+		
+		StringBuffer buffer;
+		TextFragment** frags = getBestTextFragments(&buffer,tokenStream,text, true,maxNumFragments);
+
+		//Get text
+		std::vector<TCHAR*> fragTexts;
+		for (uint32_t i=0; frags[i]!=NULL; i++)
+		{
+			TextFragment* f = frags[i];
+			if ((f != NULL) && (f->getScore() > 0))
+			{
+				 fragTexts.push_back(f->toString(&buffer));
+			}
+			_CLDELETE(f);
+		}
+
+		_CLDELETE_ARRAY(frags);
+
+        size_t l = fragTexts.size();
+		TCHAR** ret = _CL_NEWARRAY(TCHAR*,l+1);
+		for ( size_t j=0;j<l;j++ )
+		    ret[j] = fragTexts[j];
+		ret[l] = NULL;
+
+		return ret;
+	}
+
+	TCHAR* Highlighter::getBestFragments(
+		TokenStream * tokenStream,	
+		const TCHAR* text,
+		int32_t maxNumFragments,
+		const TCHAR* separator)
+	{
+		TCHAR** sections = getBestFragments(tokenStream,text, maxNumFragments);
+		StringBuffer result;
+
+		for (int32_t i = 0; sections[i]!=NULL; i++)
+		{
+			if (i > 0)
+			{
+				result.append(separator);
+			}
+			result.append(sections[i]);
+		}
+
+		_CLDELETE_CARRAY_ALL(sections);
+		return result.toString();
+	}
+
+	TextFragment** Highlighter::getBestTextFragments(
+		StringBuffer* writeTo,
+		TokenStream * tokenStream,	
+		const TCHAR* text,
+		bool mergeContiguousFragments,
+		int32_t maxNumFragments)
+	{
+		CLArrayList<TextFragment*> docFrags(false);
+		TextFragment* currentFrag = _CLNEW TextFragment(writeTo->length(), docFrags.size());
+		_fragmentScorer->startFragment(currentFrag);
+		docFrags.push_back(currentFrag);
+
+		FragmentQueue fragQueue(maxNumFragments);
+
+		try
+		{
+			int32_t startOffset;
+			int32_t endOffset;
+			int32_t lastEndOffset = 0;
+			_textFragmenter->start(text);
+			TCHAR substringBuffer[LUCENE_MAX_WORD_LEN];
+
+			TokenGroup* tokenGroup=_CLNEW TokenGroup();
+
+			TCHAR buffer[LUCENE_MAX_FIELD_LEN+1];
+			Token token;
+			while ( tokenStream->next(&token) )
+			{
+				if((tokenGroup->getNumTokens()>0)&&(tokenGroup->isDistinct(&token))){
+					//the current token is distinct from previous tokens -
+					// markup the cached token group info
+					 startOffset = tokenGroup->getStartOffset();
+					 endOffset = tokenGroup->getEndOffset();
+
+					 _tcsncpy(substringBuffer,text+startOffset,endOffset-startOffset);
+					 substringBuffer[endOffset-startOffset]=_T('\0');
+
+					 TCHAR* encoded = _encoder->encodeText(substringBuffer);
+					 TCHAR* markedUpText=_formatter->highlightTerm(encoded, tokenGroup);
+					 _CLDELETE_CARRAY(encoded);
+
+					 //store any whitespace etc from between this and last group
+					 if (startOffset > lastEndOffset){
+						 int len = startOffset-lastEndOffset;
+						 if ( len > LUCENE_MAX_FIELD_LEN )
+							 len = LUCENE_MAX_FIELD_LEN;
+						 _tcsncpy(buffer,text+lastEndOffset,len);
+						 buffer[len]=_T('\0');
+
+						 TCHAR* encoded = _encoder->encodeText(buffer);
+						 writeTo->append(encoded);
+						 _CLDELETE_CARRAY(encoded);
+					 }
+					 writeTo->append(markedUpText);
+					 lastEndOffset=endOffset;
+					 tokenGroup->clear();
+					 _CLDELETE_CARRAY(markedUpText);
+
+					//check if current token marks the start of a new fragment
+					if (_textFragmenter->isNewFragment(&token))
+					{
+						currentFrag->setScore(_fragmentScorer->getFragmentScore());
+						//record stats for a new fragment
+						currentFrag->setTextEndPos( writeTo->length() );
+						currentFrag =_CLNEW TextFragment(writeTo->length(), docFrags.size());
+						_fragmentScorer->startFragment(currentFrag);
+						docFrags.push_back(currentFrag);
+					}
+				}
+
+				// does query contain current token?
+				float_t score=_fragmentScorer->getTokenScore(&token);			
+				//TCHAR* highlightedTerm = _formatter->highlightTerm(&substringBuffer, token->termText(), score, startOffset);
+				//newText->append(highlightedTerm);
+				//_CLDELETE_CARRAY(highlightedTerm);
+				//_CLDELETE(token);
+
+				tokenGroup->addToken(&token,_fragmentScorer->getTokenScore(&token));
+
+				if(lastEndOffset>maxDocBytesToAnalyze)
+				{
+					break;
+				}
+			}
+			currentFrag->setScore(_fragmentScorer->getFragmentScore());
+
+			if(tokenGroup->getNumTokens()>0)
+  	        {
+  	            //flush the accumulated text (same code as in above loop)
+  	            startOffset = tokenGroup->getStartOffset();
+  	            endOffset = tokenGroup->getEndOffset();
+
+				_tcsncpy(substringBuffer,text+startOffset,endOffset-startOffset);
+				substringBuffer[endOffset-startOffset]=_T('\0');
+
+				TCHAR* encoded = _encoder->encodeText(substringBuffer);
+        TCHAR* markedUpText=_formatter->highlightTerm(encoded, tokenGroup);
+				_CLDELETE_CARRAY(encoded);
+
+  	            //store any whitespace etc from between this and last group
+				if (startOffset > lastEndOffset){
+					int len = startOffset-lastEndOffset;
+					if ( len > LUCENE_MAX_FIELD_LEN )
+						len = LUCENE_MAX_FIELD_LEN;
+					_tcsncpy(buffer,text+lastEndOffset,len);
+					buffer[len]=_T('\0');
+
+					TCHAR* encoded = _encoder->encodeText(buffer);
+  					writeTo->append(encoded);
+					_CLDELETE_CARRAY(encoded);
+				}
+  	            writeTo->append(markedUpText);
+  	            lastEndOffset=endOffset;
+
+				_CLDELETE_CARRAY(markedUpText);
+  	        }
+
+			// append text after end of last token
+			//if (lastEndOffset < (int32_t)_tcslen(text))
+			//newText->append(text+lastEndOffset);
+
+			currentFrag->setTextEndPos(writeTo->length());
+
+			//sort the most relevant sections of the text
+			while (docFrags.size() > 0) {
+			//for (TextFragmentList::iterator i = docFrags.begin(); i != docFrags.end(); i++)
+			//{
+				currentFrag = (TextFragment*) docFrags[0];
+				docFrags.remove(0);
+
+				//If you are running with a version of Lucene before 11th Sept 03
+				// you do not have PriorityQueue.insert() - so uncomment the code below					
+
+				/*if (currentFrag->getScore() >= minScore)
+				{
+					fragQueue.put(currentFrag);
+					if (fragQueue.size() > maxNumFragments)
+					{ // if hit queue overfull
+						_CLLDELETE(fragQueue.pop()); // remove lowest in hit queue
+						minScore = ((TextFragment *) fragQueue.top())->getScore(); // reset minScore
+					}
+
+
+				} else {
+					_CLDELETE(currentFrag);
+				}*/
+
+				//The above code caused a problem as a result of Christoph Goller's 11th Sept 03
+				//fix to PriorityQueue. The correct method to use here is the new "insert" method
+				// USE ABOVE CODE IF THIS DOES NOT COMPILE!
+				if ( !fragQueue.insert(currentFrag) )
+					_CLDELETE(currentFrag);
+
+				//todo: check this
+			}
+
+			//return the most relevant fragments
+			int32_t fragsLen = fragQueue.size();
+			TextFragment** frags = _CL_NEWARRAY(TextFragment*,fragsLen+1);
+			for ( int32_t i=0;i<fragsLen;i++ )
+				frags[i] = fragQueue.pop();
+			frags[fragsLen]=NULL;
+
+			//merge any contiguous fragments to improve readability
+  	        if(mergeContiguousFragments)
+  	        {
+  	            _mergeContiguousFragments(frags,fragsLen);
+  	            CLArrayList<TextFragment*> fragTexts;
+  	            for (int32_t i = 0; i < fragsLen; i++)
+  	            {
+					TextFragment* tf = frags[i];
+  	                if ((tf != NULL) && (tf->getScore() > 0))
+  						fragTexts.push_back(tf);
+  	                else
+						_CLDELETE(tf);
+  	            }
+				_CLDELETE_ARRAY(frags);
+				frags = _CL_NEWARRAY(TextFragment*,fragTexts.size()+1);
+				fragTexts.toArray_nullTerminated(frags);
+  	        }
+
+			_CLDELETE(tokenGroup);
+			//_CLDELETE(newText);
+			return frags;
+
+		}
+		_CLFINALLY(
+			if (tokenStream)
+			{
+				try
+				{
+					tokenStream->close();
+				}
+				catch (...)
+				{
+				}
+			}
+		)
+	}
+
+
+	void Highlighter::_mergeContiguousFragments(TextFragment** frag, int32_t fragsLen)
+	{
+		bool mergingStillBeingDone;
+		if ( frag[0] != NULL )
+			do
+			{
+				mergingStillBeingDone = false; //initialise loop control flag
+				//for each fragment, scan other frags looking for contiguous blocks
+				for (int32_t i=0; i<fragsLen; i++)
+				{
+					if (frag[i] == NULL)
+					{
+						continue;
+					}
+					//merge any contiguous blocks 
+					for (int32_t x=0; x<fragsLen; x++)
+					{
+					   if ( x==i )
+					      continue; //bug 1072183. don't try and merge with self
+
+						if (frag[x] == NULL)
+							continue;
+						if (frag[i] == NULL)
+							break;
+
+						TextFragment * frag1 = NULL;
+						TextFragment * frag2 = NULL;
+						int32_t frag1Num = 0;
+						int32_t frag2Num = 0;
+						int32_t bestScoringFragNum;
+						int32_t worstScoringFragNum;
+						//if blocks are contiguous....
+						if (frag[i]->follows(frag[x]))
+						{
+							frag1 = frag[x];
+							frag1Num = x;
+							frag2 = frag[i];
+							frag2Num = i;
+						}
+						else if (frag[x]->follows(frag[i]))
+						{
+							frag1 = frag[i];
+							frag1Num = i;
+							frag2 = frag[x];
+							frag2Num = x;
+						}
+						//merging required..
+						if (frag1 != NULL)
+						{
+							if (frag1->getScore() > frag2->getScore())
+							{
+								bestScoringFragNum = frag1Num;
+								worstScoringFragNum = frag2Num;
+							}
+							else
+							{
+								bestScoringFragNum = frag2Num;
+								worstScoringFragNum = frag1Num;
+							}
+							frag1->merge(frag2);
+							frag[worstScoringFragNum]= NULL;
+							mergingStillBeingDone = true;
+							frag[bestScoringFragNum]=frag1;
+							_CLDELETE(frag2);
+						}
+					}
+				}
+			}
+			while (mergingStillBeingDone);
+	}
+
+	int32_t Highlighter::getMaxDocBytesToAnalyze()
+	{
+		return maxDocBytesToAnalyze;
+	}
+
+	void Highlighter::setMaxDocBytesToAnalyze(int32_t byteCount)
+	{
+		maxDocBytesToAnalyze = byteCount;
+	}
+
+	Fragmenter * Highlighter::getTextFragmenter()
+	{
+		return _textFragmenter;
+	}
+
+	void Highlighter::setTextFragmenter(Fragmenter * fragmenter)
+	{
+		if ( delete_textFragmenter ){
+			_CLDELETE(_textFragmenter);
+			delete_textFragmenter = false;
+		}
+		_textFragmenter = fragmenter;
+	}
+
+	HighlightScorer * Highlighter::getFragmentScorer()
+	{
+		return _fragmentScorer;
+	}
+
+
+	void Highlighter::setFragmentScorer(HighlightScorer * scorer)
+	{
+		if ( delete_fragmentScorer ){
+			delete_fragmentScorer = false;
+			_CLDELETE(scorer);
+		}
+		_fragmentScorer = scorer;
+	}
+
+	
+    Encoder* Highlighter::getEncoder()
+    {
+        return _encoder;
+    }
+    void Highlighter::setEncoder(Encoder* encoder)
+    {
+		if ( delete_encoder ){
+			_CLDELETE(encoder);
+			delete_encoder = false;
+		}
+        this->_encoder = encoder;
+    }
+
+
+
+CL_NS_END2
diff --git a/src/contribs-lib/CLucene/highlighter/Highlighter.h b/src/contribs-lib/CLucene/highlighter/Highlighter.h
new file mode 100644
index 0000000..99f3f8e
--- /dev/null
+++ b/src/contribs-lib/CLucene/highlighter/Highlighter.h
@@ -0,0 +1,226 @@
+/**
+ * Copyright 2002-2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _lucene_search_highlight_highlighter_
+#define _lucene_search_highlight_highlighter_
+
+
+
+CL_CLASS_DEF(util, StringBuffer)
+//#include "CLucene/util/VoidList.h"
+CL_CLASS_DEF2(search,highlight,Formatter)
+CL_CLASS_DEF2(search,highlight,Encoder)
+CL_CLASS_DEF2(search,highlight,HighlightScorer)
+CL_CLASS_DEF2(search,highlight,Fragmenter)
+CL_CLASS_DEF2(search,highlight,TextFragment)
+CL_CLASS_DEF(analysis, TokenStream)
+CL_CLASS_DEF(analysis, Analyzer)
+
+//#include "HighlightScorer.h"
+//#include "SimpleFragmenter.h"
+//#include "TextFragment.h"
+
+CL_NS_DEF2(search,highlight)
+
+/**
+* Class used to markup highlighted terms found in the best sections of a
+* text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter},
+* and tokenizers. 	  
+* {@link Encoder} and tokenizers.
+*/
+class CLUCENE_CONTRIBS_EXPORT Highlighter :LUCENE_BASE
+{
+private:
+	int32_t maxDocBytesToAnalyze;
+
+	Formatter * _formatter;
+	bool delete_formatter;
+	
+	Encoder* _encoder;
+	bool delete_encoder;
+
+	Fragmenter * _textFragmenter;
+	bool delete_textFragmenter;
+
+	HighlightScorer * _fragmentScorer;
+	bool delete_fragmentScorer;
+
+	/** Improves readability of a score-sorted list of TextFragments by merging any fragments 
+	 * that were contiguous in the original text into one larger fragment with the correct order.
+	 * This will leave a "null" in the array entry for the lesser scored fragment. 
+	 * 
+	 * @param frag An array of document fragments in descending score
+	 */
+	void _mergeContiguousFragments(TextFragment** frag, int32_t fragsLen);
+	
+public:
+	LUCENE_STATIC_CONSTANT(int32_t, DEFAULT_MAX_DOC_BYTES_TO_ANALYZE=50*1024);
+
+	/**
+	 * Constructs a Highlighter object with the provided scorer. The HighlightScorer object is owned
+	 * by the Highlighter object, and it will freed in the destructor.
+	 */
+	Highlighter(HighlightScorer * fragmentScorer);
+
+	Highlighter(Formatter * formatter, HighlightScorer * fragmentScorer);
+
+	Highlighter(Formatter * formatter, Encoder* encoder, HighlightScorer * fragmentScorer);
+
+
+	/**
+	 * Destructor for Highlighter. It deletes the owned HighlightScorer, formatter and textFragmenter.
+	 */
+	~Highlighter();
+
+	/**
+	 * Highlights chosen terms in a text, extracting the most relevant section.
+	 * The document text is analysed in chunks to record hit statistics
+	 * across the document. After accumulating stats, the fragment with the highest score
+	 * is returned
+	 *
+	 * @param tokenStream   a stream of tokens identified in the text parameter, including offset information. 
+	 * This is typically produced by an analyzer re-parsing a document's 
+	 * text. Some work may be done on retrieving TokenStreams more efficently 
+	 * by adding support for storing original text position data in the Lucene
+	 * index but this support is not currently available (as of Lucene 1.4 rc2).  
+	 * @param text text to highlight terms in
+	 *
+	 * @return highlighted text fragment or null if no terms found
+	 */
+	TCHAR* getBestFragment(CL_NS(analysis)::TokenStream * tokenStream, const TCHAR* text);
+
+	/**
+	 * Highlights chosen terms in a text, extracting the most relevant section.
+	 * This is a convenience method that calls
+	 * {@link #getBestFragment(TokenStream, const TCHAR*)}
+	 *
+	 * @param analyzer   the analyzer that will be used to split <code>text</code>
+	 * into chunks  
+	 * @param text text to highlight terms in
+	 * @param fieldName Name of field used to influence analyzer's tokenization policy 
+	 *
+	 * @return highlighted text fragment or null if no terms found
+	 */
+	TCHAR* getBestFragment(CL_NS(analysis)::Analyzer* analyzer, const TCHAR* fieldName, const TCHAR* text);
+
+	/**
+	 * Highlights chosen terms in a text, extracting the most relevant sections.
+	 * This is a convenience method that calls
+	 * {@link #getBestFragments(TokenStream, const TCHAR*, int)}
+	 *
+	 * @param analyzer   the analyzer that will be used to split <code>text</code>
+	 * into chunks  
+	 * @param text        	text to highlight terms in
+	 * @param maxNumFragments  the maximum number of fragments.
+	 *
+	 * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
+	 */
+	TCHAR** getBestFragments(
+		CL_NS(analysis)::Analyzer* analyzer,	
+		const TCHAR* text,
+		int32_t maxNumFragments);
+
+	/**
+	 * Highlights chosen terms in a text, extracting the most relevant sections.
+	 * The document text is analysed in chunks to record hit statistics
+	 * across the document. After accumulating stats, the fragments with the highest scores
+	 * are returned as an array of strings in order of score (contiguous fragments are merged into 
+	 * one in their original order to improve readability)
+	 *
+	 * @param text        	text to highlight terms in
+	 * @param maxNumFragments  the maximum number of fragments.
+	 *
+	 * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
+	 */
+	 TCHAR** getBestFragments(
+		CL_NS(analysis)::TokenStream * tokenStream,	
+		const TCHAR* text,
+		int32_t maxNumFragments);
+
+	/**
+    * Low level api to get the most relevant (formatted) sections of the document.
+  	* This method has been made public to allow visibility of score information held in TextFragment objects.
+  	* Thanks to Jason Calabrese for help in redefining the interface.
+    * @param tokenStream
+    * @param text
+    * @param maxNumFragments
+    * @param mergeContiguousFragments
+    */
+	TextFragment** getBestTextFragments(
+		CL_NS(util)::StringBuffer* writeTo,
+		CL_NS(analysis)::TokenStream * tokenStream,	
+		const TCHAR* text,
+		bool mergeContiguousFragments,
+		int32_t maxNumFragments);
+
+	/**
+	 * Highlights terms in the  text , extracting the most relevant sections
+	 * and concatenating the chosen fragments with a separator (typically "...").
+	 * The document text is analysed in chunks to record hit statistics
+	 * across the document. After accumulating stats, the fragments with the highest scores
+	 * are returned in order as "separator" delimited strings.
+	 *
+	 * @param text        text to highlight terms in
+	 * @param maxNumFragments  the maximum number of fragments.
+	 * @param separator  the separator used to intersperse the document fragments (typically "...")
+	 *
+	 * @return highlighted text
+	 */
+	TCHAR* getBestFragments(
+		CL_NS(analysis)::TokenStream * tokenStream,	
+		const TCHAR* text,
+		int32_t maxNumFragments,
+		const TCHAR* separator);
+
+	/**
+	 * @return the maximum number of bytes to be tokenized per doc 
+	 */
+	int32_t getMaxDocBytesToAnalyze();
+
+	/**
+	 * @param byteCount the maximum number of bytes to be tokenized per doc
+	 * (This can improve performance with large documents)
+	 */
+	void setMaxDocBytesToAnalyze(int32_t byteCount);
+
+	/**
+	 */
+	Fragmenter * getTextFragmenter();
+
+	/**
+	 * @param fragmenter
+	 */
+	void setTextFragmenter(Fragmenter * fragmenter);
+
+	/**
+	 * @return Object used to score each text fragment 
+	 */
+	HighlightScorer * getFragmentScorer();
+
+	/**
+	 * @param HighlightScorer
+	 */
+	void setFragmentScorer(HighlightScorer * scorer);
+	
+    Encoder* getEncoder();
+    void setEncoder(Encoder* encoder);
+};
+
+
+CL_NS_END2
+
+#endif
+
diff --git a/src/contribs-lib/CLucene/highlighter/QueryScorer.cpp b/src/contribs-lib/CLucene/highlighter/QueryScorer.cpp
new file mode 100644
index 0000000..6b8b116
--- /dev/null
+++ b/src/contribs-lib/CLucene/highlighter/QueryScorer.cpp
@@ -0,0 +1,118 @@
+/**
+ * Copyright 2002-2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CLucene/_ApiHeader.h"
+#include "QueryScorer.h"
+#include "WeightedTerm.h"
+#include "QueryTermExtractor.h"
+#include "CLucene/analysis/AnalysisHeader.h"
+
+CL_NS_DEF2(search,highlight)
+CL_NS_USE(index)
+CL_NS_USE(analysis)
+
+	QueryScorer::QueryScorer(const Query * query):
+		_termsToFind(false,true),
+		_uniqueTermsInFragment(true)
+	 {
+		 WeightedTerm** _weighted_terms = QueryTermExtractor::getTerms(query);
+		 initialize(_weighted_terms);
+		 _CLDELETE_ARRAY(_weighted_terms);
+	 }
+	 QueryScorer::~QueryScorer()
+	 {
+	 }
+
+/*	 QueryScorer(Query* query, CL_NS(index)::IndexReader* reader, const TCHAR* fieldName)
+	 {
+		 WeightedTerm** _weighted_terms = QueryTermExtractor.getIdfWeightedTerms(query, reader, fieldName);
+		 initialize(_weighted_terms);
+	 }*/
+
+
+	QueryScorer::QueryScorer(WeightedTerm** weightedTerms)
+	{
+		 initialize(weightedTerms);
+	}
+	
+	void QueryScorer::initialize(WeightedTerm** weightedTerms)
+	{
+		_currentTextFragment = NULL;
+		_totalScore = 0;
+		_maxTermWeight = 0;
+
+		// Copy external weighted terms
+		 int i=0;
+		 while ( weightedTerms[i] != NULL ){
+			const WeightedTerm* existingTerm=_termsToFind.get(weightedTerms[i]->getTerm());
+			if( (existingTerm==NULL) ||(existingTerm->getWeight()<weightedTerms[i]->getWeight()) )
+  	        {
+  				//if a term is defined more than once, always use the highest scoring weight
+				WeightedTerm* term = weightedTerms[i];
+				_termsToFind.put(term->getTerm(), term);
+
+				_maxTermWeight=cl_max(_maxTermWeight,weightedTerms[i]->getWeight());
+  	        }else
+				_CLDELETE(weightedTerms[i]);
+
+			i++;
+		 }
+	}
+
+	void QueryScorer::startFragment(TextFragment * newFragment)
+	{
+		_uniqueTermsInFragment.clear();
+		_currentTextFragment=newFragment;
+		_totalScore=0;
+		
+	}
+	
+	float_t QueryScorer::getTokenScore(Token * token)
+	{
+		const TCHAR* termText=token->termBuffer<TCHAR>();
+		
+		const WeightedTerm* queryTerm = _termsToFind.get(termText);
+		if(queryTerm==NULL)
+		{
+			//not a query term - return
+			return 0;
+		}
+		//found a query term - is it unique in this doc?
+		if(_uniqueTermsInFragment.find((TCHAR*)termText)==_uniqueTermsInFragment.end())
+		{
+			_totalScore+=queryTerm->getWeight();
+			TCHAR* owned_term = stringDuplicate(termText);
+			_uniqueTermsInFragment.insert(owned_term);
+		}
+		return queryTerm->getWeight();
+	}
+	
+	/**
+  	*
+  	* @return The highest weighted term (useful for passing to GradientFormatter to set
+  	* top end of coloring scale.
+  	*/
+	float_t QueryScorer::getMaxTermWeight()
+	{
+  		return _maxTermWeight;
+	}
+
+
+	float_t QueryScorer::getFragmentScore(){
+		return _totalScore;
+	}
+
+CL_NS_END2
diff --git a/src/contribs-lib/CLucene/highlighter/QueryScorer.h b/src/contribs-lib/CLucene/highlighter/QueryScorer.h
new file mode 100644
index 0000000..c89a6cf
--- /dev/null
+++ b/src/contribs-lib/CLucene/highlighter/QueryScorer.h
@@ -0,0 +1,114 @@
+/**
+ * Copyright 2002-2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _lucene_search_highlight_queryscorer_
+#define _lucene_search_highlight_queryscorer_
+
+
+CL_CLASS_DEF(index, IndexReader)
+CL_CLASS_DEF(search, Query)
+
+//#include "CLucene/search/SearchHeader.h"
+//#include "TextFragment.h"
+
+#include "HighlightScorer.h"
+
+CL_NS_DEF2(search,highlight)
+
+class WeightedTerm;
+class QueryTermExtractor;
+class TextFragment;
+
+/**
+ * {@link Scorer} implementation which scores text fragments by the number of unique query terms found.
+ * This class uses the {@link QueryTermExtractor} class to process determine the query terms and 
+ * their boosts to be used. 
+ */
+//TODO: provide option to boost score of fragments near beginning of document 
+// based on fragment.getFragNum()
+class CLUCENE_CONTRIBS_EXPORT QueryScorer : public HighlightScorer
+{
+private:
+	TextFragment * _currentTextFragment;
+	CL_NS(util)::CLHashSet<TCHAR*,
+		CL_NS(util)::Compare::TChar,
+		CL_NS(util)::Deletor::tcArray> _uniqueTermsInFragment;
+	float_t _totalScore;
+	float_t _maxTermWeight;
+	CL_NS(util)::CLHashMap<const TCHAR*, const WeightedTerm *,
+		CL_NS(util)::Compare::TChar,
+		CL_NS(util)::Equals::TChar,
+		CL_NS(util)::Deletor::Dummy,
+		CL_NS(util)::Deletor::Object<const WeightedTerm> > _termsToFind;
+
+public:
+	/**
+	* 
+	* @param query a Lucene query (ideally rewritten using query.rewrite 
+	* before being passed to this class and the searcher)
+	*/
+	QueryScorer(const Query * query);
+
+	/**
+	* 
+	* @param query a Lucene query (ideally rewritten using query.rewrite 
+	* before being passed to this class and the searcher)
+	* @param reader used to compute IDF which can be used to a) score selected fragments better 
+	* b) use graded highlights eg set font color intensity
+	* @param fieldName the field on which Inverse Document Frequency (IDF) calculations are based
+	*/
+	QueryScorer(const Query* query, CL_NS(index)::IndexReader* reader, const TCHAR* fieldName);
+
+	QueryScorer(WeightedTerm** weightedTerms);
+
+	~QueryScorer();
+
+	/* (non-Javadoc)
+	 * @see org.apache.lucene.search.highlight.FragmentScorer#startFragment(org.apache.lucene.search.highlight.TextFragment)
+	 */
+	void startFragment(TextFragment* newFragment);
+	
+	/* (non-Javadoc)
+	 * @see org.apache.lucene.search.highlight.FragmentScorer#scoreToken(org.apache.lucene.analysis.Token)
+	 */
+	float_t getTokenScore(CL_NS(analysis)::Token * token);
+	
+	/* (non-Javadoc)
+	 * @see org.apache.lucene.search.highlight.FragmentScorer#endFragment(org.apache.lucene.search.highlight.TextFragment)
+	 */
+	float_t getFragmentScore();
+
+	/* (non-Javadoc)
+	 * @see org.apache.lucene.search.highlight.FragmentScorer#allFragmentsProcessed()
+	 */
+	void allFragmentsProcessed();
+
+	/**
+	 * 
+	 * @return The highest weighted term (useful for passing to GradientFormatter to set
+	 * top end of coloring scale.  
+		*/
+	float_t getMaxTermWeight();
+
+private:
+	void initialize(WeightedTerm** weightedTerms);
+
+};
+
+CL_NS_END2
+
+#endif
+
diff --git a/src/contribs-lib/CLucene/highlighter/QueryTermExtractor.cpp b/src/contribs-lib/CLucene/highlighter/QueryTermExtractor.cpp
new file mode 100644
index 0000000..d3fe0ee
--- /dev/null
+++ b/src/contribs-lib/CLucene/highlighter/QueryTermExtractor.cpp
@@ -0,0 +1,135 @@
+/**
+ * Copyright 2002-2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CLucene/_ApiHeader.h"
+#include "QueryTermExtractor.h"
+
+#include "CLucene/search/Query.h"
+#include "CLucene/search/BooleanQuery.h"
+#include "CLucene/search/TermQuery.h"
+#include "CLucene/search/PhraseQuery.h"
+#include "CLucene/index/IndexReader.h"
+#include "CLucene/index/Term.h"
+
+__asm__(".symver log,log@GLIBC_2.2.5");
+__asm__(".symver pow,pow@GLIBC_2.2.5");
+__asm__(".symver logf,logf@GLIBC_2.2.5");
+__asm__(".symver powf,powf@GLIBC_2.2.5");
+
+CL_NS_DEF2(search,highlight)
+CL_NS_USE(index)
+
+	WeightedTerm** QueryTermExtractor::getTerms(const Query * query, bool prohibited, const TCHAR* fieldName) 
+	{
+		WeightedTermList terms(false);
+		getTerms(query,&terms,prohibited,fieldName);
+
+		// Return extracted terms
+		WeightedTerm** ret = _CL_NEWARRAY(WeightedTerm*,terms.size()+1);
+		terms.toArray_nullTerminated(ret);
+
+		return ret;
+	}
+
+	void QueryTermExtractor::getTerms(const Query * query, WeightedTermList * terms, bool prohibited, const TCHAR* fieldName) 
+	{
+		if (query->instanceOf( BooleanQuery::getClassName() ))
+        {
+			getTermsFromBooleanQuery((BooleanQuery *) query, terms, prohibited, fieldName);
+        }
+// FilteredQuery not implemented yet
+// 		else if (query->instanceOf( FilteredQuery::getClassName() ))
+// 			getTermsFromFilteredQuery((FilteredQuery *) query, terms);
+		else
+        {
+            TermSet nonWeightedTerms;
+            query->extractTerms(&nonWeightedTerms);
+            for (TermSet::iterator iter = nonWeightedTerms.begin(); iter != nonWeightedTerms.end(); iter++)
+            {
+                Term * term = (Term *)(*iter);
+                if ( fieldName == NULL || term->field() == fieldName )
+                    terms->insert(_CLNEW WeightedTerm(query->getBoost(), term->text()));
+                _CLLDECDELETE( term );
+            }
+        }
+	}
+
+	/**
+  	* Extracts all terms texts of a given Query into an array of WeightedTerms
+  	*
+  	* @param query      Query to extract term texts from
+  	* @param reader used to compute IDF which can be used to a) score selected fragments better
+  	* b) use graded highlights eg chaning intensity of font color
+  	* @param fieldName the field on which Inverse Document Frequency (IDF) calculations are based
+  	* @return an array of the terms used in a query, plus their weights.
... 329006 lines suppressed ...


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org