You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/09/18 12:39:12 UTC

svn commit: r1172242 [1/9] - in /tika/trunk: tika-app/src/test/java/org/apache/tika/cli/ tika-core/src/main/java/org/apache/tika/detect/ tika-core/src/main/java/org/apache/tika/extractor/ tika-core/src/main/java/org/apache/tika/fork/ tika-core/src/main...

Author: jukka
Date: Sun Sep 18 10:39:08 2011
New Revision: 1172242

URL: http://svn.apache.org/viewvc?rev=1172242&view=rev
Log:
Add missing svn:eol-style settings

Modified:
    tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java   (contents, props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/detect/ContainerAwareDetector.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ContainerExtractor.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedResourceHandler.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParserContainerExtractor.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ClassLoaderResource.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerProxy.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerResource.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ForkProxy.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ForkResource.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/fork/InputStreamProxy.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/fork/InputStreamResource.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/fork/MemoryURLConnection.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamHandler.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamHandlerFactory.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/io/CloseShieldInputStream.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/io/ClosedInputStream.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/io/CountingInputStream.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/io/IOExceptionWithCause.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/io/IOUtils.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/io/LookaheadInputStream.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/io/NullInputStream.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/io/NullOutputStream.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/io/TemporaryFiles.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/io/TemporaryResources.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfile.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfilerBuilder.java   (contents, props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/language/ProfilingHandler.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/language/ProfilingWriter.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Message.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CryptoParser.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DelegatingParser.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/parser/NetworkParser.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/CompositeExternalParser.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersFactory.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ElementMappingContentHandler.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/sax/EndDocumentShieldingContentHandler.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/sax/Link.java   (props changed)
    tika/trunk/tika-core/src/main/java/org/apache/tika/sax/LinkBuilder.java   (props changed)
    tika/trunk/tika-core/src/main/resources/org/apache/tika/language/tika.language.properties   (props changed)
    tika/trunk/tika-core/src/test/java/org/apache/tika/fork/ForkParserTest.java   (props changed)
    tika/trunk/tika-core/src/test/java/org/apache/tika/fork/ForkTestParser.java   (props changed)
    tika/trunk/tika-core/src/test/java/org/apache/tika/io/LookaheadInputStreamTest.java   (props changed)
    tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageProfilerBuilderTest.java   (contents, props changed)
    tika/trunk/tika-core/src/test/java/org/apache/tika/metadata/TestMetadataHelper.java   (props changed)
    tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java   (props changed)
    tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java   (props changed)
    tika/trunk/tika-core/src/test/java/org/apache/tika/mime/PatternsTest.java   (props changed)
    tika/trunk/tika-core/src/test/java/org/apache/tika/parser/CompositeParserTest.java   (props changed)
    tika/trunk/tika-core/src/test/java/org/apache/tika/parser/DummyParser.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHM2XHTML.java   (contents, props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHMDocumentInformation.java   (contents, props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/ChmParser.java   (contents, props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java   (contents, props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java   (contents, props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmItsfHeader.java   (contents, props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmItspHeader.java   (contents, props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcControlData.java   (contents, props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcResetTable.java   (contents, props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.java   (contents, props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmglHeader.java   (contents, props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/DirectoryListingEntry.java   (contents, props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/assertion/ChmAssert.java   (contents, props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmCommons.java   (contents, props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmConstants.java   (contents, props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java   (contents, props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmWrapper.java   (contents, props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/exception/ChmParsingException.java   (contents, props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmBlockInfo.java   (contents, props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java   (contents, props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java   (contents, props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java   (contents, props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubContentParser.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubParser.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlMapper.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/MetadataFields.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/XMPPacketScanner.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_mbcs.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/video/FLVParser.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java   (props changed)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java   (props changed)
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java   (props changed)
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java   (props changed)
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmBlockInfo.java   (contents, props changed)
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmDocumentInformation.java   (contents, props changed)
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java   (contents, props changed)
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java   (contents, props changed)
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmItsfHeader.java   (contents, props changed)
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmItspHeader.java   (contents, props changed)
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmLzxState.java   (contents, props changed)
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmLzxcControlData.java   (contents, props changed)
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmLzxcResetTable.java   (contents, props changed)
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestDirectoryListingEntry.java   (contents, props changed)
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestParameters.java   (contents, props changed)
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestPmgiHeader.java   (contents, props changed)
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestPmglHeader.java   (contents, props changed)
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java   (props changed)
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java   (props changed)
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/MetadataFieldsTest.java   (props changed)
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java   (props changed)
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java   (props changed)
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java   (props changed)
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java   (props changed)
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java   (props changed)
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java   (props changed)
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java   (props changed)
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WriteProtectedParserTest.java   (props changed)
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java   (props changed)
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java   (props changed)
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java   (props changed)
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java   (props changed)
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java   (props changed)
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java   (props changed)
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/video/FLVParserTest.java   (props changed)
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java   (props changed)
    tika/trunk/tika-server/src/main/java/org/apache/tika/server/MetadataResource.java   (props changed)
    tika/trunk/tika-server/src/main/java/org/apache/tika/server/PartExtractor.java   (props changed)
    tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaExceptionMapper.java   (props changed)
    tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaResource.java   (props changed)
    tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java   (props changed)
    tika/trunk/tika-server/src/main/java/org/apache/tika/server/UnpackerResource.java   (props changed)
    tika/trunk/tika-server/src/main/java/org/apache/tika/server/ZipOutput.java   (props changed)
    tika/trunk/tika-server/src/main/java/org/apache/tika/server/ZipUtils.java   (props changed)
    tika/trunk/tika-server/src/main/resources/commons-logging.properties   (props changed)
    tika/trunk/tika-server/src/main/resources/tikaserver-version.properties   (props changed)
    tika/trunk/tika-server/src/test/java/org/apache/tika/server/MetadataResourceTest.java   (props changed)
    tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java   (props changed)
    tika/trunk/tika-server/src/test/java/org/apache/tika/server/UnpackerResourceTest.java   (props changed)

Modified: tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java?rev=1172242&r1=1172241&r2=1172242&view=diff
==============================================================================
--- tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java (original)
+++ tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java Sun Sep 18 10:39:08 2011
@@ -1,189 +1,189 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.cli;
-
-
-import java.io.ByteArrayOutputStream;
-import java.io.File;
-import java.io.IOException;
-import java.io.PrintStream;
-import java.net.URI;
-import junit.framework.Assert;
-import junit.framework.TestCase;
-
-/**
- * Tests the Tika's cli
- */
-public class TikaCLITest extends TestCase{
-    /* Test members */
-	private File profile = null;
-	private ByteArrayOutputStream outContent = null;
-	private PrintStream stdout = null;
-	private PrintStream reassign = null;
-	private URI testDataURI = new File("src/test/resources/test-data/").toURI();
-	private String resorcePrefix = testDataURI.toString();
-	
-	
-	public void setUp() throws Exception {
-		profile = new File("welsh.ngp");
-		outContent = new ByteArrayOutputStream();
-		stdout = System.out;
-		reassign = new PrintStream(outContent);
-		System.setOut(reassign);
-	}
-	
-    	
-	/**
-	 * Creates a welsh language profile
-	 * 
-	 * @throws Exception
-	 */
-	public void testCreateProfile() throws Exception {
-		String[] params = {"--create-profile=welsh", "-eUTF-8", resorcePrefix + "welsh_corpus.txt"};
-		TikaCLI.main(params);
-		Assert.assertTrue(profile.exists());
-	}
-	
-	/**
-	 * Tests --list-parser-detail option of the cli
-	 * 
-	 * @throws Exception
-	 */
-	public void testListParserDetail() throws Exception{
-		String[] params = {"--list-parser-detail"};
-		TikaCLI.main(params);
-		Assert.assertTrue(outContent.toString().contains("application/vnd.oasis.opendocument.text-web"));
-	}
-	
-	/**
-	 * Tests --list-parser option of the cli
-	 * 
-	 * @throws Exception
-	 */
-	public void testListParsers() throws Exception{
-		String[] params = {"--list-parser"};
-		TikaCLI.main(params);
-		//Assert was commented temporarily for finding the problem
-//		Assert.assertTrue(outContent != null && outContent.toString().contains("org.apache.tika.parser.iwork.IWorkPackageParser"));
-	}
-	
-	/**
-	 * Tests -x option of the cli
-	 * 
-	 * @throws Exception
-	 */
-	public void testXMLOutput() throws Exception{
-		String[] params = {"-x", resorcePrefix + "alice.cli.test"};
-		TikaCLI.main(params);
-		Assert.assertTrue(outContent.toString().contains("?xml version=\"1.0\" encoding=\"UTF-8\"?"));
-	}
-	
-	/**
-	 * Tests a -h option of the cli
-	 * 
-	 * @throws Exception
-	 */
-	public void testHTMLOutput() throws Exception{
-		String[] params = {"-h", resorcePrefix + "alice.cli.test"};
-		TikaCLI.main(params);
-		Assert.assertTrue(outContent.toString().contains("html xmlns=\"http://www.w3.org/1999/xhtml"));
-	}
-	
-	/**
-	 * Tests -t option of the cli
-	 * 
-	 * @throws Exception
-	 */
-	public void testTextOutput() throws Exception{
-		String[] params = {"-t", resorcePrefix + "alice.cli.test"};
-		TikaCLI.main(params);
-		Assert.assertTrue(outContent.toString().contains("finished off the cake"));
-	}
-	
-	/**
-	 * Tests -m option of the cli
-	 * @throws Exception
-	 */
-	public void testMetadataOutput() throws Exception{
-		String[] params = {"-m", resorcePrefix + "alice.cli.test"};
-		TikaCLI.main(params);
-		Assert.assertTrue(outContent.toString().contains("text/plain"));
-	}
-	
-	/**
-	 * Tests -l option of the cli
-	 * 
-	 * @throws Exception
-	 */
-	public void testLanguageOutput() throws Exception{
-		String[] params = {"-l", resorcePrefix + "alice.cli.test"};
-		TikaCLI.main(params);
-		Assert.assertTrue(outContent.toString().contains("en"));
-	}
-	
-	/**
-	 * Tests -d option of the cli
-	 * 
-	 * @throws Exception
-	 */
-	public void testDetectOutput() throws Exception{
-		String[] params = {"-d", resorcePrefix + "alice.cli.test"};
-		TikaCLI.main(params);
-		Assert.assertTrue(outContent.toString().contains("text/plain"));
-	}
-	
-	/**
-	 * Tests --list-met-models option of the cli
-	 * 
-	 * @throws Exception
-	 */
-	public void testListMetModels() throws Exception{
-		String[] params = {"--list-met-models", resorcePrefix + "alice.cli.test"};
-		TikaCLI.main(params);
-		Assert.assertTrue(outContent.toString().contains("text/plain"));
-	}
-	
-	/**
-	 * Tests --list-supported-types option of the cli
-	 * 
-	 * @throws Exception
-	 */
-	public void testListSupportedTypes() throws Exception{
-		String[] params = {"--list-supported-types", resorcePrefix + "alice.cli.test"};
-		TikaCLI.main(params);
-		Assert.assertTrue(outContent.toString().contains("supertype: application/octet-stream"));
-	}
-	
-	/**
-	 * Tears down the test. Returns the System.out
-	 */
-	public void tearDown() throws Exception {
-		if(profile != null && profile.exists())
-			profile.delete();
-		System.setOut(stdout);
-		closeStreams();
-	}
-	
-    private void closeStreams() throws IOException {
-        if(outContent != null)
-            outContent.close();
-        if(stdout != null)
-            stdout.close();
-        if(reassign != null)
-            reassign.close();
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.cli;
+
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.net.URI;
+import junit.framework.Assert;
+import junit.framework.TestCase;
+
+/**
+ * Tests the Tika's cli
+ */
+public class TikaCLITest extends TestCase{
+    /* Test members */
+	private File profile = null;
+	private ByteArrayOutputStream outContent = null;
+	private PrintStream stdout = null;
+	private PrintStream reassign = null;
+	private URI testDataURI = new File("src/test/resources/test-data/").toURI();
+	private String resorcePrefix = testDataURI.toString();
+	
+	
+	public void setUp() throws Exception {
+		profile = new File("welsh.ngp");
+		outContent = new ByteArrayOutputStream();
+		stdout = System.out;
+		reassign = new PrintStream(outContent);
+		System.setOut(reassign);
+	}
+	
+    	
+	/**
+	 * Creates a welsh language profile
+	 * 
+	 * @throws Exception
+	 */
+	public void testCreateProfile() throws Exception {
+		String[] params = {"--create-profile=welsh", "-eUTF-8", resorcePrefix + "welsh_corpus.txt"};
+		TikaCLI.main(params);
+		Assert.assertTrue(profile.exists());
+	}
+	
+	/**
+	 * Tests --list-parser-detail option of the cli
+	 * 
+	 * @throws Exception
+	 */
+	public void testListParserDetail() throws Exception{
+		String[] params = {"--list-parser-detail"};
+		TikaCLI.main(params);
+		Assert.assertTrue(outContent.toString().contains("application/vnd.oasis.opendocument.text-web"));
+	}
+	
+	/**
+	 * Tests --list-parser option of the cli
+	 * 
+	 * @throws Exception
+	 */
+	public void testListParsers() throws Exception{
+		String[] params = {"--list-parser"};
+		TikaCLI.main(params);
+		//Assert was commented temporarily for finding the problem
+//		Assert.assertTrue(outContent != null && outContent.toString().contains("org.apache.tika.parser.iwork.IWorkPackageParser"));
+	}
+	
+	/**
+	 * Tests -x option of the cli
+	 * 
+	 * @throws Exception
+	 */
+	public void testXMLOutput() throws Exception{
+		String[] params = {"-x", resorcePrefix + "alice.cli.test"};
+		TikaCLI.main(params);
+		Assert.assertTrue(outContent.toString().contains("?xml version=\"1.0\" encoding=\"UTF-8\"?"));
+	}
+	
+	/**
+	 * Tests a -h option of the cli
+	 * 
+	 * @throws Exception
+	 */
+	public void testHTMLOutput() throws Exception{
+		String[] params = {"-h", resorcePrefix + "alice.cli.test"};
+		TikaCLI.main(params);
+		Assert.assertTrue(outContent.toString().contains("html xmlns=\"http://www.w3.org/1999/xhtml"));
+	}
+	
+	/**
+	 * Tests -t option of the cli
+	 * 
+	 * @throws Exception
+	 */
+	public void testTextOutput() throws Exception{
+		String[] params = {"-t", resorcePrefix + "alice.cli.test"};
+		TikaCLI.main(params);
+		Assert.assertTrue(outContent.toString().contains("finished off the cake"));
+	}
+	
+	/**
+	 * Tests -m option of the cli
+	 * @throws Exception
+	 */
+	public void testMetadataOutput() throws Exception{
+		String[] params = {"-m", resorcePrefix + "alice.cli.test"};
+		TikaCLI.main(params);
+		Assert.assertTrue(outContent.toString().contains("text/plain"));
+	}
+	
+	/**
+	 * Tests -l option of the cli
+	 * 
+	 * @throws Exception
+	 */
+	public void testLanguageOutput() throws Exception{
+		String[] params = {"-l", resorcePrefix + "alice.cli.test"};
+		TikaCLI.main(params);
+		Assert.assertTrue(outContent.toString().contains("en"));
+	}
+	
+	/**
+	 * Tests -d option of the cli
+	 * 
+	 * @throws Exception
+	 */
+	public void testDetectOutput() throws Exception{
+		String[] params = {"-d", resorcePrefix + "alice.cli.test"};
+		TikaCLI.main(params);
+		Assert.assertTrue(outContent.toString().contains("text/plain"));
+	}
+	
+	/**
+	 * Tests --list-met-models option of the cli
+	 * 
+	 * @throws Exception
+	 */
+	public void testListMetModels() throws Exception{
+		String[] params = {"--list-met-models", resorcePrefix + "alice.cli.test"};
+		TikaCLI.main(params);
+		Assert.assertTrue(outContent.toString().contains("text/plain"));
+	}
+	
+	/**
+	 * Tests --list-supported-types option of the cli
+	 * 
+	 * @throws Exception
+	 */
+	public void testListSupportedTypes() throws Exception{
+		String[] params = {"--list-supported-types", resorcePrefix + "alice.cli.test"};
+		TikaCLI.main(params);
+		Assert.assertTrue(outContent.toString().contains("supertype: application/octet-stream"));
+	}
+	
+	/**
+	 * Tears down the test. Returns the System.out
+	 */
+	public void tearDown() throws Exception {
+		if(profile != null && profile.exists())
+			profile.delete();
+		System.setOut(stdout);
+		closeStreams();
+	}
+	
+    private void closeStreams() throws IOException {
+        if(outContent != null)
+            outContent.close();
+        if(stdout != null)
+            stdout.close();
+        if(reassign != null)
+            reassign.close();
+    }
+}

Propchange: tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/ContainerAwareDetector.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ContainerExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedResourceHandler.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParserContainerExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ClassLoaderResource.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerProxy.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerResource.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ForkProxy.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ForkResource.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/fork/InputStreamProxy.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/fork/InputStreamResource.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/fork/MemoryURLConnection.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamHandler.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamHandlerFactory.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/io/CloseShieldInputStream.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/io/ClosedInputStream.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/io/CountingInputStream.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/io/IOExceptionWithCause.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/io/IOUtils.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/io/LookaheadInputStream.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/io/NullInputStream.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/io/NullOutputStream.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/io/TemporaryFiles.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/io/TemporaryResources.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfile.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfilerBuilder.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfilerBuilder.java?rev=1172242&r1=1172241&r2=1172242&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfilerBuilder.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfilerBuilder.java Sun Sep 18 10:39:08 2011
@@ -1,767 +1,767 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.language;
-
-// JDK imports
-import java.io.BufferedInputStream;
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.OutputStream;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Date;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import org.apache.tika.exception.TikaException;
-
-/**
- * This class runs a ngram analysis over submitted text, results might be used
- * for automatic language identification.
- * 
- * The similarity calculation is at experimental level. You have been warned.
- * 
- * Methods are provided to build new NGramProfiles profiles.
- * 
- * @author Sami Siren
- * @author Jerome Charron - http://frutch.free.fr/
- */
-public class LanguageProfilerBuilder {
-
-    // public static final Log LOG =
-    // LogFactory.getLog(LanguageProfilerBuilder.class);
-
-    /** The minimum length allowed for a ngram. */
-    final static int ABSOLUTE_MIN_NGRAM_LENGTH = 3; /* was 1 */
-
-    /** The maximum length allowed for a ngram. */
-    final static int ABSOLUTE_MAX_NGRAM_LENGTH = 3; /* was 4 */
-
-    /** The default min length of ngram */
-    final static int DEFAULT_MIN_NGRAM_LENGTH = 3;
-
-    /** The default max length of ngram */
-    final static int DEFAULT_MAX_NGRAM_LENGTH = 3;
-
-    /** The ngram profile file extension */
-    final static String FILE_EXTENSION = "ngp";
-
-    /** The profile max size (number of ngrams of the same size) */
-    final static int MAX_SIZE = 1000;
-
-    /** separator char */
-    final static char SEPARATOR = '_';
-    /** The String form of the separator char */
-    private final static String SEP_CHARSEQ = new String(
-            new char[] { SEPARATOR });
-
-    /** The profile's name */
-    private String name = null;
-
-    /** The NGrams of this profile sorted on the number of occurrences */
-    private List<NGramEntry> sorted = null;
-
-    /** The min length of ngram */
-    private int minLength = DEFAULT_MIN_NGRAM_LENGTH;
-
-    /** The max length of ngram */
-    private int maxLength = DEFAULT_MAX_NGRAM_LENGTH;
-
-    /** The total number of ngrams occurences */
-    private int[] ngramcounts = null;
-
-    /** An index of the ngrams of the profile */
-    private Map<CharSequence, NGramEntry> ngrams = null;
-
-    /** A StringBuffer used during analysis */
-    private QuickStringBuffer word = new QuickStringBuffer();
-
-    /**
-     * Constructs a new ngram profile
-     * 
-     * @param name is the name of the profile
-     * @param minlen is the min length of ngram sequences
-     * @param maxlen is the max length of ngram sequences
-     */
-    public LanguageProfilerBuilder(String name, int minlen, int maxlen) {
-        // TODO: Compute the initial capacity using minlen and maxlen.
-        this.ngrams = new HashMap<CharSequence, NGramEntry>(4000);
-        this.minLength = minlen;
-        this.maxLength = maxlen;
-        this.name = name;
-    }
-  
-    /**
-     * Constructs a new ngram profile where minlen=3, maxlen=3
-     * 
-     * @param name is a name of profile, usually two length string
-     * @since Tika 1.0
-     */
-    public LanguageProfilerBuilder(String name) {
-        this.ngrams = new HashMap<CharSequence, NGramEntry>(4000);
-        this.minLength = ABSOLUTE_MIN_NGRAM_LENGTH;
-        this.maxLength = ABSOLUTE_MAX_NGRAM_LENGTH;
-        this.name = name;
-    }
-
-    /**
-     * @return Returns the name.
-     */
-    public String getName() {
-        return name;
-    }
-  
-    // This method was commented because it depends on org.apache.lucene.analysis.Token
-    // that is not a part of the Tika
-    // /**
-    // * Adds ngrams from a token to this profile
-    // *
-    // * @param t is the Token to be added
-    // */
-    // public void add(Token t) {
-    // add(new StringBuffer().append(SEPARATOR)
-    // .append(t.term())
-    // .append(SEPARATOR));
-    // }
-
-    /**
-     * Adds ngrams from a single word to this profile
-     * 
-     * @param word is the word to add
-     */
-    public void add(StringBuffer word) {
-        for (int i = minLength; (i <= maxLength) && (i < word.length()); i++) {
-            add(word, i);
-        }
-    }
-
-    /**
-     * Adds the last NGrams from the specified word.
-     */
-    private void add(QuickStringBuffer word) {
-        int wlen = word.length();
-        if (wlen >= minLength) {
-            int max = Math.min(maxLength, wlen);
-            for (int i = minLength; i <= max; i++) {
-                add(word.subSequence(wlen - i, wlen));
-            }
-        }
-    }
-
-    /**
-     * Adds ngrams from a single word in this profile
-     * 
-     * @param word is the word to add
-     * @param n is the ngram size
-     */
-    private void add(CharSequence cs) {
-
-        if (cs.equals(SEP_CHARSEQ)) {
-            return;
-        }
-        NGramEntry nge = ngrams.get(cs);
-        if (nge == null) {
-            nge = new NGramEntry(cs);
-            ngrams.put(cs, nge);
-        }
-        nge.inc();
-    }
-
-    /**
-     * Analyzes a piece of text
-     * 
-     * @param text
-     *            the text to be analyzed
-     */
-    public void analyze(StringBuilder text) {
-
-        if (ngrams != null) {
-            ngrams.clear();
-            sorted = null;
-            ngramcounts = null;
-        }
-
-        word.clear().append(SEPARATOR);
-        for (int i = 0; i < text.length(); i++) {
-            char c = Character.toLowerCase(text.charAt(i));
-
-            if (Character.isLetter(c)) {
-                add(word.append(c));
-            } else {
-                // found word boundary
-                if (word.length() > 1) {
-                    // we have a word!
-                    add(word.append(SEPARATOR));
-                    word.clear().append(SEPARATOR);
-                }
-            }
-        }
-
-        if (word.length() > 1) {
-            // we have a word!
-            add(word.append(SEPARATOR));
-        }
-        normalize();
-    }
-
-    /**
-     * @param word
-     * @param n sequence length
-     */
-    private void add(StringBuffer word, int n) {
-        for (int i = 0; i <= word.length() - n; i++) {
-            add(word.subSequence(i, i + n));
-        }
-    }
-    
-    /**
-     * Normalizes the profile (calculates the ngrams frequencies)
-     */
-    protected void normalize() {
-        NGramEntry e = null;
-        Iterator<NGramEntry> i = ngrams.values().iterator();
-
-        // Calculates ngram count if not already done
-        if (ngramcounts == null) {
-            ngramcounts = new int[maxLength + 1];
-            while (i.hasNext()) {
-                e = i.next();
-                ngramcounts[e.size()] += e.count;
-            }
-        }
-
-        i = ngrams.values().iterator();
-        while (i.hasNext()) {
-            e = i.next();
-            e.frequency = (float) e.count / (float) ngramcounts[e.size()];
-        }
-    }
-
-    /**
-     * Returns a sorted list of ngrams (sort done by 1. frequency 2. sequence)
-     * 
-     * @return sorted vector of ngrams
-     */
-    public List<NGramEntry> getSorted() {
-        // make sure sorting is done only once
-        if (sorted == null) {
-            sorted = new ArrayList<NGramEntry>(ngrams.values());
-            Collections.sort(sorted);
-
-            // trim at NGRAM_LENGTH entries
-            if (sorted.size() > MAX_SIZE) {
-                sorted = sorted.subList(0, MAX_SIZE);
-            }
-        }
-        return sorted;
-    }
-
-    // Inherited JavaDoc
-    public String toString() {
-
-        StringBuffer s = new StringBuffer().append("NGramProfile: ")
-                                           .append(name).append("\n");
-
-        Iterator<NGramEntry> i = getSorted().iterator();
-
-        while (i.hasNext()) {
-            NGramEntry entry = i.next();
-            s.append("[").append(entry.seq).append("/").append(entry.count)
-                         .append("/").append(entry.frequency).append("]\n");
-        }
-        return s.toString();
-    }
-
-    /**
-     * Calculates a score how well NGramProfiles match each other
-     * 
-     * @param another
-     *            ngram profile to compare against
-     * @return similarity 0=exact match
-     * @throws TikaException
-     *             if could not calculate a score
-     */
-    public float getSimilarity(LanguageProfilerBuilder another)
-            throws TikaException {
-
-        float sum = 0;
-
-        try {
-            Iterator<NGramEntry> i = another.getSorted().iterator();
-            while (i.hasNext()) {
-                NGramEntry other = i.next();
-                if (ngrams.containsKey(other.seq)) {
-                    sum += Math.abs((other.frequency - ngrams.get(other.seq).frequency)) / 2;
-                } else {
-                    sum += other.frequency;
-                }
-            }
-            i = getSorted().iterator();
-            while (i.hasNext()) {
-                NGramEntry other = i.next();
-                if (another.ngrams.containsKey(other.seq)) {
-                    sum += Math.abs((other.frequency - another.ngrams
-                            .get(other.seq).frequency)) / 2;
-                } else {
-                    sum += other.frequency;
-                }
-            }
-        } catch (Exception e) {
-            throw new TikaException("Could not calculate a score how well NGramProfiles match each other");
-        }
-        return sum;
-    }
-
-    /**
-     * Loads a ngram profile from an InputStream (assumes UTF-8 encoded content)
-     * 
-     * @param is the InputStream to read
-     */
-    public void load(InputStream is) throws IOException {
-
-        ngrams.clear();
-        ngramcounts = new int[maxLength + 1];
-        BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
-        String line = null;
-
-        while ((line = reader.readLine()) != null) {
-
-            // # starts a comment line
-            if (line.charAt(0) != '#') {
-                int spacepos = line.indexOf(' ');
-                String ngramsequence = line.substring(0, spacepos).trim();
-                int len = ngramsequence.length();
-                if ((len >= minLength) && (len <= maxLength)) {
-                    int ngramcount = Integer.parseInt(line.substring(spacepos + 1));
-                    NGramEntry en = new NGramEntry(ngramsequence, ngramcount);
-                    ngrams.put(en.getSeq(), en);
-                    ngramcounts[len] += ngramcount;
-                }
-            }
-        }
-        normalize();
-    }
-    
-    /**
-     * Creates a new Language profile from (preferably quite large - 5-10k of
-     * lines) text file
-     * 
-     * @param name to be given for the profile
-     * @param is a stream to be read
-     * @param encoding is the encoding of stream
-     * 
-     * @throws TikaException if could not create a language profile
-     *  
-     */
-    public static LanguageProfilerBuilder create(String name, InputStream is, String encoding) throws TikaException {
-
-        LanguageProfilerBuilder newProfile = new LanguageProfilerBuilder(name,
-                ABSOLUTE_MIN_NGRAM_LENGTH, ABSOLUTE_MAX_NGRAM_LENGTH);
-        BufferedInputStream bis = new BufferedInputStream(is);
-
-        byte buffer[] = new byte[4096];
-        StringBuilder text = new StringBuilder();
-        int len;
-
-        try {
-            while ((len = bis.read(buffer)) != -1) {
-                text.append(new String(buffer, 0, len, encoding));
-            }
-        } catch (IOException e) {
-            throw new TikaException("Could not create profile, " + e.getMessage());
-        }
-
-        newProfile.analyze(text);
-        return newProfile;
-    }
-
-    /**
-     * Writes NGramProfile content into OutputStream, content is outputted with
-     * UTF-8 encoding
-     * 
-     * @param os the Stream to output to
-     * 
-     * @throws IOException
-     */
-    public void save(OutputStream os) throws IOException {
-        os.write(("# NgramProfile generated at " + new Date() + 
-                  " for Apache Tika Language Identification\n").getBytes());
-
-        // And then each ngram
-
-        // First dispatch ngrams in many lists depending on their size
-        // (one list for each size, in order to store MAX_SIZE ngrams for each
-        // size of ngram)
-        List<NGramEntry> list = new ArrayList<NGramEntry>();
-        List<NGramEntry> sublist = new ArrayList<NGramEntry>();
-        NGramEntry[] entries = ngrams.values().toArray(
-                new NGramEntry[ngrams.size()]);
-        for (int i = minLength; i <= maxLength; i++) {
-            for (int j = 0; j < entries.length; j++) {
-                if (entries[j].getSeq().length() == i) {
-                    sublist.add(entries[j]);
-                }
-            }
-            Collections.sort(sublist);
-            if (sublist.size() > MAX_SIZE) {
-                sublist = sublist.subList(0, MAX_SIZE);
-            }
-            list.addAll(sublist);
-            sublist.clear();
-        }
-        for (int i = 0; i < list.size(); i++) {
-            NGramEntry e = list.get(i);
-            String line = e.toString() + " " + e.getCount() + "\n";
-            os.write(line.getBytes("UTF-8"));
-        }
-        os.flush();
-    }
-
-    /**
-     * main method used for testing only
-     * 
-     * @param args
-     */
-    public static void main(String args[]) {
-
-        // -create he sample_he.txt utf-8
-
-        String usage = "Usage: NGramProfile "
-                + "[-create profilename filename encoding] "
-                + "[-similarity file1 file2] "
-                + "[-score profile-name filename encoding]";
-        int command = 0;
-
-        final int CREATE = 1;
-        final int SIMILARITY = 2;
-        final int SCORE = 3;
-
-        String profilename = "";
-        String filename = "";
-        String filename2 = "";
-        String encoding = "";
-
-        if (args.length == 0) {
-            System.err.println(usage);
-            System.exit(-1);
-        }
-
-        for (int i = 0; i < args.length; i++) { // parse command line
-            if (args[i].equals("-create")) { // found -create option
-                command = CREATE;
-                profilename = args[++i];
-                filename = args[++i];
-                encoding = args[++i];
-            }
-
-            if (args[i].equals("-similarity")) { // found -similarity option
-                command = SIMILARITY;
-                filename = args[++i];
-                filename2 = args[++i];
-                encoding = args[++i];
-            }
-
-            if (args[i].equals("-score")) { // found -Score option
-                command = SCORE;
-                profilename = args[++i];
-                filename = args[++i];
-                encoding = args[++i];
-            }
-        }
-
-        try {
-
-            switch (command) {
-
-            case CREATE:
-
-                File f = new File(filename);
-                FileInputStream fis = new FileInputStream(f);
-                LanguageProfilerBuilder newProfile = LanguageProfilerBuilder
-                        .create(profilename, fis, encoding);
-                fis.close();
-                f = new File(profilename + "." + FILE_EXTENSION);
-                FileOutputStream fos = new FileOutputStream(f);
-                newProfile.save(fos);
-                System.out.println("new profile " + profilename + "."
-                        + FILE_EXTENSION + " was created.");
-                break;
-
-            case SIMILARITY:
-
-                f = new File(filename);
-                fis = new FileInputStream(f);
-                newProfile = LanguageProfilerBuilder.create(filename, fis,
-                        encoding);
-                newProfile.normalize();
-
-                f = new File(filename2);
-                fis = new FileInputStream(f);
-                LanguageProfilerBuilder newProfile2 = LanguageProfilerBuilder
-                        .create(filename2, fis, encoding);
-                newProfile2.normalize();
-                System.out.println("Similarity is "
-                        + newProfile.getSimilarity(newProfile2));
-                break;
-
-            case SCORE:
-                f = new File(filename);
-                fis = new FileInputStream(f);
-                newProfile = LanguageProfilerBuilder.create(filename, fis,
-                        encoding);
-
-                f = new File(profilename + "." + FILE_EXTENSION);
-                fis = new FileInputStream(f);
-                LanguageProfilerBuilder compare = new LanguageProfilerBuilder(
-                        profilename, DEFAULT_MIN_NGRAM_LENGTH,
-                        DEFAULT_MAX_NGRAM_LENGTH);
-                compare.load(fis);
-                System.out.println("Score is "
-                        + compare.getSimilarity(newProfile));
-                break;
-
-            }
-
-        } catch (Exception e) {
-            e.printStackTrace();
-            // throw new TikaException("");
-        }
-    }
-
-  
-    /**
-     * Inner class that describes a NGram
-     */
-    class NGramEntry implements Comparable<NGramEntry> {
-
-        /** The NGRamProfile this NGram is related to */
-        private LanguageProfilerBuilder profile = null;
-
-        /** The sequence of characters of the ngram */
-        CharSequence seq = null;
-
-        /** The number of occurences of this ngram in its profile */
-        private int count = 0;
-
-        /** The frequency of this ngram in its profile */
-        private float frequency = 0.0F;
-
-        /**
-         * Constructs a new NGramEntry
-         * 
-         * @param seq is the sequence of characters of the ngram
-         */
-        public NGramEntry(CharSequence seq) {
-            this.seq = seq;
-        }
-
-        /**
-         * Constructs a new NGramEntry
-         * 
-         * @param seq is the sequence of characters of the ngram
-         * @param count is the number of occurrences of this ngram
-         */
-        public NGramEntry(String seq, int count) {
-            this.seq = new StringBuffer(seq).subSequence(0, seq.length());
-            this.count = count;
-        }
-
-        /**
-         * Returns the number of occurrences of this ngram in its profile
-         * 
-         * @return the number of occurrences of this ngram in its profile
-         */
-        public int getCount() {
-            return count;
-        }
-
-        /**
-         * Returns the frequency of this ngram in its profile
-         * 
-         * @return the frequency of this ngram in its profile
-         */
-        public float getFrequency() {
-            return frequency;
-        }
-
-        /**
-         * Returns the sequence of characters of this ngram
-         * 
-         * @return the sequence of characters of this ngram
-         */
-        public CharSequence getSeq() {
-            return seq;
-        }
-
-        /**
-         * Returns the size of this ngram
-         * 
-         * @return the size of this ngram
-         */
-        public int size() {
-            return seq.length();
-        }
-
-        // Inherited JavaDoc
-        public int compareTo(NGramEntry ngram) {
-            int diff = Float.compare(ngram.getFrequency(), frequency);
-            if (diff != 0) {
-                return diff;
-            } else {
-                return (toString().compareTo(ngram.toString()));
-            }
-        }
-
-        /**
-         * Increments the number of occurrences of this ngram.
-         */
-        public void inc() {
-            count++;
-        }
-
-        /**
-         * Associated a profile to this ngram
-         * 
-         * @param profile
-         *            is the profile associated to this ngram
-         */
-        public void setProfile(LanguageProfilerBuilder profile) {
-            this.profile = profile;
-        }
-
-        /**
-         * Returns the profile associated to this ngram
-         * 
-         * @return the profile associated to this ngram
-         */
-        public LanguageProfilerBuilder getProfile() {
-            return profile;
-        }
-
-        // Inherited JavaDoc
-        public String toString() {
-            return seq.toString();
-        }
-
-        // Inherited JavaDoc
-        public int hashCode() {
-            return seq.hashCode();
-        }
-
-        // Inherited JavaDoc
-        public boolean equals(Object obj) {
-
-            NGramEntry ngram = null;
-            try {
-                ngram = (NGramEntry) obj;
-                return ngram.seq.equals(seq);
-            } catch (Exception e) {
-                return false;
-            }
-        }
-
-    }
-
-    private class QuickStringBuffer implements CharSequence {
-
-        private char value[];
-
-        private int count;
-
-        QuickStringBuffer() {
-            this(16);
-        }
-
-        QuickStringBuffer(char[] value) {
-            this.value = value;
-            count = value.length;
-        }
-
-        QuickStringBuffer(int length) {
-            value = new char[length];
-        }
-
-        QuickStringBuffer(String str) {
-            this(str.length() + 16);
-            append(str);
-        }
-
-        public int length() {
-            return count;
-        }
-
-        private void expandCapacity(int minimumCapacity) {
-            int newCapacity = (value.length + 1) * 2;
-            if (newCapacity < 0) {
-                newCapacity = Integer.MAX_VALUE;
-            } else if (minimumCapacity > newCapacity) {
-                newCapacity = minimumCapacity;
-            }
-
-            char newValue[] = new char[newCapacity];
-            System.arraycopy(value, 0, newValue, 0, count);
-            value = newValue;
-        }
-
-        QuickStringBuffer clear() {
-            count = 0;
-            return this;
-        }
-
-        public char charAt(int index) {
-            return value[index];
-        }
-
-        QuickStringBuffer append(String str) {
-            if (str == null) {
-                str = String.valueOf(str);
-            }
-
-            int len = str.length();
-            int newcount = count + len;
-            if (newcount > value.length) {
-                expandCapacity(newcount);
-            }
-            str.getChars(0, len, value, count);
-            count = newcount;
-            return this;
-        }
-
-        QuickStringBuffer append(char c) {
-            int newcount = count + 1;
-            if (newcount > value.length) {
-                expandCapacity(newcount);
-            }
-            value[count++] = c;
-            return this;
-        }
-
-        public CharSequence subSequence(int start, int end) {
-            return new String(value, start, end - start);
-        }
-
-        public String toString() {
-            return new String(this.value);
-        }
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.language;
+
+// JDK imports
+import java.io.BufferedInputStream;
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import org.apache.tika.exception.TikaException;
+
+/**
+ * This class runs a ngram analysis over submitted text, results might be used
+ * for automatic language identification.
+ * 
+ * The similarity calculation is at experimental level. You have been warned.
+ * 
+ * Methods are provided to build new NGramProfiles profiles.
+ * 
+ * @author Sami Siren
+ * @author Jerome Charron - http://frutch.free.fr/
+ */
+public class LanguageProfilerBuilder {
+
+    // public static final Log LOG =
+    // LogFactory.getLog(LanguageProfilerBuilder.class);
+
+    /** The minimum length allowed for a ngram. */
+    final static int ABSOLUTE_MIN_NGRAM_LENGTH = 3; /* was 1 */
+
+    /** The maximum length allowed for a ngram. */
+    final static int ABSOLUTE_MAX_NGRAM_LENGTH = 3; /* was 4 */
+
+    /** The default min length of ngram */
+    final static int DEFAULT_MIN_NGRAM_LENGTH = 3;
+
+    /** The default max length of ngram */
+    final static int DEFAULT_MAX_NGRAM_LENGTH = 3;
+
+    /** The ngram profile file extension */
+    final static String FILE_EXTENSION = "ngp";
+
+    /** The profile max size (number of ngrams of the same size) */
+    final static int MAX_SIZE = 1000;
+
+    /** separator char */
+    final static char SEPARATOR = '_';
+    /** The String form of the separator char */
+    private final static String SEP_CHARSEQ = new String(
+            new char[] { SEPARATOR });
+
+    /** The profile's name */
+    private String name = null;
+
+    /** The NGrams of this profile sorted on the number of occurrences */
+    private List<NGramEntry> sorted = null;
+
+    /** The min length of ngram */
+    private int minLength = DEFAULT_MIN_NGRAM_LENGTH;
+
+    /** The max length of ngram */
+    private int maxLength = DEFAULT_MAX_NGRAM_LENGTH;
+
+    /** The total number of ngrams occurences */
+    private int[] ngramcounts = null;
+
+    /** An index of the ngrams of the profile */
+    private Map<CharSequence, NGramEntry> ngrams = null;
+
+    /** A StringBuffer used during analysis */
+    private QuickStringBuffer word = new QuickStringBuffer();
+
+    /**
+     * Constructs a new ngram profile
+     * 
+     * @param name is the name of the profile
+     * @param minlen is the min length of ngram sequences
+     * @param maxlen is the max length of ngram sequences
+     */
+    public LanguageProfilerBuilder(String name, int minlen, int maxlen) {
+        // TODO: Compute the initial capacity using minlen and maxlen.
+        this.ngrams = new HashMap<CharSequence, NGramEntry>(4000);
+        this.minLength = minlen;
+        this.maxLength = maxlen;
+        this.name = name;
+    }
+  
+    /**
+     * Constructs a new ngram profile where minlen=3, maxlen=3
+     * 
+     * @param name is a name of profile, usually two length string
+     * @since Tika 1.0
+     */
+    public LanguageProfilerBuilder(String name) {
+        this.ngrams = new HashMap<CharSequence, NGramEntry>(4000);
+        this.minLength = ABSOLUTE_MIN_NGRAM_LENGTH;
+        this.maxLength = ABSOLUTE_MAX_NGRAM_LENGTH;
+        this.name = name;
+    }
+
+    /**
+     * @return Returns the name.
+     */
+    public String getName() {
+        return name;
+    }
+  
+    // This method was commented because it depends on org.apache.lucene.analysis.Token
+    // that is not a part of the Tika
+    // /**
+    // * Adds ngrams from a token to this profile
+    // *
+    // * @param t is the Token to be added
+    // */
+    // public void add(Token t) {
+    // add(new StringBuffer().append(SEPARATOR)
+    // .append(t.term())
+    // .append(SEPARATOR));
+    // }
+
+    /**
+     * Adds ngrams from a single word to this profile
+     * 
+     * @param word is the word to add
+     */
+    public void add(StringBuffer word) {
+        for (int i = minLength; (i <= maxLength) && (i < word.length()); i++) {
+            add(word, i);
+        }
+    }
+
+    /**
+     * Adds the last NGrams from the specified word.
+     */
+    private void add(QuickStringBuffer word) {
+        int wlen = word.length();
+        if (wlen >= minLength) {
+            int max = Math.min(maxLength, wlen);
+            for (int i = minLength; i <= max; i++) {
+                add(word.subSequence(wlen - i, wlen));
+            }
+        }
+    }
+
+    /**
+     * Adds ngrams from a single word in this profile
+     * 
+     * @param word is the word to add
+     * @param n is the ngram size
+     */
+    private void add(CharSequence cs) {
+
+        if (cs.equals(SEP_CHARSEQ)) {
+            return;
+        }
+        NGramEntry nge = ngrams.get(cs);
+        if (nge == null) {
+            nge = new NGramEntry(cs);
+            ngrams.put(cs, nge);
+        }
+        nge.inc();
+    }
+
+    /**
+     * Analyzes a piece of text
+     * 
+     * @param text
+     *            the text to be analyzed
+     */
+    public void analyze(StringBuilder text) {
+
+        if (ngrams != null) {
+            ngrams.clear();
+            sorted = null;
+            ngramcounts = null;
+        }
+
+        word.clear().append(SEPARATOR);
+        for (int i = 0; i < text.length(); i++) {
+            char c = Character.toLowerCase(text.charAt(i));
+
+            if (Character.isLetter(c)) {
+                add(word.append(c));
+            } else {
+                // found word boundary
+                if (word.length() > 1) {
+                    // we have a word!
+                    add(word.append(SEPARATOR));
+                    word.clear().append(SEPARATOR);
+                }
+            }
+        }
+
+        if (word.length() > 1) {
+            // we have a word!
+            add(word.append(SEPARATOR));
+        }
+        normalize();
+    }
+
+    /**
+     * @param word
+     * @param n sequence length
+     */
+    private void add(StringBuffer word, int n) {
+        for (int i = 0; i <= word.length() - n; i++) {
+            add(word.subSequence(i, i + n));
+        }
+    }
+    
+    /**
+     * Normalizes the profile (calculates the ngrams frequencies)
+     */
+    protected void normalize() {
+        NGramEntry e = null;
+        Iterator<NGramEntry> i = ngrams.values().iterator();
+
+        // Calculates ngram count if not already done
+        if (ngramcounts == null) {
+            ngramcounts = new int[maxLength + 1];
+            while (i.hasNext()) {
+                e = i.next();
+                ngramcounts[e.size()] += e.count;
+            }
+        }
+
+        i = ngrams.values().iterator();
+        while (i.hasNext()) {
+            e = i.next();
+            e.frequency = (float) e.count / (float) ngramcounts[e.size()];
+        }
+    }
+
+    /**
+     * Returns a sorted list of ngrams (sort done by 1. frequency 2. sequence)
+     * 
+     * @return sorted vector of ngrams
+     */
+    public List<NGramEntry> getSorted() {
+        // make sure sorting is done only once
+        if (sorted == null) {
+            sorted = new ArrayList<NGramEntry>(ngrams.values());
+            Collections.sort(sorted);
+
+            // trim at NGRAM_LENGTH entries
+            if (sorted.size() > MAX_SIZE) {
+                sorted = sorted.subList(0, MAX_SIZE);
+            }
+        }
+        return sorted;
+    }
+
+    // Inherited JavaDoc
+    public String toString() {
+
+        StringBuffer s = new StringBuffer().append("NGramProfile: ")
+                                           .append(name).append("\n");
+
+        Iterator<NGramEntry> i = getSorted().iterator();
+
+        while (i.hasNext()) {
+            NGramEntry entry = i.next();
+            s.append("[").append(entry.seq).append("/").append(entry.count)
+                         .append("/").append(entry.frequency).append("]\n");
+        }
+        return s.toString();
+    }
+
+    /**
+     * Calculates a score how well NGramProfiles match each other
+     * 
+     * @param another
+     *            ngram profile to compare against
+     * @return similarity 0=exact match
+     * @throws TikaException
+     *             if could not calculate a score
+     */
+    public float getSimilarity(LanguageProfilerBuilder another)
+            throws TikaException {
+
+        float sum = 0;
+
+        try {
+            Iterator<NGramEntry> i = another.getSorted().iterator();
+            while (i.hasNext()) {
+                NGramEntry other = i.next();
+                if (ngrams.containsKey(other.seq)) {
+                    sum += Math.abs((other.frequency - ngrams.get(other.seq).frequency)) / 2;
+                } else {
+                    sum += other.frequency;
+                }
+            }
+            i = getSorted().iterator();
+            while (i.hasNext()) {
+                NGramEntry other = i.next();
+                if (another.ngrams.containsKey(other.seq)) {
+                    sum += Math.abs((other.frequency - another.ngrams
+                            .get(other.seq).frequency)) / 2;
+                } else {
+                    sum += other.frequency;
+                }
+            }
+        } catch (Exception e) {
+            throw new TikaException("Could not calculate a score how well NGramProfiles match each other");
+        }
+        return sum;
+    }
+
+    /**
+     * Loads a ngram profile from an InputStream (assumes UTF-8 encoded content)
+     * 
+     * @param is the InputStream to read
+     */
+    public void load(InputStream is) throws IOException {
+
+        ngrams.clear();
+        ngramcounts = new int[maxLength + 1];
+        BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
+        String line = null;
+
+        while ((line = reader.readLine()) != null) {
+
+            // # starts a comment line
+            if (line.charAt(0) != '#') {
+                int spacepos = line.indexOf(' ');
+                String ngramsequence = line.substring(0, spacepos).trim();
+                int len = ngramsequence.length();
+                if ((len >= minLength) && (len <= maxLength)) {
+                    int ngramcount = Integer.parseInt(line.substring(spacepos + 1));
+                    NGramEntry en = new NGramEntry(ngramsequence, ngramcount);
+                    ngrams.put(en.getSeq(), en);
+                    ngramcounts[len] += ngramcount;
+                }
+            }
+        }
+        normalize();
+    }
+    
+    /**
+     * Creates a new Language profile from (preferably quite large - 5-10k of
+     * lines) text file
+     * 
+     * @param name to be given for the profile
+     * @param is a stream to be read
+     * @param encoding is the encoding of stream
+     * 
+     * @throws TikaException if could not create a language profile
+     *  
+     */
+    public static LanguageProfilerBuilder create(String name, InputStream is, String encoding) throws TikaException {
+
+        LanguageProfilerBuilder newProfile = new LanguageProfilerBuilder(name,
+                ABSOLUTE_MIN_NGRAM_LENGTH, ABSOLUTE_MAX_NGRAM_LENGTH);
+        BufferedInputStream bis = new BufferedInputStream(is);
+
+        byte buffer[] = new byte[4096];
+        StringBuilder text = new StringBuilder();
+        int len;
+
+        try {
+            while ((len = bis.read(buffer)) != -1) {
+                text.append(new String(buffer, 0, len, encoding));
+            }
+        } catch (IOException e) {
+            throw new TikaException("Could not create profile, " + e.getMessage());
+        }
+
+        newProfile.analyze(text);
+        return newProfile;
+    }
+
+    /**
+     * Writes NGramProfile content into OutputStream, content is outputted with
+     * UTF-8 encoding
+     * 
+     * @param os the Stream to output to
+     * 
+     * @throws IOException
+     */
+    public void save(OutputStream os) throws IOException {
+        os.write(("# NgramProfile generated at " + new Date() + 
+                  " for Apache Tika Language Identification\n").getBytes());
+
+        // And then each ngram
+
+        // First dispatch ngrams in many lists depending on their size
+        // (one list for each size, in order to store MAX_SIZE ngrams for each
+        // size of ngram)
+        List<NGramEntry> list = new ArrayList<NGramEntry>();
+        List<NGramEntry> sublist = new ArrayList<NGramEntry>();
+        NGramEntry[] entries = ngrams.values().toArray(
+                new NGramEntry[ngrams.size()]);
+        for (int i = minLength; i <= maxLength; i++) {
+            for (int j = 0; j < entries.length; j++) {
+                if (entries[j].getSeq().length() == i) {
+                    sublist.add(entries[j]);
+                }
+            }
+            Collections.sort(sublist);
+            if (sublist.size() > MAX_SIZE) {
+                sublist = sublist.subList(0, MAX_SIZE);
+            }
+            list.addAll(sublist);
+            sublist.clear();
+        }
+        for (int i = 0; i < list.size(); i++) {
+            NGramEntry e = list.get(i);
+            String line = e.toString() + " " + e.getCount() + "\n";
+            os.write(line.getBytes("UTF-8"));
+        }
+        os.flush();
+    }
+
+    /**
+     * main method used for testing only
+     * 
+     * @param args
+     */
+    public static void main(String args[]) {
+
+        // -create he sample_he.txt utf-8
+
+        String usage = "Usage: NGramProfile "
+                + "[-create profilename filename encoding] "
+                + "[-similarity file1 file2] "
+                + "[-score profile-name filename encoding]";
+        int command = 0;
+
+        final int CREATE = 1;
+        final int SIMILARITY = 2;
+        final int SCORE = 3;
+
+        String profilename = "";
+        String filename = "";
+        String filename2 = "";
+        String encoding = "";
+
+        if (args.length == 0) {
+            System.err.println(usage);
+            System.exit(-1);
+        }
+
+        for (int i = 0; i < args.length; i++) { // parse command line
+            if (args[i].equals("-create")) { // found -create option
+                command = CREATE;
+                profilename = args[++i];
+                filename = args[++i];
+                encoding = args[++i];
+            }
+
+            if (args[i].equals("-similarity")) { // found -similarity option
+                command = SIMILARITY;
+                filename = args[++i];
+                filename2 = args[++i];
+                encoding = args[++i];
+            }
+
+            if (args[i].equals("-score")) { // found -Score option
+                command = SCORE;
+                profilename = args[++i];
+                filename = args[++i];
+                encoding = args[++i];
+            }
+        }
+
+        try {
+
+            switch (command) {
+
+            case CREATE:
+
+                File f = new File(filename);
+                FileInputStream fis = new FileInputStream(f);
+                LanguageProfilerBuilder newProfile = LanguageProfilerBuilder
+                        .create(profilename, fis, encoding);
+                fis.close();
+                f = new File(profilename + "." + FILE_EXTENSION);
+                FileOutputStream fos = new FileOutputStream(f);
+                newProfile.save(fos);
+                System.out.println("new profile " + profilename + "."
+                        + FILE_EXTENSION + " was created.");
+                break;
+
+            case SIMILARITY:
+
+                f = new File(filename);
+                fis = new FileInputStream(f);
+                newProfile = LanguageProfilerBuilder.create(filename, fis,
+                        encoding);
+                newProfile.normalize();
+
+                f = new File(filename2);
+                fis = new FileInputStream(f);
+                LanguageProfilerBuilder newProfile2 = LanguageProfilerBuilder
+                        .create(filename2, fis, encoding);
+                newProfile2.normalize();
+                System.out.println("Similarity is "
+                        + newProfile.getSimilarity(newProfile2));
+                break;
+
+            case SCORE:
+                f = new File(filename);
+                fis = new FileInputStream(f);
+                newProfile = LanguageProfilerBuilder.create(filename, fis,
+                        encoding);
+
+                f = new File(profilename + "." + FILE_EXTENSION);
+                fis = new FileInputStream(f);
+                LanguageProfilerBuilder compare = new LanguageProfilerBuilder(
+                        profilename, DEFAULT_MIN_NGRAM_LENGTH,
+                        DEFAULT_MAX_NGRAM_LENGTH);
+                compare.load(fis);
+                System.out.println("Score is "
+                        + compare.getSimilarity(newProfile));
+                break;
+
+            }
+
+        } catch (Exception e) {
+            e.printStackTrace();
+            // throw new TikaException("");
+        }
+    }
+
+  
+    /**
+     * Inner class that describes a NGram
+     */
+    class NGramEntry implements Comparable<NGramEntry> {
+
+        /** The NGRamProfile this NGram is related to */
+        private LanguageProfilerBuilder profile = null;
+
+        /** The sequence of characters of the ngram */
+        CharSequence seq = null;
+
+        /** The number of occurences of this ngram in its profile */
+        private int count = 0;
+
+        /** The frequency of this ngram in its profile */
+        private float frequency = 0.0F;
+
+        /**
+         * Constructs a new NGramEntry
+         * 
+         * @param seq is the sequence of characters of the ngram
+         */
+        public NGramEntry(CharSequence seq) {
+            this.seq = seq;
+        }
+
+        /**
+         * Constructs a new NGramEntry
+         * 
+         * @param seq is the sequence of characters of the ngram
+         * @param count is the number of occurrences of this ngram
+         */
+        public NGramEntry(String seq, int count) {
+            this.seq = new StringBuffer(seq).subSequence(0, seq.length());
+            this.count = count;
+        }
+
+        /**
+         * Returns the number of occurrences of this ngram in its profile
+         * 
+         * @return the number of occurrences of this ngram in its profile
+         */
+        public int getCount() {
+            return count;
+        }
+
+        /**
+         * Returns the frequency of this ngram in its profile
+         * 
+         * @return the frequency of this ngram in its profile
+         */
+        public float getFrequency() {
+            return frequency;
+        }
+
+        /**
+         * Returns the sequence of characters of this ngram
+         * 
+         * @return the sequence of characters of this ngram
+         */
+        public CharSequence getSeq() {
+            return seq;
+        }
+
+        /**
+         * Returns the size of this ngram
+         * 
+         * @return the size of this ngram
+         */
+        public int size() {
+            return seq.length();
+        }
+
+        // Inherited JavaDoc
+        public int compareTo(NGramEntry ngram) {
+            int diff = Float.compare(ngram.getFrequency(), frequency);
+            if (diff != 0) {
+                return diff;
+            } else {
+                return (toString().compareTo(ngram.toString()));
+            }
+        }
+
+        /**
+         * Increments the number of occurrences of this ngram.
+         */
+        public void inc() {
+            count++;
+        }
+
+        /**
+         * Associated a profile to this ngram
+         * 
+         * @param profile
+         *            is the profile associated to this ngram
+         */
+        public void setProfile(LanguageProfilerBuilder profile) {
+            this.profile = profile;
+        }
+
+        /**
+         * Returns the profile associated to this ngram
+         * 
+         * @return the profile associated to this ngram
+         */
+        public LanguageProfilerBuilder getProfile() {
+            return profile;
+        }
+
+        // Inherited JavaDoc
+        public String toString() {
+            return seq.toString();
+        }
+
+        // Inherited JavaDoc
+        public int hashCode() {
+            return seq.hashCode();
+        }
+
+        // Inherited JavaDoc
+        public boolean equals(Object obj) {
+
+            NGramEntry ngram = null;
+            try {
+                ngram = (NGramEntry) obj;
+                return ngram.seq.equals(seq);
+            } catch (Exception e) {
+                return false;
+            }
+        }
+
+    }
+
+    private class QuickStringBuffer implements CharSequence {
+
+        private char value[];
+
+        private int count;
+
+        QuickStringBuffer() {
+            this(16);
+        }
+
+        QuickStringBuffer(char[] value) {
+            this.value = value;
+            count = value.length;
+        }
+
+        QuickStringBuffer(int length) {
+            value = new char[length];
+        }
+
+        QuickStringBuffer(String str) {
+            this(str.length() + 16);
+            append(str);
+        }
+
+        public int length() {
+            return count;
+        }
+
+        private void expandCapacity(int minimumCapacity) {
+            int newCapacity = (value.length + 1) * 2;
+            if (newCapacity < 0) {
+                newCapacity = Integer.MAX_VALUE;
+            } else if (minimumCapacity > newCapacity) {
+                newCapacity = minimumCapacity;
+            }
+
+            char newValue[] = new char[newCapacity];
+            System.arraycopy(value, 0, newValue, 0, count);
+            value = newValue;
+        }
+
+        QuickStringBuffer clear() {
+            count = 0;
+            return this;
+        }
+
+        public char charAt(int index) {
+            return value[index];
+        }
+
+        QuickStringBuffer append(String str) {
+            if (str == null) {
+                str = String.valueOf(str);
+            }
+
+            int len = str.length();
+            int newcount = count + len;
+            if (newcount > value.length) {
+                expandCapacity(newcount);
+            }
+            str.getChars(0, len, value, count);
+            count = newcount;
+            return this;
+        }
+
+        QuickStringBuffer append(char c) {
+            int newcount = count + 1;
+            if (newcount > value.length) {
+                expandCapacity(newcount);
+            }
+            value[count++] = c;
+            return this;
+        }
+
+        public CharSequence subSequence(int start, int end) {
+            return new String(value, start, end - start);
+        }
+
+        public String toString() {
+            return new String(this.value);
+        }
+    }
+}

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfilerBuilder.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/language/ProfilingHandler.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/language/ProfilingWriter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Message.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CryptoParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DelegatingParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/NetworkParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/CompositeExternalParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersFactory.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ElementMappingContentHandler.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/EndDocumentShieldingContentHandler.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/Link.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/LinkBuilder.java
------------------------------------------------------------------------------
    svn:eol-style = native