You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/09/18 12:39:12 UTC
svn commit: r1172242 [1/9] - in /tika/trunk:
tika-app/src/test/java/org/apache/tika/cli/
tika-core/src/main/java/org/apache/tika/detect/
tika-core/src/main/java/org/apache/tika/extractor/
tika-core/src/main/java/org/apache/tika/fork/ tika-core/src/main...
Author: jukka
Date: Sun Sep 18 10:39:08 2011
New Revision: 1172242
URL: http://svn.apache.org/viewvc?rev=1172242&view=rev
Log:
Add missing svn:eol-style settings
Modified:
tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java (contents, props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/ContainerAwareDetector.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ContainerExtractor.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedResourceHandler.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParserContainerExtractor.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ClassLoaderResource.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerProxy.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerResource.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ForkProxy.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ForkResource.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/fork/InputStreamProxy.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/fork/InputStreamResource.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/fork/MemoryURLConnection.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamHandler.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamHandlerFactory.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/io/CloseShieldInputStream.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/io/ClosedInputStream.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/io/CountingInputStream.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/io/IOExceptionWithCause.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/io/IOUtils.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/io/LookaheadInputStream.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/io/NullInputStream.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/io/NullOutputStream.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/io/TemporaryFiles.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/io/TemporaryResources.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfile.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfilerBuilder.java (contents, props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/language/ProfilingHandler.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/language/ProfilingWriter.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Message.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CryptoParser.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DelegatingParser.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/NetworkParser.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/CompositeExternalParser.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersFactory.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ElementMappingContentHandler.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/EndDocumentShieldingContentHandler.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/Link.java (props changed)
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/LinkBuilder.java (props changed)
tika/trunk/tika-core/src/main/resources/org/apache/tika/language/tika.language.properties (props changed)
tika/trunk/tika-core/src/test/java/org/apache/tika/fork/ForkParserTest.java (props changed)
tika/trunk/tika-core/src/test/java/org/apache/tika/fork/ForkTestParser.java (props changed)
tika/trunk/tika-core/src/test/java/org/apache/tika/io/LookaheadInputStreamTest.java (props changed)
tika/trunk/tika-core/src/test/java/org/apache/tika/language/LanguageProfilerBuilderTest.java (contents, props changed)
tika/trunk/tika-core/src/test/java/org/apache/tika/metadata/TestMetadataHelper.java (props changed)
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java (props changed)
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java (props changed)
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/PatternsTest.java (props changed)
tika/trunk/tika-core/src/test/java/org/apache/tika/parser/CompositeParserTest.java (props changed)
tika/trunk/tika-core/src/test/java/org/apache/tika/parser/DummyParser.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHM2XHTML.java (contents, props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHMDocumentInformation.java (contents, props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/ChmParser.java (contents, props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java (contents, props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java (contents, props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmItsfHeader.java (contents, props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmItspHeader.java (contents, props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcControlData.java (contents, props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcResetTable.java (contents, props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.java (contents, props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmglHeader.java (contents, props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/DirectoryListingEntry.java (contents, props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/assertion/ChmAssert.java (contents, props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmCommons.java (contents, props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmConstants.java (contents, props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java (contents, props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmWrapper.java (contents, props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/exception/ChmParsingException.java (contents, props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmBlockInfo.java (contents, props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java (contents, props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java (contents, props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java (contents, props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubContentParser.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubParser.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlMapper.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/MetadataFields.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/XMPPacketScanner.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_mbcs.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/video/FLVParser.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AttributeDependantMetadataHandler.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/AttributeMetadataHandler.java (props changed)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java (props changed)
tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java (props changed)
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java (props changed)
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmBlockInfo.java (contents, props changed)
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmDocumentInformation.java (contents, props changed)
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java (contents, props changed)
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java (contents, props changed)
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmItsfHeader.java (contents, props changed)
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmItspHeader.java (contents, props changed)
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmLzxState.java (contents, props changed)
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmLzxcControlData.java (contents, props changed)
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmLzxcResetTable.java (contents, props changed)
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestDirectoryListingEntry.java (contents, props changed)
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestParameters.java (contents, props changed)
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestPmgiHeader.java (contents, props changed)
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestPmglHeader.java (contents, props changed)
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java (props changed)
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java (props changed)
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/MetadataFieldsTest.java (props changed)
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java (props changed)
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java (props changed)
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java (props changed)
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java (props changed)
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java (props changed)
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java (props changed)
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java (props changed)
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WriteProtectedParserTest.java (props changed)
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java (props changed)
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java (props changed)
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (props changed)
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java (props changed)
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java (props changed)
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java (props changed)
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/video/FLVParserTest.java (props changed)
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java (props changed)
tika/trunk/tika-server/src/main/java/org/apache/tika/server/MetadataResource.java (props changed)
tika/trunk/tika-server/src/main/java/org/apache/tika/server/PartExtractor.java (props changed)
tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaExceptionMapper.java (props changed)
tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaResource.java (props changed)
tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java (props changed)
tika/trunk/tika-server/src/main/java/org/apache/tika/server/UnpackerResource.java (props changed)
tika/trunk/tika-server/src/main/java/org/apache/tika/server/ZipOutput.java (props changed)
tika/trunk/tika-server/src/main/java/org/apache/tika/server/ZipUtils.java (props changed)
tika/trunk/tika-server/src/main/resources/commons-logging.properties (props changed)
tika/trunk/tika-server/src/main/resources/tikaserver-version.properties (props changed)
tika/trunk/tika-server/src/test/java/org/apache/tika/server/MetadataResourceTest.java (props changed)
tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java (props changed)
tika/trunk/tika-server/src/test/java/org/apache/tika/server/UnpackerResourceTest.java (props changed)
Modified: tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java?rev=1172242&r1=1172241&r2=1172242&view=diff
==============================================================================
--- tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java (original)
+++ tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java Sun Sep 18 10:39:08 2011
@@ -1,189 +1,189 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.cli;
-
-
-import java.io.ByteArrayOutputStream;
-import java.io.File;
-import java.io.IOException;
-import java.io.PrintStream;
-import java.net.URI;
-import junit.framework.Assert;
-import junit.framework.TestCase;
-
-/**
- * Tests the Tika's cli
- */
-public class TikaCLITest extends TestCase{
- /* Test members */
- private File profile = null;
- private ByteArrayOutputStream outContent = null;
- private PrintStream stdout = null;
- private PrintStream reassign = null;
- private URI testDataURI = new File("src/test/resources/test-data/").toURI();
- private String resorcePrefix = testDataURI.toString();
-
-
- public void setUp() throws Exception {
- profile = new File("welsh.ngp");
- outContent = new ByteArrayOutputStream();
- stdout = System.out;
- reassign = new PrintStream(outContent);
- System.setOut(reassign);
- }
-
-
- /**
- * Creates a welsh language profile
- *
- * @throws Exception
- */
- public void testCreateProfile() throws Exception {
- String[] params = {"--create-profile=welsh", "-eUTF-8", resorcePrefix + "welsh_corpus.txt"};
- TikaCLI.main(params);
- Assert.assertTrue(profile.exists());
- }
-
- /**
- * Tests --list-parser-detail option of the cli
- *
- * @throws Exception
- */
- public void testListParserDetail() throws Exception{
- String[] params = {"--list-parser-detail"};
- TikaCLI.main(params);
- Assert.assertTrue(outContent.toString().contains("application/vnd.oasis.opendocument.text-web"));
- }
-
- /**
- * Tests --list-parser option of the cli
- *
- * @throws Exception
- */
- public void testListParsers() throws Exception{
- String[] params = {"--list-parser"};
- TikaCLI.main(params);
- //Assert was commented temporarily for finding the problem
-// Assert.assertTrue(outContent != null && outContent.toString().contains("org.apache.tika.parser.iwork.IWorkPackageParser"));
- }
-
- /**
- * Tests -x option of the cli
- *
- * @throws Exception
- */
- public void testXMLOutput() throws Exception{
- String[] params = {"-x", resorcePrefix + "alice.cli.test"};
- TikaCLI.main(params);
- Assert.assertTrue(outContent.toString().contains("?xml version=\"1.0\" encoding=\"UTF-8\"?"));
- }
-
- /**
- * Tests a -h option of the cli
- *
- * @throws Exception
- */
- public void testHTMLOutput() throws Exception{
- String[] params = {"-h", resorcePrefix + "alice.cli.test"};
- TikaCLI.main(params);
- Assert.assertTrue(outContent.toString().contains("html xmlns=\"http://www.w3.org/1999/xhtml"));
- }
-
- /**
- * Tests -t option of the cli
- *
- * @throws Exception
- */
- public void testTextOutput() throws Exception{
- String[] params = {"-t", resorcePrefix + "alice.cli.test"};
- TikaCLI.main(params);
- Assert.assertTrue(outContent.toString().contains("finished off the cake"));
- }
-
- /**
- * Tests -m option of the cli
- * @throws Exception
- */
- public void testMetadataOutput() throws Exception{
- String[] params = {"-m", resorcePrefix + "alice.cli.test"};
- TikaCLI.main(params);
- Assert.assertTrue(outContent.toString().contains("text/plain"));
- }
-
- /**
- * Tests -l option of the cli
- *
- * @throws Exception
- */
- public void testLanguageOutput() throws Exception{
- String[] params = {"-l", resorcePrefix + "alice.cli.test"};
- TikaCLI.main(params);
- Assert.assertTrue(outContent.toString().contains("en"));
- }
-
- /**
- * Tests -d option of the cli
- *
- * @throws Exception
- */
- public void testDetectOutput() throws Exception{
- String[] params = {"-d", resorcePrefix + "alice.cli.test"};
- TikaCLI.main(params);
- Assert.assertTrue(outContent.toString().contains("text/plain"));
- }
-
- /**
- * Tests --list-met-models option of the cli
- *
- * @throws Exception
- */
- public void testListMetModels() throws Exception{
- String[] params = {"--list-met-models", resorcePrefix + "alice.cli.test"};
- TikaCLI.main(params);
- Assert.assertTrue(outContent.toString().contains("text/plain"));
- }
-
- /**
- * Tests --list-supported-types option of the cli
- *
- * @throws Exception
- */
- public void testListSupportedTypes() throws Exception{
- String[] params = {"--list-supported-types", resorcePrefix + "alice.cli.test"};
- TikaCLI.main(params);
- Assert.assertTrue(outContent.toString().contains("supertype: application/octet-stream"));
- }
-
- /**
- * Tears down the test. Returns the System.out
- */
- public void tearDown() throws Exception {
- if(profile != null && profile.exists())
- profile.delete();
- System.setOut(stdout);
- closeStreams();
- }
-
- private void closeStreams() throws IOException {
- if(outContent != null)
- outContent.close();
- if(stdout != null)
- stdout.close();
- if(reassign != null)
- reassign.close();
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.cli;
+
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.net.URI;
+import junit.framework.Assert;
+import junit.framework.TestCase;
+
+/**
+ * Tests the Tika's cli
+ */
+public class TikaCLITest extends TestCase{
+ /* Test members */
+ private File profile = null;
+ private ByteArrayOutputStream outContent = null;
+ private PrintStream stdout = null;
+ private PrintStream reassign = null;
+ private URI testDataURI = new File("src/test/resources/test-data/").toURI();
+ private String resorcePrefix = testDataURI.toString();
+
+
+ public void setUp() throws Exception {
+ profile = new File("welsh.ngp");
+ outContent = new ByteArrayOutputStream();
+ stdout = System.out;
+ reassign = new PrintStream(outContent);
+ System.setOut(reassign);
+ }
+
+
+ /**
+ * Creates a welsh language profile
+ *
+ * @throws Exception
+ */
+ public void testCreateProfile() throws Exception {
+ String[] params = {"--create-profile=welsh", "-eUTF-8", resorcePrefix + "welsh_corpus.txt"};
+ TikaCLI.main(params);
+ Assert.assertTrue(profile.exists());
+ }
+
+ /**
+ * Tests --list-parser-detail option of the cli
+ *
+ * @throws Exception
+ */
+ public void testListParserDetail() throws Exception{
+ String[] params = {"--list-parser-detail"};
+ TikaCLI.main(params);
+ Assert.assertTrue(outContent.toString().contains("application/vnd.oasis.opendocument.text-web"));
+ }
+
+ /**
+ * Tests --list-parser option of the cli
+ *
+ * @throws Exception
+ */
+ public void testListParsers() throws Exception{
+ String[] params = {"--list-parser"};
+ TikaCLI.main(params);
+ //Assert was commented temporarily for finding the problem
+// Assert.assertTrue(outContent != null && outContent.toString().contains("org.apache.tika.parser.iwork.IWorkPackageParser"));
+ }
+
+ /**
+ * Tests -x option of the cli
+ *
+ * @throws Exception
+ */
+ public void testXMLOutput() throws Exception{
+ String[] params = {"-x", resorcePrefix + "alice.cli.test"};
+ TikaCLI.main(params);
+ Assert.assertTrue(outContent.toString().contains("?xml version=\"1.0\" encoding=\"UTF-8\"?"));
+ }
+
+ /**
+ * Tests a -h option of the cli
+ *
+ * @throws Exception
+ */
+ public void testHTMLOutput() throws Exception{
+ String[] params = {"-h", resorcePrefix + "alice.cli.test"};
+ TikaCLI.main(params);
+ Assert.assertTrue(outContent.toString().contains("html xmlns=\"http://www.w3.org/1999/xhtml"));
+ }
+
+ /**
+ * Tests -t option of the cli
+ *
+ * @throws Exception
+ */
+ public void testTextOutput() throws Exception{
+ String[] params = {"-t", resorcePrefix + "alice.cli.test"};
+ TikaCLI.main(params);
+ Assert.assertTrue(outContent.toString().contains("finished off the cake"));
+ }
+
+ /**
+ * Tests -m option of the cli
+ * @throws Exception
+ */
+ public void testMetadataOutput() throws Exception{
+ String[] params = {"-m", resorcePrefix + "alice.cli.test"};
+ TikaCLI.main(params);
+ Assert.assertTrue(outContent.toString().contains("text/plain"));
+ }
+
+ /**
+ * Tests -l option of the cli
+ *
+ * @throws Exception
+ */
+ public void testLanguageOutput() throws Exception{
+ String[] params = {"-l", resorcePrefix + "alice.cli.test"};
+ TikaCLI.main(params);
+ Assert.assertTrue(outContent.toString().contains("en"));
+ }
+
+ /**
+ * Tests -d option of the cli
+ *
+ * @throws Exception
+ */
+ public void testDetectOutput() throws Exception{
+ String[] params = {"-d", resorcePrefix + "alice.cli.test"};
+ TikaCLI.main(params);
+ Assert.assertTrue(outContent.toString().contains("text/plain"));
+ }
+
+ /**
+ * Tests --list-met-models option of the cli
+ *
+ * @throws Exception
+ */
+ public void testListMetModels() throws Exception{
+ String[] params = {"--list-met-models", resorcePrefix + "alice.cli.test"};
+ TikaCLI.main(params);
+ Assert.assertTrue(outContent.toString().contains("text/plain"));
+ }
+
+ /**
+ * Tests --list-supported-types option of the cli
+ *
+ * @throws Exception
+ */
+ public void testListSupportedTypes() throws Exception{
+ String[] params = {"--list-supported-types", resorcePrefix + "alice.cli.test"};
+ TikaCLI.main(params);
+ Assert.assertTrue(outContent.toString().contains("supertype: application/octet-stream"));
+ }
+
+ /**
+ * Tears down the test. Returns the System.out
+ */
+ public void tearDown() throws Exception {
+ if(profile != null && profile.exists())
+ profile.delete();
+ System.setOut(stdout);
+ closeStreams();
+ }
+
+ private void closeStreams() throws IOException {
+ if(outContent != null)
+ outContent.close();
+ if(stdout != null)
+ stdout.close();
+ if(reassign != null)
+ reassign.close();
+ }
+}
Propchange: tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/ContainerAwareDetector.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ContainerExtractor.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedResourceHandler.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParserContainerExtractor.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ClassLoaderResource.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerProxy.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerResource.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ForkProxy.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/fork/ForkResource.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/fork/InputStreamProxy.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/fork/InputStreamResource.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/fork/MemoryURLConnection.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamHandler.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamHandlerFactory.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/io/CloseShieldInputStream.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/io/ClosedInputStream.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/io/CountingInputStream.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/io/IOExceptionWithCause.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/io/IOUtils.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/io/LookaheadInputStream.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/io/NullInputStream.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/io/NullOutputStream.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/io/TemporaryFiles.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/io/TemporaryResources.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageIdentifier.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfile.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfilerBuilder.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfilerBuilder.java?rev=1172242&r1=1172241&r2=1172242&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfilerBuilder.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfilerBuilder.java Sun Sep 18 10:39:08 2011
@@ -1,767 +1,767 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.language;
-
-// JDK imports
-import java.io.BufferedInputStream;
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.OutputStream;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Date;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import org.apache.tika.exception.TikaException;
-
-/**
- * This class runs a ngram analysis over submitted text, results might be used
- * for automatic language identification.
- *
- * The similarity calculation is at experimental level. You have been warned.
- *
- * Methods are provided to build new NGramProfiles profiles.
- *
- * @author Sami Siren
- * @author Jerome Charron - http://frutch.free.fr/
- */
-public class LanguageProfilerBuilder {
-
- // public static final Log LOG =
- // LogFactory.getLog(LanguageProfilerBuilder.class);
-
- /** The minimum length allowed for a ngram. */
- final static int ABSOLUTE_MIN_NGRAM_LENGTH = 3; /* was 1 */
-
- /** The maximum length allowed for a ngram. */
- final static int ABSOLUTE_MAX_NGRAM_LENGTH = 3; /* was 4 */
-
- /** The default min length of ngram */
- final static int DEFAULT_MIN_NGRAM_LENGTH = 3;
-
- /** The default max length of ngram */
- final static int DEFAULT_MAX_NGRAM_LENGTH = 3;
-
- /** The ngram profile file extension */
- final static String FILE_EXTENSION = "ngp";
-
- /** The profile max size (number of ngrams of the same size) */
- final static int MAX_SIZE = 1000;
-
- /** separator char */
- final static char SEPARATOR = '_';
- /** The String form of the separator char */
- private final static String SEP_CHARSEQ = new String(
- new char[] { SEPARATOR });
-
- /** The profile's name */
- private String name = null;
-
- /** The NGrams of this profile sorted on the number of occurrences */
- private List<NGramEntry> sorted = null;
-
- /** The min length of ngram */
- private int minLength = DEFAULT_MIN_NGRAM_LENGTH;
-
- /** The max length of ngram */
- private int maxLength = DEFAULT_MAX_NGRAM_LENGTH;
-
- /** The total number of ngrams occurences */
- private int[] ngramcounts = null;
-
- /** An index of the ngrams of the profile */
- private Map<CharSequence, NGramEntry> ngrams = null;
-
- /** A StringBuffer used during analysis */
- private QuickStringBuffer word = new QuickStringBuffer();
-
- /**
- * Constructs a new ngram profile
- *
- * @param name is the name of the profile
- * @param minlen is the min length of ngram sequences
- * @param maxlen is the max length of ngram sequences
- */
- public LanguageProfilerBuilder(String name, int minlen, int maxlen) {
- // TODO: Compute the initial capacity using minlen and maxlen.
- this.ngrams = new HashMap<CharSequence, NGramEntry>(4000);
- this.minLength = minlen;
- this.maxLength = maxlen;
- this.name = name;
- }
-
- /**
- * Constructs a new ngram profile where minlen=3, maxlen=3
- *
- * @param name is a name of profile, usually two length string
- * @since Tika 1.0
- */
- public LanguageProfilerBuilder(String name) {
- this.ngrams = new HashMap<CharSequence, NGramEntry>(4000);
- this.minLength = ABSOLUTE_MIN_NGRAM_LENGTH;
- this.maxLength = ABSOLUTE_MAX_NGRAM_LENGTH;
- this.name = name;
- }
-
- /**
- * @return Returns the name.
- */
- public String getName() {
- return name;
- }
-
- // This method was commented because it depends on org.apache.lucene.analysis.Token
- // that is not a part of the Tika
- // /**
- // * Adds ngrams from a token to this profile
- // *
- // * @param t is the Token to be added
- // */
- // public void add(Token t) {
- // add(new StringBuffer().append(SEPARATOR)
- // .append(t.term())
- // .append(SEPARATOR));
- // }
-
- /**
- * Adds ngrams from a single word to this profile
- *
- * @param word is the word to add
- */
- public void add(StringBuffer word) {
- for (int i = minLength; (i <= maxLength) && (i < word.length()); i++) {
- add(word, i);
- }
- }
-
- /**
- * Adds the last NGrams from the specified word.
- */
- private void add(QuickStringBuffer word) {
- int wlen = word.length();
- if (wlen >= minLength) {
- int max = Math.min(maxLength, wlen);
- for (int i = minLength; i <= max; i++) {
- add(word.subSequence(wlen - i, wlen));
- }
- }
- }
-
- /**
- * Adds ngrams from a single word in this profile
- *
- * @param word is the word to add
- * @param n is the ngram size
- */
- private void add(CharSequence cs) {
-
- if (cs.equals(SEP_CHARSEQ)) {
- return;
- }
- NGramEntry nge = ngrams.get(cs);
- if (nge == null) {
- nge = new NGramEntry(cs);
- ngrams.put(cs, nge);
- }
- nge.inc();
- }
-
- /**
- * Analyzes a piece of text
- *
- * @param text
- * the text to be analyzed
- */
- public void analyze(StringBuilder text) {
-
- if (ngrams != null) {
- ngrams.clear();
- sorted = null;
- ngramcounts = null;
- }
-
- word.clear().append(SEPARATOR);
- for (int i = 0; i < text.length(); i++) {
- char c = Character.toLowerCase(text.charAt(i));
-
- if (Character.isLetter(c)) {
- add(word.append(c));
- } else {
- // found word boundary
- if (word.length() > 1) {
- // we have a word!
- add(word.append(SEPARATOR));
- word.clear().append(SEPARATOR);
- }
- }
- }
-
- if (word.length() > 1) {
- // we have a word!
- add(word.append(SEPARATOR));
- }
- normalize();
- }
-
- /**
- * @param word
- * @param n sequence length
- */
- private void add(StringBuffer word, int n) {
- for (int i = 0; i <= word.length() - n; i++) {
- add(word.subSequence(i, i + n));
- }
- }
-
- /**
- * Normalizes the profile (calculates the ngrams frequencies)
- */
- protected void normalize() {
- NGramEntry e = null;
- Iterator<NGramEntry> i = ngrams.values().iterator();
-
- // Calculates ngram count if not already done
- if (ngramcounts == null) {
- ngramcounts = new int[maxLength + 1];
- while (i.hasNext()) {
- e = i.next();
- ngramcounts[e.size()] += e.count;
- }
- }
-
- i = ngrams.values().iterator();
- while (i.hasNext()) {
- e = i.next();
- e.frequency = (float) e.count / (float) ngramcounts[e.size()];
- }
- }
-
- /**
- * Returns a sorted list of ngrams (sort done by 1. frequency 2. sequence)
- *
- * @return sorted vector of ngrams
- */
- public List<NGramEntry> getSorted() {
- // make sure sorting is done only once
- if (sorted == null) {
- sorted = new ArrayList<NGramEntry>(ngrams.values());
- Collections.sort(sorted);
-
- // trim at NGRAM_LENGTH entries
- if (sorted.size() > MAX_SIZE) {
- sorted = sorted.subList(0, MAX_SIZE);
- }
- }
- return sorted;
- }
-
- // Inherited JavaDoc
- public String toString() {
-
- StringBuffer s = new StringBuffer().append("NGramProfile: ")
- .append(name).append("\n");
-
- Iterator<NGramEntry> i = getSorted().iterator();
-
- while (i.hasNext()) {
- NGramEntry entry = i.next();
- s.append("[").append(entry.seq).append("/").append(entry.count)
- .append("/").append(entry.frequency).append("]\n");
- }
- return s.toString();
- }
-
- /**
- * Calculates a score how well NGramProfiles match each other
- *
- * @param another
- * ngram profile to compare against
- * @return similarity 0=exact match
- * @throws TikaException
- * if could not calculate a score
- */
- public float getSimilarity(LanguageProfilerBuilder another)
- throws TikaException {
-
- float sum = 0;
-
- try {
- Iterator<NGramEntry> i = another.getSorted().iterator();
- while (i.hasNext()) {
- NGramEntry other = i.next();
- if (ngrams.containsKey(other.seq)) {
- sum += Math.abs((other.frequency - ngrams.get(other.seq).frequency)) / 2;
- } else {
- sum += other.frequency;
- }
- }
- i = getSorted().iterator();
- while (i.hasNext()) {
- NGramEntry other = i.next();
- if (another.ngrams.containsKey(other.seq)) {
- sum += Math.abs((other.frequency - another.ngrams
- .get(other.seq).frequency)) / 2;
- } else {
- sum += other.frequency;
- }
- }
- } catch (Exception e) {
- throw new TikaException("Could not calculate a score how well NGramProfiles match each other");
- }
- return sum;
- }
-
- /**
- * Loads a ngram profile from an InputStream (assumes UTF-8 encoded content)
- *
- * @param is the InputStream to read
- */
- public void load(InputStream is) throws IOException {
-
- ngrams.clear();
- ngramcounts = new int[maxLength + 1];
- BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
- String line = null;
-
- while ((line = reader.readLine()) != null) {
-
- // # starts a comment line
- if (line.charAt(0) != '#') {
- int spacepos = line.indexOf(' ');
- String ngramsequence = line.substring(0, spacepos).trim();
- int len = ngramsequence.length();
- if ((len >= minLength) && (len <= maxLength)) {
- int ngramcount = Integer.parseInt(line.substring(spacepos + 1));
- NGramEntry en = new NGramEntry(ngramsequence, ngramcount);
- ngrams.put(en.getSeq(), en);
- ngramcounts[len] += ngramcount;
- }
- }
- }
- normalize();
- }
-
- /**
- * Creates a new Language profile from (preferably quite large - 5-10k of
- * lines) text file
- *
- * @param name to be given for the profile
- * @param is a stream to be read
- * @param encoding is the encoding of stream
- *
- * @throws TikaException if could not create a language profile
- *
- */
- public static LanguageProfilerBuilder create(String name, InputStream is, String encoding) throws TikaException {
-
- LanguageProfilerBuilder newProfile = new LanguageProfilerBuilder(name,
- ABSOLUTE_MIN_NGRAM_LENGTH, ABSOLUTE_MAX_NGRAM_LENGTH);
- BufferedInputStream bis = new BufferedInputStream(is);
-
- byte buffer[] = new byte[4096];
- StringBuilder text = new StringBuilder();
- int len;
-
- try {
- while ((len = bis.read(buffer)) != -1) {
- text.append(new String(buffer, 0, len, encoding));
- }
- } catch (IOException e) {
- throw new TikaException("Could not create profile, " + e.getMessage());
- }
-
- newProfile.analyze(text);
- return newProfile;
- }
-
- /**
- * Writes NGramProfile content into OutputStream, content is outputted with
- * UTF-8 encoding
- *
- * @param os the Stream to output to
- *
- * @throws IOException
- */
- public void save(OutputStream os) throws IOException {
- os.write(("# NgramProfile generated at " + new Date() +
- " for Apache Tika Language Identification\n").getBytes());
-
- // And then each ngram
-
- // First dispatch ngrams in many lists depending on their size
- // (one list for each size, in order to store MAX_SIZE ngrams for each
- // size of ngram)
- List<NGramEntry> list = new ArrayList<NGramEntry>();
- List<NGramEntry> sublist = new ArrayList<NGramEntry>();
- NGramEntry[] entries = ngrams.values().toArray(
- new NGramEntry[ngrams.size()]);
- for (int i = minLength; i <= maxLength; i++) {
- for (int j = 0; j < entries.length; j++) {
- if (entries[j].getSeq().length() == i) {
- sublist.add(entries[j]);
- }
- }
- Collections.sort(sublist);
- if (sublist.size() > MAX_SIZE) {
- sublist = sublist.subList(0, MAX_SIZE);
- }
- list.addAll(sublist);
- sublist.clear();
- }
- for (int i = 0; i < list.size(); i++) {
- NGramEntry e = list.get(i);
- String line = e.toString() + " " + e.getCount() + "\n";
- os.write(line.getBytes("UTF-8"));
- }
- os.flush();
- }
-
- /**
- * main method used for testing only
- *
- * @param args
- */
- public static void main(String args[]) {
-
- // -create he sample_he.txt utf-8
-
- String usage = "Usage: NGramProfile "
- + "[-create profilename filename encoding] "
- + "[-similarity file1 file2] "
- + "[-score profile-name filename encoding]";
- int command = 0;
-
- final int CREATE = 1;
- final int SIMILARITY = 2;
- final int SCORE = 3;
-
- String profilename = "";
- String filename = "";
- String filename2 = "";
- String encoding = "";
-
- if (args.length == 0) {
- System.err.println(usage);
- System.exit(-1);
- }
-
- for (int i = 0; i < args.length; i++) { // parse command line
- if (args[i].equals("-create")) { // found -create option
- command = CREATE;
- profilename = args[++i];
- filename = args[++i];
- encoding = args[++i];
- }
-
- if (args[i].equals("-similarity")) { // found -similarity option
- command = SIMILARITY;
- filename = args[++i];
- filename2 = args[++i];
- encoding = args[++i];
- }
-
- if (args[i].equals("-score")) { // found -Score option
- command = SCORE;
- profilename = args[++i];
- filename = args[++i];
- encoding = args[++i];
- }
- }
-
- try {
-
- switch (command) {
-
- case CREATE:
-
- File f = new File(filename);
- FileInputStream fis = new FileInputStream(f);
- LanguageProfilerBuilder newProfile = LanguageProfilerBuilder
- .create(profilename, fis, encoding);
- fis.close();
- f = new File(profilename + "." + FILE_EXTENSION);
- FileOutputStream fos = new FileOutputStream(f);
- newProfile.save(fos);
- System.out.println("new profile " + profilename + "."
- + FILE_EXTENSION + " was created.");
- break;
-
- case SIMILARITY:
-
- f = new File(filename);
- fis = new FileInputStream(f);
- newProfile = LanguageProfilerBuilder.create(filename, fis,
- encoding);
- newProfile.normalize();
-
- f = new File(filename2);
- fis = new FileInputStream(f);
- LanguageProfilerBuilder newProfile2 = LanguageProfilerBuilder
- .create(filename2, fis, encoding);
- newProfile2.normalize();
- System.out.println("Similarity is "
- + newProfile.getSimilarity(newProfile2));
- break;
-
- case SCORE:
- f = new File(filename);
- fis = new FileInputStream(f);
- newProfile = LanguageProfilerBuilder.create(filename, fis,
- encoding);
-
- f = new File(profilename + "." + FILE_EXTENSION);
- fis = new FileInputStream(f);
- LanguageProfilerBuilder compare = new LanguageProfilerBuilder(
- profilename, DEFAULT_MIN_NGRAM_LENGTH,
- DEFAULT_MAX_NGRAM_LENGTH);
- compare.load(fis);
- System.out.println("Score is "
- + compare.getSimilarity(newProfile));
- break;
-
- }
-
- } catch (Exception e) {
- e.printStackTrace();
- // throw new TikaException("");
- }
- }
-
-
- /**
- * Inner class that describes a NGram
- */
- class NGramEntry implements Comparable<NGramEntry> {
-
- /** The NGRamProfile this NGram is related to */
- private LanguageProfilerBuilder profile = null;
-
- /** The sequence of characters of the ngram */
- CharSequence seq = null;
-
- /** The number of occurences of this ngram in its profile */
- private int count = 0;
-
- /** The frequency of this ngram in its profile */
- private float frequency = 0.0F;
-
- /**
- * Constructs a new NGramEntry
- *
- * @param seq is the sequence of characters of the ngram
- */
- public NGramEntry(CharSequence seq) {
- this.seq = seq;
- }
-
- /**
- * Constructs a new NGramEntry
- *
- * @param seq is the sequence of characters of the ngram
- * @param count is the number of occurrences of this ngram
- */
- public NGramEntry(String seq, int count) {
- this.seq = new StringBuffer(seq).subSequence(0, seq.length());
- this.count = count;
- }
-
- /**
- * Returns the number of occurrences of this ngram in its profile
- *
- * @return the number of occurrences of this ngram in its profile
- */
- public int getCount() {
- return count;
- }
-
- /**
- * Returns the frequency of this ngram in its profile
- *
- * @return the frequency of this ngram in its profile
- */
- public float getFrequency() {
- return frequency;
- }
-
- /**
- * Returns the sequence of characters of this ngram
- *
- * @return the sequence of characters of this ngram
- */
- public CharSequence getSeq() {
- return seq;
- }
-
- /**
- * Returns the size of this ngram
- *
- * @return the size of this ngram
- */
- public int size() {
- return seq.length();
- }
-
- // Inherited JavaDoc
- public int compareTo(NGramEntry ngram) {
- int diff = Float.compare(ngram.getFrequency(), frequency);
- if (diff != 0) {
- return diff;
- } else {
- return (toString().compareTo(ngram.toString()));
- }
- }
-
- /**
- * Increments the number of occurrences of this ngram.
- */
- public void inc() {
- count++;
- }
-
- /**
- * Associated a profile to this ngram
- *
- * @param profile
- * is the profile associated to this ngram
- */
- public void setProfile(LanguageProfilerBuilder profile) {
- this.profile = profile;
- }
-
- /**
- * Returns the profile associated to this ngram
- *
- * @return the profile associated to this ngram
- */
- public LanguageProfilerBuilder getProfile() {
- return profile;
- }
-
- // Inherited JavaDoc
- public String toString() {
- return seq.toString();
- }
-
- // Inherited JavaDoc
- public int hashCode() {
- return seq.hashCode();
- }
-
- // Inherited JavaDoc
- public boolean equals(Object obj) {
-
- NGramEntry ngram = null;
- try {
- ngram = (NGramEntry) obj;
- return ngram.seq.equals(seq);
- } catch (Exception e) {
- return false;
- }
- }
-
- }
-
- private class QuickStringBuffer implements CharSequence {
-
- private char value[];
-
- private int count;
-
- QuickStringBuffer() {
- this(16);
- }
-
- QuickStringBuffer(char[] value) {
- this.value = value;
- count = value.length;
- }
-
- QuickStringBuffer(int length) {
- value = new char[length];
- }
-
- QuickStringBuffer(String str) {
- this(str.length() + 16);
- append(str);
- }
-
- public int length() {
- return count;
- }
-
- private void expandCapacity(int minimumCapacity) {
- int newCapacity = (value.length + 1) * 2;
- if (newCapacity < 0) {
- newCapacity = Integer.MAX_VALUE;
- } else if (minimumCapacity > newCapacity) {
- newCapacity = minimumCapacity;
- }
-
- char newValue[] = new char[newCapacity];
- System.arraycopy(value, 0, newValue, 0, count);
- value = newValue;
- }
-
- QuickStringBuffer clear() {
- count = 0;
- return this;
- }
-
- public char charAt(int index) {
- return value[index];
- }
-
- QuickStringBuffer append(String str) {
- if (str == null) {
- str = String.valueOf(str);
- }
-
- int len = str.length();
- int newcount = count + len;
- if (newcount > value.length) {
- expandCapacity(newcount);
- }
- str.getChars(0, len, value, count);
- count = newcount;
- return this;
- }
-
- QuickStringBuffer append(char c) {
- int newcount = count + 1;
- if (newcount > value.length) {
- expandCapacity(newcount);
- }
- value[count++] = c;
- return this;
- }
-
- public CharSequence subSequence(int start, int end) {
- return new String(value, start, end - start);
- }
-
- public String toString() {
- return new String(this.value);
- }
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.language;
+
+// JDK imports
+import java.io.BufferedInputStream;
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import org.apache.tika.exception.TikaException;
+
+/**
+ * This class runs a ngram analysis over submitted text, results might be used
+ * for automatic language identification.
+ *
+ * The similarity calculation is at experimental level. You have been warned.
+ *
+ * Methods are provided to build new NGramProfiles profiles.
+ *
+ * @author Sami Siren
+ * @author Jerome Charron - http://frutch.free.fr/
+ */
+public class LanguageProfilerBuilder {
+
+ // public static final Log LOG =
+ // LogFactory.getLog(LanguageProfilerBuilder.class);
+
+ /** The minimum length allowed for a ngram. */
+ final static int ABSOLUTE_MIN_NGRAM_LENGTH = 3; /* was 1 */
+
+ /** The maximum length allowed for a ngram. */
+ final static int ABSOLUTE_MAX_NGRAM_LENGTH = 3; /* was 4 */
+
+ /** The default min length of ngram */
+ final static int DEFAULT_MIN_NGRAM_LENGTH = 3;
+
+ /** The default max length of ngram */
+ final static int DEFAULT_MAX_NGRAM_LENGTH = 3;
+
+ /** The ngram profile file extension */
+ final static String FILE_EXTENSION = "ngp";
+
+ /** The profile max size (number of ngrams of the same size) */
+ final static int MAX_SIZE = 1000;
+
+ /** separator char */
+ final static char SEPARATOR = '_';
+ /** The String form of the separator char */
+ private final static String SEP_CHARSEQ = new String(
+ new char[] { SEPARATOR });
+
+ /** The profile's name */
+ private String name = null;
+
+ /** The NGrams of this profile sorted on the number of occurrences */
+ private List<NGramEntry> sorted = null;
+
+ /** The min length of ngram */
+ private int minLength = DEFAULT_MIN_NGRAM_LENGTH;
+
+ /** The max length of ngram */
+ private int maxLength = DEFAULT_MAX_NGRAM_LENGTH;
+
+ /** The total number of ngrams occurences */
+ private int[] ngramcounts = null;
+
+ /** An index of the ngrams of the profile */
+ private Map<CharSequence, NGramEntry> ngrams = null;
+
+ /** A StringBuffer used during analysis */
+ private QuickStringBuffer word = new QuickStringBuffer();
+
+ /**
+ * Constructs a new ngram profile
+ *
+ * @param name is the name of the profile
+ * @param minlen is the min length of ngram sequences
+ * @param maxlen is the max length of ngram sequences
+ */
+ public LanguageProfilerBuilder(String name, int minlen, int maxlen) {
+ // TODO: Compute the initial capacity using minlen and maxlen.
+ this.ngrams = new HashMap<CharSequence, NGramEntry>(4000);
+ this.minLength = minlen;
+ this.maxLength = maxlen;
+ this.name = name;
+ }
+
+ /**
+ * Constructs a new ngram profile where minlen=3, maxlen=3
+ *
+ * @param name is a name of profile, usually two length string
+ * @since Tika 1.0
+ */
+ public LanguageProfilerBuilder(String name) {
+ this.ngrams = new HashMap<CharSequence, NGramEntry>(4000);
+ this.minLength = ABSOLUTE_MIN_NGRAM_LENGTH;
+ this.maxLength = ABSOLUTE_MAX_NGRAM_LENGTH;
+ this.name = name;
+ }
+
+ /**
+ * @return Returns the name.
+ */
+ public String getName() {
+ return name;
+ }
+
+ // This method was commented because it depends on org.apache.lucene.analysis.Token
+ // that is not a part of the Tika
+ // /**
+ // * Adds ngrams from a token to this profile
+ // *
+ // * @param t is the Token to be added
+ // */
+ // public void add(Token t) {
+ // add(new StringBuffer().append(SEPARATOR)
+ // .append(t.term())
+ // .append(SEPARATOR));
+ // }
+
+ /**
+ * Adds ngrams from a single word to this profile
+ *
+ * @param word is the word to add
+ */
+ public void add(StringBuffer word) {
+ for (int i = minLength; (i <= maxLength) && (i < word.length()); i++) {
+ add(word, i);
+ }
+ }
+
+ /**
+ * Adds the last NGrams from the specified word.
+ */
+ private void add(QuickStringBuffer word) {
+ int wlen = word.length();
+ if (wlen >= minLength) {
+ int max = Math.min(maxLength, wlen);
+ for (int i = minLength; i <= max; i++) {
+ add(word.subSequence(wlen - i, wlen));
+ }
+ }
+ }
+
+ /**
+ * Adds ngrams from a single word in this profile
+ *
+ * @param word is the word to add
+ * @param n is the ngram size
+ */
+ private void add(CharSequence cs) {
+
+ if (cs.equals(SEP_CHARSEQ)) {
+ return;
+ }
+ NGramEntry nge = ngrams.get(cs);
+ if (nge == null) {
+ nge = new NGramEntry(cs);
+ ngrams.put(cs, nge);
+ }
+ nge.inc();
+ }
+
+ /**
+ * Analyzes a piece of text
+ *
+ * @param text
+ * the text to be analyzed
+ */
+ public void analyze(StringBuilder text) {
+
+ if (ngrams != null) {
+ ngrams.clear();
+ sorted = null;
+ ngramcounts = null;
+ }
+
+ word.clear().append(SEPARATOR);
+ for (int i = 0; i < text.length(); i++) {
+ char c = Character.toLowerCase(text.charAt(i));
+
+ if (Character.isLetter(c)) {
+ add(word.append(c));
+ } else {
+ // found word boundary
+ if (word.length() > 1) {
+ // we have a word!
+ add(word.append(SEPARATOR));
+ word.clear().append(SEPARATOR);
+ }
+ }
+ }
+
+ if (word.length() > 1) {
+ // we have a word!
+ add(word.append(SEPARATOR));
+ }
+ normalize();
+ }
+
+ /**
+ * @param word
+ * @param n sequence length
+ */
+ private void add(StringBuffer word, int n) {
+ for (int i = 0; i <= word.length() - n; i++) {
+ add(word.subSequence(i, i + n));
+ }
+ }
+
+ /**
+ * Normalizes the profile (calculates the ngrams frequencies)
+ */
+ protected void normalize() {
+ NGramEntry e = null;
+ Iterator<NGramEntry> i = ngrams.values().iterator();
+
+ // Calculates ngram count if not already done
+ if (ngramcounts == null) {
+ ngramcounts = new int[maxLength + 1];
+ while (i.hasNext()) {
+ e = i.next();
+ ngramcounts[e.size()] += e.count;
+ }
+ }
+
+ i = ngrams.values().iterator();
+ while (i.hasNext()) {
+ e = i.next();
+ e.frequency = (float) e.count / (float) ngramcounts[e.size()];
+ }
+ }
+
+ /**
+ * Returns a sorted list of ngrams (sort done by 1. frequency 2. sequence)
+ *
+ * @return sorted vector of ngrams
+ */
+ public List<NGramEntry> getSorted() {
+ // make sure sorting is done only once
+ if (sorted == null) {
+ sorted = new ArrayList<NGramEntry>(ngrams.values());
+ Collections.sort(sorted);
+
+ // trim at NGRAM_LENGTH entries
+ if (sorted.size() > MAX_SIZE) {
+ sorted = sorted.subList(0, MAX_SIZE);
+ }
+ }
+ return sorted;
+ }
+
+ // Inherited JavaDoc
+ public String toString() {
+
+ StringBuffer s = new StringBuffer().append("NGramProfile: ")
+ .append(name).append("\n");
+
+ Iterator<NGramEntry> i = getSorted().iterator();
+
+ while (i.hasNext()) {
+ NGramEntry entry = i.next();
+ s.append("[").append(entry.seq).append("/").append(entry.count)
+ .append("/").append(entry.frequency).append("]\n");
+ }
+ return s.toString();
+ }
+
+ /**
+ * Calculates a score how well NGramProfiles match each other
+ *
+ * @param another
+ * ngram profile to compare against
+ * @return similarity 0=exact match
+ * @throws TikaException
+ * if could not calculate a score
+ */
+ public float getSimilarity(LanguageProfilerBuilder another)
+ throws TikaException {
+
+ float sum = 0;
+
+ try {
+ Iterator<NGramEntry> i = another.getSorted().iterator();
+ while (i.hasNext()) {
+ NGramEntry other = i.next();
+ if (ngrams.containsKey(other.seq)) {
+ sum += Math.abs((other.frequency - ngrams.get(other.seq).frequency)) / 2;
+ } else {
+ sum += other.frequency;
+ }
+ }
+ i = getSorted().iterator();
+ while (i.hasNext()) {
+ NGramEntry other = i.next();
+ if (another.ngrams.containsKey(other.seq)) {
+ sum += Math.abs((other.frequency - another.ngrams
+ .get(other.seq).frequency)) / 2;
+ } else {
+ sum += other.frequency;
+ }
+ }
+ } catch (Exception e) {
+ throw new TikaException("Could not calculate a score how well NGramProfiles match each other");
+ }
+ return sum;
+ }
+
+ /**
+ * Loads a ngram profile from an InputStream (assumes UTF-8 encoded content)
+ *
+ * @param is the InputStream to read
+ */
+ public void load(InputStream is) throws IOException {
+
+ ngrams.clear();
+ ngramcounts = new int[maxLength + 1];
+ BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
+ String line = null;
+
+ while ((line = reader.readLine()) != null) {
+
+ // # starts a comment line
+ if (line.charAt(0) != '#') {
+ int spacepos = line.indexOf(' ');
+ String ngramsequence = line.substring(0, spacepos).trim();
+ int len = ngramsequence.length();
+ if ((len >= minLength) && (len <= maxLength)) {
+ int ngramcount = Integer.parseInt(line.substring(spacepos + 1));
+ NGramEntry en = new NGramEntry(ngramsequence, ngramcount);
+ ngrams.put(en.getSeq(), en);
+ ngramcounts[len] += ngramcount;
+ }
+ }
+ }
+ normalize();
+ }
+
+ /**
+ * Creates a new Language profile from (preferably quite large - 5-10k of
+ * lines) text file
+ *
+ * @param name to be given for the profile
+ * @param is a stream to be read
+ * @param encoding is the encoding of stream
+ *
+ * @throws TikaException if could not create a language profile
+ *
+ */
+ public static LanguageProfilerBuilder create(String name, InputStream is, String encoding) throws TikaException {
+
+ LanguageProfilerBuilder newProfile = new LanguageProfilerBuilder(name,
+ ABSOLUTE_MIN_NGRAM_LENGTH, ABSOLUTE_MAX_NGRAM_LENGTH);
+ BufferedInputStream bis = new BufferedInputStream(is);
+
+ byte buffer[] = new byte[4096];
+ StringBuilder text = new StringBuilder();
+ int len;
+
+ try {
+ while ((len = bis.read(buffer)) != -1) {
+ text.append(new String(buffer, 0, len, encoding));
+ }
+ } catch (IOException e) {
+ throw new TikaException("Could not create profile, " + e.getMessage());
+ }
+
+ newProfile.analyze(text);
+ return newProfile;
+ }
+
+ /**
+ * Writes NGramProfile content into OutputStream, content is outputted with
+ * UTF-8 encoding
+ *
+ * @param os the Stream to output to
+ *
+ * @throws IOException
+ */
+ public void save(OutputStream os) throws IOException {
+ os.write(("# NgramProfile generated at " + new Date() +
+ " for Apache Tika Language Identification\n").getBytes());
+
+ // And then each ngram
+
+ // First dispatch ngrams in many lists depending on their size
+ // (one list for each size, in order to store MAX_SIZE ngrams for each
+ // size of ngram)
+ List<NGramEntry> list = new ArrayList<NGramEntry>();
+ List<NGramEntry> sublist = new ArrayList<NGramEntry>();
+ NGramEntry[] entries = ngrams.values().toArray(
+ new NGramEntry[ngrams.size()]);
+ for (int i = minLength; i <= maxLength; i++) {
+ for (int j = 0; j < entries.length; j++) {
+ if (entries[j].getSeq().length() == i) {
+ sublist.add(entries[j]);
+ }
+ }
+ Collections.sort(sublist);
+ if (sublist.size() > MAX_SIZE) {
+ sublist = sublist.subList(0, MAX_SIZE);
+ }
+ list.addAll(sublist);
+ sublist.clear();
+ }
+ for (int i = 0; i < list.size(); i++) {
+ NGramEntry e = list.get(i);
+ String line = e.toString() + " " + e.getCount() + "\n";
+ os.write(line.getBytes("UTF-8"));
+ }
+ os.flush();
+ }
+
+ /**
+ * main method used for testing only
+ *
+ * @param args
+ */
+ public static void main(String args[]) {
+
+ // -create he sample_he.txt utf-8
+
+ String usage = "Usage: NGramProfile "
+ + "[-create profilename filename encoding] "
+ + "[-similarity file1 file2] "
+ + "[-score profile-name filename encoding]";
+ int command = 0;
+
+ final int CREATE = 1;
+ final int SIMILARITY = 2;
+ final int SCORE = 3;
+
+ String profilename = "";
+ String filename = "";
+ String filename2 = "";
+ String encoding = "";
+
+ if (args.length == 0) {
+ System.err.println(usage);
+ System.exit(-1);
+ }
+
+ for (int i = 0; i < args.length; i++) { // parse command line
+ if (args[i].equals("-create")) { // found -create option
+ command = CREATE;
+ profilename = args[++i];
+ filename = args[++i];
+ encoding = args[++i];
+ }
+
+ if (args[i].equals("-similarity")) { // found -similarity option
+ command = SIMILARITY;
+ filename = args[++i];
+ filename2 = args[++i];
+ encoding = args[++i];
+ }
+
+ if (args[i].equals("-score")) { // found -Score option
+ command = SCORE;
+ profilename = args[++i];
+ filename = args[++i];
+ encoding = args[++i];
+ }
+ }
+
+ try {
+
+ switch (command) {
+
+ case CREATE:
+
+ File f = new File(filename);
+ FileInputStream fis = new FileInputStream(f);
+ LanguageProfilerBuilder newProfile = LanguageProfilerBuilder
+ .create(profilename, fis, encoding);
+ fis.close();
+ f = new File(profilename + "." + FILE_EXTENSION);
+ FileOutputStream fos = new FileOutputStream(f);
+ newProfile.save(fos);
+ System.out.println("new profile " + profilename + "."
+ + FILE_EXTENSION + " was created.");
+ break;
+
+ case SIMILARITY:
+
+ f = new File(filename);
+ fis = new FileInputStream(f);
+ newProfile = LanguageProfilerBuilder.create(filename, fis,
+ encoding);
+ newProfile.normalize();
+
+ f = new File(filename2);
+ fis = new FileInputStream(f);
+ LanguageProfilerBuilder newProfile2 = LanguageProfilerBuilder
+ .create(filename2, fis, encoding);
+ newProfile2.normalize();
+ System.out.println("Similarity is "
+ + newProfile.getSimilarity(newProfile2));
+ break;
+
+ case SCORE:
+ f = new File(filename);
+ fis = new FileInputStream(f);
+ newProfile = LanguageProfilerBuilder.create(filename, fis,
+ encoding);
+
+ f = new File(profilename + "." + FILE_EXTENSION);
+ fis = new FileInputStream(f);
+ LanguageProfilerBuilder compare = new LanguageProfilerBuilder(
+ profilename, DEFAULT_MIN_NGRAM_LENGTH,
+ DEFAULT_MAX_NGRAM_LENGTH);
+ compare.load(fis);
+ System.out.println("Score is "
+ + compare.getSimilarity(newProfile));
+ break;
+
+ }
+
+ } catch (Exception e) {
+ e.printStackTrace();
+ // throw new TikaException("");
+ }
+ }
+
+
+ /**
+ * Inner class that describes a NGram
+ */
+ class NGramEntry implements Comparable<NGramEntry> {
+
+ /** The NGRamProfile this NGram is related to */
+ private LanguageProfilerBuilder profile = null;
+
+ /** The sequence of characters of the ngram */
+ CharSequence seq = null;
+
+ /** The number of occurences of this ngram in its profile */
+ private int count = 0;
+
+ /** The frequency of this ngram in its profile */
+ private float frequency = 0.0F;
+
+ /**
+ * Constructs a new NGramEntry
+ *
+ * @param seq is the sequence of characters of the ngram
+ */
+ public NGramEntry(CharSequence seq) {
+ this.seq = seq;
+ }
+
+ /**
+ * Constructs a new NGramEntry
+ *
+ * @param seq is the sequence of characters of the ngram
+ * @param count is the number of occurrences of this ngram
+ */
+ public NGramEntry(String seq, int count) {
+ this.seq = new StringBuffer(seq).subSequence(0, seq.length());
+ this.count = count;
+ }
+
+ /**
+ * Returns the number of occurrences of this ngram in its profile
+ *
+ * @return the number of occurrences of this ngram in its profile
+ */
+ public int getCount() {
+ return count;
+ }
+
+ /**
+ * Returns the frequency of this ngram in its profile
+ *
+ * @return the frequency of this ngram in its profile
+ */
+ public float getFrequency() {
+ return frequency;
+ }
+
+ /**
+ * Returns the sequence of characters of this ngram
+ *
+ * @return the sequence of characters of this ngram
+ */
+ public CharSequence getSeq() {
+ return seq;
+ }
+
+ /**
+ * Returns the size of this ngram
+ *
+ * @return the size of this ngram
+ */
+ public int size() {
+ return seq.length();
+ }
+
+ // Inherited JavaDoc
+ public int compareTo(NGramEntry ngram) {
+ int diff = Float.compare(ngram.getFrequency(), frequency);
+ if (diff != 0) {
+ return diff;
+ } else {
+ return (toString().compareTo(ngram.toString()));
+ }
+ }
+
+ /**
+ * Increments the number of occurrences of this ngram.
+ */
+ public void inc() {
+ count++;
+ }
+
+ /**
+ * Associated a profile to this ngram
+ *
+ * @param profile
+ * is the profile associated to this ngram
+ */
+ public void setProfile(LanguageProfilerBuilder profile) {
+ this.profile = profile;
+ }
+
+ /**
+ * Returns the profile associated to this ngram
+ *
+ * @return the profile associated to this ngram
+ */
+ public LanguageProfilerBuilder getProfile() {
+ return profile;
+ }
+
+ // Inherited JavaDoc
+ public String toString() {
+ return seq.toString();
+ }
+
+ // Inherited JavaDoc
+ public int hashCode() {
+ return seq.hashCode();
+ }
+
+ // Inherited JavaDoc
+ public boolean equals(Object obj) {
+
+ NGramEntry ngram = null;
+ try {
+ ngram = (NGramEntry) obj;
+ return ngram.seq.equals(seq);
+ } catch (Exception e) {
+ return false;
+ }
+ }
+
+ }
+
+ private class QuickStringBuffer implements CharSequence {
+
+ private char value[];
+
+ private int count;
+
+ QuickStringBuffer() {
+ this(16);
+ }
+
+ QuickStringBuffer(char[] value) {
+ this.value = value;
+ count = value.length;
+ }
+
+ QuickStringBuffer(int length) {
+ value = new char[length];
+ }
+
+ QuickStringBuffer(String str) {
+ this(str.length() + 16);
+ append(str);
+ }
+
+ public int length() {
+ return count;
+ }
+
+ private void expandCapacity(int minimumCapacity) {
+ int newCapacity = (value.length + 1) * 2;
+ if (newCapacity < 0) {
+ newCapacity = Integer.MAX_VALUE;
+ } else if (minimumCapacity > newCapacity) {
+ newCapacity = minimumCapacity;
+ }
+
+ char newValue[] = new char[newCapacity];
+ System.arraycopy(value, 0, newValue, 0, count);
+ value = newValue;
+ }
+
+ QuickStringBuffer clear() {
+ count = 0;
+ return this;
+ }
+
+ public char charAt(int index) {
+ return value[index];
+ }
+
+ QuickStringBuffer append(String str) {
+ if (str == null) {
+ str = String.valueOf(str);
+ }
+
+ int len = str.length();
+ int newcount = count + len;
+ if (newcount > value.length) {
+ expandCapacity(newcount);
+ }
+ str.getChars(0, len, value, count);
+ count = newcount;
+ return this;
+ }
+
+ QuickStringBuffer append(char c) {
+ int newcount = count + 1;
+ if (newcount > value.length) {
+ expandCapacity(newcount);
+ }
+ value[count++] = c;
+ return this;
+ }
+
+ public CharSequence subSequence(int start, int end) {
+ return new String(value, start, end - start);
+ }
+
+ public String toString() {
+ return new String(this.value);
+ }
+ }
+}
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/language/LanguageProfilerBuilder.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/language/ProfilingHandler.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/language/ProfilingWriter.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Message.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CryptoParser.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DelegatingParser.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/NetworkParser.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/CompositeExternalParser.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersFactory.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ElementMappingContentHandler.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/EndDocumentShieldingContentHandler.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/Link.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/LinkBuilder.java
------------------------------------------------------------------------------
svn:eol-style = native