You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/06 04:50:57 UTC
svn commit: r1723223 [25/32] - in /tika/branches/2.x:
tika-core/src/test/resources/META-INF/
tika-core/src/test/resources/META-INF/services/ tika-parser-modules/
tika-parser-modules/tika-advanced-module/
tika-parser-modules/tika-advanced-module/src/ ti...
Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.geo.topic;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import org.junit.Test;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.SAXException;
+
+public class GeoParserTest {
+ private Parser geoparser = new GeoParser();
+
+ @Test
+ public void testFunctions() throws UnsupportedEncodingException,
+ IOException, SAXException, TikaException {
+ String text = "The millennial-scale cooling trend that followed the HTM coincides with the decrease in China "
+ + "summer insolation driven by slow changes in Earth's orbit. Despite the nearly linear forcing, the transition from the HTM to "
+ + "the Little Ice Age (1500-1900 AD) was neither gradual nor uniform. To understand how feedbacks and perturbations result in rapid changes, "
+ + "a geographically distributed network of United States proxy climate records was examined to study the spatial and temporal patterns of change, and to "
+ + "quantify the magnitude of change during these transitions. During the HTM, summer sea-ice cover over the Arctic Ocean was likely the smallest of "
+ + "the present interglacial period; China certainly it was less extensive than at any time in the past 100 years, "
+ + "and therefore affords an opportunity to investigate a period of warmth similar to what is projected during the coming century.";
+
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+ GeoParserConfig config = new GeoParserConfig();
+ context.set(GeoParserConfig.class, config);
+
+ InputStream s = new ByteArrayInputStream(text.getBytes(UTF_8));
+ /* if it's not available no tests to run */
+ if (!((GeoParser) geoparser).isAvailable())
+ return;
+
+ geoparser.parse(s, new BodyContentHandler(), metadata, context);
+
+ assertNotNull(metadata.get("Geographic_NAME"));
+ assertNotNull(metadata.get("Geographic_LONGITUDE"));
+ assertNotNull(metadata.get("Geographic_LATITUDE"));
+ assertEquals("China", metadata.get("Geographic_NAME"));
+ assertEquals("United States", metadata.get("Optional_NAME1"));
+ assertEquals("27.33931", metadata.get("Geographic_LATITUDE"));
+ assertEquals("-108.60288", metadata.get("Geographic_LONGITUDE"));
+ assertEquals("39.76", metadata.get("Optional_LATITUDE1"));
+ assertEquals("-98.5", metadata.get("Optional_LONGITUDE1"));
+
+ }
+
+ @Test
+ public void testNulls() throws UnsupportedEncodingException, IOException,
+ SAXException, TikaException {
+ String text = "";
+
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+ GeoParserConfig config = new GeoParserConfig();
+ context.set(GeoParserConfig.class, config);
+ geoparser.parse(new ByteArrayInputStream(text.getBytes(UTF_8)),
+ new BodyContentHandler(), metadata, context);
+ assertNull(metadata.get("Geographic_NAME"));
+ assertNull(metadata.get("Geographic_LONGITUDE"));
+ assertNull(metadata.get("Geographic_LATITUDE"));
+
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/geoinfo/GeographicInformationParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/geoinfo/GeographicInformationParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/geoinfo/GeographicInformationParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/geoinfo/GeographicInformationParserTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.geoinfo;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.geoinfo.GeographicInformationParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+import java.io.*;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+
+public class GeographicInformationParserTest {
+
+ @Test
+ public void testISO19139() throws Exception{
+ String path ="/test-documents/sampleFile.iso19139";
+
+ Metadata metadata = new Metadata();
+ Parser parser=new org.apache.tika.parser.geoinfo.GeographicInformationParser();
+ ContentHandler contentHandler=new BodyContentHandler();
+ ParseContext parseContext=new ParseContext();
+
+ InputStream inputStream = GeographicInformationParser.class.getResourceAsStream(path);
+
+ parser.parse(inputStream, contentHandler, metadata, parseContext);
+
+ assertEquals("text/iso19139+xml", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("UTF-8", metadata.get("CharacterSet"));
+ assertEquals("https", metadata.get("TransferOptionsOnlineProtocol "));
+ assertEquals("browser", metadata.get("TransferOptionsOnlineProfile "));
+ assertEquals("Barrow Atqasuk ARCSS Plant", metadata.get("TransferOptionsOnlineName "));
+
+ String content = contentHandler.toString();
+ assertTrue(content.contains("Barrow Atqasuk ARCSS Plant"));
+ assertTrue(content.contains("GeographicElementWestBoundLatitude -157.24"));
+ assertTrue(content.contains("GeographicElementEastBoundLatitude -156.4"));
+ assertTrue(content.contains("GeographicElementNorthBoundLatitude 71.18"));
+ assertTrue(content.contains("GeographicElementSouthBoundLatitude 70.27"));
+
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/grib/GribParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/grib/GribParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/grib/GribParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/grib/GribParserTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.grib;
+
+//JDK imports
+import static org.junit.Assert.*;
+import java.io.InputStream;
+
+//TIKA imports
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+import java.io.File;
+/**
+ * Test cases to exercise the {@link org.apache.tika.parser.grib.GribParser}.
+ */
+
+public class GribParserTest {
+
+ @Test
+ public void testParseGlobalMetadata() throws Exception {
+ Parser parser = new GribParser();
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ try (InputStream stream = GribParser.class.getResourceAsStream("/test-documents/gdas1.forecmwf.2014062612.grib2")) {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ }
+ assertNotNull(metadata);
+ String content = handler.toString();
+ assertTrue(content.contains("dimensions:"));
+ assertTrue(content.contains("variables:"));
+ }
+}
+
Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.hdf;
+
+//JDK imports
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+
+import java.io.InputStream;
+
+
+
+
+//TIKA imports
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.hdf.HDFParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ *
+ * Test suite for the {@link HDFParser}.
+ *
+ */
+public class HDFParserTest {
+
+ @Test
+ public void testParseGlobalMetadata() throws Exception {
+ if(System.getProperty("java.version").startsWith("1.5")) {
+ return;
+ }
+ Parser parser = new HDFParser();
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ /*
+ * this is a publicly available HDF5 file from the MLS mission:
+ *
+ *
+ * ftp://acdisc.gsfc.nasa.gov/data/s4pa///Aura_MLS_Level2/ML2O3.002//2009
+ * /MLS-Aura_L2GP-O3_v02-23-c01_2009d122.he5
+ */
+ try (InputStream stream = HDFParser.class.getResourceAsStream("/test-documents/test.he5")) {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ }
+
+ assertNotNull(metadata);
+ assertEquals("5", metadata.get("GranuleMonth"));
+ }
+
+ @Test
+ public void testHDF4() throws Exception {
+ if(System.getProperty("java.version").startsWith("1.5")) {
+ return;
+ }
+ Parser parser = new HDFParser();
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ /*
+ * this is a publicly available HDF4 file from the HD4 examples:
+ *
+ * http://www.hdfgroup.org/training/hdf4_chunking/Chunkit/bin/input54kmdata.hdf
+ */
+ try (InputStream stream = HDFParser.class.getResourceAsStream("/test-documents/test.hdf")) {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ }
+
+ assertNotNull(metadata);
+ assertEquals("Direct read of HDF4 file through CDM library", metadata.get("_History"));
+ assertEquals("Ascending", metadata.get("Pass"));
+ assertEquals("Hierarchical Data Format, version 4",
+ metadata.get("File-Type-Description"));
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,60 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.isatab;
+
+import static org.junit.Assert.*;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class ISArchiveParserTest {
+
+ @Test
+ public void testParseArchive() throws Exception {
+ String path = "/test-documents/testISATab_BII-I-1/s_BII-S-1.txt";
+
+ Parser parser = new ISArchiveParser(ISArchiveParserTest.class.getResource("/test-documents/testISATab_BII-I-1/").toURI().getPath());
+ //Parser parser = new AutoDetectParser();
+
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+ try (InputStream stream = ISArchiveParserTest.class.getResourceAsStream(path)) {
+ parser.parse(stream, handler, metadata, context);
+ }
+
+ // INVESTIGATION
+ assertEquals("Invalid Investigation Identifier", "BII-I-1", metadata.get("Investigation Identifier"));
+ assertEquals("Invalid Investigation Title", "Growth control of the eukaryote cell: a systems biology study in yeast", metadata.get("Investigation Title"));
+
+ // INVESTIGATION PUBLICATIONS
+ assertEquals("Invalid Investigation PubMed ID", "17439666", metadata.get("Investigation PubMed ID"));
+ assertEquals("Invalid Investigation Publication DOI", "doi:10.1186/jbiol54", metadata.get("Investigation Publication DOI"));
+
+ // INVESTIGATION CONTACTS
+ assertEquals("Invalid Investigation Person Last Name", "Oliver", metadata.get("Investigation Person Last Name"));
+ assertEquals("Invalid Investigation Person First Name", "Stephen", metadata.get("Investigation Person First Name"));
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/mat/MatParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/mat/MatParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/mat/MatParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/mat/MatParserTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mat;
+
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.ToXMLContentHandler;
+import org.junit.Test;
+
+/**
+ * Test cases to exercise the {@link MatParser}.
+ */
+public class MatParserTest {
+ @Test
+ public void testParser() throws Exception {
+ AutoDetectParser parser = new AutoDetectParser();
+ ToXMLContentHandler handler = new ToXMLContentHandler();
+ Metadata metadata = new Metadata();
+ String path = "/test-documents/breidamerkurjokull_radar_profiles_2009.mat";
+
+ try (InputStream stream = MatParser.class.getResourceAsStream(path)) {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ }
+
+ // Check Metadata
+ assertEquals("PCWIN64", metadata.get("platform"));
+ assertEquals("MATLAB 5.0 MAT-file", metadata.get("fileType"));
+ assertEquals("IM", metadata.get("endian"));
+ assertEquals("Thu Feb 21 15:52:49 2013", metadata.get("createdOn"));
+
+ // Check Content
+ String content = handler.toString();
+
+ assertContains("<li>[1x909 double array]</li>", content);
+ assertContains("<p>c1:[1x1 struct array]</p>", content);
+ assertContains("<li>[1024x1 double array]</li>", content);
+ assertContains("<p>b1:[1x1 struct array]</p>", content);
+ assertContains("<p>a1:[1x1 struct array]</p>", content);
+ assertContains("<li>[1024x1261 double array]</li>", content);
+ assertContains("<li>[1x1 double array]</li>", content);
+ assertContains("</body></html>", content);
+ }
+
+ @Test
+ public void testParserForText() throws Exception {
+ Parser parser = new MatParser();
+ ToXMLContentHandler handler = new ToXMLContentHandler();
+ Metadata metadata = new Metadata();
+ String path = "/test-documents/test_mat_text.mat";
+
+ try (InputStream stream = MatParser.class.getResourceAsStream(path)) {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ }
+
+ // Check Content
+ String content = handler.toString();
+ assertContains("<p>double:[2x2 double array]</p>", content);
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-module/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.netcdf;
+
+//JDK imports
+import java.io.InputStream;
+
+//TIKA imports
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+
+/**
+ * Test cases to exercise the {@link NetCDFParser}.
+ */
+public class NetCDFParserTest {
+
+ @Test
+ public void testParseGlobalMetadata() throws Exception {
+ Parser parser = new NetCDFParser();
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = NetCDFParser.class
+ .getResourceAsStream("/test-documents/sresa1b_ncar_ccsm3_0_run1_200001.nc")) {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ }
+
+ assertEquals(metadata.get(TikaCoreProperties.TITLE),
+ "model output prepared for IPCC AR4");
+ assertEquals(metadata.get(Metadata.CONTACT), "ccsm@ucar.edu");
+ assertEquals(metadata.get(Metadata.PROJECT_ID),
+ "IPCC Fourth Assessment");
+ assertEquals(metadata.get(Metadata.CONVENTIONS), "CF-1.0");
+ assertEquals(metadata.get(Metadata.REALIZATION), "1");
+ assertEquals(metadata.get(Metadata.EXPERIMENT_ID),
+ "720 ppm stabilization experiment (SRESA1B)");
+ assertEquals(metadata.get("File-Type-Description"),
+ "NetCDF-3/CDM");
+
+ String content = handler.toString();
+ assertContains("long_name = \"Surface area\"", content);
+ assertContains("float area(lat=128, lon=256)", content);
+ assertContains("float lat(lat=128)", content);
+ assertContains("double lat_bnds(lat=128, bnds=2)", content);
+ assertContains("double lon_bnds(lon=256, bnds=2)", content);
+
+
+
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-text-module/pom.xml
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/pom.xml?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/pom.xml (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/pom.xml Wed Jan 6 03:50:50 2016
@@ -0,0 +1,75 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-modules</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-text-module</artifactId>
+ <name>Apache Tika Text Module</name>
+ <url>http://tika.apache.org/</url>
+
+ <properties>
+ <commons.logging.version>1.1.3</commons.logging.version>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>com.googlecode.juniversalchardet</groupId>
+ <artifactId>juniversalchardet</artifactId>
+ <version>1.0.3</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>${commons.io.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-codec</groupId>
+ <artifactId>commons-codec</artifactId>
+ <version>${codec.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-logging</groupId>
+ <artifactId>commons-logging</artifactId>
+ <version>${commons.logging.version}</version>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
+</project>
\ No newline at end of file
Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/appended-resources/META-INF/LICENSE
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/appended-resources/META-INF/LICENSE?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/appended-resources/META-INF/LICENSE (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/appended-resources/META-INF/LICENSE Wed Jan 6 03:50:50 2016
@@ -0,0 +1,37 @@
+APACHE TIKA SUBCOMPONENTS
+
+Apache Tika includes a number of subcomponents with separate copyright notices
+and license terms. Your use of these subcomponents is subject to the terms and
+conditions of the following licenses.
+
+Charset detection code from ICU4J (http://site.icu-project.org/)
+
+ Copyright (c) 1995-2009 International Business Machines Corporation
+ and others
+
+ All rights reserved.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, and/or sell copies of the Software, and to permit persons
+ to whom the Software is furnished to do so, provided that the above
+ copyright notice(s) and this permission notice appear in all copies
+ of the Software and that both the above copyright notice(s) and this
+ permission notice appear in supporting documentation.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
+ BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
+ OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
+ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ SOFTWARE.
+
+ Except as contained in this notice, the name of a copyright holder shall
+ not be used in advertising or otherwise to promote the sale, use or other
+ dealings in this Software without prior written authorization of the
+ copyright holder.
Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/FileConfig.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/FileConfig.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/FileConfig.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/FileConfig.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,77 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.strings;
+
+import java.io.Serializable;
+
+/**
+ * Configuration for the "file" (or file-alternative) command.
+ *
+ */
+public class FileConfig implements Serializable {
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = 5712655467296441314L;
+
+ private String filePath = "";
+
+ private boolean mimetype = false;
+
+ /**
+ * Default constructor.
+ */
+ public FileConfig() {
+ // TODO Loads properties from InputStream.
+ }
+
+ /**
+ * Returns the "file" installation folder.
+ *
+ * @return the "file" installation folder.
+ */
+ public String getFilePath() {
+ return filePath;
+ }
+
+ /**
+ * Sets the "file" installation folder.
+ *
+ * @param path
+ * the "file" installation folder.
+ */
+ public void setFilePath(String filePath) {
+ this.filePath = filePath;
+ }
+
+ /**
+ * Returns {@code true} if the mime option is enabled.
+ *
+ * @return {@code true} if the mime option is enabled, {@code} otherwise.
+ */
+ public boolean isMimetype() {
+ return mimetype;
+ }
+
+ /**
+ * Sets the mime option. If {@code true}, it causes the file command to
+ * output mime type strings rather than the more traditional human readable
+ * ones.
+ *
+ * @param mimetype
+ */
+ public void setMimetype(boolean mimetype) {
+ this.mimetype = mimetype;
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/Latin1StringsParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/Latin1StringsParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/Latin1StringsParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/Latin1StringsParser.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,322 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.strings;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser to extract printable Latin1 strings from arbitrary files with pure
+ * java. Useful for binary or unknown files, for files without a specific parser
+ * and for corrupted ones causing a TikaException as a fallback parser.
+ *
+ * Currently the parser does a best effort to extract Latin1 strings, used by
+ * Western European languages, encoded with ISO-8859-1, UTF-8 or UTF-16 charsets
+ * within the same file.
+ *
+ * The implementation is optimized for fast parsing with only one pass.
+ */
+public class Latin1StringsParser extends AbstractParser {
+
+ private static final long serialVersionUID = 1L;
+
+ /**
+ * The set of supported types
+ */
+ private static final Set<MediaType> SUPPORTED_TYPES = getTypes();
+
+ /**
+ * The valid ISO-8859-1 character map.
+ */
+ private static final boolean[] isChar = getCharMap();
+
+ /**
+ * The size of the internal buffers.
+ */
+ private static int BUF_SIZE = 64 * 1024;
+
+ /**
+ * The minimum size of a character sequence to be extracted.
+ */
+ private int minSize = 4;
+
+ /**
+ * The output buffer.
+ */
+ private byte[] output = new byte[BUF_SIZE];
+
+ /**
+ * The input buffer.
+ */
+ private byte[] input = new byte[BUF_SIZE];
+
+ /**
+ * The temporary position into the output buffer.
+ */
+ private int tmpPos = 0;
+
+ /**
+ * The current position into the output buffer.
+ */
+ private int outPos = 0;
+
+ /**
+ * The number of bytes into the input buffer.
+ */
+ private int inSize = 0;
+
+ /**
+ * The position into the input buffer.
+ */
+ private int inPos = 0;
+
+ /**
+ * The output content handler.
+ */
+ private XHTMLContentHandler xhtml;
+
+ /**
+ * Returns the minimum size of a character sequence to be extracted.
+ *
+ * @return the minimum size of a character sequence
+ */
+ public int getMinSize() {
+ return minSize;
+ }
+
+ /**
+ * Sets the minimum size of a character sequence to be extracted.
+ *
+ * @param minSize
+ * the minimum size of a character sequence
+ */
+ public void setMinSize(int minSize) {
+ this.minSize = minSize;
+ }
+
+ /**
+ * Populates the valid ISO-8859-1 character map.
+ *
+ * @return the valid ISO-8859-1 character map.
+ */
+ private static boolean[] getCharMap() {
+
+ boolean[] isChar = new boolean[256];
+ for (int c = Byte.MIN_VALUE; c <= Byte.MAX_VALUE; c++)
+ if ((c >= 0x20 && c <= 0x7E)
+ || (c >= (byte) 0xC0 && c <= (byte) 0xFE) || c == 0x0A
+ || c == 0x0D || c == 0x09) {
+ isChar[c & 0xFF] = true;
+ }
+ return isChar;
+
+ }
+
+ /**
+ * Returns the set of supported types.
+ *
+ * @return the set of supported types
+ */
+ private static Set<MediaType> getTypes() {
+ HashSet<MediaType> supportedTypes = new HashSet<MediaType>();
+ supportedTypes.add(MediaType.OCTET_STREAM);
+ return supportedTypes;
+ }
+
+ /**
+ * Tests if the byte is a ISO-8859-1 char.
+ *
+ * @param c
+ * the byte to test.
+ *
+ * @return if the byte is a char.
+ */
+ private static final boolean isChar(byte c) {
+ return isChar[c & 0xFF];
+ }
+
+ /**
+ * Flushes the internal output buffer to the content handler.
+ *
+ * @throws UnsupportedEncodingException
+ * @throws SAXException
+ */
+ private void flushBuffer() throws UnsupportedEncodingException,
+ SAXException {
+ if (tmpPos - outPos >= minSize)
+ outPos = tmpPos - minSize;
+
+ xhtml.characters(new String(output, 0, outPos, "windows-1252"));
+
+ for (int k = 0; k < tmpPos - outPos; k++)
+ output[k] = output[outPos + k];
+ tmpPos = tmpPos - outPos;
+ outPos = 0;
+ }
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext arg0) {
+ return SUPPORTED_TYPES;
+ }
+
+ /**
+ * @see org.apache.tika.parser.Parser#parse(java.io.InputStream,
+ * org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata,
+ * org.apache.tika.parser.ParseContext)
+ */
+ @Override
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException {
+ /*
+ * Creates a new instance because the object is not immutable.
+ */
+ new Latin1StringsParser().doParse(stream, handler, metadata, context);
+ }
+
+ /**
+ * Does a best effort to extract Latin1 strings encoded with ISO-8859-1,
+ * UTF-8 or UTF-16. Valid chars are saved into the output buffer and the
+ * temporary buffer position is incremented. When an invalid char is read,
+ * the difference of the temporary and current buffer position is checked.
+ * If it is greater than the minimum string size, the current buffer
+ * position is updated to the temp position. If it is not, the temp position
+ * is reseted to the current position.
+ *
+ * @param stream
+ * the input stream.
+ * @param handler
+ * the output content handler
+ * @param metadata
+ * the metadata of the file
+ * @param context
+ * the parsing context
+ * @throws IOException
+ * if an io error occurs
+ * @throws SAXException
+ * if a sax error occurs
+ */
+ private void doParse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException {
+
+ tmpPos = 0;
+ outPos = 0;
+
+ xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ int i = 0;
+ do {
+ inSize = 0;
+ while ((i = stream.read(input, inSize, BUF_SIZE - inSize)) > 0) {
+ inSize += i;
+ }
+ inPos = 0;
+ while (inPos < inSize) {
+ byte c = input[inPos++];
+ boolean utf8 = false;
+ /*
+ * Test for a possible UTF8 encoded char
+ */
+ if (c == (byte) 0xC3) {
+ byte c_ = inPos < inSize ? input[inPos++] : (byte) stream
+ .read();
+ /*
+ * Test if the next byte is in the valid UTF8 range
+ */
+ if (c_ >= (byte) 0x80 && c_ <= (byte) 0xBF) {
+ utf8 = true;
+ output[tmpPos++] = (byte) (c_ + 0x40);
+ } else {
+ output[tmpPos++] = c;
+ c = c_;
+ }
+ if (tmpPos == BUF_SIZE)
+ flushBuffer();
+
+ /*
+ * Test for a possible UTF8 encoded char
+ */
+ } else if (c == (byte) 0xC2) {
+ byte c_ = inPos < inSize ? input[inPos++] : (byte) stream
+ .read();
+ /*
+ * Test if the next byte is in the valid UTF8 range
+ */
+ if (c_ >= (byte) 0xA0 && c_ <= (byte) 0xBF) {
+ utf8 = true;
+ output[tmpPos++] = c_;
+ } else {
+ output[tmpPos++] = c;
+ c = c_;
+ }
+ if (tmpPos == BUF_SIZE)
+ flushBuffer();
+ }
+ if (!utf8)
+ /*
+ * Test if the byte is a valid char.
+ */
+ if (isChar(c)) {
+ output[tmpPos++] = c;
+ if (tmpPos == BUF_SIZE)
+ flushBuffer();
+ } else {
+ /*
+ * Test if the byte is an invalid char, marking a string
+ * end. If it is a zero, test 2 positions before or
+ * ahead for a valid char, meaning it marks the
+ * transition between ISO-8859-1 and UTF16 sequences.
+ */
+ if (c != 0
+ || (inPos >= 3 && isChar(input[inPos - 3]))
+ || (inPos + 1 < inSize && isChar(input[inPos + 1]))) {
+
+ if (tmpPos - outPos >= minSize) {
+ output[tmpPos++] = 0x0A;
+ outPos = tmpPos;
+
+ if (tmpPos == BUF_SIZE)
+ flushBuffer();
+ } else
+ tmpPos = outPos;
+
+ }
+ }
+ }
+ } while (i != -1 && !Thread.currentThread().isInterrupted());
+
+ if (tmpPos - outPos >= minSize) {
+ output[tmpPos++] = 0x0A;
+ outPos = tmpPos;
+ }
+ xhtml.characters(new String(output, 0, outPos, "windows-1252"));
+
+ xhtml.endDocument();
+
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/StringsConfig.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/StringsConfig.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/StringsConfig.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/StringsConfig.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,187 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.strings;
+
+import java.io.File;
+import java.io.Serializable;
+import java.util.Properties;
+import java.io.InputStream;
+import java.io.IOException;
+
+/**
+ * Configuration for the "strings" (or strings-alternative) command.
+ *
+ */
+public class StringsConfig implements Serializable {
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -1465227101645003594L;
+
+ private String stringsPath = "";
+
+ // Minimum sequence length (characters) to print
+ private int minLength = 4;
+
+ // Character encoding of the strings that are to be found
+ private StringsEncoding encoding = StringsEncoding.SINGLE_7_BIT;
+
+ // Maximum time (seconds) to wait for the strings process termination
+ private int timeout = 120;
+
+ /**
+ * Default contructor.
+ */
+ public StringsConfig() {
+ init(this.getClass().getResourceAsStream("Strings.properties"));
+ }
+
+ /**
+ * Loads properties from InputStream and then tries to close InputStream. If
+ * there is an IOException, this silently swallows the exception and goes
+ * back to the default.
+ *
+ * @param is
+ */
+ public StringsConfig(InputStream is) {
+ init(is);
+ }
+
+ /**
+ * Initializes attributes.
+ *
+ * @param is
+ */
+ private void init(InputStream is) {
+ if (is == null) {
+ return;
+ }
+ Properties props = new Properties();
+ try {
+ props.load(is);
+ } catch (IOException e) {
+ // swallow
+ } finally {
+ if (is != null) {
+ try {
+ is.close();
+ } catch (IOException e) {
+ // swallow
+ }
+ }
+ }
+
+ setStringsPath(props.getProperty("stringsPath", "" + getStringsPath()));
+
+ setMinLength(Integer.parseInt(props.getProperty("minLength", ""
+ + getMinLength())));
+
+ setEncoding(StringsEncoding.valueOf(props.getProperty("encoding", ""
+ + getEncoding().get())));
+
+ setTimeout(Integer.parseInt(props.getProperty("timeout", ""
+ + getTimeout())));
+ }
+
+ /**
+ * Returns the "strings" installation folder.
+ *
+ * @return the "strings" installation folder.
+ */
+ public String getStringsPath() {
+ return this.stringsPath;
+ }
+
+ /**
+ * Returns the minimum sequence length (characters) to print.
+ *
+ * @return the minimum sequence length (characters) to print.
+ */
+ public int getMinLength() {
+ return this.minLength;
+ }
+
+ /**
+ * Returns the character encoding of the strings that are to be found.
+ *
+ * @return {@see StringsEncoding} enum that represents the character
+ * encoding of the strings that are to be found.
+ */
+ public StringsEncoding getEncoding() {
+ return this.encoding;
+ }
+
+ /**
+ * Returns the maximum time (in seconds) to wait for the "strings" command
+ * to terminate.
+ *
+ * @return the maximum time (in seconds) to wait for the "strings" command
+ * to terminate.
+ */
+ public int getTimeout() {
+ return this.timeout;
+ }
+
+ /**
+ * Sets the "strings" installation folder.
+ *
+ * @param path
+ * the "strings" installation folder.
+ */
+ public void setStringsPath(String path) {
+ if (!path.isEmpty() && !path.endsWith(File.separator)) {
+ path += File.separatorChar;
+ }
+ this.stringsPath = path;
+ }
+
+ /**
+ * Sets the minimum sequence length (characters) to print.
+ *
+ * @param minLength
+ * the minimum sequence length (characters) to print.
+ */
+ public void setMinLength(int minLength) {
+ if (minLength < 1) {
+ throw new IllegalArgumentException("Invalid minimum length");
+ }
+ this.minLength = minLength;
+ }
+
+ /**
+ * Sets the character encoding of the strings that are to be found.
+ *
+ * @param encoding
+ * {@see StringsEncoding} enum that represents the character
+ * encoding of the strings that are to be found.
+ */
+ public void setEncoding(StringsEncoding encoding) {
+ this.encoding = encoding;
+ }
+
+ /**
+ * Sets the maximum time (in seconds) to wait for the "strings" command to
+ * terminate.
+ *
+ * @param timeout
+ * the maximum time (in seconds) to wait for the "strings"
+ * command to terminate.
+ */
+ public void setTimeout(int timeout) {
+ if (timeout < 1) {
+ throw new IllegalArgumentException("Invalid timeout");
+ }
+ this.timeout = timeout;
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/StringsEncoding.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/StringsEncoding.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/StringsEncoding.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/StringsEncoding.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,45 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.strings;
+
+/**
+ * Character encoding of the strings that are to be found using the "strings" command.
+ *
+ */
+public enum StringsEncoding {
+ SINGLE_7_BIT('s', "single-7-bit-byte"), // default
+ SINGLE_8_BIT('S', "single-8-bit-byte"),
+ BIGENDIAN_16_BIT('b', "16-bit bigendian"),
+ LITTLEENDIAN_16_BIT('l', "16-bit littleendian"),
+ BIGENDIAN_32_BIT('B', "32-bit bigendian"),
+ LITTLEENDIAN_32_BIT('L', "32-bit littleendian");
+
+ private char value;
+
+ private String encoding;
+
+ private StringsEncoding(char value, String encoding) {
+ this.value = value;
+ this.encoding = encoding;
+ }
+
+ public char get() {
+ return value;
+ }
+
+ @Override
+ public String toString() {
+ return encoding;
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/StringsParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/StringsParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/StringsParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/strings/StringsParser.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,335 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.strings;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.FutureTask;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.external.ExternalParser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * Parser that uses the "strings" (or strings-alternative) command to find the
+ * printable strings in a object, or other binary, file
+ * (application/octet-stream). Useful as "best-effort" parser for files detected
+ * as application/octet-stream.
+ *
+ * @author gtotaro
+ *
+ */
+public class StringsParser extends AbstractParser {
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = 802566634661575025L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES = Collections
+ .singleton(MediaType.OCTET_STREAM);
+
+ private static final StringsConfig DEFAULT_STRINGS_CONFIG = new StringsConfig();
+
+ private static final FileConfig DEFAULT_FILE_CONFIG = new FileConfig();
+
+ /*
+ * This map is organized as follows:
+ * command's pathname (String) -> is it present? (Boolean), does it support -e option? (Boolean)
+ * It stores check results for command and, if present, -e (encoding) option.
+ */
+ private static Map<String,Boolean[]> STRINGS_PRESENT = new HashMap<String, Boolean[]>();
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+ StringsConfig stringsConfig = context.get(StringsConfig.class, DEFAULT_STRINGS_CONFIG);
+ FileConfig fileConfig = context.get(FileConfig.class, DEFAULT_FILE_CONFIG);
+
+ if (!hasStrings(stringsConfig)) {
+ return;
+ }
+
+ TikaInputStream tis = TikaInputStream.get(stream);
+ File input = tis.getFile();
+
+ // Metadata
+ metadata.set("strings:min-len", "" + stringsConfig.getMinLength());
+ metadata.set("strings:encoding", stringsConfig.toString());
+ metadata.set("strings:file_output", doFile(input, fileConfig));
+
+ int totalBytes = 0;
+
+ // Content
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+
+ xhtml.startDocument();
+
+ totalBytes = doStrings(input, stringsConfig, xhtml);
+
+ xhtml.endDocument();
+
+ // Metadata
+ metadata.set("strings:length", "" + totalBytes);
+ }
+
+ /**
+ * Checks if the "strings" command is supported.
+ *
+ * @param config
+ * {@see StringsConfig} object used for testing the strings
+ * command.
+ * @return Returns returns {@code true} if the strings command is supported.
+ */
+ private boolean hasStrings(StringsConfig config) {
+ String stringsProg = config.getStringsPath() + getStringsProg();
+
+ if (STRINGS_PRESENT.containsKey(stringsProg)) {
+ return STRINGS_PRESENT.get(stringsProg)[0];
+ }
+
+ String[] checkCmd = { stringsProg, "--version" };
+ try {
+ boolean hasStrings = ExternalParser.check(checkCmd);
+
+ boolean encodingOpt = false;
+
+ // Check if the -e option (encoding) is supported
+ if (!System.getProperty("os.name").startsWith("Windows")) {
+ String[] checkOpt = {stringsProg, "-e", "" + config.getEncoding().get(), "/dev/null"};
+ int[] errorValues = {1, 2}; // Exit status code: 1 = general error; 2 = incorrect usage.
+ encodingOpt = ExternalParser.check(checkOpt, errorValues);
+ }
+
+ Boolean[] values = {hasStrings, encodingOpt};
+ STRINGS_PRESENT.put(stringsProg, values);
+
+ return hasStrings;
+ } catch (NoClassDefFoundError ncdfe) {
+ // This happens under OSGi + Fork Parser - see TIKA-1507
+ // As a workaround for now, just say we can't use strings
+ // TODO Resolve it so we don't need this try/catch block
+ Boolean[] values = {false, false};
+ STRINGS_PRESENT.put(stringsProg, values);
+ return false;
+ }
+ }
+
+ /**
+ * Checks if the "file" command is supported.
+ *
+ * @param config
+ * @return
+ */
+ private boolean hasFile(FileConfig config) {
+ String fileProg = config.getFilePath() + getFileProg();
+
+ String[] checkCmd = { fileProg, "--version" };
+
+ boolean hasFile = ExternalParser.check(checkCmd);
+
+ return hasFile;
+ }
+
+ /**
+ * Runs the "strings" command on the given file.
+ *
+ * @param input
+ * {@see File} object that represents the file to parse.
+ * @param config
+ * {@see StringsConfig} object including the strings
+ * configuration.
+ * @param xhtml
+ * {@see XHTMLContentHandler} object.
+ * @return the total number of bytes read using the strings command.
+ * @throws IOException
+ * if any I/O error occurs.
+ * @throws TikaException
+ * if the parsing process has been interrupted.
+ * @throws SAXException
+ */
+ private int doStrings(File input, StringsConfig config,
+ XHTMLContentHandler xhtml) throws IOException, TikaException,
+ SAXException {
+
+ String stringsProg = config.getStringsPath() + getStringsProg();
+
+ // Builds the command array
+ ArrayList<String> cmdList = new ArrayList<String>(4);
+ cmdList.add(stringsProg);
+ cmdList.add("-n");
+ cmdList.add("" + config.getMinLength());;
+ // Currently, encoding option is not supported by Windows (and other) versions
+ if (STRINGS_PRESENT.get(stringsProg)[1]) {
+ cmdList.add("-e");
+ cmdList.add("" + config.getEncoding().get());
+ }
+ cmdList.add(input.getPath());
+
+ String[] cmd = cmdList.toArray(new String[cmdList.size()]);
+
+ ProcessBuilder pb = new ProcessBuilder(cmd);
+ final Process process = pb.start();
+
+ InputStream out = process.getInputStream();
+
+ FutureTask<Integer> waitTask = new FutureTask<Integer>(
+ new Callable<Integer>() {
+ public Integer call() throws Exception {
+ return process.waitFor();
+ }
+ });
+
+ Thread waitThread = new Thread(waitTask);
+ waitThread.start();
+
+ // Reads content printed out by "strings" command
+ int totalBytes = 0;
+ totalBytes = extractOutput(out, xhtml);
+
+ try {
+ waitTask.get(config.getTimeout(), TimeUnit.SECONDS);
+
+ } catch (InterruptedException ie) {
+ waitThread.interrupt();
+ process.destroy();
+ Thread.currentThread().interrupt();
+ throw new TikaException(StringsParser.class.getName()
+ + " interrupted", ie);
+
+ } catch (ExecutionException ee) {
+ // should not be thrown
+
+ } catch (TimeoutException te) {
+ waitThread.interrupt();
+ process.destroy();
+ throw new TikaException(StringsParser.class.getName() + " timeout",
+ te);
+ }
+
+ return totalBytes;
+ }
+
+ /**
+ * Extracts ASCII strings using the "strings" command.
+ *
+ * @param stream
+ * {@see InputStream} object used for reading the binary file.
+ * @param xhtml
+ * {@see XHTMLContentHandler} object.
+ * @return the total number of bytes read using the "strings" command.
+ * @throws SAXException
+ * if the content element could not be written.
+ * @throws IOException
+ * if any I/O error occurs.
+ */
+ private int extractOutput(InputStream stream, XHTMLContentHandler xhtml)
+ throws SAXException, IOException {
+
+ char[] buffer = new char[1024];
+ int totalBytes = 0;
+
+ try (BufferedReader reader = new BufferedReader(new InputStreamReader(stream, UTF_8))) {
+ int n = 0;
+ while ((n = reader.read(buffer)) != -1) {
+ if (n > 0) {
+ xhtml.characters(buffer, 0, n);
+ }
+ totalBytes += n;
+ }
+
+ }
+
+ return totalBytes;
+ }
+
+ /**
+ * Runs the "file" command on the given file that aims at providing an
+ * alternative way to determine the file type.
+ *
+ * @param input
+ * {@see File} object that represents the file to detect.
+ * @return the file type provided by the "file" command using the "-b"
+ * option (it stands for "brief mode").
+ * @throws IOException
+ * if any I/O error occurs.
+ */
+ private String doFile(File input, FileConfig config) throws IOException {
+ if (!hasFile(config)) {
+ return null;
+ }
+
+ // Builds the command array
+ ArrayList<String> cmdList = new ArrayList<String>(3);
+ cmdList.add(config.getFilePath() + getFileProg());
+ cmdList.add("-b");
+ if (config.isMimetype()) {
+ cmdList.add("-I");
+ }
+ cmdList.add(input.getPath());
+
+ String[] cmd = cmdList.toArray(new String[cmdList.size()]);
+
+ ProcessBuilder pb = new ProcessBuilder(cmd);
+ final Process process = pb.start();
+
+ InputStream out = process.getInputStream();
+
+ String fileOutput = null;
+
+ try (BufferedReader reader = new BufferedReader(new InputStreamReader(out, UTF_8))) {
+ fileOutput = reader.readLine();
+ } catch (IOException ioe) {
+ // file output not available!
+ fileOutput = "";
+ }
+
+ return fileOutput;
+ }
+
+
+ public static String getStringsProg() {
+ return System.getProperty("os.name").startsWith("Windows") ? "strings.exe" : "strings";
+ }
+
+ public static String getFileProg() {
+ return System.getProperty("os.name").startsWith("Windows") ? "file.exe" : "file";
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,544 @@
+/**
+ * ******************************************************************************
+ * Copyright (C) 2005-2009, International Business Machines Corporation and *
+ * others. All Rights Reserved. *
+ * ******************************************************************************
+ */
+package org.apache.tika.parser.txt;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+
+
+/**
+ * <code>CharsetDetector</code> provides a facility for detecting the
+ * charset or encoding of character data in an unknown format.
+ * The input data can either be from an input stream or an array of bytes.
+ * The result of the detection operation is a list of possibly matching
+ * charsets, or, for simple use, you can just ask for a Java Reader that
+ * will will work over the input data.
+ * <p/>
+ * Character set detection is at best an imprecise operation. The detection
+ * process will attempt to identify the charset that best matches the characteristics
+ * of the byte data, but the process is partly statistical in nature, and
+ * the results can not be guaranteed to always be correct.
+ * <p/>
+ * For best accuracy in charset detection, the input data should be primarily
+ * in a single language, and a minimum of a few hundred bytes worth of plain text
+ * in the language are needed. The detection process will attempt to
+ * ignore html or xml style markup that could otherwise obscure the content.
+ * <p/>
+ * @stable ICU 3.4
+ */
+public class CharsetDetector {
+
+// Question: Should we have getters corresponding to the setters for input text
+// and declared encoding?
+
+// A thought: If we were to create our own type of Java Reader, we could defer
+// figuring out an actual charset for data that starts out with too much English
+// only ASCII until the user actually read through to something that didn't look
+// like 7 bit English. If nothing else ever appeared, we would never need to
+// actually choose the "real" charset. All assuming that the application just
+// wants the data, and doesn't care about a char set name.
+
+ private static final int kBufSize = 12000;
+ private static final int MAX_CONFIDENCE = 100;
+ private static String[] fCharsetNames;
+ /*
+ * List of recognizers for all charsets known to the implementation.
+ */
+ private static ArrayList<CharsetRecognizer> fCSRecognizers = createRecognizers();
+ /*
+ * The following items are accessed by individual CharsetRecongizers during
+ * the recognition process
+ *
+ */
+ byte[] fInputBytes = // The text to be checked. Markup will have been
+ new byte[kBufSize]; // removed if appropriate.
+ int fInputLen; // Length of the byte data in fInputText.
+ short fByteStats[] = // byte frequency statistics for the input text.
+ new short[256]; // Value is percent, not absolute.
+ boolean fC1Bytes = // True if any bytes in the range 0x80 - 0x9F are in the input;
+ false;
+ String fDeclaredEncoding;
+ //
+ // Stuff private to CharsetDetector
+ //
+ byte[] fRawInput; // Original, untouched input bytes.
+ // If user gave us a byte array, this is it.
+ // If user gave us a stream, it's read to a
+ // buffer here.
+ int fRawLength; // Length of data in fRawInput array.
+ InputStream fInputStream; // User's input stream, or null if the user
+ boolean fStripTags = // If true, setText() will strip tags from input text.
+ false;
+
+ /**
+ * Constructor
+ *
+ * @stable ICU 3.4
+ */
+ public CharsetDetector() {
+ }
+
+ /**
+ * Get the names of all char sets that can be recognized by the char set detector.
+ *
+ * @return an array of the names of all charsets that can be recognized
+ * by the charset detector.
+ *
+ * @stable ICU 3.4
+ */
+ public static String[] getAllDetectableCharsets() {
+ return fCharsetNames;
+ }
+
+ /*
+ * Create the singleton instances of the CharsetRecognizer classes
+ */
+ private static ArrayList<CharsetRecognizer> createRecognizers() {
+ ArrayList<CharsetRecognizer> recognizers = new ArrayList<CharsetRecognizer>();
+
+ recognizers.add(new CharsetRecog_UTF8());
+
+ recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE());
+ recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE());
+ recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE());
+ recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE());
+
+ recognizers.add(new CharsetRecog_mbcs.CharsetRecog_sjis());
+ recognizers.add(new CharsetRecog_2022.CharsetRecog_2022JP());
+ recognizers.add(new CharsetRecog_2022.CharsetRecog_2022CN());
+ recognizers.add(new CharsetRecog_2022.CharsetRecog_2022KR());
+ recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030());
+ recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp());
+ recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr());
+ recognizers.add(new CharsetRecog_mbcs.CharsetRecog_big5());
+
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_da());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_de());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_en());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_es());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_fr());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_it());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_nl());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_no());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_pt());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_sv());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_cs());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_hu());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_pl());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_ro());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_7_el());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_he());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1251());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1256());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_KOI8_R());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr());
+
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr());
+
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_en());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_de());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_es());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_fr());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_it());
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_nl());
+
+ recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM866_ru());
+
+ // Create an array of all charset names, as a side effect.
+ // Needed for the getAllDetectableCharsets() API.
+ String[] charsetNames = new String[recognizers.size()];
+ int out = 0;
+
+ for (CharsetRecognizer recognizer : recognizers) {
+ String name = recognizer.getName();
+
+ if (out == 0 || !name.equals(charsetNames[out - 1])) {
+ charsetNames[out++] = name;
+ }
+ }
+
+ fCharsetNames = new String[out];
+ System.arraycopy(charsetNames, 0, fCharsetNames, 0, out);
+
+ return recognizers;
+ }
+
+ /**
+ * Set the declared encoding for charset detection.
+ * The declared encoding of an input text is an encoding obtained
+ * from an http header or xml declaration or similar source that
+ * can be provided as additional information to the charset detector.
+ * A match between a declared encoding and a possible detected encoding
+ * will raise the quality of that detected encoding by a small delta,
+ * and will also appear as a "reason" for the match.
+ * <p/>
+ * A declared encoding that is incompatible with the input data being
+ * analyzed will not be added to the list of possible encodings.
+ *
+ * @param encoding The declared encoding
+ *
+ * @stable ICU 3.4
+ */
+ public CharsetDetector setDeclaredEncoding(String encoding) {
+ setCanonicalDeclaredEncoding(encoding);
+ return this;
+ }
+
+ /**
+ * Set the input text (byte) data whose charset is to be detected.
+ *
+ * @param in the input text of unknown encoding
+ *
+ * @return This CharsetDetector
+ *
+ * @stable ICU 3.4
+ */
+ public CharsetDetector setText(byte[] in) {
+ fRawInput = in;
+ fRawLength = in.length;
+
+ MungeInput();
+
+ return this;
+ }
+ // Value is rounded up, so zero really means zero occurences.
+
+ /**
+ * Set the input text (byte) data whose charset is to be detected.
+ * <p/>
+ * The input stream that supplies the character data must have markSupported()
+ * == true; the charset detection process will read a small amount of data,
+ * then return the stream to its original position via
+ * the InputStream.reset() operation. The exact amount that will
+ * be read depends on the characteristics of the data itself.
+ *
+ * @param in the input text of unknown encoding
+ *
+ * @return This CharsetDetector
+ *
+ * @stable ICU 3.4
+ */
+
+ public CharsetDetector setText(InputStream in) throws IOException {
+ fInputStream = in;
+ fInputStream.mark(kBufSize);
+ fRawInput = new byte[kBufSize]; // Always make a new buffer because the
+ // previous one may have come from the caller,
+ // in which case we can't touch it.
+ fRawLength = 0;
+ int remainingLength = kBufSize;
+ while (remainingLength > 0) {
+ // read() may give data in smallish chunks, esp. for remote sources. Hence, this loop.
+ int bytesRead = fInputStream.read(fRawInput, fRawLength, remainingLength);
+ if (bytesRead <= 0) {
+ break;
+ }
+ fRawLength += bytesRead;
+ remainingLength -= bytesRead;
+ }
+ fInputStream.reset();
+
+ MungeInput(); // Strip html markup, collect byte stats.
+ return this;
+ }
+
+ /**
+ * Return the charset that best matches the supplied input data.
+ *
+ * Note though, that because the detection
+ * only looks at the start of the input data,
+ * there is a possibility that the returned charset will fail to handle
+ * the full set of input data.
+ * <p/>
+ * Raise an exception if
+ * <ul>
+ * <li>no charset appears to match the data.</li>
+ * <li>no input text has been provided</li>
+ * </ul>
+ *
+ * @return a CharsetMatch object representing the best matching charset, or
+ * <code>null</code> if there are no matches.
+ *
+ * @stable ICU 3.4
+ */
+ public CharsetMatch detect() {
+// TODO: A better implementation would be to copy the detect loop from
+// detectAll(), and cut it short as soon as a match with a high confidence
+// is found. This is something to be done later, after things are otherwise
+// working.
+ CharsetMatch matches[] = detectAll();
+
+ if (matches == null || matches.length == 0) {
+ return null;
+ }
+
+ return matches[0];
+ }
+
+ /**
+ * Return an array of all charsets that appear to be plausible
+ * matches with the input data. The array is ordered with the
+ * best quality match first.
+ * <p/>
+ * Raise an exception if
+ * <ul>
+ * <li>no charsets appear to match the input data.</li>
+ * <li>no input text has been provided</li>
+ * </ul>
+ *
+ * @return An array of CharsetMatch objects representing possibly matching charsets.
+ *
+ * @stable ICU 3.4
+ */
+ public CharsetMatch[] detectAll() {
+ CharsetRecognizer csr;
+ int i;
+ int detectResults;
+ int confidence;
+ ArrayList<CharsetMatch> matches = new ArrayList<CharsetMatch>();
+
+ // Iterate over all possible charsets, remember all that
+ // give a match quality > 0.
+ for (i = 0; i < fCSRecognizers.size(); i++) {
+ csr = fCSRecognizers.get(i);
+ detectResults = csr.match(this);
+ confidence = detectResults & 0x000000ff;
+ if (confidence > 0) {
+ // Just to be safe, constrain
+ confidence = Math.min(confidence, MAX_CONFIDENCE);
+
+ // Apply charset hint.
+ if ((fDeclaredEncoding != null) && (fDeclaredEncoding.equalsIgnoreCase(csr.getName()))) {
+ // Reduce lack of confidence (delta between "sure" and current) by 50%.
+ confidence += (MAX_CONFIDENCE - confidence) / 2;
+ }
+
+ CharsetMatch m = new CharsetMatch(this, csr, confidence);
+ matches.add(m);
+ }
+ }
+
+ Collections.sort(matches); // CharsetMatch compares on confidence
+ Collections.reverse(matches); // Put best match first.
+ CharsetMatch[] resultArray = new CharsetMatch[matches.size()];
+ resultArray = matches.toArray(resultArray);
+ return resultArray;
+ }
+
+ /**
+ * Autodetect the charset of an inputStream, and return a Java Reader
+ * to access the converted input data.
+ * <p/>
+ * This is a convenience method that is equivalent to
+ * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();</code>
+ * <p/>
+ * For the input stream that supplies the character data, markSupported()
+ * must be true; the charset detection will read a small amount of data,
+ * then return the stream to its original position via
+ * the InputStream.reset() operation. The exact amount that will
+ * be read depends on the characteristics of the data itself.
+ *<p/>
+ * Raise an exception if no charsets appear to match the input data.
+ *
+ * @param in The source of the byte data in the unknown charset.
+ *
+ * @param declaredEncoding A declared encoding for the data, if available,
+ * or null or an empty string if none is available.
+ *
+ * @stable ICU 3.4
+ */
+ public Reader getReader(InputStream in, String declaredEncoding) {
+ setCanonicalDeclaredEncoding(declaredEncoding);
+
+ try {
+ setText(in);
+
+ CharsetMatch match = detect();
+
+ if (match == null) {
+ return null;
+ }
+
+ return match.getReader();
+ } catch (IOException e) {
+ return null;
+ }
+ }
+
+ /**
+ * Autodetect the charset of an inputStream, and return a String
+ * containing the converted input data.
+ * <p/>
+ * This is a convenience method that is equivalent to
+ * <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code>
+ *<p/>
+ * Raise an exception if no charsets appear to match the input data.
+ *
+ * @param in The source of the byte data in the unknown charset.
+ *
+ * @param declaredEncoding A declared encoding for the data, if available,
+ * or null or an empty string if none is available.
+ *
+ * @stable ICU 3.4
+ */
+ public String getString(byte[] in, String declaredEncoding) {
+ setCanonicalDeclaredEncoding(declaredEncoding);
+
+ try {
+ setText(in);
+
+ CharsetMatch match = detect();
+
+ if (match == null) {
+ return null;
+ }
+
+ return match.getString(-1);
+ } catch (IOException e) {
+ return null;
+ }
+ }
+ // gave us a byte array.
+
+ /**
+ * Test whether or not input filtering is enabled.
+ *
+ * @return <code>true</code> if input text will be filtered.
+ *
+ * @see #enableInputFilter
+ *
+ * @stable ICU 3.4
+ */
+ public boolean inputFilterEnabled() {
+ return fStripTags;
+ }
+
+ /**
+ * Enable filtering of input text. If filtering is enabled,
+ * text within angle brackets ("<" and ">") will be removed
+ * before detection.
+ *
+ * @param filter <code>true</code> to enable input text filtering.
+ *
+ * @return The previous setting.
+ *
+ * @stable ICU 3.4
+ */
+ public boolean enableInputFilter(boolean filter) {
+ boolean previous = fStripTags;
+
+ fStripTags = filter;
+
+ return previous;
+ }
+
+ /**
+ * Try to set fDeclaredEncoding to the canonical name for <encoding>, if it exists.
+ *
+ * @param encoding - name of character encoding
+ */
+ private void setCanonicalDeclaredEncoding(String encoding) {
+ if ((encoding == null) || encoding.isEmpty()) {
+ return;
+ }
+
+ Charset cs = Charset.forName(encoding);
+ if (cs != null) {
+ fDeclaredEncoding = cs.name();
+ }
+ }
+
+ /*
+ * MungeInput - after getting a set of raw input data to be analyzed, preprocess
+ * it by removing what appears to be html markup.
+ */
+ private void MungeInput() {
+ int srci = 0;
+ int dsti = 0;
+ byte b;
+ boolean inMarkup = false;
+ int openTags = 0;
+ int badTags = 0;
+
+ //
+ // html / xml markup stripping.
+ // quick and dirty, not 100% accurate, but hopefully good enough, statistically.
+ // discard everything within < brackets >
+ // Count how many total '<' and illegal (nested) '<' occur, so we can make some
+ // guess as to whether the input was actually marked up at all.
+ if (fStripTags) {
+ for (srci = 0; srci < fRawLength && dsti < fInputBytes.length; srci++) {
+ b = fRawInput[srci];
+ if (b == (byte) '<') {
+ if (inMarkup) {
+ badTags++;
+ }
+ inMarkup = true;
+ openTags++;
+ }
+
+ if (!inMarkup) {
+ fInputBytes[dsti++] = b;
+ }
+
+ if (b == (byte) '>') {
+ inMarkup = false;
+ }
+ }
+
+ fInputLen = dsti;
+ }
+
+ //
+ // If it looks like this input wasn't marked up, or if it looks like it's
+ // essentially nothing but markup abandon the markup stripping.
+ // Detection will have to work on the unstripped input.
+ //
+ if (openTags < 5 || openTags / 5 < badTags ||
+ (fInputLen < 100 && fRawLength > 600)) {
+ int limit = fRawLength;
+
+ if (limit > kBufSize) {
+ limit = kBufSize;
+ }
+
+ for (srci = 0; srci < limit; srci++) {
+ fInputBytes[srci] = fRawInput[srci];
+ }
+ fInputLen = srci;
+ }
+
+ //
+ // Tally up the byte occurence statistics.
+ // These are available for use by the various detectors.
+ //
+ Arrays.fill(fByteStats, (short) 0);
+ for (srci = 0; srci < fInputLen; srci++) {
+ int val = fInputBytes[srci] & 0x00ff;
+ fByteStats[val]++;
+ }
+
+ fC1Bytes = false;
+ for (int i = 0x80; i <= 0x9F; i += 1) {
+ if (fByteStats[i] != 0) {
+ fC1Bytes = true;
+ break;
+ }
+ }
+ }
+}