You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by th...@apache.org on 2016/06/02 03:16:04 UTC
[05/12] tika git commit: merged upstream changes and resolved
conflicts
merged upstream changes and resolved conflicts
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/e780d566
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/e780d566
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/e780d566
Branch: refs/heads/TIKA-1508
Commit: e780d56652d48dd0f50b4e62a58153e95f055022
Parents: 0d69ca7 bb46c0e
Author: Thamme Gowda <tg...@gmail.com>
Authored: Mon May 23 11:30:13 2016 -0700
Committer: Thamme Gowda <tg...@gmail.com>
Committed: Mon May 23 11:30:13 2016 -0700
----------------------------------------------------------------------
CHANGES.txt | 85 +-
pom.xml | 3 +-
tika-app/pom.xml | 7 +-
.../main/appended-resources/META-INF/LICENSE | 227 -
.../tika/cli/BatchCommandLineBuilder.java | 7 -
.../main/java/org/apache/tika/cli/TikaCLI.java | 50 +-
.../main/resources/tika-app-batch-config.xml | 10 +-
.../tika/cli/TikaCLIBatchCommandLineTest.java | 1 -
.../java/org/apache/tika/cli/TikaCLITest.java | 16 -
tika-batch/pom.xml | 4 +-
.../batch/builders/BatchProcessBuilder.java | 15 +-
.../builders/CommandLineParserBuilder.java | 16 +-
.../apache/tika/batch/fs/FSBatchProcessCLI.java | 4 +-
.../builders/BasicTikaFSConsumersBuilder.java | 51 +-
.../tika/batch/fs/default-tika-batch-config.xml | 50 +-
.../apache/tika/batch/fs/BatchProcessTest.java | 19 +-
.../tika/batch/fs/HandlerBuilderTest.java | 4 -
.../tika-batch-config-MockConsumersBuilder.xml | 2 +-
.../test/resources/tika-batch-config-broken.xml | 2 +-
.../tika-batch-config-test-suffix-override.xml | 112 +
.../test/resources/tika-batch-config-test.xml | 2 +-
tika-bundle/pom.xml | 6 +-
.../main/appended-resources/META-INF/LICENSE | 226 -
tika-core/pom.xml | 7 +-
.../java/org/apache/tika/config/TikaConfig.java | 26 +-
.../tika/config/TikaConfigSerializer.java | 4 +-
.../org/apache/tika/detect/NameDetector.java | 15 +-
.../tika/detect/ZeroSizeFileDetector.java | 45 +
.../java/org/apache/tika/fork/ForkClient.java | 10 +-
.../tika/language/LanguageIdentifier.java | 7 +-
.../apache/tika/language/LanguageProfile.java | 2 +
.../tika/language/LanguageProfilerBuilder.java | 9 +-
.../apache/tika/language/ProfilingHandler.java | 3 +-
.../apache/tika/language/ProfilingWriter.java | 2 +
.../language/detect/LanguageConfidence.java | 25 +
.../tika/language/detect/LanguageDetector.java | 239 +
.../tika/language/detect/LanguageHandler.java | 66 +
.../tika/language/detect/LanguageNames.java | 86 +
.../tika/language/detect/LanguageResult.java | 98 +
.../tika/language/detect/LanguageWriter.java | 78 +
.../org/apache/tika/language/package-info.java | 22 -
.../tika/metadata/TikaCoreProperties.java | 9 +
.../java/org/apache/tika/mime/MediaType.java | 3 +
.../org/apache/tika/mime/MediaTypeRegistry.java | 2 +
.../org/apache/tika/mime/MimeTypesReader.java | 20 +-
.../org/apache/tika/parser/NetworkParser.java | 4 +-
.../org/apache/tika/parser/ParseContext.java | 169 +-
.../org/apache/tika/parser/ParserDecorator.java | 35 +-
.../tika/parser/external/ExternalParser.java | 8 +-
.../external/ExternalParsersConfigReader.java | 11 +-
.../tika/sax/BasicContentHandlerFactory.java | 8 +
.../src/main/java/org/apache/tika/sax/Link.java | 4 +
.../java/org/apache/tika/sax/LinkBuilder.java | 6 +-
.../org/apache/tika/sax/LinkContentHandler.java | 18 +-
.../resources/org/apache/tika/language/be.ngp | 0
.../resources/org/apache/tika/language/ca.ngp | 0
.../resources/org/apache/tika/language/eo.ngp | 0
.../resources/org/apache/tika/language/gl.ngp | 0
.../resources/org/apache/tika/language/ro.ngp | 0
.../resources/org/apache/tika/language/sk.ngp | 0
.../resources/org/apache/tika/language/sl.ngp | 0
.../resources/org/apache/tika/language/uk.ngp | 0
.../org/apache/tika/mime/tika-mimetypes.xml | 93 +-
.../src/test/java/org/apache/tika/TikaTest.java | 59 +-
.../apache/tika/detect/NameDetectorTest.java | 10 +
.../tika/detect/ZeroSizeFileDetectorTest.java | 64 +
.../tika/language/LanguageIdentifierTest.java | 1 +
.../tika/language/LanguageProfileTest.java | 7 +-
.../language/LanguageProfilerBuilderTest.java | 1 +
.../tika/language/ProfilingWriterTest.java | 5 +-
.../tika/language/detect/LanguageNamesTest.java | 38 +
.../org/apache/tika/parser/mock/MockParser.java | 12 +-
.../apache/tika/sax/LinkContentHandlerTest.java | 36 +-
.../tika/language/langbuilder/welsh_corpus.txt | 5204 +++++++++---------
tika-example/pom.xml | 16 +-
.../java/org/apache/tika/example/Language.java | 32 +-
.../tika/example/LanguageDetectingParser.java | 16 +-
.../tika/example/LanguageDetectorExample.java | 33 +
.../tika/example/LanguageIdentifierExample.java | 27 -
.../org/apache/tika/example/MyFirstTika.java | 13 +-
.../org/apache/tika/example/ParsingExample.java | 14 +-
.../example/LanguageDetectorExampleTest.java | 39 +
.../example/LanguageIdentifierExampleTest.java | 37 -
tika-java7/pom.xml | 2 +-
tika-langdetect/pom.xml | 171 +
.../tika/langdetect/OptimaizeLangDetector.java | 196 +
.../tika/langdetect/TextLangDetector.java | 146 +
...apache.tika.language.detect.LanguageDetector | 16 +
.../tika/langdetect/LanguageDetectorTest.java | 92 +
.../langdetect/OptimaizeLangDetectorTest.java | 265 +
.../tika/langdetect/TextLangDetectorTest.java | 59 +
.../src/test/resources/log4j.properties | 24 +
.../apache/tika/langdetect/language-codes.txt | 186 +
.../tika/langdetect/language-tests/da.test | 108 +
.../tika/langdetect/language-tests/de.test | 104 +
.../tika/langdetect/language-tests/el.test | 109 +
.../tika/langdetect/language-tests/en.test | 105 +
.../tika/langdetect/language-tests/es.test | 107 +
.../tika/langdetect/language-tests/et.test | 17 +
.../tika/langdetect/language-tests/fi.test | 106 +
.../tika/langdetect/language-tests/fr.test | 105 +
.../tika/langdetect/language-tests/it.test | 109 +
.../tika/langdetect/language-tests/ja.test | 78 +
.../tika/langdetect/language-tests/lt.test | 32 +
.../tika/langdetect/language-tests/nl.test | 105 +
.../tika/langdetect/language-tests/pt.test | 105 +
.../tika/langdetect/language-tests/sv.test | 108 +
.../tika/langdetect/language-tests/th.test | 28 +
.../tika/langdetect/language-tests/zh.test | 57 +
.../org/apache/tika/langdetect/text-test.tsv | 18 +
.../org/apache/tika/langdetect/udhr-known.txt | 11 +
.../org/apache/tika/langdetect/udhr-unknown.txt | 4 +
tika-parent/pom.xml | 29 +-
tika-parsers/pom.xml | 53 +-
.../tika/parser/code/SourceCodeParser.java | 142 +-
.../tika/parser/epub/EpubContentParser.java | 33 +-
.../org/apache/tika/parser/epub/EpubParser.java | 8 +-
.../parser/executable/ExecutableParser.java | 2 +-
.../tika/parser/font/AdobeFontMetricParser.java | 16 +-
.../apache/tika/parser/font/TrueTypeParser.java | 4 +-
.../geoinfo/GeographicInformationParser.java | 30 +-
.../apache/tika/parser/html/HtmlHandler.java | 3 +
.../apache/tika/parser/image/ICNSParser.java | 117 +
.../org/apache/tika/parser/image/ICNSType.java | 170 +
.../parser/image/ImageMetadataExtractor.java | 45 +-
.../tika/parser/image/xmp/JempboxExtractor.java | 75 +-
.../tika/parser/isatab/ISArchiveParser.java | 62 +-
.../tika/parser/jdbc/AbstractDBParser.java | 13 +-
.../tika/parser/jdbc/JDBCTableReader.java | 68 +-
.../tika/parser/jdbc/SQLite3DBParser.java | 31 +-
.../apache/tika/parser/jdbc/SQLite3Parser.java | 6 +-
.../tika/parser/jdbc/SQLite3TableReader.java | 45 +-
.../apache/tika/parser/journal/TEIParser.java | 8 +-
.../tika/parser/mail/MailContentHandler.java | 110 +-
.../org/apache/tika/parser/mat/MatParser.java | 27 +-
.../tika/parser/microsoft/HSLFExtractor.java | 14 +
.../tika/parser/microsoft/OfficeParser.java | 3 +-
.../microsoft/POIFSContainerDetector.java | 21 +-
.../tika/parser/microsoft/WordExtractor.java | 11 +-
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 22 +-
.../ooxml/XSLFPowerPointExtractorDecorator.java | 58 +-
.../ooxml/XSSFExcelExtractorDecorator.java | 112 +-
.../microsoft/xml/AbstractXML2003Parser.java | 86 +
.../parser/microsoft/xml/HyperlinkHandler.java | 96 +
.../microsoft/xml/SpreadsheetMLParser.java | 161 +
.../tika/parser/microsoft/xml/WordMLParser.java | 229 +
.../parser/mp4/DirectFileReadDataSource.java | 34 +-
.../org/apache/tika/parser/mp4/MP4Parser.java | 379 +-
.../parser/ner/grobid/GrobidNERecogniser.java | 240 +
.../parser/ner/mitie/MITIENERecogniser.java | 160 +
.../tika/parser/ner/nltk/NLTKNERecogniser.java | 19 +-
.../apache/tika/parser/netcdf/NetCDFParser.java | 20 +-
.../parser/odf/OpenDocumentContentParser.java | 37 +-
.../tika/parser/odf/OpenDocumentParser.java | 62 +-
.../org/apache/tika/parser/pdf/PDF2XHTML.java | 244 +-
.../parser/pdf/PDFEncodedStringDecoder.java | 14 +-
.../org/apache/tika/parser/pdf/PDFParser.java | 143 +-
.../apache/tika/parser/pdf/PDFParserConfig.java | 67 +-
.../apache/tika/parser/pdf/XFAExtractor.java | 30 +-
.../tika/parser/pot/PooledTimeSeriesParser.java | 394 +-
.../services/org.apache.tika.parser.Parser | 5 +-
.../parser/ner/grobid/GrobidServer.properties | 17 +
.../apache/tika/parser/pdf/PDFParser.properties | 4 +-
.../org/apache/tika/mime/TestMimeTypes.java | 21 +-
.../parser/executable/ExecutableParserTest.java | 73 +-
.../GeographicInformationParserTest.java | 48 +-
.../apache/tika/parser/html/HtmlParserTest.java | 38 +
.../tika/parser/image/ICNSParserTest.java | 65 +
.../tika/parser/image/ImageParserTest.java | 5 +-
.../tika/parser/jdbc/SQLite3ParserTest.java | 106 +-
.../apache/tika/parser/jpeg/JpegParserTest.java | 21 +-
.../tika/parser/mail/RFC822ParserTest.java | 115 +
.../AbstractPOIContainerExtractionTest.java | 4 +-
.../tika/parser/microsoft/ExcelParserTest.java | 18 +-
.../microsoft/POIContainerExtractionTest.java | 35 +-
.../parser/microsoft/PowerPointParserTest.java | 2 +-
.../ooxml/OOXMLContainerExtractionTest.java | 23 +-
.../parser/microsoft/ooxml/OOXMLParserTest.java | 24 +-
.../parser/microsoft/xml/XML2003ParserTest.java | 81 +
.../apache/tika/parser/mp4/MP4ParserTest.java | 12 +-
.../apache/tika/parser/odf/ODFParserTest.java | 10 +-
.../apache/tika/parser/pdf/PDFParserTest.java | 562 +-
.../resources/test-documents/testEXCEL2003.xml | 100 +
.../test-documents/testEXCEL_hyperlinks.xls | Bin 0 -> 29696 bytes
.../test-documents/testEXCEL_hyperlinks.xlsx | Bin 0 -> 10038 bytes
.../test/resources/test-documents/testHFA.hfa | Bin 0 -> 1024 bytes
.../test/resources/test-documents/testICNS.icns | Bin 0 -> 2472 bytes
.../test-documents/testICNS_basic.icns | Bin 0 -> 18199 bytes
.../resources/test-documents/testKeynoteNew.key | Bin 0 -> 274397 bytes
.../test/resources/test-documents/testMIF.mif | Bin 0 -> 10240 bytes
.../test-documents/testMP4_truncated.m4a | Bin 0 -> 74 bytes
.../testMSChart-govdocs-428996.ppt | Bin 0 -> 41472 bytes
.../testMSChart-govdocs-428996.pptx | Bin 0 -> 56224 bytes
.../testMSChart-govdocs-428996.xls | Bin 0 -> 35328 bytes
.../testMSChart-govdocs-428996.xlsx | Bin 0 -> 17112 bytes
.../test-documents/testNumbersNew.numbers | Bin 0 -> 179147 bytes
.../resources/test-documents/testODTNoMeta.odt | Bin 0 -> 5847 bytes
.../test-documents/testPDF_bad_page_303226.pdf | Bin 0 -> 138027 bytes
.../resources/test-documents/testPagesNew.pages | Bin 0 -> 237567 bytes
.../test-documents/testRFC822_date_utf8 | 8 +
.../resources/test-documents/testRFC822_eml | 33 +
.../resources/test-documents/testSqlite3b.db | Bin 27648 -> 27648 bytes
.../resources/test-documents/testWORD2003.xml | 2542 +++++++++
tika-serialization/pom.xml | 4 +-
tika-server/pom.xml | 45 +-
.../tika/server/resource/LanguageResource.java | 27 +-
.../tika/server/resource/MetadataResource.java | 9 +-
.../resource/RecursiveMetadataResource.java | 7 +-
.../tika/server/resource/TranslateResource.java | 22 +-
.../org/apache/tika/server/CXFTestBase.java | 26 +-
tika-translate/pom.xml | 9 +-
.../language/translate/AbstractTranslator.java | 32 +
.../language/translate/CachedTranslator.java | 20 +-
.../language/translate/ExternalTranslator.java | 13 +-
.../language/translate/GoogleTranslator.java | 20 +-
.../language/translate/Lingo24Translator.java | 20 +-
.../language/translate/MosesTranslator.java | 7 +-
.../language/translate/YandexTranslator.java | 175 +
.../translate/translator.yandex.properties | 24 +
.../translate/YandexTranslatorTest.java | 105 +
tika-xmp/pom.xml | 2 +-
221 files changed, 13467 insertions(+), 5115 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/e780d566/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
----------------------------------------------------------------------
diff --cc tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
index caa916a,0e3acd9..896b51b
--- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
@@@ -35,12 -35,6 +37,8 @@@ import java.util.Map
import java.util.Set;
import java.util.concurrent.ExecutorService;
- import javax.imageio.spi.ServiceRegistry;
- import javax.xml.parsers.DocumentBuilder;
- import javax.xml.parsers.DocumentBuilderFactory;
- import javax.xml.parsers.ParserConfigurationException;
-
+import org.apache.tika.base.Configurable;
++
import org.apache.tika.concurrent.ConfigurableThreadPoolExecutor;
import org.apache.tika.concurrent.SimpleThreadPoolExecutor;
import org.apache.tika.detect.CompositeDetector;
http://git-wip-us.apache.org/repos/asf/tika/blob/e780d566/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
----------------------------------------------------------------------
diff --cc tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
index 20607d9,2521cc9..e58f5c8
--- a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
@@@ -43,11 -53,22 +53,27 @@@ public class ParseContext implements Se
/** Map of objects in this context */
private final Map<String, Object> context = new HashMap<String, Object>();
+
+ /**
+ * Map of configurable arguments
+ */
+ private final Map<String, String> params = new HashMap<>();
+
+ private static final EntityResolver IGNORING_SAX_ENTITY_RESOLVER = new EntityResolver() {
+ public InputSource resolveEntity(String publicId, String systemId) throws SAXException, IOException {
+ return new InputSource(new StringReader(""));
+ }
+ };
+
+ private static final XMLResolver IGNORING_STAX_ENTITY_RESOLVER =
+ new XMLResolver() {
+ @Override
+ public Object resolveEntity(String publicID, String systemID, String baseURI, String namespace) throws
+ XMLStreamException {
+ return "";
+ }
+ };
+
/**
* Adds the given value to the context as an implementation of the given
* interface.
@@@ -150,36 -196,111 +201,144 @@@
}
/**
+ * Stores a key=value parameter
+ * @param key parameter name
+ * @param value value
+ */
+ public void setParam(String key, String value){
+ this.params.put(key, value);
+ }
+
+ /**
+ * Gets the value associated with given parameter
+ * @param key parameter name
+ */
+ public void getParam(String key){
+ this.params.get(key);
+ }
+
+ /**
+ * Gets all the params
+ * @return map of key values
+ */
+ public Map<String, String> getParams() {
+ return params;
+ }
+
+ /**
+ * Checks if parameter is available
+ * @param key parameter name
+ * @return true if parameter is available, false otherwise
+ */
+ public boolean hasParam(String key){
+ return params.containsKey(key);
+ }
++ /**
+ * Returns the DOM builder factory specified in this parsing context.
+ * If a factory is not explicitly specified, then a default factory
+ * instance is created and returned. The default factory instance is
+ * configured to be namespace-aware and to apply reasonable security
+ * features.
+ *
+ * @since Apache Tika 1.13
+ * @return DOM parser factory
+ */
+ private DocumentBuilderFactory getDocumentBuilderFactory() {
+ //borrowed from Apache POI
+ DocumentBuilderFactory documentBuilderFactory = get(DocumentBuilderFactory.class);
+ if (documentBuilderFactory != null) {
+ return documentBuilderFactory;
+ }
+ documentBuilderFactory = DocumentBuilderFactory.newInstance();
+ documentBuilderFactory.setNamespaceAware(true);
+ documentBuilderFactory.setValidating(false);
+ tryToSetSAXFeatureOnDOMFactory(documentBuilderFactory,
+ XMLConstants.FEATURE_SECURE_PROCESSING, true);
+ tryToSetXercesManager(documentBuilderFactory);
+ return documentBuilderFactory;
+ }
+
+ /**
+ * Returns the DOM builder specified in this parsing context.
+ * If a builder is not explicitly specified, then a builder
+ * instance is created and returned. The builder instance is
+ * configured to apply an {@link #IGNORING_SAX_ENTITY_RESOLVER},
+ * and it sets the ErrorHandler to <code>null</code>.
+ *
+ * @since Apache Tika 1.13
+ * @return DOM Builder
+ */
+ public DocumentBuilder getDocumentBuilder() throws TikaException {
+ DocumentBuilder documentBuilder = get(DocumentBuilder.class);
+ if (documentBuilder != null) {
+ return documentBuilder;
+ }
+ try {
+ DocumentBuilderFactory documentBuilderFactory = getDocumentBuilderFactory();
+ documentBuilder = documentBuilderFactory.newDocumentBuilder();
+ documentBuilder.setEntityResolver(IGNORING_SAX_ENTITY_RESOLVER);
+ documentBuilder.setErrorHandler(null);
+ return documentBuilder;
+ } catch (ParserConfigurationException e) {
+ throw new TikaException("XML parser not available", e);
+ }
+ }
+
+ /**
+ * Returns the StAX input factory specified in this parsing context.
+ * If a factory is not explicitly specified, then a default factory
+ * instance is created and returned. The default factory instance is
+ * configured to be namespace-aware and to apply reasonable security
+ * using the {@link #IGNORING_STAX_ENTITY_RESOLVER}.
+ *
+ * @since Apache Tika 1.13
+ * @return StAX input factory
+ */
+ public XMLInputFactory getXMLInputFactory() {
+ XMLInputFactory factory = get(XMLInputFactory.class);
+ if (factory != null) {
+ return factory;
+ }
+ factory = XMLInputFactory.newFactory();
+
+ tryToSetStaxProperty(factory, XMLInputFactory.IS_NAMESPACE_AWARE, true);
+ tryToSetStaxProperty(factory, XMLInputFactory.IS_VALIDATING, false);
+
+ factory.setXMLResolver(IGNORING_STAX_ENTITY_RESOLVER);
+ return factory;
+ }
+
+ private static void tryToSetSAXFeatureOnDOMFactory(DocumentBuilderFactory dbf, String feature, boolean value) {
+ try {
+ dbf.setFeature(feature, value);
+ } catch (Exception|AbstractMethodError e) {
+ }
+ }
+
+ private static void tryToSetXercesManager(DocumentBuilderFactory dbf) {
+ // Try built-in JVM one first, standalone if not
+ for (String securityManagerClassName : new String[] {
+ "com.sun.org.apache.xerces.internal.util.SecurityManager",
+ "org.apache.xerces.util.SecurityManager"
+ }) {
+ try {
+ Object mgr = Class.forName(securityManagerClassName).newInstance();
+ Method setLimit = mgr.getClass().getMethod("setEntityExpansionLimit", Integer.TYPE);
+ setLimit.invoke(mgr, 4096);
+ dbf.setAttribute("http://apache.org/xml/properties/security-manager", mgr);
+ // Stop once one can be setup without error
+ return;
+ } catch (Throwable t) {
+ }
+ }
+ }
+
+ private void tryToSetStaxProperty(XMLInputFactory factory, String key, boolean value) {
+ try {
+ factory.setProperty(key, value);
+ } catch (IllegalArgumentException e) {
+ //swallow
+ }
+ }
+
}