You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by th...@apache.org on 2016/06/02 03:16:04 UTC

[05/12] tika git commit: merged upstream changes and resolved conflicts

merged upstream changes and resolved conflicts


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/e780d566
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/e780d566
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/e780d566

Branch: refs/heads/TIKA-1508
Commit: e780d56652d48dd0f50b4e62a58153e95f055022
Parents: 0d69ca7 bb46c0e
Author: Thamme Gowda <tg...@gmail.com>
Authored: Mon May 23 11:30:13 2016 -0700
Committer: Thamme Gowda <tg...@gmail.com>
Committed: Mon May 23 11:30:13 2016 -0700

----------------------------------------------------------------------
 CHANGES.txt                                     |   85 +-
 pom.xml                                         |    3 +-
 tika-app/pom.xml                                |    7 +-
 .../main/appended-resources/META-INF/LICENSE    |  227 -
 .../tika/cli/BatchCommandLineBuilder.java       |    7 -
 .../main/java/org/apache/tika/cli/TikaCLI.java  |   50 +-
 .../main/resources/tika-app-batch-config.xml    |   10 +-
 .../tika/cli/TikaCLIBatchCommandLineTest.java   |    1 -
 .../java/org/apache/tika/cli/TikaCLITest.java   |   16 -
 tika-batch/pom.xml                              |    4 +-
 .../batch/builders/BatchProcessBuilder.java     |   15 +-
 .../builders/CommandLineParserBuilder.java      |   16 +-
 .../apache/tika/batch/fs/FSBatchProcessCLI.java |    4 +-
 .../builders/BasicTikaFSConsumersBuilder.java   |   51 +-
 .../tika/batch/fs/default-tika-batch-config.xml |   50 +-
 .../apache/tika/batch/fs/BatchProcessTest.java  |   19 +-
 .../tika/batch/fs/HandlerBuilderTest.java       |    4 -
 .../tika-batch-config-MockConsumersBuilder.xml  |    2 +-
 .../test/resources/tika-batch-config-broken.xml |    2 +-
 .../tika-batch-config-test-suffix-override.xml  |  112 +
 .../test/resources/tika-batch-config-test.xml   |    2 +-
 tika-bundle/pom.xml                             |    6 +-
 .../main/appended-resources/META-INF/LICENSE    |  226 -
 tika-core/pom.xml                               |    7 +-
 .../java/org/apache/tika/config/TikaConfig.java |   26 +-
 .../tika/config/TikaConfigSerializer.java       |    4 +-
 .../org/apache/tika/detect/NameDetector.java    |   15 +-
 .../tika/detect/ZeroSizeFileDetector.java       |   45 +
 .../java/org/apache/tika/fork/ForkClient.java   |   10 +-
 .../tika/language/LanguageIdentifier.java       |    7 +-
 .../apache/tika/language/LanguageProfile.java   |    2 +
 .../tika/language/LanguageProfilerBuilder.java  |    9 +-
 .../apache/tika/language/ProfilingHandler.java  |    3 +-
 .../apache/tika/language/ProfilingWriter.java   |    2 +
 .../language/detect/LanguageConfidence.java     |   25 +
 .../tika/language/detect/LanguageDetector.java  |  239 +
 .../tika/language/detect/LanguageHandler.java   |   66 +
 .../tika/language/detect/LanguageNames.java     |   86 +
 .../tika/language/detect/LanguageResult.java    |   98 +
 .../tika/language/detect/LanguageWriter.java    |   78 +
 .../org/apache/tika/language/package-info.java  |   22 -
 .../tika/metadata/TikaCoreProperties.java       |    9 +
 .../java/org/apache/tika/mime/MediaType.java    |    3 +
 .../org/apache/tika/mime/MediaTypeRegistry.java |    2 +
 .../org/apache/tika/mime/MimeTypesReader.java   |   20 +-
 .../org/apache/tika/parser/NetworkParser.java   |    4 +-
 .../org/apache/tika/parser/ParseContext.java    |  169 +-
 .../org/apache/tika/parser/ParserDecorator.java |   35 +-
 .../tika/parser/external/ExternalParser.java    |    8 +-
 .../external/ExternalParsersConfigReader.java   |   11 +-
 .../tika/sax/BasicContentHandlerFactory.java    |    8 +
 .../src/main/java/org/apache/tika/sax/Link.java |    4 +
 .../java/org/apache/tika/sax/LinkBuilder.java   |    6 +-
 .../org/apache/tika/sax/LinkContentHandler.java |   18 +-
 .../resources/org/apache/tika/language/be.ngp   |    0
 .../resources/org/apache/tika/language/ca.ngp   |    0
 .../resources/org/apache/tika/language/eo.ngp   |    0
 .../resources/org/apache/tika/language/gl.ngp   |    0
 .../resources/org/apache/tika/language/ro.ngp   |    0
 .../resources/org/apache/tika/language/sk.ngp   |    0
 .../resources/org/apache/tika/language/sl.ngp   |    0
 .../resources/org/apache/tika/language/uk.ngp   |    0
 .../org/apache/tika/mime/tika-mimetypes.xml     |   93 +-
 .../src/test/java/org/apache/tika/TikaTest.java |   59 +-
 .../apache/tika/detect/NameDetectorTest.java    |   10 +
 .../tika/detect/ZeroSizeFileDetectorTest.java   |   64 +
 .../tika/language/LanguageIdentifierTest.java   |    1 +
 .../tika/language/LanguageProfileTest.java      |    7 +-
 .../language/LanguageProfilerBuilderTest.java   |    1 +
 .../tika/language/ProfilingWriterTest.java      |    5 +-
 .../tika/language/detect/LanguageNamesTest.java |   38 +
 .../org/apache/tika/parser/mock/MockParser.java |   12 +-
 .../apache/tika/sax/LinkContentHandlerTest.java |   36 +-
 .../tika/language/langbuilder/welsh_corpus.txt  | 5204 +++++++++---------
 tika-example/pom.xml                            |   16 +-
 .../java/org/apache/tika/example/Language.java  |   32 +-
 .../tika/example/LanguageDetectingParser.java   |   16 +-
 .../tika/example/LanguageDetectorExample.java   |   33 +
 .../tika/example/LanguageIdentifierExample.java |   27 -
 .../org/apache/tika/example/MyFirstTika.java    |   13 +-
 .../org/apache/tika/example/ParsingExample.java |   14 +-
 .../example/LanguageDetectorExampleTest.java    |   39 +
 .../example/LanguageIdentifierExampleTest.java  |   37 -
 tika-java7/pom.xml                              |    2 +-
 tika-langdetect/pom.xml                         |  171 +
 .../tika/langdetect/OptimaizeLangDetector.java  |  196 +
 .../tika/langdetect/TextLangDetector.java       |  146 +
 ...apache.tika.language.detect.LanguageDetector |   16 +
 .../tika/langdetect/LanguageDetectorTest.java   |   92 +
 .../langdetect/OptimaizeLangDetectorTest.java   |  265 +
 .../tika/langdetect/TextLangDetectorTest.java   |   59 +
 .../src/test/resources/log4j.properties         |   24 +
 .../apache/tika/langdetect/language-codes.txt   |  186 +
 .../tika/langdetect/language-tests/da.test      |  108 +
 .../tika/langdetect/language-tests/de.test      |  104 +
 .../tika/langdetect/language-tests/el.test      |  109 +
 .../tika/langdetect/language-tests/en.test      |  105 +
 .../tika/langdetect/language-tests/es.test      |  107 +
 .../tika/langdetect/language-tests/et.test      |   17 +
 .../tika/langdetect/language-tests/fi.test      |  106 +
 .../tika/langdetect/language-tests/fr.test      |  105 +
 .../tika/langdetect/language-tests/it.test      |  109 +
 .../tika/langdetect/language-tests/ja.test      |   78 +
 .../tika/langdetect/language-tests/lt.test      |   32 +
 .../tika/langdetect/language-tests/nl.test      |  105 +
 .../tika/langdetect/language-tests/pt.test      |  105 +
 .../tika/langdetect/language-tests/sv.test      |  108 +
 .../tika/langdetect/language-tests/th.test      |   28 +
 .../tika/langdetect/language-tests/zh.test      |   57 +
 .../org/apache/tika/langdetect/text-test.tsv    |   18 +
 .../org/apache/tika/langdetect/udhr-known.txt   |   11 +
 .../org/apache/tika/langdetect/udhr-unknown.txt |    4 +
 tika-parent/pom.xml                             |   29 +-
 tika-parsers/pom.xml                            |   53 +-
 .../tika/parser/code/SourceCodeParser.java      |  142 +-
 .../tika/parser/epub/EpubContentParser.java     |   33 +-
 .../org/apache/tika/parser/epub/EpubParser.java |    8 +-
 .../parser/executable/ExecutableParser.java     |    2 +-
 .../tika/parser/font/AdobeFontMetricParser.java |   16 +-
 .../apache/tika/parser/font/TrueTypeParser.java |    4 +-
 .../geoinfo/GeographicInformationParser.java    |   30 +-
 .../apache/tika/parser/html/HtmlHandler.java    |    3 +
 .../apache/tika/parser/image/ICNSParser.java    |  117 +
 .../org/apache/tika/parser/image/ICNSType.java  |  170 +
 .../parser/image/ImageMetadataExtractor.java    |   45 +-
 .../tika/parser/image/xmp/JempboxExtractor.java |   75 +-
 .../tika/parser/isatab/ISArchiveParser.java     |   62 +-
 .../tika/parser/jdbc/AbstractDBParser.java      |   13 +-
 .../tika/parser/jdbc/JDBCTableReader.java       |   68 +-
 .../tika/parser/jdbc/SQLite3DBParser.java       |   31 +-
 .../apache/tika/parser/jdbc/SQLite3Parser.java  |    6 +-
 .../tika/parser/jdbc/SQLite3TableReader.java    |   45 +-
 .../apache/tika/parser/journal/TEIParser.java   |    8 +-
 .../tika/parser/mail/MailContentHandler.java    |  110 +-
 .../org/apache/tika/parser/mat/MatParser.java   |   27 +-
 .../tika/parser/microsoft/HSLFExtractor.java    |   14 +
 .../tika/parser/microsoft/OfficeParser.java     |    3 +-
 .../microsoft/POIFSContainerDetector.java       |   21 +-
 .../tika/parser/microsoft/WordExtractor.java    |   11 +-
 .../microsoft/ooxml/AbstractOOXMLExtractor.java |   22 +-
 .../ooxml/XSLFPowerPointExtractorDecorator.java |   58 +-
 .../ooxml/XSSFExcelExtractorDecorator.java      |  112 +-
 .../microsoft/xml/AbstractXML2003Parser.java    |   86 +
 .../parser/microsoft/xml/HyperlinkHandler.java  |   96 +
 .../microsoft/xml/SpreadsheetMLParser.java      |  161 +
 .../tika/parser/microsoft/xml/WordMLParser.java |  229 +
 .../parser/mp4/DirectFileReadDataSource.java    |   34 +-
 .../org/apache/tika/parser/mp4/MP4Parser.java   |  379 +-
 .../parser/ner/grobid/GrobidNERecogniser.java   |  240 +
 .../parser/ner/mitie/MITIENERecogniser.java     |  160 +
 .../tika/parser/ner/nltk/NLTKNERecogniser.java  |   19 +-
 .../apache/tika/parser/netcdf/NetCDFParser.java |   20 +-
 .../parser/odf/OpenDocumentContentParser.java   |   37 +-
 .../tika/parser/odf/OpenDocumentParser.java     |   62 +-
 .../org/apache/tika/parser/pdf/PDF2XHTML.java   |  244 +-
 .../parser/pdf/PDFEncodedStringDecoder.java     |   14 +-
 .../org/apache/tika/parser/pdf/PDFParser.java   |  143 +-
 .../apache/tika/parser/pdf/PDFParserConfig.java |   67 +-
 .../apache/tika/parser/pdf/XFAExtractor.java    |   30 +-
 .../tika/parser/pot/PooledTimeSeriesParser.java |  394 +-
 .../services/org.apache.tika.parser.Parser      |    5 +-
 .../parser/ner/grobid/GrobidServer.properties   |   17 +
 .../apache/tika/parser/pdf/PDFParser.properties |    4 +-
 .../org/apache/tika/mime/TestMimeTypes.java     |   21 +-
 .../parser/executable/ExecutableParserTest.java |   73 +-
 .../GeographicInformationParserTest.java        |   48 +-
 .../apache/tika/parser/html/HtmlParserTest.java |   38 +
 .../tika/parser/image/ICNSParserTest.java       |   65 +
 .../tika/parser/image/ImageParserTest.java      |    5 +-
 .../tika/parser/jdbc/SQLite3ParserTest.java     |  106 +-
 .../apache/tika/parser/jpeg/JpegParserTest.java |   21 +-
 .../tika/parser/mail/RFC822ParserTest.java      |  115 +
 .../AbstractPOIContainerExtractionTest.java     |    4 +-
 .../tika/parser/microsoft/ExcelParserTest.java  |   18 +-
 .../microsoft/POIContainerExtractionTest.java   |   35 +-
 .../parser/microsoft/PowerPointParserTest.java  |    2 +-
 .../ooxml/OOXMLContainerExtractionTest.java     |   23 +-
 .../parser/microsoft/ooxml/OOXMLParserTest.java |   24 +-
 .../parser/microsoft/xml/XML2003ParserTest.java |   81 +
 .../apache/tika/parser/mp4/MP4ParserTest.java   |   12 +-
 .../apache/tika/parser/odf/ODFParserTest.java   |   10 +-
 .../apache/tika/parser/pdf/PDFParserTest.java   |  562 +-
 .../resources/test-documents/testEXCEL2003.xml  |  100 +
 .../test-documents/testEXCEL_hyperlinks.xls     |  Bin 0 -> 29696 bytes
 .../test-documents/testEXCEL_hyperlinks.xlsx    |  Bin 0 -> 10038 bytes
 .../test/resources/test-documents/testHFA.hfa   |  Bin 0 -> 1024 bytes
 .../test/resources/test-documents/testICNS.icns |  Bin 0 -> 2472 bytes
 .../test-documents/testICNS_basic.icns          |  Bin 0 -> 18199 bytes
 .../resources/test-documents/testKeynoteNew.key |  Bin 0 -> 274397 bytes
 .../test/resources/test-documents/testMIF.mif   |  Bin 0 -> 10240 bytes
 .../test-documents/testMP4_truncated.m4a        |  Bin 0 -> 74 bytes
 .../testMSChart-govdocs-428996.ppt              |  Bin 0 -> 41472 bytes
 .../testMSChart-govdocs-428996.pptx             |  Bin 0 -> 56224 bytes
 .../testMSChart-govdocs-428996.xls              |  Bin 0 -> 35328 bytes
 .../testMSChart-govdocs-428996.xlsx             |  Bin 0 -> 17112 bytes
 .../test-documents/testNumbersNew.numbers       |  Bin 0 -> 179147 bytes
 .../resources/test-documents/testODTNoMeta.odt  |  Bin 0 -> 5847 bytes
 .../test-documents/testPDF_bad_page_303226.pdf  |  Bin 0 -> 138027 bytes
 .../resources/test-documents/testPagesNew.pages |  Bin 0 -> 237567 bytes
 .../test-documents/testRFC822_date_utf8         |    8 +
 .../resources/test-documents/testRFC822_eml     |   33 +
 .../resources/test-documents/testSqlite3b.db    |  Bin 27648 -> 27648 bytes
 .../resources/test-documents/testWORD2003.xml   | 2542 +++++++++
 tika-serialization/pom.xml                      |    4 +-
 tika-server/pom.xml                             |   45 +-
 .../tika/server/resource/LanguageResource.java  |   27 +-
 .../tika/server/resource/MetadataResource.java  |    9 +-
 .../resource/RecursiveMetadataResource.java     |    7 +-
 .../tika/server/resource/TranslateResource.java |   22 +-
 .../org/apache/tika/server/CXFTestBase.java     |   26 +-
 tika-translate/pom.xml                          |    9 +-
 .../language/translate/AbstractTranslator.java  |   32 +
 .../language/translate/CachedTranslator.java    |   20 +-
 .../language/translate/ExternalTranslator.java  |   13 +-
 .../language/translate/GoogleTranslator.java    |   20 +-
 .../language/translate/Lingo24Translator.java   |   20 +-
 .../language/translate/MosesTranslator.java     |    7 +-
 .../language/translate/YandexTranslator.java    |  175 +
 .../translate/translator.yandex.properties      |   24 +
 .../translate/YandexTranslatorTest.java         |  105 +
 tika-xmp/pom.xml                                |    2 +-
 221 files changed, 13467 insertions(+), 5115 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/e780d566/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
----------------------------------------------------------------------
diff --cc tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
index caa916a,0e3acd9..896b51b
--- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
@@@ -35,12 -35,6 +37,8 @@@ import java.util.Map
  import java.util.Set;
  import java.util.concurrent.ExecutorService;
  
- import javax.imageio.spi.ServiceRegistry;
- import javax.xml.parsers.DocumentBuilder;
- import javax.xml.parsers.DocumentBuilderFactory;
- import javax.xml.parsers.ParserConfigurationException;
- 
 +import org.apache.tika.base.Configurable;
++
  import org.apache.tika.concurrent.ConfigurableThreadPoolExecutor;
  import org.apache.tika.concurrent.SimpleThreadPoolExecutor;
  import org.apache.tika.detect.CompositeDetector;

http://git-wip-us.apache.org/repos/asf/tika/blob/e780d566/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
----------------------------------------------------------------------
diff --cc tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
index 20607d9,2521cc9..e58f5c8
--- a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
@@@ -43,11 -53,22 +53,27 @@@ public class ParseContext implements Se
  
      /** Map of objects in this context */
      private final Map<String, Object> context = new HashMap<String, Object>();
+ 
 +    /**
 +     * Map of configurable arguments
 +     */
 +    private final Map<String, String> params = new HashMap<>();
 +
+     private static final EntityResolver IGNORING_SAX_ENTITY_RESOLVER = new EntityResolver() {
+         public InputSource resolveEntity(String publicId, String systemId) throws SAXException, IOException {
+             return new InputSource(new StringReader(""));
+         }
+     };
+ 
+     private static final XMLResolver IGNORING_STAX_ENTITY_RESOLVER =
+             new XMLResolver() {
+                 @Override
+                 public Object resolveEntity(String publicID, String systemID, String baseURI, String namespace) throws
+                         XMLStreamException {
+                     return "";
+                 }
+             };
+ 
      /**
       * Adds the given value to the context as an implementation of the given
       * interface.
@@@ -150,36 -196,111 +201,144 @@@
      }
  
      /**
 +     * Stores a key=value parameter
 +     * @param key parameter name
 +     * @param value value
 +     */
 +    public void setParam(String key, String value){
 +        this.params.put(key, value);
 +    }
 +
 +    /**
 +     * Gets the value associated with given parameter
 +     * @param key parameter name
 +     */
 +    public void getParam(String key){
 +        this.params.get(key);
 +    }
 +
 +    /**
 +     * Gets all the params
 +     * @return map of key values
 +     */
 +    public Map<String, String> getParams() {
 +        return params;
 +    }
 +
 +    /**
 +     * Checks if parameter is available
 +     * @param key parameter name
 +     * @return true if parameter is available, false otherwise
 +     */
 +    public boolean hasParam(String key){
 +       return params.containsKey(key);
 +    }
++    /**
+      * Returns the DOM builder factory specified in this parsing context.
+      * If a factory is not explicitly specified, then a default factory
+      * instance is created and returned. The default factory instance is
+      * configured to be namespace-aware and to apply reasonable security
+      * features.
+      *
+      * @since Apache Tika 1.13
+      * @return DOM parser factory
+      */
+     private DocumentBuilderFactory getDocumentBuilderFactory() {
+         //borrowed from Apache POI
+         DocumentBuilderFactory documentBuilderFactory = get(DocumentBuilderFactory.class);
+         if (documentBuilderFactory != null) {
+             return documentBuilderFactory;
+         }
+         documentBuilderFactory = DocumentBuilderFactory.newInstance();
+         documentBuilderFactory.setNamespaceAware(true);
+         documentBuilderFactory.setValidating(false);
+         tryToSetSAXFeatureOnDOMFactory(documentBuilderFactory,
+             XMLConstants.FEATURE_SECURE_PROCESSING, true);
+         tryToSetXercesManager(documentBuilderFactory);
+         return documentBuilderFactory;
+     }
+ 
+     /**
+      * Returns the DOM builder specified in this parsing context.
+      * If a builder is not explicitly specified, then a builder
+      * instance is created and returned. The builder instance is
+      * configured to apply an {@link #IGNORING_SAX_ENTITY_RESOLVER},
+      * and it sets the ErrorHandler to <code>null</code>.
+      *
+      * @since Apache Tika 1.13
+      * @return DOM Builder
+      */
+     public DocumentBuilder getDocumentBuilder() throws TikaException {
+         DocumentBuilder documentBuilder = get(DocumentBuilder.class);
+         if (documentBuilder != null) {
+             return documentBuilder;
+         }
+         try {
+             DocumentBuilderFactory documentBuilderFactory = getDocumentBuilderFactory();
+             documentBuilder = documentBuilderFactory.newDocumentBuilder();
+             documentBuilder.setEntityResolver(IGNORING_SAX_ENTITY_RESOLVER);
+             documentBuilder.setErrorHandler(null);
+             return documentBuilder;
+         } catch (ParserConfigurationException e) {
+             throw new TikaException("XML parser not available", e);
+         }
+     }
+ 
+     /**
+      * Returns the StAX input factory specified in this parsing context.
+      * If a factory is not explicitly specified, then a default factory
+      * instance is created and returned. The default factory instance is
+      * configured to be namespace-aware and to apply reasonable security
+      * using the {@link #IGNORING_STAX_ENTITY_RESOLVER}.
+      *
+      * @since Apache Tika 1.13
+      * @return StAX input factory
+      */
+     public XMLInputFactory getXMLInputFactory() {
+         XMLInputFactory factory = get(XMLInputFactory.class);
+         if (factory != null) {
+             return factory;
+         }
+         factory = XMLInputFactory.newFactory();
+ 
+         tryToSetStaxProperty(factory, XMLInputFactory.IS_NAMESPACE_AWARE, true);
+         tryToSetStaxProperty(factory, XMLInputFactory.IS_VALIDATING, false);
+ 
+         factory.setXMLResolver(IGNORING_STAX_ENTITY_RESOLVER);
+         return factory;
+     }
+ 
+     private static void tryToSetSAXFeatureOnDOMFactory(DocumentBuilderFactory dbf, String feature, boolean value) {
+         try {
+             dbf.setFeature(feature, value);
+         } catch (Exception|AbstractMethodError e) {
+         }
+     }
+ 
+     private static void tryToSetXercesManager(DocumentBuilderFactory dbf) {
+         // Try built-in JVM one first, standalone if not
+         for (String securityManagerClassName : new String[] {
+                 "com.sun.org.apache.xerces.internal.util.SecurityManager",
+                 "org.apache.xerces.util.SecurityManager"
+         }) {
+             try {
+                 Object mgr = Class.forName(securityManagerClassName).newInstance();
+                 Method setLimit = mgr.getClass().getMethod("setEntityExpansionLimit", Integer.TYPE);
+                 setLimit.invoke(mgr, 4096);
+                 dbf.setAttribute("http://apache.org/xml/properties/security-manager", mgr);
+                 // Stop once one can be setup without error
+                 return;
+             } catch (Throwable t) {
+             }
+         }
+     }
+ 
+     private void tryToSetStaxProperty(XMLInputFactory factory, String key, boolean value) {
+         try {
+             factory.setProperty(key, value);
+         } catch (IllegalArgumentException e) {
+             //swallow
+         }
+     }
+ 
  }