You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2015/11/04 17:00:33 UTC

svn commit: r1712572 - in /tika/trunk: tika-batch/src/main/java/org/apache/tika/batch/ tika-batch/src/test/java/org/apache/tika/batch/fs/ tika-batch/src/test/resources/ tika-batch/src/test/resources/test-input/illegal_xml_chars_in_exception/ tika-core/...

Author: tallison
Date: Wed Nov  4 16:00:22 2015
New Revision: 1712572

URL: http://svn.apache.org/viewvc?rev=1712572&view=rev
Log:
TIKA-1786 -- clean up logging in tika-batch

Added:
    tika/trunk/tika-batch/src/test/resources/log4j-on.properties
    tika/trunk/tika-batch/src/test/resources/test-input/illegal_xml_chars_in_exception/
    tika/trunk/tika-batch/src/test/resources/test-input/illegal_xml_chars_in_exception/test0_bad_chars.xml
    tika/trunk/tika-batch/src/test/resources/test-input/illegal_xml_chars_in_exception/test1_ok.xml
Modified:
    tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/FileResourceConsumer.java
    tika/trunk/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchProcessTest.java
    tika/trunk/tika-batch/src/test/java/org/apache/tika/batch/fs/FSBatchTestBase.java
    tika/trunk/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java

Modified: tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/FileResourceConsumer.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/FileResourceConsumer.java?rev=1712572&r1=1712571&r2=1712572&view=diff
==============================================================================
--- tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/FileResourceConsumer.java (original)
+++ tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/FileResourceConsumer.java Wed Nov  4 16:00:22 2015
@@ -17,9 +17,6 @@ package org.apache.tika.batch;
  * limitations under the License.
  */
 
-import javax.xml.stream.XMLOutputFactory;
-import javax.xml.stream.XMLStreamException;
-import javax.xml.stream.XMLStreamWriter;
 import java.io.Closeable;
 import java.io.Flushable;
 import java.io.IOException;
@@ -35,9 +32,13 @@ import java.util.concurrent.atomic.Atomi
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.SafeContentHandler;
+import org.apache.tika.sax.ToXMLContentHandler;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
 
 
 /**
@@ -77,7 +78,6 @@ public abstract class FileResourceConsum
 
     private final ArrayBlockingQueue<FileResource> fileQueue;
 
-    private final XMLOutputFactory xmlOutputFactory = XMLOutputFactory.newFactory();
     private final int consumerId;
 
     //used to lock checks on state to prevent
@@ -284,34 +284,31 @@ public abstract class FileResourceConsum
      */
     protected String getXMLifiedLogMsg(String type, String resourceId, Throwable t, String... attrs) {
 
-        StringWriter writer = new StringWriter();
+        ContentHandler toXML = new ToXMLContentHandler();
+        SafeContentHandler handler = new SafeContentHandler(toXML);
+        AttributesImpl attributes = new AttributesImpl();
+        attributes.addAttribute("", "resourceId", "resourceId", "", resourceId);
+        for (int i = 0; i < attrs.length - 1; i++) {
+            attributes.addAttribute("", attrs[i], attrs[i], "", attrs[i + 1]);
+        }
         try {
-            XMLStreamWriter xml = xmlOutputFactory.createXMLStreamWriter(writer);
-            xml.writeStartDocument();
-            xml.writeStartElement(type);
-            xml.writeAttribute("resourceId", resourceId);
-            if (attrs != null) {
-                //this assumes args has name value pairs alternating, name0 at 0, val0 at 1, name1 at 2, val2 at 3, etc.
-                for (int i = 0; i < attrs.length - 1; i++) {
-                    xml.writeAttribute(attrs[i], attrs[i + 1]);
-                }
-            }
+            handler.startDocument();
+            handler.startElement("", type, type, attributes);
             if (t != null) {
                 StringWriter stackWriter = new StringWriter();
                 PrintWriter printWriter = new PrintWriter(stackWriter);
                 t.printStackTrace(printWriter);
                 printWriter.flush();
                 stackWriter.flush();
-                xml.writeCharacters(stackWriter.toString());
+                char[] chars = stackWriter.toString().toCharArray();
+                handler.characters(chars, 0, chars.length);
             }
-            xml.writeEndElement();
-            xml.writeEndDocument();
-            xml.flush();
-            xml.close();
-        } catch (XMLStreamException e) {
-            logger.error("error writing xml stream for: " + resourceId, t);
+            handler.endElement("", type, type);
+            handler.endDocument();
+        } catch (SAXException e) {
+            logger.warn("error writing xml stream for: " + resourceId, t);
         }
-        return writer.toString();
+        return handler.toString();
     }
 
     private FileResource getNextFileResource() throws InterruptedException {
@@ -356,7 +353,7 @@ public abstract class FileResourceConsum
             try {
                 closeable.close();
             } catch (IOException e){
-                logger.error(e.getMessage());
+                logger.warn(e.getMessage());
             }
         }
         closeable = null;
@@ -370,7 +367,7 @@ public abstract class FileResourceConsum
             try {
                 ((Flushable)closeable).flush();
             } catch (IOException e) {
-                logger.error(e.getMessage());
+                logger.warn(e.getMessage());
             }
         }
         close(closeable);

Modified: tika/trunk/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchProcessTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchProcessTest.java?rev=1712572&r1=1712571&r2=1712572&view=diff
==============================================================================
--- tika/trunk/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchProcessTest.java (original)
+++ tika/trunk/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchProcessTest.java Wed Nov  4 16:00:22 2015
@@ -291,24 +291,53 @@ public class BatchProcessTest extends FS
         assertFalse(Files.exists(test2));
     }
 
+    @Test
+    public void testHandlingOfIllegalXMLCharsInException() throws Exception {
+        //tests to make sure that hierarchy is maintained when reading from
+        //file list
+        //also tests that list actually works.
+        Path outputDir = getNewOutputDir("illegal_xml_chars_in_exception");
+
+        Map<String, String> args = getDefaultArgs("illegal_xml_chars_in_exception", outputDir);
+        args.put("numConsumers", "1");
+        args.put("recursiveParserWrapper", "true");
+        args.put("basicHandlerType", "text");
+        args.put("outputSuffix", "json");
+
+        BatchProcessTestExecutor ex = new BatchProcessTestExecutor(args,
+                "/tika-batch-config-MockConsumersBuilder.xml",
+                "/log4j-on.properties");
+        StreamStrings ss = ex.execute();
+        assertFalse(ss.getOutString().contains("error writing xml stream for"));
+        assertContains("parse_ex resourceId=\"test0_bad_chars.xml\"", ss.getOutString());
+    }
+
     private class BatchProcessTestExecutor {
         private final Map<String, String> args;
         private final String configPath;
+        private final String loggerProps;
         private int exitValue = Integer.MIN_VALUE;
 
         public BatchProcessTestExecutor(Map<String, String> args) {
             this(args, "/tika-batch-config-test.xml");
         }
 
+
+
         public BatchProcessTestExecutor(Map<String, String> args, String configPath) {
+            this(args, configPath, "/log4j_process.properties");
+        }
+
+        public BatchProcessTestExecutor(Map<String, String> args, String configPath, String loggerProps) {
             this.args = args;
             this.configPath = configPath;
+            this.loggerProps = loggerProps;
         }
 
         private StreamStrings execute() {
             Process p = null;
             try {
-                ProcessBuilder b = getNewBatchRunnerProcess(configPath, args);
+                ProcessBuilder b = getNewBatchRunnerProcess(configPath, loggerProps, args);
                 p = b.start();
                 StringStreamGobbler errorGobbler = new StringStreamGobbler(p.getErrorStream());
                 StringStreamGobbler outGobbler = new StringStreamGobbler(p.getInputStream());

Modified: tika/trunk/tika-batch/src/test/java/org/apache/tika/batch/fs/FSBatchTestBase.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/java/org/apache/tika/batch/fs/FSBatchTestBase.java?rev=1712572&r1=1712571&r2=1712572&view=diff
==============================================================================
--- tika/trunk/tika-batch/src/test/java/org/apache/tika/batch/fs/FSBatchTestBase.java (original)
+++ tika/trunk/tika-batch/src/test/java/org/apache/tika/batch/fs/FSBatchTestBase.java Wed Nov  4 16:00:22 2015
@@ -150,23 +150,25 @@ public abstract class FSBatchTestBase ex
         return runner;
     }
 
-    public ProcessBuilder getNewBatchRunnerProcess(String testConfig, Map<String, String> args) {
+    public ProcessBuilder getNewBatchRunnerProcess(String testConfig, String loggerProps,
+                                                   Map<String, String> args) {
         List<String> argList = new ArrayList<>();
+
         for (Map.Entry<String, String> e : args.entrySet()) {
             argList.add("-"+e.getKey());
             argList.add(e.getValue());
         }
 
-        String[] fullCommandLine = commandLine(testConfig,
+        String[] fullCommandLine = commandLine(testConfig, loggerProps,
                 argList.toArray(new String[argList.size()]));
         return new ProcessBuilder(fullCommandLine);
     }
 
-    private String[] commandLine(String testConfig, String[] args) {
+    private String[] commandLine(String testConfig, String loggerProps, String[] args) {
         List<String> commandLine = new ArrayList<>();
         commandLine.add("java");
         commandLine.add("-Dlog4j.configuration=file:"+
-            this.getClass().getResource("/log4j_process.properties").getFile());
+            this.getClass().getResource(loggerProps).getFile());
         commandLine.add("-Xmx128m");
         commandLine.add("-cp");
         String cp = System.getProperty("java.class.path");

Added: tika/trunk/tika-batch/src/test/resources/log4j-on.properties
URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/resources/log4j-on.properties?rev=1712572&view=auto
==============================================================================
--- tika/trunk/tika-batch/src/test/resources/log4j-on.properties (added)
+++ tika/trunk/tika-batch/src/test/resources/log4j-on.properties Wed Nov  4 16:00:22 2015
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#This is used by the batch process; see log4j.properties for the driver
+
+log4j.rootLogger=WARN,A1
+
+#for debugging
+#log4j.rootLogger=TRACE,A1
+
+log4j.appender.A1=org.apache.log4j.ConsoleAppender
+
+# A1 uses PatternLayout.
+log4j.appender.A1.layout=org.apache.log4j.PatternLayout
+log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n

Added: tika/trunk/tika-batch/src/test/resources/test-input/illegal_xml_chars_in_exception/test0_bad_chars.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/resources/test-input/illegal_xml_chars_in_exception/test0_bad_chars.xml?rev=1712572&view=auto
==============================================================================
--- tika/trunk/tika-batch/src/test/resources/test-input/illegal_xml_chars_in_exception/test0_bad_chars.xml (added)
+++ tika/trunk/tika-batch/src/test/resources/test-input/illegal_xml_chars_in_exception/test0_bad_chars.xml Wed Nov  4 16:00:22 2015
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<mock>
+    <metadata action="add" name="author">Nikolai Lobachevsky</metadata>
+    <write element="p">some content</write>
+    <throwIllegalChars />
+</mock>
\ No newline at end of file

Added: tika/trunk/tika-batch/src/test/resources/test-input/illegal_xml_chars_in_exception/test1_ok.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/resources/test-input/illegal_xml_chars_in_exception/test1_ok.xml?rev=1712572&view=auto
==============================================================================
--- tika/trunk/tika-batch/src/test/resources/test-input/illegal_xml_chars_in_exception/test1_ok.xml (added)
+++ tika/trunk/tika-batch/src/test/resources/test-input/illegal_xml_chars_in_exception/test1_ok.xml Wed Nov  4 16:00:22 2015
@@ -0,0 +1,23 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<mock>
+    <metadata action="add" name="author">Nikolai Lobachevsky</metadata>
+    <write element="p">This is tika-batch's first test file.</write>
+</mock>
\ No newline at end of file

Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java?rev=1712572&r1=1712571&r2=1712572&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java (original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java Wed Nov  4 16:00:22 2015
@@ -18,6 +18,8 @@ package org.apache.tika.parser.mock;
  */
 
 
+import static java.nio.charset.StandardCharsets.UTF_8;
+
 import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;
 import javax.xml.parsers.ParserConfigurationException;
@@ -49,8 +51,6 @@ import org.w3c.dom.NodeList;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
-import static java.nio.charset.StandardCharsets.UTF_8;
-
 /**
  * This class enables mocking of parser behavior for use in testing
  * wrappers and drivers of parsers.
@@ -123,11 +123,17 @@ public class MockParser extends Abstract
             print(action, name);
         } else if ("embedded".equals(name)) {
             handleEmbedded(action, xhtml, context);
+        } else if ("throwIllegalChars".equals(name)) {
+            throwIllegalChars();
         } else {
             throw new IllegalArgumentException("Didn't recognize mock action: "+name);
         }
     }
 
+    private void throwIllegalChars() throws IOException {
+        throw new IOException("Can't say \u0000 in xml or \u0001 or \u0002 or \u0003");
+    }
+
     private void handleEmbedded(Node action, XHTMLContentHandler handler, ParseContext context)
             throws TikaException, SAXException, IOException {
         String fileName = "";