You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2015/11/04 17:00:33 UTC
svn commit: r1712572 - in /tika/trunk:
tika-batch/src/main/java/org/apache/tika/batch/
tika-batch/src/test/java/org/apache/tika/batch/fs/
tika-batch/src/test/resources/
tika-batch/src/test/resources/test-input/illegal_xml_chars_in_exception/
tika-core/...
Author: tallison
Date: Wed Nov 4 16:00:22 2015
New Revision: 1712572
URL: http://svn.apache.org/viewvc?rev=1712572&view=rev
Log:
TIKA-1786 -- clean up logging in tika-batch
Added:
tika/trunk/tika-batch/src/test/resources/log4j-on.properties
tika/trunk/tika-batch/src/test/resources/test-input/illegal_xml_chars_in_exception/
tika/trunk/tika-batch/src/test/resources/test-input/illegal_xml_chars_in_exception/test0_bad_chars.xml
tika/trunk/tika-batch/src/test/resources/test-input/illegal_xml_chars_in_exception/test1_ok.xml
Modified:
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/FileResourceConsumer.java
tika/trunk/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchProcessTest.java
tika/trunk/tika-batch/src/test/java/org/apache/tika/batch/fs/FSBatchTestBase.java
tika/trunk/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java
Modified: tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/FileResourceConsumer.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/FileResourceConsumer.java?rev=1712572&r1=1712571&r2=1712572&view=diff
==============================================================================
--- tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/FileResourceConsumer.java (original)
+++ tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/FileResourceConsumer.java Wed Nov 4 16:00:22 2015
@@ -17,9 +17,6 @@ package org.apache.tika.batch;
* limitations under the License.
*/
-import javax.xml.stream.XMLOutputFactory;
-import javax.xml.stream.XMLStreamException;
-import javax.xml.stream.XMLStreamWriter;
import java.io.Closeable;
import java.io.Flushable;
import java.io.IOException;
@@ -35,9 +32,13 @@ import java.util.concurrent.atomic.Atomi
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.SafeContentHandler;
+import org.apache.tika.sax.ToXMLContentHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
/**
@@ -77,7 +78,6 @@ public abstract class FileResourceConsum
private final ArrayBlockingQueue<FileResource> fileQueue;
- private final XMLOutputFactory xmlOutputFactory = XMLOutputFactory.newFactory();
private final int consumerId;
//used to lock checks on state to prevent
@@ -284,34 +284,31 @@ public abstract class FileResourceConsum
*/
protected String getXMLifiedLogMsg(String type, String resourceId, Throwable t, String... attrs) {
- StringWriter writer = new StringWriter();
+ ContentHandler toXML = new ToXMLContentHandler();
+ SafeContentHandler handler = new SafeContentHandler(toXML);
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "resourceId", "resourceId", "", resourceId);
+ for (int i = 0; i < attrs.length - 1; i++) {
+ attributes.addAttribute("", attrs[i], attrs[i], "", attrs[i + 1]);
+ }
try {
- XMLStreamWriter xml = xmlOutputFactory.createXMLStreamWriter(writer);
- xml.writeStartDocument();
- xml.writeStartElement(type);
- xml.writeAttribute("resourceId", resourceId);
- if (attrs != null) {
- //this assumes args has name value pairs alternating, name0 at 0, val0 at 1, name1 at 2, val2 at 3, etc.
- for (int i = 0; i < attrs.length - 1; i++) {
- xml.writeAttribute(attrs[i], attrs[i + 1]);
- }
- }
+ handler.startDocument();
+ handler.startElement("", type, type, attributes);
if (t != null) {
StringWriter stackWriter = new StringWriter();
PrintWriter printWriter = new PrintWriter(stackWriter);
t.printStackTrace(printWriter);
printWriter.flush();
stackWriter.flush();
- xml.writeCharacters(stackWriter.toString());
+ char[] chars = stackWriter.toString().toCharArray();
+ handler.characters(chars, 0, chars.length);
}
- xml.writeEndElement();
- xml.writeEndDocument();
- xml.flush();
- xml.close();
- } catch (XMLStreamException e) {
- logger.error("error writing xml stream for: " + resourceId, t);
+ handler.endElement("", type, type);
+ handler.endDocument();
+ } catch (SAXException e) {
+ logger.warn("error writing xml stream for: " + resourceId, t);
}
- return writer.toString();
+ return handler.toString();
}
private FileResource getNextFileResource() throws InterruptedException {
@@ -356,7 +353,7 @@ public abstract class FileResourceConsum
try {
closeable.close();
} catch (IOException e){
- logger.error(e.getMessage());
+ logger.warn(e.getMessage());
}
}
closeable = null;
@@ -370,7 +367,7 @@ public abstract class FileResourceConsum
try {
((Flushable)closeable).flush();
} catch (IOException e) {
- logger.error(e.getMessage());
+ logger.warn(e.getMessage());
}
}
close(closeable);
Modified: tika/trunk/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchProcessTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchProcessTest.java?rev=1712572&r1=1712571&r2=1712572&view=diff
==============================================================================
--- tika/trunk/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchProcessTest.java (original)
+++ tika/trunk/tika-batch/src/test/java/org/apache/tika/batch/fs/BatchProcessTest.java Wed Nov 4 16:00:22 2015
@@ -291,24 +291,53 @@ public class BatchProcessTest extends FS
assertFalse(Files.exists(test2));
}
+ @Test
+ public void testHandlingOfIllegalXMLCharsInException() throws Exception {
+ //tests to make sure that hierarchy is maintained when reading from
+ //file list
+ //also tests that list actually works.
+ Path outputDir = getNewOutputDir("illegal_xml_chars_in_exception");
+
+ Map<String, String> args = getDefaultArgs("illegal_xml_chars_in_exception", outputDir);
+ args.put("numConsumers", "1");
+ args.put("recursiveParserWrapper", "true");
+ args.put("basicHandlerType", "text");
+ args.put("outputSuffix", "json");
+
+ BatchProcessTestExecutor ex = new BatchProcessTestExecutor(args,
+ "/tika-batch-config-MockConsumersBuilder.xml",
+ "/log4j-on.properties");
+ StreamStrings ss = ex.execute();
+ assertFalse(ss.getOutString().contains("error writing xml stream for"));
+ assertContains("parse_ex resourceId=\"test0_bad_chars.xml\"", ss.getOutString());
+ }
+
private class BatchProcessTestExecutor {
private final Map<String, String> args;
private final String configPath;
+ private final String loggerProps;
private int exitValue = Integer.MIN_VALUE;
public BatchProcessTestExecutor(Map<String, String> args) {
this(args, "/tika-batch-config-test.xml");
}
+
+
public BatchProcessTestExecutor(Map<String, String> args, String configPath) {
+ this(args, configPath, "/log4j_process.properties");
+ }
+
+ public BatchProcessTestExecutor(Map<String, String> args, String configPath, String loggerProps) {
this.args = args;
this.configPath = configPath;
+ this.loggerProps = loggerProps;
}
private StreamStrings execute() {
Process p = null;
try {
- ProcessBuilder b = getNewBatchRunnerProcess(configPath, args);
+ ProcessBuilder b = getNewBatchRunnerProcess(configPath, loggerProps, args);
p = b.start();
StringStreamGobbler errorGobbler = new StringStreamGobbler(p.getErrorStream());
StringStreamGobbler outGobbler = new StringStreamGobbler(p.getInputStream());
Modified: tika/trunk/tika-batch/src/test/java/org/apache/tika/batch/fs/FSBatchTestBase.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/java/org/apache/tika/batch/fs/FSBatchTestBase.java?rev=1712572&r1=1712571&r2=1712572&view=diff
==============================================================================
--- tika/trunk/tika-batch/src/test/java/org/apache/tika/batch/fs/FSBatchTestBase.java (original)
+++ tika/trunk/tika-batch/src/test/java/org/apache/tika/batch/fs/FSBatchTestBase.java Wed Nov 4 16:00:22 2015
@@ -150,23 +150,25 @@ public abstract class FSBatchTestBase ex
return runner;
}
- public ProcessBuilder getNewBatchRunnerProcess(String testConfig, Map<String, String> args) {
+ public ProcessBuilder getNewBatchRunnerProcess(String testConfig, String loggerProps,
+ Map<String, String> args) {
List<String> argList = new ArrayList<>();
+
for (Map.Entry<String, String> e : args.entrySet()) {
argList.add("-"+e.getKey());
argList.add(e.getValue());
}
- String[] fullCommandLine = commandLine(testConfig,
+ String[] fullCommandLine = commandLine(testConfig, loggerProps,
argList.toArray(new String[argList.size()]));
return new ProcessBuilder(fullCommandLine);
}
- private String[] commandLine(String testConfig, String[] args) {
+ private String[] commandLine(String testConfig, String loggerProps, String[] args) {
List<String> commandLine = new ArrayList<>();
commandLine.add("java");
commandLine.add("-Dlog4j.configuration=file:"+
- this.getClass().getResource("/log4j_process.properties").getFile());
+ this.getClass().getResource(loggerProps).getFile());
commandLine.add("-Xmx128m");
commandLine.add("-cp");
String cp = System.getProperty("java.class.path");
Added: tika/trunk/tika-batch/src/test/resources/log4j-on.properties
URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/resources/log4j-on.properties?rev=1712572&view=auto
==============================================================================
--- tika/trunk/tika-batch/src/test/resources/log4j-on.properties (added)
+++ tika/trunk/tika-batch/src/test/resources/log4j-on.properties Wed Nov 4 16:00:22 2015
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#This is used by the batch process; see log4j.properties for the driver
+
+log4j.rootLogger=WARN,A1
+
+#for debugging
+#log4j.rootLogger=TRACE,A1
+
+log4j.appender.A1=org.apache.log4j.ConsoleAppender
+
+# A1 uses PatternLayout.
+log4j.appender.A1.layout=org.apache.log4j.PatternLayout
+log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
Added: tika/trunk/tika-batch/src/test/resources/test-input/illegal_xml_chars_in_exception/test0_bad_chars.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/resources/test-input/illegal_xml_chars_in_exception/test0_bad_chars.xml?rev=1712572&view=auto
==============================================================================
--- tika/trunk/tika-batch/src/test/resources/test-input/illegal_xml_chars_in_exception/test0_bad_chars.xml (added)
+++ tika/trunk/tika-batch/src/test/resources/test-input/illegal_xml_chars_in_exception/test0_bad_chars.xml Wed Nov 4 16:00:22 2015
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+<mock>
+ <metadata action="add" name="author">Nikolai Lobachevsky</metadata>
+ <write element="p">some content</write>
+ <throwIllegalChars />
+</mock>
\ No newline at end of file
Added: tika/trunk/tika-batch/src/test/resources/test-input/illegal_xml_chars_in_exception/test1_ok.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/test/resources/test-input/illegal_xml_chars_in_exception/test1_ok.xml?rev=1712572&view=auto
==============================================================================
--- tika/trunk/tika-batch/src/test/resources/test-input/illegal_xml_chars_in_exception/test1_ok.xml (added)
+++ tika/trunk/tika-batch/src/test/resources/test-input/illegal_xml_chars_in_exception/test1_ok.xml Wed Nov 4 16:00:22 2015
@@ -0,0 +1,23 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+<mock>
+ <metadata action="add" name="author">Nikolai Lobachevsky</metadata>
+ <write element="p">This is tika-batch's first test file.</write>
+</mock>
\ No newline at end of file
Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java?rev=1712572&r1=1712571&r2=1712572&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java (original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java Wed Nov 4 16:00:22 2015
@@ -18,6 +18,8 @@ package org.apache.tika.parser.mock;
*/
+import static java.nio.charset.StandardCharsets.UTF_8;
+
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
@@ -49,8 +51,6 @@ import org.w3c.dom.NodeList;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-import static java.nio.charset.StandardCharsets.UTF_8;
-
/**
* This class enables mocking of parser behavior for use in testing
* wrappers and drivers of parsers.
@@ -123,11 +123,17 @@ public class MockParser extends Abstract
print(action, name);
} else if ("embedded".equals(name)) {
handleEmbedded(action, xhtml, context);
+ } else if ("throwIllegalChars".equals(name)) {
+ throwIllegalChars();
} else {
throw new IllegalArgumentException("Didn't recognize mock action: "+name);
}
}
+ private void throwIllegalChars() throws IOException {
+ throw new IOException("Can't say \u0000 in xml or \u0001 or \u0002 or \u0003");
+ }
+
private void handleEmbedded(Node action, XHTMLContentHandler handler, ParseContext context)
throws TikaException, SAXException, IOException {
String fileName = "";