You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/13 15:17:40 UTC
[5/7] tika git commit: Start factoring out "configurable";
change signature of ParseContext's setParam to (Class, Param);
add check for illegal field being specified in TikaConfig.
Start factoring out "configurable"; change signature of ParseContext's setParam to (Class, Param); add check for illegal field being specified in TikaConfig.
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/338db905
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/338db905
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/338db905
Branch: refs/heads/TIKA-1508
Commit: 338db905d4e203d4df4582d5511242eaa922af6b
Parents: ecdc403
Author: tballison <ta...@mitre.org>
Authored: Mon Jun 13 11:14:27 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Mon Jun 13 11:14:27 2016 -0400
----------------------------------------------------------------------
.../java/org/apache/tika/config/TikaConfig.java | 12 ++---
.../org/apache/tika/parser/AbstractParser.java | 24 +---------
.../org/apache/tika/parser/ParseContext.java | 46 +++++++++++++-------
.../org/apache/tika/utils/AnnotationUtils.java | 24 +++++++---
.../tika/parser/ConfigurableParserTest.java | 3 ++
.../tika/parser/DummyConfigurableParser.java | 6 +--
.../tika/parser/DummyParameterizedParser.java | 3 +-
.../tika/parser/ParameterizedParserTest.java | 1 -
.../org/apache/tika/parser/pdf/PDFParser.java | 16 ++++---
.../apache/tika/parser/pdf/PDFParserTest.java | 19 +++++++-
10 files changed, 86 insertions(+), 68 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
index 853cdf0..692b007 100644
--- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
@@ -567,15 +567,9 @@ public class TikaConfig {
// Have any decoration performed, eg explicit mimetypes
loaded = decorate(loaded, element);
//if the instance is configurable, then call configure()
- if (loaded instanceof Configurable){
- Map<String, Param<?>> params = getParams(element);
- //Assigning the params to bean fields/setters
- AnnotationUtils.assignFieldParams(loaded, params);
- //invoking the configure() hook
- ParseContext context = new ParseContext();
- context.getParams().putAll(params);
- ((Configurable) loaded).configure(context); // initialize here
- }
+ Map<String, Param<?>> params = getParams(element);
+ //Assigning the params to bean fields/setters
+ AnnotationUtils.assignFieldParams(loaded, params);
// All done with setup
return loaded;
} catch (ClassNotFoundException e) {
http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java b/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java
index 5c045db..51687e7 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java
@@ -34,7 +34,7 @@ import org.xml.sax.SAXException;
*
* @since Apache Tika 0.10
*/
-public abstract class AbstractParser implements ConfigurableParser {
+public abstract class AbstractParser implements Parser {
/**
* Configuration supplied at runtime
@@ -62,27 +62,5 @@ public abstract class AbstractParser implements ConfigurableParser {
parse(stream, handler, metadata, new ParseContext());
}
- /**
- * called by the framework to supply runtime parameters which may be
- * required for initialization
- * @param context the parser context at runtime
- * @since Apache Tika 1.14
- */
- @Override
- public void configure(ParseContext context) throws TikaConfigException {
- this.context = context;
- }
-
-
- /**
- * Gets Parameters of this configurable instance
- * @return a map of key value pairs
- *
- * @since Apache Tika 1.14
- */
- @Override
- public Map<String, Param<?>> getParams() {
- return this.context.getParams();
- }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
index dc03099..68d5038 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
@@ -29,6 +29,7 @@ import java.io.IOException;
import java.io.Serializable;
import java.io.StringReader;
import java.lang.reflect.Method;
+import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
@@ -55,10 +56,12 @@ public class ParseContext implements Serializable {
/** Map of objects in this context */
private final Map<String, Object> context = new HashMap<String, Object>();
+ private final static Map<String, Param<?>> EMPTY_PARAMS = Collections.EMPTY_MAP;
+
/**
* Map of configurable arguments
*/
- private final Map<String, Param<?>> params = new HashMap<>();
+ private final Map<String, Map<String, Param<?>>> params = new HashMap<>();
private static final EntityResolver IGNORING_SAX_ENTITY_RESOLVER = new EntityResolver() {
public InputSource resolveEntity(String publicId, String systemId) throws SAXException, IOException {
@@ -202,29 +205,42 @@ public class ParseContext implements Serializable {
}
/**
- * Stores a key=value parameter
- * @param key parameter name
+ * @param clazz class associated with given param name
* @param value value
*/
- public void setParam(String key, Param<?> value){
- this.params.put(key, value);
+ public void setParam(Class clazz, Param<?> value){
+ Map<String, Param<?>> classParams = this.params.get(clazz.getName());
+ if (classParams == null) {
+ classParams = new HashMap<>();
+ }
+ classParams.put(value.getName(), value);
+ this.params.put(clazz.getName(), classParams);
}
/**
- * Gets the value associated with given parameter
+ * Gets the value associated with given class and parameter
+ * @param clazz class
* @param key parameter name
- * @return param value
+ * @return param value or null if the clazz or key doesn't exist
*/
- public Param<?> getParam(String key){
- return this.params.get(key);
+ public Param<?> getParam(Class clazz, String key) {
+ Map<String, Param<?>> classParams = this.params.get(clazz.getName());
+ if (classParams != null) {
+ return classParams.get(key);
+ }
+ return null;
}
/**
- * Gets all the params
- * @return map of key values
+ * Gets all the params for the specified class
+ * @param clazz class for which to grab the params
+ * @return map of key values or null if nothing has been specified
*/
- public Map<String, Param<?>> getParams() {
- return params;
+ public Map<String, Param<?>> getParams(Class clazz) {
+ if (params.containsKey(clazz.getName())) {
+ return params.get(clazz.getName());
+ }
+ return EMPTY_PARAMS;
}
/**
@@ -232,8 +248,8 @@ public class ParseContext implements Serializable {
* @param key parameter name
* @return true if parameter is available, false otherwise
*/
- public boolean hasParam(String key){
- return params.containsKey(key);
+ public boolean hasParam(Class clazz, String key){
+ return params.containsKey(clazz) && params.get(clazz.getName()).containsKey(key);
}
/**
* Returns the DOM builder factory specified in this parsing context.
http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-core/src/main/java/org/apache/tika/utils/AnnotationUtils.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/utils/AnnotationUtils.java b/tika-core/src/main/java/org/apache/tika/utils/AnnotationUtils.java
index 08e004b..1f56bc7 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/AnnotationUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/AnnotationUtils.java
@@ -26,11 +26,7 @@ import java.lang.annotation.Annotation;
import java.lang.reflect.AccessibleObject;
import java.security.AccessController;
import java.security.PrivilegedAction;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
/**
* This class contains utilities for dealing with tika annotations
@@ -100,7 +96,11 @@ public class AnnotationUtils {
}
List<ParamField> fields = PARAM_INFO.get(beanClass);
+
+ Set<String> validFieldNames = new HashSet<>();
+
for (ParamField field : fields) {
+ validFieldNames.add(field.getName());
Param<?> param = params.get(field.getName());
if (param != null){
if (field.getType().isAssignableFrom(param.getType())) {
@@ -110,7 +110,7 @@ public class AnnotationUtils {
throw new TikaConfigException(e.getMessage(), e);
}
} else {
- String msg = String.format("Value '%s' of type '%s' cant be" +
+ String msg = String.format(Locale.ROOT, "Value '%s' of type '%s' cant be" +
" assigned to field '%s' of defined type '%s'",
param.getValue(), param.getValue().getClass(),
field.getName(), field.getType());
@@ -118,7 +118,7 @@ public class AnnotationUtils {
}
} else if (field.isRequired()){
//param not supplied but field is declared as required?
- String msg = String.format("Param %s is required for %s," +
+ String msg = String.format(Locale.ROOT, "Param %s is required for %s," +
" but it is not given in config.", field.getName(),
bean.getClass().getName());
throw new TikaConfigException(msg);
@@ -127,5 +127,15 @@ public class AnnotationUtils {
//LOG.debug("Param not supplied, field is not mandatory");
}
}
+ //now test that params doesn't contain a field
+ //not allowed by this object
+ for (String fieldName : params.keySet()) {
+ if (! validFieldNames.contains(fieldName)) {
+ String msg = String.format(Locale.ROOT,
+ "No field '%s' exists for %s",
+ fieldName, bean.getClass().getName());
+ throw new TikaConfigException(msg);
+ }
+ }
}
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-core/src/test/java/org/apache/tika/parser/ConfigurableParserTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/parser/ConfigurableParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/ConfigurableParserTest.java
index dcf188d..ffb632c 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/ConfigurableParserTest.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/ConfigurableParserTest.java
@@ -20,6 +20,7 @@ import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.junit.Assert;
+import org.junit.Ignore;
import org.junit.Test;
import java.io.File;
@@ -36,6 +37,7 @@ public class ConfigurableParserTest {
public static final String TEST_PARAM_VAL = "testparamval";
@Test
+ @Ignore
public void testConfigurableParser() throws Exception {
URL configFileUrl = getClass().getClassLoader().getResource(TIKA_CFG_FILE);
assert configFileUrl != null;
@@ -48,6 +50,7 @@ public class ConfigurableParserTest {
}
@Test
+ @Ignore
public void testConfigurableParserTypes() throws Exception {
URL configFileUrl = getClass().getClassLoader().getResource(TIKA_CFG_FILE);
assert configFileUrl != null;
http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-core/src/test/java/org/apache/tika/parser/DummyConfigurableParser.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/parser/DummyConfigurableParser.java b/tika-core/src/test/java/org/apache/tika/parser/DummyConfigurableParser.java
index 3914b01..15fe060 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/DummyConfigurableParser.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/DummyConfigurableParser.java
@@ -39,8 +39,8 @@ import java.util.Set;
* 3. parameters were available at parse
*
*/
-public class DummyConfigurableParser extends AbstractParser {
-
+public class DummyConfigurableParser {
+/*
private static Set<MediaType> MIMES = new HashSet<>();
static {
MIMES.add(MediaType.TEXT_PLAIN);
@@ -63,5 +63,5 @@ public class DummyConfigurableParser extends AbstractParser {
metadata.add(entry.getKey()+"-type", param.getValue().getClass().getName());
}
}
-
+*/
}
http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java b/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java
index 848b774..801d65e 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java
@@ -40,8 +40,7 @@ import static org.osgi.util.measurement.Unit.s;
* A test Parsers to test {@link Field}
* @since Apache Tika 1.14
*/
-public class DummyParameterizedParser extends AbstractParser
- implements ConfigurableParser {
+public class DummyParameterizedParser extends AbstractParser {
private static Set<MediaType> MIMES = new HashSet<>();
static {
http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java
index e0c3b53..a048f29 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java
@@ -77,7 +77,6 @@ public class ParameterizedParserTest {
}
@Test
- @Ignore("can we get this to work, somehow?")
public void testBadParam() throws Exception {
try {
Metadata m = getMetadata("TIKA-1986-bad-parameters.xml");
http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index bacc901..dd03177 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -21,12 +21,7 @@ import javax.xml.stream.XMLStreamException;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Calendar;
-import java.util.Collections;
-import java.util.List;
-import java.util.Locale;
-import java.util.Set;
+import java.util.*;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.jempbox.xmp.XMPMetadata;
@@ -44,6 +39,7 @@ import org.apache.pdfbox.pdmodel.common.PDMetadata;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
import org.apache.tika.config.Field;
+import org.apache.tika.config.Param;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
@@ -86,7 +82,7 @@ import static org.bouncycastle.asn1.x500.style.RFC4519Style.name;
* turn this feature on, see
* {@link PDFParserConfig#setExtractInlineImages(boolean)}.
*/
-public class PDFParser extends AbstractParser implements ConfigurableParser {
+public class PDFParser extends AbstractParser {
/**
@@ -123,6 +119,12 @@ public class PDFParser extends AbstractParser implements ConfigurableParser {
PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig);
//TODO: get rid of this after dev of TIKA-1508!!!
localConfig.setSortByPosition(sortByPosition);
+
+ //TODO: this is just a mockup...move elsewhere
+ Map<String, Param<?>> params = context.getParams(PDFParser.class);
+ if (params != null && params.containsKey("sortByPosition")) {
+ localConfig.setSortByPosition((Boolean)params.get("sortByPosition").getValue());
+ }
String password = "";
try {
// PDFBox can process entirely in memory, or can use a temp file
http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index ac54b11..2ef29f3 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -35,6 +35,7 @@ import org.apache.commons.io.IOUtils;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.tika.TikaTest;
+import org.apache.tika.config.Param;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.AccessPermissionException;
import org.apache.tika.exception.EncryptedDocumentException;
@@ -469,7 +470,7 @@ public class PDFParserTest extends TikaTest {
content = content.replaceAll("\\s+", " ");
assertContains("Left column line 1 Left column line 2 Right column line 1 Right column line 2", content);
- parser.getPDFParserConfig().setSortByPosition(true);
+ parser.setSortByPosition(true);
stream = getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
content = getText(stream, parser);
content = content.replaceAll("\\s+", " ");
@@ -1229,6 +1230,22 @@ public class PDFParserTest extends TikaTest {
}
+ @Test
+ public void testParameterizationViaContext() throws Exception {
+ ParseContext context = new ParseContext();
+
+ Param<Boolean> paramVal = new Param<>("sortByPosition", new Boolean(true));
+ context.setParam(PDFParser.class, paramVal);
+
+ Parser p = new AutoDetectParser();
+ String text = getText(getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf"), p, context);
+ text = text.replaceAll("\\s+", " ");
+
+ // Column text is now interleaved:
+ assertContains("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2", text);
+
+ }
+
private void assertException(String path, Parser parser, ParseContext context, Class expected) {
boolean noEx = false;
InputStream is = getResourceAsStream(path);