You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/13 15:17:40 UTC

[5/7] tika git commit: Start factoring out "configurable"; change signature of ParseContext's setParam to (Class, Param); add check for illegal field being specified in TikaConfig.

Start factoring out "configurable"; change signature of ParseContext's setParam to (Class, Param); add check for illegal field being specified in TikaConfig.


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/338db905
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/338db905
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/338db905

Branch: refs/heads/TIKA-1508
Commit: 338db905d4e203d4df4582d5511242eaa922af6b
Parents: ecdc403
Author: tballison <ta...@mitre.org>
Authored: Mon Jun 13 11:14:27 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Mon Jun 13 11:14:27 2016 -0400

----------------------------------------------------------------------
 .../java/org/apache/tika/config/TikaConfig.java | 12 ++---
 .../org/apache/tika/parser/AbstractParser.java  | 24 +---------
 .../org/apache/tika/parser/ParseContext.java    | 46 +++++++++++++-------
 .../org/apache/tika/utils/AnnotationUtils.java  | 24 +++++++---
 .../tika/parser/ConfigurableParserTest.java     |  3 ++
 .../tika/parser/DummyConfigurableParser.java    |  6 +--
 .../tika/parser/DummyParameterizedParser.java   |  3 +-
 .../tika/parser/ParameterizedParserTest.java    |  1 -
 .../org/apache/tika/parser/pdf/PDFParser.java   | 16 ++++---
 .../apache/tika/parser/pdf/PDFParserTest.java   | 19 +++++++-
 10 files changed, 86 insertions(+), 68 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
index 853cdf0..692b007 100644
--- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
@@ -567,15 +567,9 @@ public class TikaConfig {
                 // Have any decoration performed, eg explicit mimetypes
                 loaded = decorate(loaded, element);
                 //if the instance is configurable, then call configure()
-                if (loaded instanceof Configurable){
-                    Map<String, Param<?>> params = getParams(element);
-                    //Assigning the params to bean fields/setters
-                    AnnotationUtils.assignFieldParams(loaded, params);
-                    //invoking the configure() hook
-                    ParseContext context = new ParseContext();
-                    context.getParams().putAll(params);
-                    ((Configurable) loaded).configure(context); // initialize here
-                }
+                Map<String, Param<?>> params = getParams(element);
+                //Assigning the params to bean fields/setters
+                AnnotationUtils.assignFieldParams(loaded, params);
                 // All done with setup
                 return loaded;
             } catch (ClassNotFoundException e) {

http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java b/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java
index 5c045db..51687e7 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java
@@ -34,7 +34,7 @@ import org.xml.sax.SAXException;
  *
  * @since Apache Tika 0.10
  */
-public abstract class AbstractParser implements ConfigurableParser {
+public abstract class AbstractParser implements Parser {
 
     /**
      * Configuration supplied at runtime
@@ -62,27 +62,5 @@ public abstract class AbstractParser implements ConfigurableParser {
         parse(stream, handler, metadata, new ParseContext());
     }
 
-    /**
-     * called by the framework to supply runtime parameters which may be
-     * required for initialization
-     * @param context the parser context at runtime
-     * @since Apache Tika 1.14
-     */
-    @Override
-    public void configure(ParseContext context) throws TikaConfigException {
-        this.context = context;
-    }
-
-
-    /**
-     * Gets Parameters of this configurable instance
-     * @return a map of key value pairs
-     *
-     * @since Apache Tika 1.14
-     */
-    @Override
-    public Map<String, Param<?>> getParams() {
-        return this.context.getParams();
-    }
 }
 

http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
index dc03099..68d5038 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
@@ -29,6 +29,7 @@ import java.io.IOException;
 import java.io.Serializable;
 import java.io.StringReader;
 import java.lang.reflect.Method;
+import java.util.Collections;
 import java.util.HashMap;
 import java.util.Map;
 
@@ -55,10 +56,12 @@ public class ParseContext implements Serializable {
     /** Map of objects in this context */
     private final Map<String, Object> context = new HashMap<String, Object>();
 
+    private final static Map<String, Param<?>> EMPTY_PARAMS = Collections.EMPTY_MAP;
+
     /**
      * Map of configurable arguments
      */
-    private final Map<String, Param<?>> params = new HashMap<>();
+    private final Map<String, Map<String, Param<?>>> params = new HashMap<>();
 
     private static final EntityResolver IGNORING_SAX_ENTITY_RESOLVER = new EntityResolver() {
         public InputSource resolveEntity(String publicId, String systemId) throws SAXException, IOException {
@@ -202,29 +205,42 @@ public class ParseContext implements Serializable {
     }
 
     /**
-     * Stores a key=value parameter
-     * @param key parameter name
+     * @param clazz class associated with given param name
      * @param value value
      */
-    public void setParam(String key, Param<?> value){
-        this.params.put(key, value);
+    public void setParam(Class clazz, Param<?> value){
+        Map<String, Param<?>> classParams = this.params.get(clazz.getName());
+        if (classParams == null) {
+            classParams = new HashMap<>();
+        }
+        classParams.put(value.getName(), value);
+        this.params.put(clazz.getName(), classParams);
     }
 
     /**
-     * Gets the value associated with given parameter
+     * Gets the value associated with given class and parameter
+     * @param clazz class
      * @param key parameter name
-     * @return param value
+     * @return param value or null if the clazz or key doesn't exist
      */
-    public Param<?> getParam(String key){
-        return this.params.get(key);
+    public Param<?> getParam(Class clazz, String key) {
+        Map<String, Param<?>> classParams = this.params.get(clazz.getName());
+        if (classParams != null) {
+            return classParams.get(key);
+        }
+        return null;
     }
 
     /**
-     * Gets all the params
-     * @return map of key values
+     * Gets all the params for the specified class
+     * @param clazz class for which to grab the params
+     * @return map of key values or null if nothing has been specified
      */
-    public Map<String, Param<?>> getParams() {
-        return params;
+    public Map<String, Param<?>> getParams(Class clazz) {
+        if (params.containsKey(clazz.getName())) {
+            return params.get(clazz.getName());
+        }
+        return EMPTY_PARAMS;
     }
 
     /**
@@ -232,8 +248,8 @@ public class ParseContext implements Serializable {
      * @param key parameter name
      * @return true if parameter is available, false otherwise
      */
-    public boolean hasParam(String key){
-       return params.containsKey(key);
+    public boolean hasParam(Class clazz, String key){
+       return params.containsKey(clazz) && params.get(clazz.getName()).containsKey(key);
     }
     /**
      * Returns the DOM builder factory specified in this parsing context.

http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-core/src/main/java/org/apache/tika/utils/AnnotationUtils.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/utils/AnnotationUtils.java b/tika-core/src/main/java/org/apache/tika/utils/AnnotationUtils.java
index 08e004b..1f56bc7 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/AnnotationUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/AnnotationUtils.java
@@ -26,11 +26,7 @@ import java.lang.annotation.Annotation;
 import java.lang.reflect.AccessibleObject;
 import java.security.AccessController;
 import java.security.PrivilegedAction;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
 
 /**
  * This class contains utilities for dealing with tika annotations
@@ -100,7 +96,11 @@ public class AnnotationUtils {
         }
 
         List<ParamField> fields = PARAM_INFO.get(beanClass);
+
+        Set<String> validFieldNames = new HashSet<>();
+
         for (ParamField field : fields) {
+            validFieldNames.add(field.getName());
             Param<?> param = params.get(field.getName());
             if (param != null){
                 if (field.getType().isAssignableFrom(param.getType())) {
@@ -110,7 +110,7 @@ public class AnnotationUtils {
                         throw new TikaConfigException(e.getMessage(), e);
                     }
                 } else {
-                    String msg = String.format("Value '%s' of type '%s' cant be" +
+                    String msg = String.format(Locale.ROOT, "Value '%s' of type '%s' cant be" +
                             " assigned to field '%s' of defined type '%s'",
                             param.getValue(), param.getValue().getClass(),
                             field.getName(), field.getType());
@@ -118,7 +118,7 @@ public class AnnotationUtils {
                 }
             } else if (field.isRequired()){
                 //param not supplied but field is declared as required?
-                String msg = String.format("Param %s is required for %s," +
+                String msg = String.format(Locale.ROOT, "Param %s is required for %s," +
                         " but it is not given in config.", field.getName(),
                         bean.getClass().getName());
                 throw new TikaConfigException(msg);
@@ -127,5 +127,15 @@ public class AnnotationUtils {
                 //LOG.debug("Param not supplied, field is not mandatory");
             }
         }
+        //now test that params doesn't contain a field
+        //not allowed by this object
+        for (String fieldName : params.keySet()) {
+            if (! validFieldNames.contains(fieldName)) {
+                String msg = String.format(Locale.ROOT,
+                        "No field '%s' exists for %s",
+                        fieldName, bean.getClass().getName());
+                throw new TikaConfigException(msg);
+            }
+        }
     }
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-core/src/test/java/org/apache/tika/parser/ConfigurableParserTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/parser/ConfigurableParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/ConfigurableParserTest.java
index dcf188d..ffb632c 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/ConfigurableParserTest.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/ConfigurableParserTest.java
@@ -20,6 +20,7 @@ import org.apache.tika.Tika;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.metadata.Metadata;
 import org.junit.Assert;
+import org.junit.Ignore;
 import org.junit.Test;
 
 import java.io.File;
@@ -36,6 +37,7 @@ public class ConfigurableParserTest {
     public static final String TEST_PARAM_VAL = "testparamval";
 
     @Test
+    @Ignore
     public void testConfigurableParser() throws Exception {
         URL configFileUrl = getClass().getClassLoader().getResource(TIKA_CFG_FILE);
         assert configFileUrl != null;
@@ -48,6 +50,7 @@ public class ConfigurableParserTest {
     }
 
     @Test
+    @Ignore
     public void testConfigurableParserTypes() throws Exception {
         URL configFileUrl = getClass().getClassLoader().getResource(TIKA_CFG_FILE);
         assert configFileUrl != null;

http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-core/src/test/java/org/apache/tika/parser/DummyConfigurableParser.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/parser/DummyConfigurableParser.java b/tika-core/src/test/java/org/apache/tika/parser/DummyConfigurableParser.java
index 3914b01..15fe060 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/DummyConfigurableParser.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/DummyConfigurableParser.java
@@ -39,8 +39,8 @@ import java.util.Set;
  * 3. parameters were available at parse
  *
  */
-public class DummyConfigurableParser extends AbstractParser {
-
+public class DummyConfigurableParser {
+/*
     private static Set<MediaType> MIMES = new HashSet<>();
     static {
         MIMES.add(MediaType.TEXT_PLAIN);
@@ -63,5 +63,5 @@ public class DummyConfigurableParser extends AbstractParser {
             metadata.add(entry.getKey()+"-type", param.getValue().getClass().getName());
         }
     }
-
+*/
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java b/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java
index 848b774..801d65e 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java
@@ -40,8 +40,7 @@ import static org.osgi.util.measurement.Unit.s;
  * A test Parsers to test {@link Field}
  * @since Apache Tika 1.14
  */
-public class DummyParameterizedParser extends AbstractParser
-        implements ConfigurableParser {
+public class DummyParameterizedParser extends AbstractParser {
 
     private static Set<MediaType> MIMES = new HashSet<>();
     static {

http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java
index e0c3b53..a048f29 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java
@@ -77,7 +77,6 @@ public class ParameterizedParserTest {
     }
 
     @Test
-    @Ignore("can we get this to work, somehow?")
     public void testBadParam() throws Exception {
         try {
             Metadata m = getMetadata("TIKA-1986-bad-parameters.xml");

http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index bacc901..dd03177 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -21,12 +21,7 @@ import javax.xml.stream.XMLStreamException;
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Calendar;
-import java.util.Collections;
-import java.util.List;
-import java.util.Locale;
-import java.util.Set;
+import java.util.*;
 
 import org.apache.commons.io.input.CloseShieldInputStream;
 import org.apache.jempbox.xmp.XMPMetadata;
@@ -44,6 +39,7 @@ import org.apache.pdfbox.pdmodel.common.PDMetadata;
 import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
 import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
 import org.apache.tika.config.Field;
+import org.apache.tika.config.Param;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
@@ -86,7 +82,7 @@ import static org.bouncycastle.asn1.x500.style.RFC4519Style.name;
  * turn this feature on, see
  * {@link PDFParserConfig#setExtractInlineImages(boolean)}.
  */
-public class PDFParser extends AbstractParser implements ConfigurableParser {
+public class PDFParser extends AbstractParser {
 
 
     /**
@@ -123,6 +119,12 @@ public class PDFParser extends AbstractParser implements ConfigurableParser {
         PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig);
         //TODO: get rid of this after dev of TIKA-1508!!!
         localConfig.setSortByPosition(sortByPosition);
+
+        //TODO: this is just a mockup...move elsewhere
+        Map<String, Param<?>> params = context.getParams(PDFParser.class);
+        if (params != null && params.containsKey("sortByPosition")) {
+            localConfig.setSortByPosition((Boolean)params.get("sortByPosition").getValue());
+        }
         String password = "";
         try {
             // PDFBox can process entirely in memory, or can use a temp file

http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index ac54b11..2ef29f3 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -35,6 +35,7 @@ import org.apache.commons.io.IOUtils;
 import org.apache.log4j.Level;
 import org.apache.log4j.Logger;
 import org.apache.tika.TikaTest;
+import org.apache.tika.config.Param;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.AccessPermissionException;
 import org.apache.tika.exception.EncryptedDocumentException;
@@ -469,7 +470,7 @@ public class PDFParserTest extends TikaTest {
         content = content.replaceAll("\\s+", " ");
         assertContains("Left column line 1 Left column line 2 Right column line 1 Right column line 2", content);
 
-        parser.getPDFParserConfig().setSortByPosition(true);
+        parser.setSortByPosition(true);
         stream = getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
         content = getText(stream, parser);
         content = content.replaceAll("\\s+", " ");
@@ -1229,6 +1230,22 @@ public class PDFParserTest extends TikaTest {
 
     }
 
+    @Test
+    public void testParameterizationViaContext() throws Exception {
+        ParseContext context = new ParseContext();
+
+        Param<Boolean> paramVal = new Param<>("sortByPosition", new Boolean(true));
+        context.setParam(PDFParser.class, paramVal);
+
+        Parser p = new AutoDetectParser();
+        String text = getText(getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf"), p, context);
+        text = text.replaceAll("\\s+", " ");
+
+        // Column text is now interleaved:
+        assertContains("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2", text);
+
+    }
+
     private void assertException(String path, Parser parser, ParseContext context, Class expected) {
         boolean noEx = false;
         InputStream is = getResourceAsStream(path);