You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/13 18:38:42 UTC

tika git commit: Remove Configurable entirely; update PDFParser example for one field.

Repository: tika
Updated Branches:
  refs/heads/TIKA-1508 ef1f7b9ec -> 03d38248f


Remove Configurable entirely; update PDFParser example for one field.


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/03d38248
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/03d38248
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/03d38248

Branch: refs/heads/TIKA-1508
Commit: 03d38248f37cca988866484ea759224f865c7765
Parents: ef1f7b9
Author: tballison <ta...@mitre.org>
Authored: Mon Jun 13 14:38:33 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Mon Jun 13 14:38:33 2016 -0400

----------------------------------------------------------------------
 .../java/org/apache/tika/base/Configurable.java | 44 --------------------
 .../main/java/org/apache/tika/config/Field.java |  2 +-
 .../java/org/apache/tika/config/TikaConfig.java |  2 -
 .../apache/tika/parser/ConfigurableParser.java  | 32 --------------
 .../org/apache/tika/parser/pdf/PDFParser.java   | 27 +++++-------
 .../apache/tika/parser/pdf/PDFParserConfig.java |  2 +
 6 files changed, 14 insertions(+), 95 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/03d38248/tika-core/src/main/java/org/apache/tika/base/Configurable.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/base/Configurable.java b/tika-core/src/main/java/org/apache/tika/base/Configurable.java
deleted file mode 100644
index f1eb91a..0000000
--- a/tika-core/src/main/java/org/apache/tika/base/Configurable.java
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.base;
-
-import org.apache.tika.config.Param;
-import org.apache.tika.exception.TikaConfigException;
-import org.apache.tika.parser.ParseContext;
-
-import java.util.Map;
-
-/**
- * Defines contract for configurable services
- * @since Apache Tika 1.14
- */
-public interface Configurable {
-
-    /**
-     * Configure an instance with Tika Context
-     * @param context configuration instance in the form of context
-     * @throws TikaConfigException when an instance fails to work at the given context
-     * @since Apache Tika 1.14
-     */
-    void configure(ParseContext context) throws TikaConfigException;
-
-    /**
-     * Gets parameters of this configurable instance
-     * @return parameters in the form  of a map of key value pairs
-     */
-    Map<String, Param<?>> getParams();
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/03d38248/tika-core/src/main/java/org/apache/tika/config/Field.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/config/Field.java b/tika-core/src/main/java/org/apache/tika/config/Field.java
index f4fe3f2..fb5a25b 100644
--- a/tika-core/src/main/java/org/apache/tika/config/Field.java
+++ b/tika-core/src/main/java/org/apache/tika/config/Field.java
@@ -23,7 +23,7 @@ import java.lang.annotation.Target;
 
 /**
  * Field annotation is a contract for binding {@link Param} value from
- * Tika Configuration to any instance of {@link org.apache.tika.base.Configurable}
+ * Tika Configuration to an object.
  * services
  * @since Apache Tika 1.14
  */

http://git-wip-us.apache.org/repos/asf/tika/blob/03d38248/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
index e76b6e6..49c5e26 100644
--- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
@@ -38,8 +38,6 @@ import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ExecutorService;
 
-import org.apache.tika.base.Configurable;
-
 import org.apache.tika.concurrent.ConfigurableThreadPoolExecutor;
 import org.apache.tika.concurrent.SimpleThreadPoolExecutor;
 import org.apache.tika.detect.CompositeDetector;

http://git-wip-us.apache.org/repos/asf/tika/blob/03d38248/tika-core/src/main/java/org/apache/tika/parser/ConfigurableParser.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/parser/ConfigurableParser.java b/tika-core/src/main/java/org/apache/tika/parser/ConfigurableParser.java
deleted file mode 100644
index 47feefa..0000000
--- a/tika-core/src/main/java/org/apache/tika/parser/ConfigurableParser.java
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser;
-
-import org.apache.tika.base.Configurable;
-
-import java.io.Serializable;
-
-/**
- * Extension of {@link Parser} with {@link Configurable} contract.
- * This interface shall be implemented to create parsers which accepts runtime parameters
- * from tika configuration file
- *
- * @since Tika 1.14
- */
-public interface ConfigurableParser extends Parser,
-        Configurable, Serializable {
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/03d38248/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index dd03177..a5673ee 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -52,17 +52,16 @@ import org.apache.tika.metadata.Property;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ConfigurableParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.PasswordProvider;
 import org.apache.tika.parser.image.xmp.JempboxExtractor;
 import org.apache.tika.parser.ocr.TesseractOCRParser;
 import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.AnnotationUtils;
 import org.w3c.dom.Document;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.ErrorHandler;
 import org.xml.sax.SAXException;
-import static org.bouncycastle.asn1.x500.style.RFC4519Style.name;
 
 /**
  * PDF parser.
@@ -105,26 +104,22 @@ public class PDFParser extends AbstractParser {
         return SUPPORTED_TYPES;
     }
 
-    @Field
-    private boolean sortByPosition = false;
-
     public void parse(
             InputStream stream, ContentHandler handler,
             Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
 
+        //step 1, check to see if there are params for the PDFParser class
+        Map<String, Param<?>> params = context.getParams(PDFParser.class);
+        PDFParserConfig localConfig = new PDFParserConfig();
+        if (params != null) {
+            AnnotationUtils.assignFieldParams(localConfig, params);
+        } else if (context.get(PDFParserConfig.class) != null) {
+            localConfig = context.get(PDFParserConfig.class, defaultConfig);
+        }
         PDDocument pdfDocument = null;
         TemporaryResources tmp = new TemporaryResources();
-        //config from context, or default if not set via context
-        PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig);
-        //TODO: get rid of this after dev of TIKA-1508!!!
-        localConfig.setSortByPosition(sortByPosition);
 
-        //TODO: this is just a mockup...move elsewhere
-        Map<String, Param<?>> params = context.getParams(PDFParser.class);
-        if (params != null && params.containsKey("sortByPosition")) {
-            localConfig.setSortByPosition((Boolean)params.get("sortByPosition").getValue());
-        }
         String password = "";
         try {
             // PDFBox can process entirely in memory, or can use a temp file
@@ -590,7 +585,7 @@ public class PDFParser extends AbstractParser {
      * @deprecated use {@link #getPDFParserConfig()}
      */
     public boolean getSortByPosition() {
-        return sortByPosition;
+        return defaultConfig.getSortByPosition();
     }
 
     /**
@@ -605,7 +600,7 @@ public class PDFParser extends AbstractParser {
      */
     @Field
     public void setSortByPosition(boolean v) {
-        sortByPosition = v;
+        defaultConfig.setSortByPosition(v);
     }
 
 

http://git-wip-us.apache.org/repos/asf/tika/blob/03d38248/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index 296b191..3f8555c 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -25,6 +25,7 @@ import java.util.Properties;
 
 import org.apache.pdfbox.rendering.ImageType;
 import org.apache.pdfbox.text.PDFTextStripper;
+import org.apache.tika.config.Field;
 
 /**
  * Config for PDFParser.
@@ -79,6 +80,7 @@ public class PDFParserConfig implements Serializable {
 
     // True if we should sort text tokens by position
     // (necessary for some PDFs, but messes up other PDFs):
+    @Field
     private boolean sortByPosition = false;
 
     //True if acroform content should be extracted