You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/13 18:38:42 UTC
tika git commit: Remove Configurable entirely;
update PDFParser example for one field.
Repository: tika
Updated Branches:
refs/heads/TIKA-1508 ef1f7b9ec -> 03d38248f
Remove Configurable entirely; update PDFParser example for one field.
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/03d38248
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/03d38248
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/03d38248
Branch: refs/heads/TIKA-1508
Commit: 03d38248f37cca988866484ea759224f865c7765
Parents: ef1f7b9
Author: tballison <ta...@mitre.org>
Authored: Mon Jun 13 14:38:33 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Mon Jun 13 14:38:33 2016 -0400
----------------------------------------------------------------------
.../java/org/apache/tika/base/Configurable.java | 44 --------------------
.../main/java/org/apache/tika/config/Field.java | 2 +-
.../java/org/apache/tika/config/TikaConfig.java | 2 -
.../apache/tika/parser/ConfigurableParser.java | 32 --------------
.../org/apache/tika/parser/pdf/PDFParser.java | 27 +++++-------
.../apache/tika/parser/pdf/PDFParserConfig.java | 2 +
6 files changed, 14 insertions(+), 95 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/03d38248/tika-core/src/main/java/org/apache/tika/base/Configurable.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/base/Configurable.java b/tika-core/src/main/java/org/apache/tika/base/Configurable.java
deleted file mode 100644
index f1eb91a..0000000
--- a/tika-core/src/main/java/org/apache/tika/base/Configurable.java
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.base;
-
-import org.apache.tika.config.Param;
-import org.apache.tika.exception.TikaConfigException;
-import org.apache.tika.parser.ParseContext;
-
-import java.util.Map;
-
-/**
- * Defines contract for configurable services
- * @since Apache Tika 1.14
- */
-public interface Configurable {
-
- /**
- * Configure an instance with Tika Context
- * @param context configuration instance in the form of context
- * @throws TikaConfigException when an instance fails to work at the given context
- * @since Apache Tika 1.14
- */
- void configure(ParseContext context) throws TikaConfigException;
-
- /**
- * Gets parameters of this configurable instance
- * @return parameters in the form of a map of key value pairs
- */
- Map<String, Param<?>> getParams();
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/03d38248/tika-core/src/main/java/org/apache/tika/config/Field.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/config/Field.java b/tika-core/src/main/java/org/apache/tika/config/Field.java
index f4fe3f2..fb5a25b 100644
--- a/tika-core/src/main/java/org/apache/tika/config/Field.java
+++ b/tika-core/src/main/java/org/apache/tika/config/Field.java
@@ -23,7 +23,7 @@ import java.lang.annotation.Target;
/**
* Field annotation is a contract for binding {@link Param} value from
- * Tika Configuration to any instance of {@link org.apache.tika.base.Configurable}
+ * Tika Configuration to an object.
* services
* @since Apache Tika 1.14
*/
http://git-wip-us.apache.org/repos/asf/tika/blob/03d38248/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
index e76b6e6..49c5e26 100644
--- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
@@ -38,8 +38,6 @@ import java.util.Map;
import java.util.Set;
import java.util.concurrent.ExecutorService;
-import org.apache.tika.base.Configurable;
-
import org.apache.tika.concurrent.ConfigurableThreadPoolExecutor;
import org.apache.tika.concurrent.SimpleThreadPoolExecutor;
import org.apache.tika.detect.CompositeDetector;
http://git-wip-us.apache.org/repos/asf/tika/blob/03d38248/tika-core/src/main/java/org/apache/tika/parser/ConfigurableParser.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/parser/ConfigurableParser.java b/tika-core/src/main/java/org/apache/tika/parser/ConfigurableParser.java
deleted file mode 100644
index 47feefa..0000000
--- a/tika-core/src/main/java/org/apache/tika/parser/ConfigurableParser.java
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser;
-
-import org.apache.tika.base.Configurable;
-
-import java.io.Serializable;
-
-/**
- * Extension of {@link Parser} with {@link Configurable} contract.
- * This interface shall be implemented to create parsers which accepts runtime parameters
- * from tika configuration file
- *
- * @since Tika 1.14
- */
-public interface ConfigurableParser extends Parser,
- Configurable, Serializable {
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/03d38248/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index dd03177..a5673ee 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -52,17 +52,16 @@ import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ConfigurableParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.image.xmp.JempboxExtractor;
import org.apache.tika.parser.ocr.TesseractOCRParser;
import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.AnnotationUtils;
import org.w3c.dom.Document;
import org.xml.sax.ContentHandler;
import org.xml.sax.ErrorHandler;
import org.xml.sax.SAXException;
-import static org.bouncycastle.asn1.x500.style.RFC4519Style.name;
/**
* PDF parser.
@@ -105,26 +104,22 @@ public class PDFParser extends AbstractParser {
return SUPPORTED_TYPES;
}
- @Field
- private boolean sortByPosition = false;
-
public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
+ //step 1, check to see if there are params for the PDFParser class
+ Map<String, Param<?>> params = context.getParams(PDFParser.class);
+ PDFParserConfig localConfig = new PDFParserConfig();
+ if (params != null) {
+ AnnotationUtils.assignFieldParams(localConfig, params);
+ } else if (context.get(PDFParserConfig.class) != null) {
+ localConfig = context.get(PDFParserConfig.class, defaultConfig);
+ }
PDDocument pdfDocument = null;
TemporaryResources tmp = new TemporaryResources();
- //config from context, or default if not set via context
- PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig);
- //TODO: get rid of this after dev of TIKA-1508!!!
- localConfig.setSortByPosition(sortByPosition);
- //TODO: this is just a mockup...move elsewhere
- Map<String, Param<?>> params = context.getParams(PDFParser.class);
- if (params != null && params.containsKey("sortByPosition")) {
- localConfig.setSortByPosition((Boolean)params.get("sortByPosition").getValue());
- }
String password = "";
try {
// PDFBox can process entirely in memory, or can use a temp file
@@ -590,7 +585,7 @@ public class PDFParser extends AbstractParser {
* @deprecated use {@link #getPDFParserConfig()}
*/
public boolean getSortByPosition() {
- return sortByPosition;
+ return defaultConfig.getSortByPosition();
}
/**
@@ -605,7 +600,7 @@ public class PDFParser extends AbstractParser {
*/
@Field
public void setSortByPosition(boolean v) {
- sortByPosition = v;
+ defaultConfig.setSortByPosition(v);
}
http://git-wip-us.apache.org/repos/asf/tika/blob/03d38248/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index 296b191..3f8555c 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -25,6 +25,7 @@ import java.util.Properties;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.text.PDFTextStripper;
+import org.apache.tika.config.Field;
/**
* Config for PDFParser.
@@ -79,6 +80,7 @@ public class PDFParserConfig implements Serializable {
// True if we should sort text tokens by position
// (necessary for some PDFs, but messes up other PDFs):
+ @Field
private boolean sortByPosition = false;
//True if acroform content should be extracted