You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/03 20:32:53 UTC
tika git commit: TIKA-1508 proof of concept with on parameter on
PDFParser
Repository: tika
Updated Branches:
refs/heads/TIKA-1508 18ab8f91f -> 853750d47
TIKA-1508 proof of concept with on parameter on PDFParser
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/853750d4
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/853750d4
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/853750d4
Branch: refs/heads/TIKA-1508
Commit: 853750d47fa99afad0df6d4f4727a35c96675254
Parents: 18ab8f9
Author: tballison <ta...@mitre.org>
Authored: Fri Jun 3 16:32:48 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Fri Jun 3 16:32:48 2016 -0400
----------------------------------------------------------------------
.../org/apache/tika/parser/pdf/PDF2XHTML.java | 1 -
.../org/apache/tika/parser/pdf/PDFParser.java | 17 ++++++++----
.../apache/tika/parser/pdf/PDFParserTest.java | 16 ++++++++++++
.../org/apache/tika/parser/pdf/tika-config.xml | 27 ++++++++++++++++++++
4 files changed, 55 insertions(+), 6 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/853750d4/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index ac9823e..34a3aff 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -106,7 +106,6 @@ class PDF2XHTML extends AbstractPDF2XHTML {
// key methods to output to the given content
// handler.
pdf2XHTML = new PDF2XHTML(document, handler, context, metadata, config);
-
config.configure(pdf2XHTML);
pdf2XHTML.writeText(document, new Writer() {
http://git-wip-us.apache.org/repos/asf/tika/blob/853750d4/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 3e33962..bacc901 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -43,6 +43,7 @@ import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.common.PDMetadata;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
+import org.apache.tika.config.Field;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
@@ -55,6 +56,7 @@ import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ConfigurableParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.image.xmp.JempboxExtractor;
@@ -64,6 +66,7 @@ import org.w3c.dom.Document;
import org.xml.sax.ContentHandler;
import org.xml.sax.ErrorHandler;
import org.xml.sax.SAXException;
+import static org.bouncycastle.asn1.x500.style.RFC4519Style.name;
/**
* PDF parser.
@@ -83,7 +86,7 @@ import org.xml.sax.SAXException;
* turn this feature on, see
* {@link PDFParserConfig#setExtractInlineImages(boolean)}.
*/
-public class PDFParser extends AbstractParser {
+public class PDFParser extends AbstractParser implements ConfigurableParser {
/**
@@ -102,12 +105,13 @@ public class PDFParser extends AbstractParser {
Collections.singleton(MEDIA_TYPE);
private PDFParserConfig defaultConfig = new PDFParserConfig();
-
-
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
+ @Field
+ private boolean sortByPosition = false;
+
public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
@@ -117,6 +121,8 @@ public class PDFParser extends AbstractParser {
TemporaryResources tmp = new TemporaryResources();
//config from context, or default if not set via context
PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig);
+ //TODO: get rid of this after dev of TIKA-1508!!!
+ localConfig.setSortByPosition(sortByPosition);
String password = "";
try {
// PDFBox can process entirely in memory, or can use a temp file
@@ -582,7 +588,7 @@ public class PDFParser extends AbstractParser {
* @deprecated use {@link #getPDFParserConfig()}
*/
public boolean getSortByPosition() {
- return defaultConfig.getSortByPosition();
+ return sortByPosition;
}
/**
@@ -595,8 +601,9 @@ public class PDFParser extends AbstractParser {
*
* @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
*/
+ @Field
public void setSortByPosition(boolean v) {
- defaultConfig.setSortByPosition(v);
+ sortByPosition = v;
}
http://git-wip-us.apache.org/repos/asf/tika/blob/853750d4/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index df2e27c..ac54b11 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -16,9 +16,11 @@
*/
package org.apache.tika.parser.pdf;
+import static org.bouncycastle.crypto.tls.CipherType.stream;
import static org.junit.Assert.assertArrayEquals;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
@@ -33,6 +35,7 @@ import org.apache.commons.io.IOUtils;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.AccessPermissionException;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
@@ -1212,6 +1215,19 @@ public class PDFParserTest extends TikaTest {
}
+ @Test
+ public void testInitializationViaConfig() throws Exception {
+ InputStream is = getClass().getResourceAsStream("/org/apache/tika/parser/pdf/tika-config.xml");
+ assertNotNull(is);
+ TikaConfig tikaConfig = new TikaConfig(is);
+ Parser p = new AutoDetectParser(tikaConfig);
+ String text = getText(getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf"), p);
+ text = text.replaceAll("\\s+", " ");
+
+ // Column text is now interleaved:
+ assertContains("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2", text);
+
+ }
private void assertException(String path, Parser parser, ParseContext context, Class expected) {
boolean noEx = false;
http://git-wip-us.apache.org/repos/asf/tika/blob/853750d4/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-config.xml
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-config.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-config.xml
new file mode 100644
index 0000000..0b965c7
--- /dev/null
+++ b/tika-parsers/src/test/resources/org/apache/tika/parser/pdf/tika-config.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.pdf.PDFParser">
+ <params>
+ <param name="sortByPosition" type="bool">true</param>
+ </params>
+ </parser>
+
+ </parsers>
+</properties>