You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/12/02 19:33:47 UTC
[tika] branch master updated (e01c753 -> 31fe67d)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git.
from e01c753 TIKA-2925 -- general upgrades for 1.23 -- downgrade required version of maven for Jenkins
new 363c9c2 Update CHANGES.txt for 1.23 release.
new 86dd0cf update rat exclusions
new 381f4ea add required dependency for scm
new adb6545 need to specify javadoc source when building with > javadoc 8
new f5edbbd TIKA-3002 -- fix bug in OCR AUTO mode
new 31fe67d TIKA-2630: Wrong height and width metadata for JPEG images (#255)
The 6 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails. The revisions
listed as "add" were already present in the repository and have only
been added to this reference.
Summary of changes:
CHANGES.txt | 12 +++++++++--
pom.xml | 1 +
tika-parent/pom.xml | 6 ++++++
.../tika/parser/image/ImageMetadataExtractor.java | 23 ++++++++++++++++++++--
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 5 +++--
.../apache/tika/parser/jpeg/JpegParserTest.java | 10 +++++-----
.../tika/parser/ocr/TesseractOCRParserTest.java | 2 +-
.../org/apache/tika/parser/pdf/PDFParserTest.java | 15 ++++++++++++++
.../org/apache/tika/parser/rtf/RTFParserTest.java | 2 +-
9 files changed, 63 insertions(+), 13 deletions(-)
[tika] 01/06: Update CHANGES.txt for 1.23 release.
Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 363c9c23b402451b4463461e38fb4114eda03e08
Author: tallison <ta...@apache.org>
AuthorDate: Tue Nov 26 14:46:25 2019 -0500
Update CHANGES.txt for 1.23 release.
# Conflicts:
# CHANGES.txt
---
CHANGES.txt | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 6b532f0..3f0146d 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -5,7 +5,11 @@ Release 2.0.0 - ???
Other changes
-Release 1.23
+Release 1.23 - 11/26/2019
+
+ * NOTE: The PDFParser now relies on OCRDPI to render page images when
+ users configure OCR on rendered page images. This will have the effect
+ of increasing rendered image size (TIKA-2624).
* NOTE: tika-server no longer returns 415 for file types for which there
is no parser.
@@ -29,6 +33,10 @@ Release 1.23
Release 1.22 - 07/29/2019
+ * NOTE: tika-server no longer hard-codes the HtmlParser to handle
+ XML files (TIKA-2910). Users must now configure that behavior
+ via a tika-config.xml file.
+
* NOTE: Known regression: PDFBOX-4587 -- PDF passwords with codepoints
between 0xF000 and 0XF0000 will cause an exception.
@@ -36,7 +44,7 @@ Release 1.22 - 07/29/2019
JinSup Kim (ddoleye) (TIKA-2909).
* Fix order of closing streams to avoid "Failed to close temporary resource"
- exception (TIKA-2908).
+ exception in TesseractOCRParser (TIKA-2908).
* Improve AutoDetectReader performance by caching encoding
detector (TIKA-1568).
[tika] 03/06: add required dependency for scm
Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 381f4eacbf464bd270dd33dd884de9542e103fe6
Author: tallison <ta...@apache.org>
AuthorDate: Tue Nov 26 15:08:57 2019 -0500
add required dependency for scm
---
tika-parent/pom.xml | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 4c439a0..6a70547 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -456,6 +456,11 @@
<artifactId>maven-scm-provider-gitexe</artifactId>
<version>1.11.2</version>
</dependency>
+ <dependency>
+ <groupId>org.apache.maven.scm</groupId>
+ <artifactId>maven-scm-api</artifactId>
+ <version>1.11.2</version>
+ </dependency>
</dependencies>
</plugin>
<plugin>
[tika] 05/06: TIKA-3002 -- fix bug in OCR AUTO mode
Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
commit f5edbbd60ef22cce3fc2c8c23e617489d42be29f
Author: tallison <ta...@apache.org>
AuthorDate: Mon Dec 2 11:03:02 2019 -0500
TIKA-3002 -- fix bug in OCR AUTO mode
---
.../org/apache/tika/parser/pdf/AbstractPDF2XHTML.java | 5 +++--
.../java/org/apache/tika/parser/pdf/PDFParserTest.java | 15 +++++++++++++++
2 files changed, 18 insertions(+), 2 deletions(-)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 8acc3ff..d3f56f6 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -370,8 +370,6 @@ class AbstractPDF2XHTML extends PDFTextStripper {
metadata.add(PDF.CHARACTERS_PER_PAGE, totalCharsPerPage);
metadata.add(PDF.UNMAPPED_UNICODE_CHARS_PER_PAGE,
unmappedUnicodeCharsPerPage);
- totalCharsPerPage = 0;
- unmappedUnicodeCharsPerPage = 0;
try {
for (PDAnnotation annotation : page.getAnnotations()) {
@@ -458,6 +456,9 @@ class AbstractPDF2XHTML extends PDFTextStripper {
throw new IOExceptionWithCause("Unable to end a page", e);
} catch (IOException e) {
handleCatchableIOE(e);
+ } finally {
+ totalCharsPerPage = 0;
+ unmappedUnicodeCharsPerPage = 0;
}
if (config.getExtractFontNames()) {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 3ad4dbf..6a816c9 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1285,6 +1285,21 @@ public class PDFParserTest extends TikaTest {
}
@Test
+ public void testOCRAutoMode() throws Exception {
+ assumeTrue("can run OCR", canRunOCR());
+ PDFParserConfig config = new PDFParserConfig();
+ config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.AUTO);
+ ParseContext context = new ParseContext();
+ context.set(PDFParserConfig.class, config);
+ XMLResult xmlResult = getXML("testOCR.pdf", context);
+ assertContains("Happy New Year", xmlResult.xml);
+
+ config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR);
+ String txt = getText("testOCR.pdf", new Metadata(), context);
+ assertEquals("", txt.trim());
+ }
+
+ @Test
public void testTesseractInitializationWorks() throws Exception {
//TIKA-2970 -- make sure that configurations set on the TesseractOCRParser
//make it through to when the TesseractOCRParser is called via
[tika] 06/06: TIKA-2630: Wrong height and width metadata for JPEG
images (#255)
Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 31fe67d7ee016579ad3827c44fe7d2bf61f3efbe
Author: Dave Meikle <dm...@apache.org>
AuthorDate: Mon Dec 2 19:03:00 2019 +0000
TIKA-2630: Wrong height and width metadata for JPEG images (#255)
* TIKA-2630:
- Added extraction of image height/width from ExifSubIFDDirectory for compressed images
- Include directory name as key qualifier for Exif directories to avoid clashes
* TIKA-2630: Tidied up code
# Conflicts:
# tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
---
.../tika/parser/image/ImageMetadataExtractor.java | 23 ++++++++++++++++++++--
.../apache/tika/parser/jpeg/JpegParserTest.java | 10 +++++-----
.../tika/parser/ocr/TesseractOCRParserTest.java | 2 +-
.../org/apache/tika/parser/rtf/RTFParserTest.java | 2 +-
4 files changed, 28 insertions(+), 9 deletions(-)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
index aeb0223..f6670d8 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
@@ -260,7 +260,11 @@ public class ImageMetadataExtractor {
throws MetadataException {
if (directory.getTags() != null) {
for (Tag tag : directory.getTags()) {
- metadata.set(tag.getTagName(), tag.getDescription());
+ if (directory instanceof ExifDirectoryBase) {
+ metadata.set(directory.getName() + ":" + tag.getTagName(), tag.getDescription());
+ } else {
+ metadata.set(tag.getTagName(), tag.getDescription());
+ }
}
}
}
@@ -288,7 +292,11 @@ public class ImageMetadataExtractor {
} else if (Boolean.FALSE.toString().equalsIgnoreCase(value)) {
value = Boolean.FALSE.toString();
}
- metadata.set(name, value);
+ if (directory instanceof ExifDirectoryBase) {
+ metadata.set(directory.getName() + ":" + name, value);
+ } else {
+ metadata.set(name, value);
+ }
}
}
}
@@ -493,6 +501,17 @@ public class ImageMetadataExtractor {
metadata.set(Metadata.IMAGE_LENGTH,
trimPixels(directory.getDescription(ExifThumbnailDirectory.TAG_IMAGE_HEIGHT)));
}
+
+ // For Compressed Images read from ExifSubIFDDirectory
+ if (directory.containsTag(ExifSubIFDDirectory.TAG_EXIF_IMAGE_WIDTH)) {
+ metadata.set(Metadata.IMAGE_WIDTH,
+ trimPixels(directory.getDescription(ExifSubIFDDirectory.TAG_EXIF_IMAGE_WIDTH)));
+ }
+ if (directory.containsTag(ExifSubIFDDirectory.TAG_EXIF_IMAGE_WIDTH)) {
+ metadata.set(Metadata.IMAGE_LENGTH,
+ trimPixels(directory.getDescription(ExifSubIFDDirectory.TAG_EXIF_IMAGE_HEIGHT)));
+ }
+
}
/**
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
index c710f23..a1339d5 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
@@ -65,8 +65,8 @@ public class JpegParserTest {
parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
// Core EXIF/TIFF tags
- assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
- assertEquals("68", metadata.get(Metadata.IMAGE_LENGTH));
+ assertEquals("3888", metadata.get(Metadata.IMAGE_WIDTH));
+ assertEquals("2592", metadata.get(Metadata.IMAGE_LENGTH));
assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));
@@ -86,7 +86,7 @@ public class JpegParserTest {
// Check that EXIF/TIFF tags come through with their raw values too
// (This may be removed for Tika 1.0, as we support more of them
// with explicit Metadata entries)
- assertEquals("Canon EOS 40D", metadata.get("Model"));
+ assertEquals("Canon EOS 40D", metadata.get("Exif IFD0:Model"));
// Common tags
assertEquals("2009-10-02T23:02:49", metadata.get(Metadata.LAST_MODIFIED));
@@ -115,8 +115,8 @@ public class JpegParserTest {
assertEquals("-54.1234", metadata.get(Metadata.LONGITUDE));
// Core EXIF/TIFF tags
- assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
- assertEquals("68", metadata.get(Metadata.IMAGE_LENGTH));
+ assertEquals("3888", metadata.get(Metadata.IMAGE_WIDTH));
+ assertEquals("2592", metadata.get(Metadata.IMAGE_LENGTH));
assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index 45ef4e2..7e3f01c 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -256,7 +256,7 @@ public class TesseractOCRParserTest extends TikaTest {
m = getXML("testTIFF.tif").metadata;
assertEquals("100", m.get(Metadata.IMAGE_WIDTH));
assertEquals("75", m.get(Metadata.IMAGE_LENGTH));
- assertEquals("72 dots per inch", m.get("Y Resolution"));
+ assertEquals("72 dots per inch", m.get("Exif IFD0:Y Resolution"));
}
//TODO: add unit tests for jp2/jpx/ppm TIKA-2174
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
index 27f3b2a..79c5834 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
@@ -466,7 +466,7 @@ public class RTFParserTest extends TikaTest {
assertEquals("false", meta_jpg_exif.get(RTFMetadata.THUMBNAIL));
assertEquals(51, meta_jpg.names().length);
- assertEquals(110, meta_jpg_exif.names().length);
+ assertEquals(112, meta_jpg_exif.names().length);
}
@Test
[tika] 02/06: update rat exclusions
Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 86dd0cf96ab7fe22b910813b14bd20dfc9238b6b
Author: tallison <ta...@apache.org>
AuthorDate: Tue Nov 26 15:04:03 2019 -0500
update rat exclusions
---
pom.xml | 1 +
1 file changed, 1 insertion(+)
diff --git a/pom.xml b/pom.xml
index 5afe327..54ec597 100644
--- a/pom.xml
+++ b/pom.xml
@@ -182,6 +182,7 @@ least three +1 Tika PMC votes are cast.
<configuration>
<excludes>
<exclude>CHANGES.txt</exclude>
+ <exclude>README.md</exclude>
</excludes>
</configuration>
</plugin>
[tika] 04/06: need to specify javadoc source when building with >
javadoc 8
Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
commit adb6545c85a31bb78be7d2317dfe55d728722135
Author: tallison <ta...@apache.org>
AuthorDate: Tue Nov 26 15:20:30 2019 -0500
need to specify javadoc source when building with > javadoc 8
---
tika-parent/pom.xml | 1 +
1 file changed, 1 insertion(+)
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 6a70547..7da58de 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -537,6 +537,7 @@
<version>${maven.javadoc.version}</version>
<configuration>
<doclint>none</doclint>
+ <source>1.8</source>
</configuration>
</plugin>
</plugins>