You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@tika.apache.org by "Carina (Jira)" <ji...@apache.org> on 2020/03/05 16:22:00 UTC

[jira] [Updated] (TIKA-3060) Unpack file .ppt leads to TikaException

     [ https://issues.apache.org/jira/browse/TIKA-3060?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Carina updated TIKA-3060:
-------------------------
    Description: 
Processing the attached file leads to a Tika Exception :
{quote}WARN unpack/all: Text extraction failed (b'data')
org.apache.tika.exception.TikaException: Unexpected RuntimeException from org.apache.tika.parser.microsoft.OfficeParser@46a07b2b
 at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:282)
 at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:280)
 at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:143)
 at org.apache.tika.server.resource.TikaResource.parse(TikaResource.java:409)
 at org.apache.tika.server.resource.UnpackerResource.process(UnpackerResource.java:144)
 at org.apache.tika.server.resource.UnpackerResource.unpackAll(UnpackerResource.java:110)
 at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
 at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
 at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
 at java.base/java.lang.reflect.Method.invoke(Method.java:566)
 at org.apache.cxf.service.invoker.AbstractInvoker.performInvocation(AbstractInvoker.java:179)
 at org.apache.cxf.service.invoker.AbstractInvoker.invoke(AbstractInvoker.java:96)
 at org.apache.cxf.jaxrs.JAXRSInvoker.invoke(JAXRSInvoker.java:201)
 at org.apache.cxf.jaxrs.JAXRSInvoker.invoke(JAXRSInvoker.java:104)
 at org.apache.cxf.interceptor.ServiceInvokerInterceptor$1.run(ServiceInvokerInterceptor.java:59)
 at org.apache.cxf.interceptor.ServiceInvokerInterceptor.handleMessage(ServiceInvokerInterceptor.java:96)
 at org.apache.cxf.phase.PhaseInterceptorChain.doIntercept(PhaseInterceptorChain.java:308)
 at org.apache.cxf.transport.ChainInitiationObserver.onMessage(ChainInitiationObserver.java:121)
 at org.apache.cxf.transport.http.AbstractHTTPDestination.invoke(AbstractHTTPDestination.java:267)
 at org.apache.cxf.transport.http_jetty.JettyHTTPDestination.doService(JettyHTTPDestination.java:247)
 at org.apache.cxf.transport.http_jetty.JettyHTTPHandler.handle(JettyHTTPHandler.java:79)
 at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)
 at org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:235)
 at org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1296)
 at org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:190)
 at org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1211)
 at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141)
 at org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:221)
 at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)
 at org.eclipse.jetty.server.Server.handle(Server.java:500)
 at org.eclipse.jetty.server.HttpChannel.lambda$handle$1(HttpChannel.java:386)
 at org.eclipse.jetty.server.HttpChannel.dispatch(HttpChannel.java:560)
 at org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:378)
 at org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:268)
 at org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:311)
 at org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:103)
 at org.eclipse.jetty.io.ChannelEndPoint$2.run(ChannelEndPoint.java:117)
 at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.runTask(EatWhatYouKill.java:336)
 at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:313)
 at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:171)
 at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.run(EatWhatYouKill.java:129)
 at org.eclipse.jetty.util.thread.ReservedThreadExecutor$ReservedThread.run(ReservedThreadExecutor.java:367)
 at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:782)
 at org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:914)
 at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: java.lang.IndexOutOfBoundsException: Block 69124 not found
 at org.apache.poi.poifs.filesystem.POIFSFileSystem.getBlockAt(POIFSFileSystem.java:429)
 at org.apache.poi.poifs.filesystem.POIFSFileSystem.readBAT(POIFSFileSystem.java:399)
 at org.apache.poi.poifs.filesystem.POIFSFileSystem.readCoreContents(POIFSFileSystem.java:373)
 at org.apache.poi.poifs.filesystem.POIFSFileSystem.<init>(POIFSFileSystem.java:232)
 at org.apache.poi.poifs.filesystem.POIFSFileSystem.<init>(POIFSFileSystem.java:170)
 at org.apache.tika.parser.microsoft.OfficeParser.parse(OfficeParser.java:121)
 at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:280)
 ... 44 more
Caused by: java.lang.IndexOutOfBoundsException: Position 35392000 past the end of the file
 at org.apache.poi.poifs.nio.FileBackedDataSource.read(FileBackedDataSource.java:84)
 at org.apache.poi.poifs.filesystem.POIFSFileSystem.getBlockAt(POIFSFileSystem.java:427)
 ... 50 more
{quote}
 

  was:
Processing the attached file leads to a Tika Exception :

 

{{WARN unpack/all: Text extraction failed (b'data')}}
{{org.apache.tika.exception.TikaException: Unexpected RuntimeException from org.apache.tika.parser.microsoft.OfficeParser@46a07b2b}}
{{ at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:282)}}
{{ at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:280)}}
{{ at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:143)}}
{{ at org.apache.tika.server.resource.TikaResource.parse(TikaResource.java:409)}}
{{ at org.apache.tika.server.resource.UnpackerResource.process(UnpackerResource.java:144)}}
{{ at org.apache.tika.server.resource.UnpackerResource.unpackAll(UnpackerResource.java:110)}}
{{ at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)}}
{{ at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)}}
{{ at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)}}
{{ at java.base/java.lang.reflect.Method.invoke(Method.java:566)}}
{{ at org.apache.cxf.service.invoker.AbstractInvoker.performInvocation(AbstractInvoker.java:179)}}
{{ at org.apache.cxf.service.invoker.AbstractInvoker.invoke(AbstractInvoker.java:96)}}
{{ at org.apache.cxf.jaxrs.JAXRSInvoker.invoke(JAXRSInvoker.java:201)}}
{{ at org.apache.cxf.jaxrs.JAXRSInvoker.invoke(JAXRSInvoker.java:104)}}
{{ at org.apache.cxf.interceptor.ServiceInvokerInterceptor$1.run(ServiceInvokerInterceptor.java:59)}}
{{ at org.apache.cxf.interceptor.ServiceInvokerInterceptor.handleMessage(ServiceInvokerInterceptor.java:96)}}
{{ at org.apache.cxf.phase.PhaseInterceptorChain.doIntercept(PhaseInterceptorChain.java:308)}}
{{ at org.apache.cxf.transport.ChainInitiationObserver.onMessage(ChainInitiationObserver.java:121)}}
{{ at org.apache.cxf.transport.http.AbstractHTTPDestination.invoke(AbstractHTTPDestination.java:267)}}
{{ at org.apache.cxf.transport.http_jetty.JettyHTTPDestination.doService(JettyHTTPDestination.java:247)}}
{{ at org.apache.cxf.transport.http_jetty.JettyHTTPHandler.handle(JettyHTTPHandler.java:79)}}
{{ at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)}}
{{ at org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:235)}}
{{ at org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1296)}}
{{ at org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:190)}}
{{ at org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1211)}}
{{ at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141)}}
{{ at org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:221)}}
{{ at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)}}
{{ at org.eclipse.jetty.server.Server.handle(Server.java:500)}}
{{ at org.eclipse.jetty.server.HttpChannel.lambda$handle$1(HttpChannel.java:386)}}
{{ at org.eclipse.jetty.server.HttpChannel.dispatch(HttpChannel.java:560)}}
{{ at org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:378)}}
{{ at org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:268)}}
{{ at org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:311)}}
{{ at org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:103)}}
{{ at org.eclipse.jetty.io.ChannelEndPoint$2.run(ChannelEndPoint.java:117)}}
{{ at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.runTask(EatWhatYouKill.java:336)}}
{{ at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:313)}}
{{ at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:171)}}
{{ at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.run(EatWhatYouKill.java:129)}}
{{ at org.eclipse.jetty.util.thread.ReservedThreadExecutor$ReservedThread.run(ReservedThreadExecutor.java:367)}}
{{ at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:782)}}
{{ at org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:914)}}
{{ at java.base/java.lang.Thread.run(Thread.java:834)}}
{{Caused by: java.lang.IndexOutOfBoundsException: Block 69124 not found}}
{{ at org.apache.poi.poifs.filesystem.POIFSFileSystem.getBlockAt(POIFSFileSystem.java:429)}}
{{ at org.apache.poi.poifs.filesystem.POIFSFileSystem.readBAT(POIFSFileSystem.java:399)}}
{{ at org.apache.poi.poifs.filesystem.POIFSFileSystem.readCoreContents(POIFSFileSystem.java:373)}}
{{ at org.apache.poi.poifs.filesystem.POIFSFileSystem.<init>(POIFSFileSystem.java:232)}}
{{ at org.apache.poi.poifs.filesystem.POIFSFileSystem.<init>(POIFSFileSystem.java:170)}}
{{ at org.apache.tika.parser.microsoft.OfficeParser.parse(OfficeParser.java:121)}}
{{ at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:280)}}
{{ ... 44 more}}
{{Caused by: java.lang.IndexOutOfBoundsException: Position 35392000 past the end of the file}}
{{ at org.apache.poi.poifs.nio.FileBackedDataSource.read(FileBackedDataSource.java:84)}}
{{ at org.apache.poi.poifs.filesystem.POIFSFileSystem.getBlockAt(POIFSFileSystem.java:427)}}
{{ ... 50 more}}

 


> Unpack file .ppt leads to TikaException
> ---------------------------------------
>
>                 Key: TIKA-3060
>                 URL: https://issues.apache.org/jira/browse/TIKA-3060
>             Project: Tika
>          Issue Type: Bug
>          Components: server
>    Affects Versions: 1.23
>         Environment: Tika server: docker image apache/tika:1.23 with command:
> java -jar /tika-server-1.23.jar -spawnChild -JXmx4g -JXms512m -maxFiles 10000-h 0.0.0.0 -log info
>            Reporter: Carina
>            Priority: Major
>         Attachments: LGT_CIEMAT_CONTRIBUTION_TO_CERN_20161011.ppt
>
>
> Processing the attached file leads to a Tika Exception :
> {quote}WARN unpack/all: Text extraction failed (b'data')
> org.apache.tika.exception.TikaException: Unexpected RuntimeException from org.apache.tika.parser.microsoft.OfficeParser@46a07b2b
>  at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:282)
>  at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:280)
>  at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:143)
>  at org.apache.tika.server.resource.TikaResource.parse(TikaResource.java:409)
>  at org.apache.tika.server.resource.UnpackerResource.process(UnpackerResource.java:144)
>  at org.apache.tika.server.resource.UnpackerResource.unpackAll(UnpackerResource.java:110)
>  at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
>  at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
>  at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
>  at java.base/java.lang.reflect.Method.invoke(Method.java:566)
>  at org.apache.cxf.service.invoker.AbstractInvoker.performInvocation(AbstractInvoker.java:179)
>  at org.apache.cxf.service.invoker.AbstractInvoker.invoke(AbstractInvoker.java:96)
>  at org.apache.cxf.jaxrs.JAXRSInvoker.invoke(JAXRSInvoker.java:201)
>  at org.apache.cxf.jaxrs.JAXRSInvoker.invoke(JAXRSInvoker.java:104)
>  at org.apache.cxf.interceptor.ServiceInvokerInterceptor$1.run(ServiceInvokerInterceptor.java:59)
>  at org.apache.cxf.interceptor.ServiceInvokerInterceptor.handleMessage(ServiceInvokerInterceptor.java:96)
>  at org.apache.cxf.phase.PhaseInterceptorChain.doIntercept(PhaseInterceptorChain.java:308)
>  at org.apache.cxf.transport.ChainInitiationObserver.onMessage(ChainInitiationObserver.java:121)
>  at org.apache.cxf.transport.http.AbstractHTTPDestination.invoke(AbstractHTTPDestination.java:267)
>  at org.apache.cxf.transport.http_jetty.JettyHTTPDestination.doService(JettyHTTPDestination.java:247)
>  at org.apache.cxf.transport.http_jetty.JettyHTTPHandler.handle(JettyHTTPHandler.java:79)
>  at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)
>  at org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:235)
>  at org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1296)
>  at org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:190)
>  at org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1211)
>  at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141)
>  at org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:221)
>  at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)
>  at org.eclipse.jetty.server.Server.handle(Server.java:500)
>  at org.eclipse.jetty.server.HttpChannel.lambda$handle$1(HttpChannel.java:386)
>  at org.eclipse.jetty.server.HttpChannel.dispatch(HttpChannel.java:560)
>  at org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:378)
>  at org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:268)
>  at org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:311)
>  at org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:103)
>  at org.eclipse.jetty.io.ChannelEndPoint$2.run(ChannelEndPoint.java:117)
>  at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.runTask(EatWhatYouKill.java:336)
>  at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:313)
>  at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:171)
>  at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.run(EatWhatYouKill.java:129)
>  at org.eclipse.jetty.util.thread.ReservedThreadExecutor$ReservedThread.run(ReservedThreadExecutor.java:367)
>  at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:782)
>  at org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:914)
>  at java.base/java.lang.Thread.run(Thread.java:834)
> Caused by: java.lang.IndexOutOfBoundsException: Block 69124 not found
>  at org.apache.poi.poifs.filesystem.POIFSFileSystem.getBlockAt(POIFSFileSystem.java:429)
>  at org.apache.poi.poifs.filesystem.POIFSFileSystem.readBAT(POIFSFileSystem.java:399)
>  at org.apache.poi.poifs.filesystem.POIFSFileSystem.readCoreContents(POIFSFileSystem.java:373)
>  at org.apache.poi.poifs.filesystem.POIFSFileSystem.<init>(POIFSFileSystem.java:232)
>  at org.apache.poi.poifs.filesystem.POIFSFileSystem.<init>(POIFSFileSystem.java:170)
>  at org.apache.tika.parser.microsoft.OfficeParser.parse(OfficeParser.java:121)
>  at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:280)
>  ... 44 more
> Caused by: java.lang.IndexOutOfBoundsException: Position 35392000 past the end of the file
>  at org.apache.poi.poifs.nio.FileBackedDataSource.read(FileBackedDataSource.java:84)
>  at org.apache.poi.poifs.filesystem.POIFSFileSystem.getBlockAt(POIFSFileSystem.java:427)
>  ... 50 more
> {quote}
>  



--
This message was sent by Atlassian Jira
(v8.3.4#803005)