You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/09/19 16:51:38 UTC

[tika] branch master updated: TIKA-2730 -- allow last frame to be truncated w/o throwing an EOF

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new e4d6d15  TIKA-2730 -- allow last frame to be truncated w/o throwing an EOF
e4d6d15 is described below

commit e4d6d15f705232b0a422c01e217919e53b118fdf
Author: TALLISON <ta...@apache.org>
AuthorDate: Wed Sep 19 12:51:23 2018 -0400

    TIKA-2730 -- allow last frame to be truncated w/o throwing an EOF
---
 .../java/org/apache/tika/parser/mp3/Mp3Parser.java |   9 ++++--
 .../org/apache/tika/parser/mp3/MpegStream.java     |  11 ++++---
 .../org/apache/tika/parser/mp3/Mp3ParserTest.java  |  33 ++++++++++++++++++++-
 .../test-documents/testMP3i18n_truncated.mp3       | Bin 0 -> 40672 bytes
 4 files changed, 43 insertions(+), 10 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java b/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
index 3b79f31..345f486 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
@@ -194,15 +194,18 @@ public class Mp3Parser extends AbstractParser {
         // Now iterate over all audio frames in the file
         AudioFrame frame = mpegStream.nextFrame();
         float duration = 0;
-        while (frame != null)
+        boolean skipped = true;
+        while (frame != null && skipped)
         {
             duration += frame.getDuration();
             if (firstAudio == null)
             {
                 firstAudio = frame;
             }
-            mpegStream.skipFrame();
-            frame = mpegStream.nextFrame();
+            skipped = mpegStream.skipFrame();
+            if (skipped) {
+                frame = mpegStream.nextFrame();
+            }
         }
 
        // ID3v1 tags live at the end of the file
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/mp3/MpegStream.java b/tika-parsers/src/main/java/org/apache/tika/parser/mp3/MpegStream.java
index 1814c12..4984fea 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/mp3/MpegStream.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/mp3/MpegStream.java
@@ -150,10 +150,10 @@ class MpegStream extends PushbackInputStream
      * Skips the current MPEG frame. This method can be called after a valid
      * MPEG header has been retrieved using {@code nextFrame()}. In this case
      * the underlying stream is advanced to the end of the associated MPEG
-     * frame. Otherwise, this method has no effect. The return value indicates
-     * whether a frame could be skipped.
+     * frame or until the EOF is reached. The return value indicates
+     * whether the full frame could be skipped.
      * 
-     * @return <b>true</b> if a frame could be skipped, <b>false</b> otherwise
+     * @return <b>true</b> if a frame could be skipped, <b>false</b> otherwise, perhaps EOF?
      * @throws IOException if an IO error occurs
      */
     public boolean skipFrame() throws IOException
@@ -162,11 +162,10 @@ class MpegStream extends PushbackInputStream
         {
             long toSkip = currentHeader.getLength() - HEADER_SIZE;
             long skipped = IOUtils.skipFully(in, toSkip);
+            currentHeader = null;
             if (skipped < toSkip) {
-                throw new EOFException("EOF: tried to skip "+toSkip +
-                        " but could only skip "+skipped);
+                return false;
             }
-            currentHeader = null;
             return true;
         }
         return false;
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java
index ae9d06b..951e46d 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java
@@ -243,7 +243,38 @@ public class Mp3ParserTest {
        assertEquals("1", metadata.get("channels"));
        checkDuration(metadata, 2);
    }
-    
+    /**
+     * Tests that a file with the last frame slightly
+     * truncated does not cause an EOF and does
+     * not lead to an infinite loop.
+     */
+    @Test
+    public void testMp3ParsingID3i18nTruncated() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = Mp3ParserTest.class.getResourceAsStream(
+                "/test-documents/testMP3i18n_truncated.mp3")) {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        }
+
+        assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Une chason en Fran\u00e7ais", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Test Artist \u2468\u2460", metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Test Artist \u2468\u2460", metadata.get(XMPDM.ARTIST));
+        assertEquals("Test Album \u2460\u2468", metadata.get(XMPDM.ALBUM));
+
+        assertEquals(
+                "Eng - Comment Desc\nThis is a \u1357\u2468\u2460 Comment",
+                metadata.get(XMPDM.LOG_COMMENT)
+        );
+
+        assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
+        assertEquals("44100", metadata.get("samplerate"));
+        assertEquals("1", metadata.get("channels"));
+        checkDuration(metadata, 2);
+    }
     
     /**
      * Tests that a file with both lyrics and
diff --git a/tika-parsers/src/test/resources/test-documents/testMP3i18n_truncated.mp3 b/tika-parsers/src/test/resources/test-documents/testMP3i18n_truncated.mp3
new file mode 100644
index 0000000..c2cd30d
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testMP3i18n_truncated.mp3 differ