You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2012/11/18 16:53:42 UTC

svn commit: r1410909 - in /tika/trunk: ./ tika-parsers/src/main/java/org/apache/tika/parser/mp3/ tika-parsers/src/test/java/org/apache/tika/parser/mp3/ tika-parsers/src/test/resources/test-documents/

Author: mikemccand
Date: Sun Nov 18 15:53:41 2012
New Revision: 1410909

URL: http://svn.apache.org/viewvc?rev=1410909&view=rev
Log:
TIKA-1024: don't returned naked BOM for MP3 ID3 tag values

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testNakedUTF16BOM.mp3   (with props)
Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1410909&r1=1410908&r2=1410909&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Sun Nov 18 15:53:41 2012
@@ -42,6 +42,10 @@ Release 1.3 - Current Development
   * MHTML: fixed Null charset name exception when a mime part has an
     unrecognized charset (TIKA-1011).
 
+  * MP3: if an ID3 tag was encoded in UTF-16 with only the BOM then on
+    certain JVMs this would incorrectly extract the BOM as the tag's
+    value (TIKA-1024).
+
 Release 1.2 - 07/10/2012
 ---------------------------------
 

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java?rev=1410909&r1=1410908&r2=1410909&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java Sun Nov 18 15:53:41 2012
@@ -227,6 +227,16 @@ public class ID3v2Frame implements MP3Fr
            return "";
         }
 
+        // TIKA-1024: If it's UTF-16 (with BOM) and all we
+        // have is a naked BOM then short-circuit here
+        // (return empty string), because new String(..)
+        // gives different results on different JVMs
+        if (encoding.encoding.equals("UTF-16") && actualLength == 2 &&
+            ((data[offset] == (byte) 0xff && data[offset+1] == (byte) 0xfe) ||
+             (data[offset] == (byte) 0xfe && data[offset+1] == (byte) 0xff))) {
+          return "";
+        }
+
         try {
             // Build the base string
             return new String(data, offset, actualLength, encoding.encoding);

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java?rev=1410909&r1=1410908&r2=1410909&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java Sun Nov 18 15:53:41 2012
@@ -347,7 +347,7 @@ public class Mp3ParserTest extends TestC
            stream.close();
        }
 
-       // Check we coud get the headers from the start
+       // Check we could get the headers from the start
        assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("Girl you have no faith in medicine", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("The White Stripes", metadata.get(TikaCoreProperties.CREATOR));
@@ -364,4 +364,22 @@ public class Mp3ParserTest extends TestC
        assertEquals(null, metadata.get("samplerate"));
        assertEquals(null, metadata.get("channels"));
     }
+
+    // TIKA-1024
+    public void testNakedUTF16BOM() throws Exception {
+       Parser parser = new AutoDetectParser(); // Should auto-detect!
+       ContentHandler handler = new BodyContentHandler();
+       Metadata metadata = new Metadata();
+
+       InputStream stream = Mp3ParserTest.class.getResourceAsStream(
+               "/test-documents/testNakedUTF16BOM.mp3");
+       
+       try {
+           parser.parse(stream, handler, metadata, new ParseContext());
+       } finally {
+           stream.close();
+       }
+       assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+       assertEquals("", metadata.get(XMPDM.GENRE));
+    }
 }

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testNakedUTF16BOM.mp3
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testNakedUTF16BOM.mp3?rev=1410909&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testNakedUTF16BOM.mp3
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream