You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2012/11/18 16:53:42 UTC
svn commit: r1410909 - in /tika/trunk: ./
tika-parsers/src/main/java/org/apache/tika/parser/mp3/
tika-parsers/src/test/java/org/apache/tika/parser/mp3/
tika-parsers/src/test/resources/test-documents/
Author: mikemccand
Date: Sun Nov 18 15:53:41 2012
New Revision: 1410909
URL: http://svn.apache.org/viewvc?rev=1410909&view=rev
Log:
TIKA-1024: don't returned naked BOM for MP3 ID3 tag values
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testNakedUTF16BOM.mp3 (with props)
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1410909&r1=1410908&r2=1410909&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Sun Nov 18 15:53:41 2012
@@ -42,6 +42,10 @@ Release 1.3 - Current Development
* MHTML: fixed Null charset name exception when a mime part has an
unrecognized charset (TIKA-1011).
+ * MP3: if an ID3 tag was encoded in UTF-16 with only the BOM then on
+ certain JVMs this would incorrectly extract the BOM as the tag's
+ value (TIKA-1024).
+
Release 1.2 - 07/10/2012
---------------------------------
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java?rev=1410909&r1=1410908&r2=1410909&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java Sun Nov 18 15:53:41 2012
@@ -227,6 +227,16 @@ public class ID3v2Frame implements MP3Fr
return "";
}
+ // TIKA-1024: If it's UTF-16 (with BOM) and all we
+ // have is a naked BOM then short-circuit here
+ // (return empty string), because new String(..)
+ // gives different results on different JVMs
+ if (encoding.encoding.equals("UTF-16") && actualLength == 2 &&
+ ((data[offset] == (byte) 0xff && data[offset+1] == (byte) 0xfe) ||
+ (data[offset] == (byte) 0xfe && data[offset+1] == (byte) 0xff))) {
+ return "";
+ }
+
try {
// Build the base string
return new String(data, offset, actualLength, encoding.encoding);
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java?rev=1410909&r1=1410908&r2=1410909&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java Sun Nov 18 15:53:41 2012
@@ -347,7 +347,7 @@ public class Mp3ParserTest extends TestC
stream.close();
}
- // Check we coud get the headers from the start
+ // Check we could get the headers from the start
assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Girl you have no faith in medicine", metadata.get(TikaCoreProperties.TITLE));
assertEquals("The White Stripes", metadata.get(TikaCoreProperties.CREATOR));
@@ -364,4 +364,22 @@ public class Mp3ParserTest extends TestC
assertEquals(null, metadata.get("samplerate"));
assertEquals(null, metadata.get("channels"));
}
+
+ // TIKA-1024
+ public void testNakedUTF16BOM() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ InputStream stream = Mp3ParserTest.class.getResourceAsStream(
+ "/test-documents/testNakedUTF16BOM.mp3");
+
+ try {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ } finally {
+ stream.close();
+ }
+ assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("", metadata.get(XMPDM.GENRE));
+ }
}
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testNakedUTF16BOM.mp3
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testNakedUTF16BOM.mp3?rev=1410909&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testNakedUTF16BOM.mp3
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream