You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@poi.apache.org by ni...@apache.org on 2016/06/30 21:20:24 UTC
svn commit: r1750866 -
/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/FileInformationBlock.java
Author: nick
Date: Thu Jun 30 21:20:23 2016
New Revision: 1750866
URL: http://svn.apache.org/viewvc?rev=1750866&view=rev
Log:
#59739 When testing for a valid NFIB in a FileInformationBlock, accept common "nearby" values as found from CommonCrawl
Modified:
poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/FileInformationBlock.java
Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/FileInformationBlock.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/FileInformationBlock.java?rev=1750866&r1=1750865&r2=1750866&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/FileInformationBlock.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hwpf/model/FileInformationBlock.java Thu Jun 30 21:20:23 2016
@@ -143,36 +143,45 @@ public final class FileInformationBlock
private void assertCbRgFcLcb()
{
- switch ( getNFib() )
+ int nfib = getNFib();
+ String nfibHex = String.format("%04X", nfib);
+
+ // Note - CommonCrawl shows there's more variation in these than
+ // the documentation suggests, so accept common clusters around
+ // the "correct" value as well
+ switch ( nfib )
{
- case 0x00C1:
- assertCbRgFcLcb( "0x00C1", 0x005D, "0x005D", _cbRgFcLcb );
+ case 0x0071:
+ // Found in CommonCrawl corpus but not in the docs...
break;
- case 0x00D9:
- assertCbRgFcLcb( "0x00D9", 0x006C, "0x006C", _cbRgFcLcb );
+ case 0x00BE:
+ case 0x00BF:
+ case 0x00C0:
+ case 0x00C1: // Docs "official"
+ case 0x00C2:
+ case 0x00C3:
+ assertCbRgFcLcb(nfibHex, 0x005D, "0x005D", _cbRgFcLcb );
+ break;
+ case 0x00D8:
+ case 0x00D9: // Docs "official"
+ assertCbRgFcLcb(nfibHex, 0x006C, "0x006C", _cbRgFcLcb );
break;
case 0x0101:
assertCbRgFcLcb( "0x0101", 0x0088, "0x0088", _cbRgFcLcb );
break;
- case 0x010C:
- assertCbRgFcLcb( "0x010C", 0x00A4, "0x00A4", _cbRgFcLcb );
+ // TODO Is CommonCrawl 265 = 0x109 the one above or below?
+ case 0x010B:
+ case 0x010C: // Docs "official"
+ assertCbRgFcLcb(nfibHex, 0x00A4, "0x00A4", _cbRgFcLcb );
break;
case 0x0112:
assertCbRgFcLcb( "0x0112", 0x00B7, "0x00B7", _cbRgFcLcb );
break;
default:
- /*
-When running with the large CommonCrawl corpus we found the following ids in documents that are processed fine:
-java.lang.IllegalStateException: Invalid file format version number: 113
-java.lang.IllegalStateException: Invalid file format version number: 191
-java.lang.IllegalStateException: Invalid file format version number: 192
-java.lang.IllegalStateException: Invalid file format version number: 194
-java.lang.IllegalStateException: Invalid file format version number: 195
-java.lang.IllegalStateException: Invalid file format version number: 216
-java.lang.IllegalStateException: Invalid file format version number: 265
-java.lang.IllegalStateException: Invalid file format version number: 267
+ /* The Word spec has a much smaller list of "valid" values
+ * to what the large CommonCrawl corpus contains!
*/
- logger.log(POILogger.WARN, "Invalid file format version number: " + getNFib());
+ logger.log(POILogger.WARN, "Invalid file format version number: " + nfib + "("+nfibHex+")");
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@poi.apache.org
For additional commands, e-mail: commits-help@poi.apache.org