You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2011/04/27 13:18:11 UTC
svn commit: r1097084 -
/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
Author: maxcom
Date: Wed Apr 27 11:18:11 2011
New Revision: 1097084
URL: http://svn.apache.org/viewvc?rev=1097084&view=rev
Log:
OfficeParser: HWPF: ignore invalid style references
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1097084&r1=1097083&r2=1097084&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Wed Apr 27 11:18:11 2011
@@ -147,18 +147,22 @@ public class WordExtractor extends Abstr
return (t.numParagraphs()-1);
}
- StyleDescription style =
- document.getStyleSheet().getStyleDescription(p.getStyleIndex());
- TagAndStyle tas = buildParagraphTagAndStyle(
- style.getName(), (parentTableLevel>0)
- );
+ TagAndStyle tas;
+
+ if (document.getStyleSheet().numStyles()>p.getStyleIndex()) {
+ StyleDescription style =
+ document.getStyleSheet().getStyleDescription(p.getStyleIndex());
+ tas = buildParagraphTagAndStyle(style.getName(), (parentTableLevel>0));
+ } else {
+ tas = new TagAndStyle("p", null);
+ }
if(tas.getStyleClass() != null) {
- xhtml.startElement(tas.getTag(), "class", tas.getStyleClass());
+ xhtml.startElement(tas.getTag(), "class", tas.getStyleClass());
} else {
- xhtml.startElement(tas.getTag());
+ xhtml.startElement(tas.getTag());
}
-
+
for(int j=0; j<p.numCharacterRuns(); j++) {
CharacterRun cr = p.getCharacterRun(j);