You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/05/17 20:37:39 UTC
svn commit: r1104455 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/html/
main/java/org/apache/tika/parser/microsoft/
test/java/org/apache/tika/parser/html/
test/java/org/apache/tika/parser/microsoft/
Author: jukka
Date: Tue May 17 18:37:39 2011
New Revision: 1104455
URL: http://svn.apache.org/viewvc?rev=1104455&view=rev
Log:
TIKA-650: Missing required alt attribute on img tag
Add alt attributes to img tags created by the WordExtractor and HtmlHandler classes.
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java?rev=1104455&r1=1104454&r2=1104455&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java Tue May 17 18:37:39 2011
@@ -185,7 +185,7 @@ class HtmlHandler extends TextContentHan
} else {
// We have a remapped attribute name, so set it as it might have changed.
newAttributes.setLocalName(att, normAttrName);
-
+
// And resolve relative links. Eventually this should be pushed
// into the HtmlMapper code.
if (URI_ATTRIBUTES.contains(normAttrName)) {
@@ -194,6 +194,10 @@ class HtmlHandler extends TextContentHan
}
}
+ if ("img".equals(name) && newAttributes.getValue("", "alt") == null) {
+ newAttributes.addAttribute("", "alt", "alt", "CDATA", "");
+ }
+
xhtml.startElement(name, newAttributes);
}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1104455&r1=1104454&r2=1104455&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Tue May 17 18:37:39 2011
@@ -46,6 +46,7 @@ import org.apache.tika.io.TikaInputStrea
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
public class WordExtractor extends AbstractPOIFSExtractor {
@@ -316,9 +317,12 @@ public class WordExtractor extends Abstr
// Grab the mime type for the picture
String mimeType = picture.getMimeType();
-
+
// Output the img tag
- xhtml.startElement("img", "src", "embedded:" + filename);
+ AttributesImpl attr = new AttributesImpl();
+ attr.addAttribute("", "src", "src", "CDATA", "embedded:" + filename);
+ attr.addAttribute("", "alt", "alt", "CDATA", filename);
+ xhtml.startElement("img", attr);
xhtml.endElement("img");
// Have we already output this one?
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=1104455&r1=1104454&r2=1104455&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Tue May 17 18:37:39 2011
@@ -460,7 +460,7 @@ public class HtmlParserTest extends Test
String result = sw.toString();
// <img> tag should exist, with fully resolved URL
- assertTrue(Pattern.matches("(?s).*<img src=\"http://domain.com/image.jpg\"/>.*$", result));
+ assertTrue(Pattern.matches("(?s).*src=\"http://domain.com/image.jpg\".*$", result));
}
/**
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=1104455&r1=1104454&r2=1104455&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java Tue May 17 18:37:39 2011
@@ -125,9 +125,9 @@ public class WordParserTest extends Test
String xml = sw.toString();
// Images 1-3
- assertTrue("Image not found in:\n"+xml, xml.contains("<img src=\"embedded:image1.png\"/>"));
- assertTrue("Image not found in:\n"+xml, xml.contains("<img src=\"embedded:image2.jpg\"/>"));
- assertTrue("Image not found in:\n"+xml, xml.contains("<img src=\"embedded:image3.png\"/>"));
+ assertTrue("Image not found in:\n"+xml, xml.contains("src=\"embedded:image1.png\""));
+ assertTrue("Image not found in:\n"+xml, xml.contains("src=\"embedded:image2.jpg\""));
+ assertTrue("Image not found in:\n"+xml, xml.contains("src=\"embedded:image3.png\""));
// Text too
assertTrue(xml.contains("<p>The end!"));