You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by th...@apache.org on 2014/07/24 11:42:14 UTC
svn commit: r1613051 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/code/SourceCodeParser.java
test/java/org/apache/tika/parser/code/SourceCodeParserTest.java
Author: thaichat04
Date: Thu Jul 24 09:42:13 2014
New Revision: 1613051
URL: http://svn.apache.org/r1613051
Log:
TIKA-1373 - Send html content to SAX events by using TagSoup
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java?rev=1613051&r1=1613050&r2=1613051&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java Thu Jul 24 09:42:13 2014
@@ -22,6 +22,7 @@ import static com.uwyn.jhighlight.render
import java.io.IOException;
import java.io.InputStream;
+import java.io.StringReader;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Map;
@@ -38,8 +39,10 @@ import org.apache.tika.metadata.TikaCore
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.XHTMLContentHandler;
+import org.ccil.cowan.tagsoup.HTMLSchema;
+import org.ccil.cowan.tagsoup.Schema;
import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import com.uwyn.jhighlight.renderer.Renderer;
@@ -66,7 +69,10 @@ public class SourceCodeParser implements
};
private static final ServiceLoader LOADER = new ServiceLoader(SourceCodeParser.class.getClassLoader());
-
+
+ //Parse the HTML document
+ private static final Schema HTML_SCHEMA = new HTMLSchema();
+
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
return TYPES_TO_RENDERER.keySet();
@@ -99,12 +105,16 @@ public class SourceCodeParser implements
nbLines ++;
}
metadata.set("LoC", String.valueOf(nbLines));
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
Renderer renderer = getRenderer(type.toString());
+
String codeAsHtml = renderer.highlight(name, out.toString(), charset.name(), false);
- xhtml.startDocument();
- xhtml.element("p", codeAsHtml);
- xhtml.endDocument();
+
+ Schema schema = context.get(Schema.class, HTML_SCHEMA);
+
+ org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();
+ parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
+ parser.setContentHandler(handler);
+ parser.parse(new InputSource(new StringReader(codeAsHtml)));
}
} finally {
reader.close();
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java?rev=1613051&r1=1613050&r2=1613051&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java Thu Jul 24 09:42:13 2014
@@ -20,6 +20,7 @@ import static junit.framework.Assert.ass
import static junit.framework.Assert.assertTrue;
import static org.junit.Assert.assertEquals;
+import java.io.ByteArrayInputStream;
import java.util.Set;
import org.apache.tika.TikaTest;
@@ -45,17 +46,25 @@ public class SourceCodeParserTest extend
}
@Test
- public void testHTMLRender() throws Exception {
- String htmlContent = getXML(getResourceAsStream("/test-documents/testJAVA.java"), sourceCodeParser, createMetadata("text/x-java-source")).xml;
- assertTrue(htmlContent.indexOf("><code><span class=\"java_javadoc_comment\">") > 0);
- assertTrue(htmlContent.indexOf("<span class=\"java_type\">HelloWorld</span>") > 0);
- assertTrue(htmlContent.indexOf("><span class=\"java_keyword\">public<") > 0);
- }
-
- @Test
public void testHTMLRenderWithReturnLine() throws Exception {
String htmlContent = getXML(getResourceAsStream("/test-documents/testJAVA.java"), sourceCodeParser, createMetadata("text/x-java-source")).xml;
- assertTrue(htmlContent.indexOf("<span class=\"java_javadoc_comment\">&nbsp;*</span><br />") > 0);
+
+ assertTrue(htmlContent.indexOf("<html:html lang=\"en\" xml:lang=\"en\"") == 0);
+ assertTrue(htmlContent.indexOf("<html:span class=\"java_keyword\">public</span><html:span class=\"java_plain\">") > 0);
+ assertTrue(htmlContent.indexOf("<html:span class=\"java_keyword\">static</span>") > 0);
+ assertTrue(htmlContent.indexOf("<html:br clear=\"none\" />") > 0);
+ }
+
+ @Test
+ public void testTextRender() throws Exception {
+ String textContent = getText(getResourceAsStream("/test-documents/testJAVA.java"), sourceCodeParser, createMetadata("text/x-java-source"));
+
+ assertTrue(textContent.length() > 0);
+ assertTrue(textContent.indexOf("html") < 0);
+
+ textContent = getText(new ByteArrayInputStream("public class HelloWorld {}".getBytes()), sourceCodeParser, createMetadata("text/x-java-source"));
+ assertTrue(textContent.length() > 0);
+ assertTrue(textContent.indexOf("html") < 0);
}
@Test