You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by th...@apache.org on 2014/07/24 11:42:14 UTC

svn commit: r1613051 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/code/SourceCodeParser.java test/java/org/apache/tika/parser/code/SourceCodeParserTest.java

Author: thaichat04
Date: Thu Jul 24 09:42:13 2014
New Revision: 1613051

URL: http://svn.apache.org/r1613051
Log:
TIKA-1373 - Send html content to SAX events by using TagSoup

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java?rev=1613051&r1=1613050&r2=1613051&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java Thu Jul 24 09:42:13 2014
@@ -22,6 +22,7 @@ import static com.uwyn.jhighlight.render
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.StringReader;
 import java.nio.charset.Charset;
 import java.util.HashMap;
 import java.util.Map;
@@ -38,8 +39,10 @@ import org.apache.tika.metadata.TikaCore
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.XHTMLContentHandler;
+import org.ccil.cowan.tagsoup.HTMLSchema;
+import org.ccil.cowan.tagsoup.Schema;
 import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
 
 import com.uwyn.jhighlight.renderer.Renderer;
@@ -66,7 +69,10 @@ public class SourceCodeParser implements
   };
 
   private static final ServiceLoader LOADER = new ServiceLoader(SourceCodeParser.class.getClassLoader());
-
+  
+  //Parse the HTML document
+  private static final Schema HTML_SCHEMA = new HTMLSchema();
+  
   @Override
   public Set<MediaType> getSupportedTypes(ParseContext context) {
     return TYPES_TO_RENDERER.keySet();
@@ -99,12 +105,16 @@ public class SourceCodeParser implements
             nbLines ++;
         }
         metadata.set("LoC", String.valueOf(nbLines));
-        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         Renderer renderer = getRenderer(type.toString());
+        
         String codeAsHtml = renderer.highlight(name, out.toString(), charset.name(), false);
-        xhtml.startDocument();
-        xhtml.element("p", codeAsHtml);
-        xhtml.endDocument();
+        
+        Schema schema = context.get(Schema.class, HTML_SCHEMA);
+
+        org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();
+        parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
+        parser.setContentHandler(handler);
+        parser.parse(new InputSource(new StringReader(codeAsHtml)));
       }
     } finally {
       reader.close();

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java?rev=1613051&r1=1613050&r2=1613051&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java Thu Jul 24 09:42:13 2014
@@ -20,6 +20,7 @@ import static junit.framework.Assert.ass
 import static junit.framework.Assert.assertTrue;
 import static org.junit.Assert.assertEquals;
 
+import java.io.ByteArrayInputStream;
 import java.util.Set;
 
 import org.apache.tika.TikaTest;
@@ -45,17 +46,25 @@ public class SourceCodeParserTest extend
   }
 
   @Test
-  public void testHTMLRender() throws Exception {
-    String htmlContent = getXML(getResourceAsStream("/test-documents/testJAVA.java"), sourceCodeParser, createMetadata("text/x-java-source")).xml;
-    assertTrue(htmlContent.indexOf("&gt;&lt;code&gt;&lt;span class=\"java_javadoc_comment\"&gt;") > 0);
-    assertTrue(htmlContent.indexOf("&lt;span class=\"java_type\"&gt;HelloWorld&lt;/span&gt;") > 0);
-    assertTrue(htmlContent.indexOf("&gt;&lt;span class=\"java_keyword\"&gt;public&lt;") > 0);
-  }
-
-  @Test
   public void testHTMLRenderWithReturnLine() throws Exception {
     String htmlContent = getXML(getResourceAsStream("/test-documents/testJAVA.java"), sourceCodeParser, createMetadata("text/x-java-source")).xml;
-    assertTrue(htmlContent.indexOf("&lt;span class=\"java_javadoc_comment\"&gt;&amp;nbsp;*&lt;/span&gt;&lt;br /&gt;") > 0);
+    
+    assertTrue(htmlContent.indexOf("<html:html lang=\"en\" xml:lang=\"en\"") == 0);
+    assertTrue(htmlContent.indexOf("<html:span class=\"java_keyword\">public</span><html:span class=\"java_plain\">") > 0);
+    assertTrue(htmlContent.indexOf("<html:span class=\"java_keyword\">static</span>") > 0);
+    assertTrue(htmlContent.indexOf("<html:br clear=\"none\" />") > 0);
+  }
+  
+  @Test
+  public void testTextRender() throws Exception {
+    String textContent = getText(getResourceAsStream("/test-documents/testJAVA.java"), sourceCodeParser, createMetadata("text/x-java-source"));
+    
+    assertTrue(textContent.length() > 0);
+    assertTrue(textContent.indexOf("html") < 0);
+    
+    textContent = getText(new ByteArrayInputStream("public class HelloWorld {}".getBytes()), sourceCodeParser, createMetadata("text/x-java-source"));
+    assertTrue(textContent.length() > 0);
+    assertTrue(textContent.indexOf("html") < 0);
   }
 
   @Test