You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/12/13 22:22:03 UTC

svn commit: r890122 - /lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java

Author: jukka
Date: Sun Dec 13 21:21:58 2009
New Revision: 890122

URL: http://svn.apache.org/viewvc?rev=890122&view=rev
Log:
TIKA-343: some parsers produces glued words

The <address> tag can be semantically useful, pass it as a safe element

Modified:
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java

Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=890122&r1=890121&r2=890122&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java Sun Dec 13 21:21:58 2009
@@ -213,6 +213,8 @@
         if ("TH".equals(name)) return "th";
         if ("TD".equals(name)) return "td";
 
+        if ("ADDRESS".equals(name)) return "address";
+
         return null;
     }