You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/04/27 00:46:46 UTC

svn commit: r768822 - in /lucene/tika/trunk/src/main/java/org/apache/tika: parser/txt/TXTParser.java utils/Utils.java

Author: jukka
Date: Sun Apr 26 22:46:46 2009
New Revision: 768822

URL: http://svn.apache.org/viewvc?rev=768822&view=rev
Log:
TIKA-209: Language detection is weak

Move the text parsing stuff to TXTParser and deprecate Utils.getUTF8Reader().

Modified:
    lucene/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java
    lucene/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java

Modified: lucene/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java?rev=768822&r1=768821&r2=768822&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java Sun Apr 26 22:46:46 2009
@@ -16,6 +16,7 @@
  */
 package org.apache.tika.parser.txt;
 
+import java.io.BufferedInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Reader;
@@ -24,10 +25,12 @@
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.XHTMLContentHandler;
-import org.apache.tika.utils.Utils;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
+import com.ibm.icu.text.CharsetDetector;
+import com.ibm.icu.text.CharsetMatch;
+
 /**
  * Text parser
  */
@@ -36,8 +39,34 @@
     public void parse(
             InputStream stream, ContentHandler handler, Metadata metadata)
             throws IOException, SAXException, TikaException {
-        Reader reader = Utils.getUTF8Reader(stream, metadata);
+        CharsetDetector detector = new CharsetDetector();
+
+        // Use the declared character encoding, if available
+        String encoding = metadata.get(Metadata.CONTENT_ENCODING);
+        if (encoding != null) {
+            detector.setDeclaredEncoding(encoding);
+        }
+
+        // CharsetDetector expects a stream to support marks
+        if (!stream.markSupported()) {
+            stream = new BufferedInputStream(stream);
+        }
+        detector.setText(stream);
+    
+        CharsetMatch match = detector.detect();
+        if (match == null) {
+            throw new TikaException("Unable to detect character encoding");
+        }
+
+        Reader reader = match.getReader();
         metadata.set(Metadata.CONTENT_TYPE, "text/plain");
+        metadata.set(Metadata.CONTENT_ENCODING, match.getName());
+
+        String language = match.getLanguage();
+        if (language != null) {
+            metadata.set(Metadata.CONTENT_LANGUAGE, match.getLanguage());
+            metadata.set(Metadata.LANGUAGE, match.getLanguage());
+        }
 
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         xhtml.startDocument();

Modified: lucene/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java?rev=768822&r1=768821&r2=768822&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java Sun Apr 26 22:46:46 2009
@@ -35,6 +35,8 @@
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.HttpHeaders;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParsingReader;
+import org.apache.tika.parser.txt.TXTParser;
 
 import com.ibm.icu.text.CharsetDetector;
 import com.ibm.icu.text.CharsetMatch;
@@ -101,36 +103,10 @@
      * and content language ({@link HttpHeaders#CONTENT_LANGUAGE}).
      * 
      * @return Reader to utf8 encoded reader.
+     * @deprecated use {@link TXTParser} instead
      */
     public static Reader getUTF8Reader(InputStream stream, Metadata metadata) throws TikaException, IOException{
-        CharsetDetector detector = new CharsetDetector();
-    
-        // Use the declared character encoding, if available
-        String encoding = metadata.get(Metadata.CONTENT_ENCODING);
-        if (encoding != null) {
-            detector.setDeclaredEncoding(encoding);
-        }
-    
-        // CharsetDetector expects a stream to support marks
-        if (!stream.markSupported()) {
-            stream = new BufferedInputStream(stream);
-        }
-    
-        detector.setText(stream);
-    
-        CharsetMatch match = detector.detect();
-        if (match == null) {
-            throw new TikaException("Unable to detect character encoding");
-        }
-        
-        metadata.set(Metadata.CONTENT_ENCODING, match.getName());
-        String language = match.getLanguage();
-        if (language != null) {
-            metadata.set(Metadata.CONTENT_LANGUAGE, match.getLanguage());
-            metadata.set(Metadata.LANGUAGE, match.getLanguage());
-        }
-        
-        return match.getReader();
+        return new ParsingReader(new TXTParser(), stream, metadata);
     }
 
 }