You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/04/27 00:46:46 UTC
svn commit: r768822 - in /lucene/tika/trunk/src/main/java/org/apache/tika:
parser/txt/TXTParser.java utils/Utils.java
Author: jukka
Date: Sun Apr 26 22:46:46 2009
New Revision: 768822
URL: http://svn.apache.org/viewvc?rev=768822&view=rev
Log:
TIKA-209: Language detection is weak
Move the text parsing stuff to TXTParser and deprecate Utils.getUTF8Reader().
Modified:
lucene/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java
lucene/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java
Modified: lucene/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java?rev=768822&r1=768821&r2=768822&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/txt/TXTParser.java Sun Apr 26 22:46:46 2009
@@ -16,6 +16,7 @@
*/
package org.apache.tika.parser.txt;
+import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
@@ -24,10 +25,12 @@
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.XHTMLContentHandler;
-import org.apache.tika.utils.Utils;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
+import com.ibm.icu.text.CharsetDetector;
+import com.ibm.icu.text.CharsetMatch;
+
/**
* Text parser
*/
@@ -36,8 +39,34 @@
public void parse(
InputStream stream, ContentHandler handler, Metadata metadata)
throws IOException, SAXException, TikaException {
- Reader reader = Utils.getUTF8Reader(stream, metadata);
+ CharsetDetector detector = new CharsetDetector();
+
+ // Use the declared character encoding, if available
+ String encoding = metadata.get(Metadata.CONTENT_ENCODING);
+ if (encoding != null) {
+ detector.setDeclaredEncoding(encoding);
+ }
+
+ // CharsetDetector expects a stream to support marks
+ if (!stream.markSupported()) {
+ stream = new BufferedInputStream(stream);
+ }
+ detector.setText(stream);
+
+ CharsetMatch match = detector.detect();
+ if (match == null) {
+ throw new TikaException("Unable to detect character encoding");
+ }
+
+ Reader reader = match.getReader();
metadata.set(Metadata.CONTENT_TYPE, "text/plain");
+ metadata.set(Metadata.CONTENT_ENCODING, match.getName());
+
+ String language = match.getLanguage();
+ if (language != null) {
+ metadata.set(Metadata.CONTENT_LANGUAGE, match.getLanguage());
+ metadata.set(Metadata.LANGUAGE, match.getLanguage());
+ }
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
Modified: lucene/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java?rev=768822&r1=768821&r2=768822&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java Sun Apr 26 22:46:46 2009
@@ -35,6 +35,8 @@
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.HttpHeaders;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParsingReader;
+import org.apache.tika.parser.txt.TXTParser;
import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
@@ -101,36 +103,10 @@
* and content language ({@link HttpHeaders#CONTENT_LANGUAGE}).
*
* @return Reader to utf8 encoded reader.
+ * @deprecated use {@link TXTParser} instead
*/
public static Reader getUTF8Reader(InputStream stream, Metadata metadata) throws TikaException, IOException{
- CharsetDetector detector = new CharsetDetector();
-
- // Use the declared character encoding, if available
- String encoding = metadata.get(Metadata.CONTENT_ENCODING);
- if (encoding != null) {
- detector.setDeclaredEncoding(encoding);
- }
-
- // CharsetDetector expects a stream to support marks
- if (!stream.markSupported()) {
- stream = new BufferedInputStream(stream);
- }
-
- detector.setText(stream);
-
- CharsetMatch match = detector.detect();
- if (match == null) {
- throw new TikaException("Unable to detect character encoding");
- }
-
- metadata.set(Metadata.CONTENT_ENCODING, match.getName());
- String language = match.getLanguage();
- if (language != null) {
- metadata.set(Metadata.CONTENT_LANGUAGE, match.getLanguage());
- metadata.set(Metadata.LANGUAGE, match.getLanguage());
- }
-
- return match.getReader();
+ return new ParsingReader(new TXTParser(), stream, metadata);
}
}