You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jackrabbit.apache.org by mr...@apache.org on 2008/08/29 17:01:07 UTC
svn commit: r690282 -
/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/HTMLTextExtractor.java
Author: mreutegg
Date: Fri Aug 29 08:01:07 2008
New Revision: 690282
URL: http://svn.apache.org/viewvc?rev=690282&view=rev
Log:
JCR-1727: HTMLTextExtractor modifying UTF-8 encoded String
Modified:
jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/HTMLTextExtractor.java
Modified: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/HTMLTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/HTMLTextExtractor.java?rev=690282&r1=690281&r2=690282&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/HTMLTextExtractor.java (original)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/HTMLTextExtractor.java Fri Aug 29 08:01:07 2008
@@ -31,6 +31,7 @@
import java.io.InputStream;
import java.io.IOException;
import java.io.StringReader;
+import java.io.InputStreamReader;
/**
* Text extractor for HyperText Markup Language (HTML).
@@ -64,7 +65,13 @@
HTMLParser parser = new HTMLParser();
SAXResult result = new SAXResult(new DefaultHandler());
- SAXSource source = new SAXSource(parser, new InputSource(stream));
+ Reader reader;
+ if (encoding != null) {
+ reader = new InputStreamReader(stream, encoding);
+ } else {
+ reader = new InputStreamReader(stream);
+ }
+ SAXSource source = new SAXSource(parser, new InputSource(reader));
transformer.transform(source, result);
return new StringReader(parser.getContents());