You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ko...@apache.org on 2011/12/28 08:35:33 UTC
svn commit: r1225131 - in /lucene/dev/branches/branch_3x: ./ lucene/
lucene/contrib/benchmark/ lucene/src/java/org/apache/lucene/analysis/
lucene/src/test/org/apache/lucene/analysis/ solr/ solr/contrib/extraction/
solr/contrib/extraction/src/java/org/a...
Author: koji
Date: Wed Dec 28 07:35:33 2011
New Revision: 1225131
URL: http://svn.apache.org/viewvc?rev=1225131&view=rev
Log:
SOLR-2346: Add a chance to set content encoding explicitly via content type of stream.
Modified:
lucene/dev/branches/branch_3x/ (props changed)
lucene/dev/branches/branch_3x/lucene/ (props changed)
lucene/dev/branches/branch_3x/lucene/contrib/benchmark/ (props changed)
lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/CharTokenizer.java (props changed)
lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestCharTokenizers.java (props changed)
lucene/dev/branches/branch_3x/solr/ (props changed)
lucene/dev/branches/branch_3x/solr/contrib/extraction/CHANGES.txt
lucene/dev/branches/branch_3x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
Modified: lucene/dev/branches/branch_3x/solr/contrib/extraction/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/extraction/CHANGES.txt?rev=1225131&r1=1225130&r2=1225131&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/extraction/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/extraction/CHANGES.txt Wed Dec 28 07:35:33 2011
@@ -26,7 +26,9 @@ $Id$
================== Release 3.6.0 ==================
-(No Changes)
+* SOLR-2346: Add a chance to set content encoding explicitly via content type of stream.
+ This is convenient when Tika's auto detector cannot detect encoding, especially
+ the text file is too short to detect encoding. (koji)
================== Release 3.5.0 ==================
Modified: lucene/dev/branches/branch_3x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java?rev=1225131&r1=1225130&r2=1225131&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java Wed Dec 28 07:35:33 2011
@@ -26,6 +26,7 @@ import org.apache.solr.common.SolrExcept
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.params.UpdateParams;
import org.apache.solr.common.util.ContentStream;
+import org.apache.solr.common.util.ContentStreamBase;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.handler.ContentStreamLoader;
import org.apache.solr.request.SolrQueryRequest;
@@ -177,6 +178,12 @@ public class ExtractingDocumentLoader ex
InputStream inputStream = null;
try {
inputStream = stream.getStream();
+ // HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata
+ String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType());
+ if(charset != null){
+ metadata.add(Metadata.CONTENT_ENCODING, charset);
+ }
+
String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION);
boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false);
ContentHandler parsingHandler = handler;