You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ko...@apache.org on 2011/12/28 08:35:33 UTC

svn commit: r1225131 - in /lucene/dev/branches/branch_3x: ./ lucene/ lucene/contrib/benchmark/ lucene/src/java/org/apache/lucene/analysis/ lucene/src/test/org/apache/lucene/analysis/ solr/ solr/contrib/extraction/ solr/contrib/extraction/src/java/org/a...

Author: koji
Date: Wed Dec 28 07:35:33 2011
New Revision: 1225131

URL: http://svn.apache.org/viewvc?rev=1225131&view=rev
Log:
SOLR-2346: Add a chance to set content encoding explicitly via content type of stream.

Modified:
    lucene/dev/branches/branch_3x/   (props changed)
    lucene/dev/branches/branch_3x/lucene/   (props changed)
    lucene/dev/branches/branch_3x/lucene/contrib/benchmark/   (props changed)
    lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/CharTokenizer.java   (props changed)
    lucene/dev/branches/branch_3x/lucene/src/test/org/apache/lucene/analysis/TestCharTokenizers.java   (props changed)
    lucene/dev/branches/branch_3x/solr/   (props changed)
    lucene/dev/branches/branch_3x/solr/contrib/extraction/CHANGES.txt
    lucene/dev/branches/branch_3x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java

Modified: lucene/dev/branches/branch_3x/solr/contrib/extraction/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/extraction/CHANGES.txt?rev=1225131&r1=1225130&r2=1225131&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/extraction/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/extraction/CHANGES.txt Wed Dec 28 07:35:33 2011
@@ -26,7 +26,9 @@ $Id$
 
 ================== Release 3.6.0 ==================
 
-(No Changes)
+* SOLR-2346: Add a chance to set content encoding explicitly via content type of stream.
+  This is convenient when Tika's auto detector cannot detect encoding, especially
+  the text file is too short to detect encoding. (koji)
 
 ================== Release 3.5.0 ==================
 

Modified: lucene/dev/branches/branch_3x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java?rev=1225131&r1=1225130&r2=1225131&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java Wed Dec 28 07:35:33 2011
@@ -26,6 +26,7 @@ import org.apache.solr.common.SolrExcept
 import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.common.params.UpdateParams;
 import org.apache.solr.common.util.ContentStream;
+import org.apache.solr.common.util.ContentStreamBase;
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.handler.ContentStreamLoader;
 import org.apache.solr.request.SolrQueryRequest;
@@ -177,6 +178,12 @@ public class ExtractingDocumentLoader ex
       InputStream inputStream = null;
       try {
         inputStream = stream.getStream();
+        // HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata
+        String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType());
+        if(charset != null){
+          metadata.add(Metadata.CONTENT_ENCODING, charset);
+        }
+
         String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION);
         boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false);
         ContentHandler parsingHandler = handler;