You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@tika.apache.org by "Khare, Kushal (MIND)" <Ku...@mind-infotech.com> on 2019/11/01 07:01:24 UTC
TextHandler extracting content when running code as Java App but not as Web App

Hello Mates, hope you are all doing good !

I am trying to integrate my Solr & Tika search utility code with my java web application.
The issue that I am facing is that I am unable to get the results (text handler is unable to extract any data) when I hit through my we app search page, that is , when I run my application on server. Whereas, when I independently run my utility method using main() function, I get the results.
Please help me with this, as I am unable to figure out this. I have checked all the jars, libs, etc. Everything is fine.
Following is the code that I am using :

package com.mind.qdms.utility;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.client.solrj.response.UpdateResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.MapSolrParams;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;


public class QdmsSolrUtilityMethods {



         public static void main(String[] args) throws IOException,
         SolrServerException { QdmsSolrUtilityMethods.getDocsList("kushal"); }


       public static List<String> getDocsList(String keyword) throws IOException, SolrServerException {
              System.out.println("in util");
              HttpSolrClient client = new HttpSolrClient.Builder("http://localhost:8983/solr/tika").build();
              AutoDetectParser autoParser = new AutoDetectParser();
              indexTikaDocuments(new File("D:\\docs"), client, autoParser);
              List<String> resultDocList = queryDocuments(client, keyword);
              return resultDocList;
       }

       public static void indexTikaDocuments(File root,HttpSolrClient
         client,AutoDetectParser autoParser) throws IOException, SolrServerException {
         int totalTika = 0;

         @SuppressWarnings("rawtypes") Collection docList = new ArrayList();

         for (File file : root.listFiles()){
                if (file.isDirectory()) {
                       indexTikaDocuments(file,client,autoParser);
                       continue;
                }
                ContentHandler textHandler = new BodyContentHandler(-1);
                Metadata metadata = new Metadata();
                ParseContext context = new ParseContext();
                InputStream input = new
                FileInputStream(file);
                try {
                autoParser.parse(input, textHandler, metadata, context);
                }catch (Exception e) {
                System.out.println(String.format("File %s failed", file.getCanonicalPath()));
                e.printStackTrace();
                continue;
                }
                SolrInputDocument doc = new SolrInputDocument();
                doc.addField("id", file.getCanonicalPath());
                doc.addField("_text_", textHandler.toString());
                docList.add(doc);
                System.out.println(textHandler.toString());
                System.out.println( file.getCanonicalPath()); ++totalTika;
                // Completely arbitrary, just batch up more than one document for throughput!
                if(docList.size() >= 1000) {
                       // Commit within 5 minutes.
                       UpdateResponse resp = client.add(docList, 300000);
                       if (resp.getStatus() != 0) {
                             System.out.println("Some horrible error has occurred, status is: " +
                             resp.getStatus());
                             }
                       docList.clear();
                       }
                }if(docList.size() > 0) {
                       client.add(docList, 300000);
                     } client.commit();
                System.out.println("indexed " + totalTika + " documents");
                }

       public static List<String> queryDocuments(HttpSolrClient client1, String queryTerm) throws SolrServerException, IOException {
              List<String> resultList = null;
              HttpSolrClient client = new HttpSolrClient.Builder("http://localhost:8983/solr").build();

              final Map<String, String> queryParamMap = new HashMap<String, String>();
              queryParamMap.put("q", queryTerm);
              queryParamMap.put("rows", "5000");
              MapSolrParams queryParams = new MapSolrParams(queryParamMap);

              final QueryResponse response = client.query("tika", queryParams);
              final SolrDocumentList docList = response.getResults();

              System.out.println("docList ::  "+docList);
              System.out.println("docList size ::  "+docList.size());

              for (SolrDocument document : docList) {
                     final String id = (String) document.getFirstValue("id");
                     resultList.add(id);
              }
              return resultList;
       }


}


Kindly help me get through this.
Thanks !

________________________________

The information contained in this electronic message and any attachments to this message are intended for the exclusive use of the addressee(s) and may contain proprietary, confidential or privileged information. If you are not the intended recipient, you should not disseminate, distribute or copy this e-mail. Please notify the sender immediately and destroy all copies of this message and any attachments. WARNING: Computer viruses can be transmitted via email. The recipient should check this email and any attachments for the presence of viruses. The company accepts no liability for any damage caused by any virus/trojan/worms/malicious code transmitted by this email. www.motherson.com