You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-user@lucene.apache.org by Sergiu Gordea <gs...@ifit.uni-klu.ac.at> on 2004/07/13 09:41:45 UTC

Re: Indexing word documents

NATARAJAN THILLAI wrote:
 Hi Sergiu,
    I am Natarajan from India and now I was working search engine 
project. I saw u r article in the net     
(http://article.gmane.org/gmane.comp.jakarta.poi.user/4851). It's very 
nice and useful to me.

  I want to Indexing exe file so pls send me your 
"com.configworks.cwk.share.Utils" file.
 
Advance Thanks.
 
Regards
Natarajan.
 

------------------------------------------------------------------------

Hi,

 I'm glad to find that the code I submitted (I cannot claim is mine) is 
usefull for other programmers.
 I can sent you the utils class, no problem. But we are not indexing  
exe files. the ExeConverterImpl is an external converter that converts 
different
file formats to text in a batch mode. For example antiword is such an 
converter.
Also the ppt converter I submitted throws an OutOfMemoryError. I'll send 
the code with the bugfix.
 
   I wish you luck in your work, and here is the classes:

  Sergiu

/******* Util class *******/
package com.configworks.cwk.share;

import com.configworks.cwk.be.system.CwkConfigurationFactory;
import com.configworks.cwk.be.system.Debug;
import com.configworks.cwk.be.system.ICwkConfiguration;
import java.io.File;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.turbine.services.intake.model.Field;


/**
 * KM-Portal Utilities Alle möglichen Methoden, die das programmieren 
mit dem KM-Portal erleichtern
 * sollen.
 *
 * @author Christine Keim
 * @version 1
 */
public class Utils {

    private static final Log logger = 
LogFactory.getLog(Utils.class.getName());

    public static final String DATE_PATTERN = "dd.MM.yyyy";
    public static final String DATE_TIME_PATTERN = "dd.MM.yyyy HH:mm:ss";

    public static final SimpleDateFormat DATE_FORMAT = new 
SimpleDateFormat(DATE_PATTERN);
    public final static SimpleDateFormat DATE_TIME_FORMAT = new 
SimpleDateFormat(DATE_TIME_PATTERN);

    private static final int MAX_FILENAME_LENGTH = 
Integer.parseInt(CwkConfigurationFactory.getConfigurationInstance(
        
CwkConfigurationFactory.PROP_FILE_CONFIGURATION).getProperty(ICwkConfiguration.FILENAMES_LENGTH_MAX));

    private static String[] typeimages = {"pdf.gif", "audio.gif", 
"video.gif", "image.gif",
                                          "office.gif", "data.gif", 
"archive.gif", "link.gif",
                                          "unknown.gif", "word.gif", 
"excel.gif", "powerpoint.gif"};

    private static String unknownTypeimage = "unknown.gif";

    public String getDatePattern() {
        return DATE_PATTERN;
    }

    /**
     * Checks wether the given String is ull or empty, contains nothing 
or only blanks...
     *
     * @param v String to check
     * @return true if string is empty, else false
     */
    public static boolean isEmpty(String v) {
       
        return (v == null || "".equals(v.trim()));
       
    }

    /**
     * @param current DOCUMENT ME!
     * @return DOCUMENT ME!
     * @see com.configworks.cwk.fe.tools.CwkToolkit#getRatingImage(float)
     * @deprecated Use cwktoolkit.getRatingImage instead
     */
    public static String getRatingImage(float current) {
        if (current < 0) {
            return "unrated.gif";
        }

        if (current == 0) {
            return "stars-0-0.gif";
        }

        if (current <= 1) {
            return "stars-0-5.gif";
        }

        if (current <= 2) {
            return "stars-1-0.gif";
        }

        if (current <= 3) {
            return "stars-1-5.gif";
        }

        if (current <= 4) {
            return "stars-2-0.gif";
        }

        if (current <= 5) {
            return "stars-2-5.gif";
        }

        if (current <= 6) {
            return "stars-3-0.gif";
        }

        if (current <= 7) {
            return "stars-3-5.gif";
        }

        if (current <= 8) {
            return "stars-4-0.gif";
        }

        if (current <= 9) {
            return "stars-4-5.gif";
        } else {
            return "stars-5-0.gif";
        }
    }

    /**
     * @param type DOCUMENT ME!
     * @return DOCUMENT ME!
     * @see com.configworks.cwk.fe.tools.CwkToolkit#getTypeImage(int)
     * @deprecated use cwktoolkit.getTypeImage instead
     */
    public static String getTypeImage(int type) {
        if ((type >= 0) && (type < typeimages.length)) {
            return typeimages[type];
        } else {
            return unknownTypeimage;
        }
    }

    /**
     * DOCUMENT ME!
     *
     * @param quality  DOCUMENT ME!
     * @param maxvalue DOCUMENT ME!
     * @return DOCUMENT ME!
     */
    public static String calculatePercentage(float quality, float 
maxvalue) {
        float result = (quality * 100) / maxvalue;

        if (result < 0) {
            result = 0;
        }

        return "" + Math.round(result);
    }

    /**
     * DOCUMENT ME!
     *
     * @param s DOCUMENT ME!
     * @return DOCUMENT ME!
     */
    public static String capitalize(String s) {
        String ret = s.substring(0, 1).toUpperCase() + s.substring(1, 
s.length());

        return ret;
    }

    /**
     * DOCUMENT ME!
     *
     * @param fn DOCUMENT ME!
     * @return DOCUMENT ME!
     */
    public static String cutFileName(String fn) {
        String separator = File.separator;
        String cutfn = null;

        if (fn.startsWith("/")) {
            separator = "/";
        } else {
            separator = "\\";
        }

        if (fn.lastIndexOf(separator) >= 0) {
            cutfn = fn.substring(fn.lastIndexOf(separator) + 1);
        } else {
            cutfn = fn;
        }
        return cutfn;
    }

    /**
     * Cats the Classname from a packagePath+className string
     * com.configworks.cwk.share.om.Tutorial becomes Tutorial
     * @param fn String to cut ClassName from
     *
     * @return className
     */
    public static String cutClassName(String fn) {
        String separator = ".";
        String cutfn = null;
       
        if (fn.lastIndexOf(separator) >= 0) {
            cutfn = fn.substring(fn.lastIndexOf(separator) + 1);
        } else {
            cutfn = fn;
        }
        return cutfn;
    }
    /**
     * @param date DOCUMENT ME!
     * @return DOCUMENT ME!
     * @see com.configworks.cwk.fe.tools.CwkToolkit#formatDate(Date, Locale)
     * @deprecated use cwktoolkit.formatDate instead
     */
    public static String dateToString(Date date) {
        if (date != null) {
            return DATE_FORMAT.format(date);
        } else {
            return null;
        }
    }

    /**
     * @param date   DOCUMENT ME!
     * @param format DOCUMENT ME!
     * @return DOCUMENT ME!
     * @deprecated use cwktoolkit.formatDate or 
cwktoolkit.formatDateTime  instead
     *             DOCUMENT ME!
     */
    public static String dateToString(Date date, String format) {
        if (date != null) {
            SimpleDateFormat df = new SimpleDateFormat(format);

            return df.format(date);
        } else {
            return null;
        }
    }

    /**
     * @param date DOCUMENT ME!
     * @return DOCUMENT ME!
     * @deprecated use cwktoolkit.formatDate or 
cwktoolkit.formatDateTime  instead
     *             <p/>
     *             DOCUMENT ME!
     */
    public static String dateToString(Field date) {
        if ((date != null) && (date.getValue() != null)) {
            return DATE_FORMAT.format(date.getValue());
        } else {
            return null;
        }
    }

    /**
     * converts an Java conform name to  a torque underscore name
     *
     * @param javaname
     * @return
     */
    public static String jToU(String javaname) {
        char[] chars = javaname.toCharArray();
        StringBuffer underscore = new StringBuffer();
        underscore.append(Character.toLowerCase(chars[0]));

        for (int i = 1; i < chars.length; i++) {
            if (Character.isUpperCase(chars[i])) {
                underscore.append("_");
                underscore.append(Character.toLowerCase(chars[i]));
            } else {
                underscore.append(chars[i]);
            }
        }

        Debug.println(javaname + " =>" + underscore);

        return underscore.toString();
    }

    /**
     * DOCUMENT ME!
     *
     * @param in DOCUMENT ME!
     * @return DOCUMENT ME!
     */
    public static String nl2br(String in) {
        return in.replaceAll("\n", "<br>");
    }

    /**
     * @param in DOCUMENT ME!
     * @return DOCUMENT ME!
     * @see com.configworks.cwk.fe.tools.CwkToolkit#htmlEncode(String)
     * @deprecated use cwktoolkit.htmlencode instead
     */
    public static String out(String in) {

        return nl2br(in);
    }

    /**
     * replaces a torque name with underscores with an Java conform name
     *
     * @param uname
     * @return
     */
    public static String uToJ(String uname) {
        char[] chars = uname.toCharArray();
        StringBuffer java = new StringBuffer();

        for (int i = 0; i < chars.length; i++) {
            if (chars[i] == '_') {
                i++;
                java.append(Character.toUpperCase(chars[i]));
            } else {
                java.append(chars[i]);
            }
        }

        Debug.println(uname + " =>" + java);

        return java.toString();
    }

    /**
     * this method is used to execute an OS COmmand
     *
     * @param execPath        the execution path (path to executable file)
     * @param sourcePath      the source path (path to imput file)
     * @param destinationPath the destination path (path to output file)
     * @param params          aditional parameters (if null or ""  this 
parameter is ignored)
     * @return a refference of the created proccess
     * @throws IOException
     */
    public static Process executeOSCommand(String execPath, String 
sourcePath,
                                           String destinationPath, 
String params)
        throws IOException {
        final String source = "<source>";
        final String destination = "<destination>";

        Runtime runtime = Runtime.getRuntime();

        if (execPath == null) {
            if (logger.isErrorEnabled())
                logger.error("Execution command is not specified!");
            return null;
        }

        int sourceIndex = execPath.indexOf(source);
        int destinationIndex = execPath.indexOf(destination);

        if ((sourceIndex >= 0) && (destinationIndex >= 0)) {
            String execCommand = execPath.substring(0, sourceIndex) + 
sourcePath + execPath.substring(
                sourceIndex + source.length(), destinationIndex) + 
destinationPath + execPath.substring(
                    destinationIndex + destination.length());

            if (Utils.notEmptyString(params))
                execCommand += " " + params;
           
            //for windows change all / in the path to \ otherwise the 
command cannot be executed
            if (File.separator.equals("\\"))
                execCommand = execCommand.replace('/', '\\');

            System.out.println(execCommand);
            if (logger.isTraceEnabled())
                logger.trace("Executing command: " + execCommand);
            return runtime.exec(execCommand);
        }
        return null;
    }

    /**
     * !isEmptyString
     *
     * @param s
     * @return
     * @see #isEmpty(String)
     * @since CWK 1.4.0
     */
    public static boolean notEmptyString(String s) {
        return !isEmpty(s);
    }

    /**
     * @param s
     * @return
     * @see
     * @see #isEmpty(String)
     * @since CWK 1.4.0
     * @deprecated use isempty instead
     *             True if s==null or ""
     */
    public static boolean isEmptyString(String s) {
        return (s == null || s.equals(""));
    }

}

/****************************** ppt converter implementation 
***************/

/* @(#) CWK 1.5 23.06.2004
 *
 * Copyright 2003-2005 ConfigWorks Informationssysteme & Consulting GmbH
 * Universitätsstr. 94/7 9020 Klagenfurt Austria
 * www.configworks.com
 * All rights reserved.
 */

package com.configworks.cwk.be.search.converters;

import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import org.apache.poi.hpsf.PropertySetFactory;
import org.apache.poi.poifs.eventfilesystem.POIFSReader;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
import org.apache.poi.util.LittleEndian;

/**
 * Class description
 *
 * @author sergiu
 * @version 1.0
 * @since CWK 1.5
 */
public class PPTConverterImpl extends Converter {

    static final String lineSeparator = 
System.getProperty("line.separator");
    private BufferedOutputStream txtFileWriter = null;
   
    File dest = null;
   
    /* (non-Javadoc)
     * @see 
com.configworks.cwk.be.search.converters.Converter#convertSource(java.io.File)
     */
    public Reader convertSource(File source) {
        if (source == null)
            return null;
        Reader reader = null;
        InputStream inputStream = null;
       
        try {
            String filename = source.getName();
            filename = filename.replace('.', '_');
            filename += ".txt";
            File tmpDir = new File(_config.getTempDirectory());
            tmpDir.mkdirs();
            dest = new File(tmpDir.getPath(), filename);
            boolean created = dest.createNewFile();           
           
            //create the input and output streams
            txtFileWriter = new BufferedOutputStream(
                    new FileOutputStream(dest));
            inputStream = new FileInputStream(source);
           
            extractText(inputStream);
           
            if (!dest.exists())
                return null;
            dest.deleteOnExit();
            reader = new BufferedReader(new FileReader(dest));

        } catch (Exception e) {
            getLogger().error("JavaDocumentConverter cannot convert the 
source file: "
                    + source.getAbsolutePath(), e);
            reader = null;
        }finally{
            try {
                if(txtFileWriter != null)
                    txtFileWriter.close();
                if(inputStream != null)
                    inputStream.close();
            } catch (IOException ex) {
                if(getLogger().isDebugEnabled())
                    getLogger().error("Cannot close the stream: " + ex);
            }
        }
        return reader;

    }
   
   
    /**
     * Extract the text from a number of presentations.
     */
    public boolean extractText(InputStream  reader) throws IOException{
           
            if(txtFileWriter == null)
                throw new IOException("Writer Not Iititalized!");
           
            POIFSReader r = new POIFSReader();

            /* Register a listener for *all* documents. */
            PptDocReaderListener listener = new 
PptDocReaderListener(txtFileWriter);
            r.registerListener(listener, "PowerPoint Document");
            r.read(reader);
            //if no exception was trown, consider that the conversion 
was successful 
            return true;
    }   
   
    class PptDocReaderListener implements POIFSReaderListener{
        private BufferedOutputStream writer = null;
       
        public PptDocReaderListener(){
        }
       
        public PptDocReaderListener(BufferedOutputStream writer){
            this.writer = writer;
        }

        public void processPOIFSReaderEvent(POIFSReaderEvent event) {
           
            try{
           
                org.apache.poi.poifs.filesystem.DocumentInputStream dis 
= null;
                if(!event.getName().equalsIgnoreCase("PowerPoint Document"))
                    return;
               
                dis=event.getStream();
               
                byte btoWrite[]= new byte[12];
                dis.read(btoWrite);
               
                btoWrite = new byte[dis.available()];
                dis.read(btoWrite, 0, dis.available());

                long type = 0;
                long size = 0;
                int offset = 0;
                int length = 0;
               
                for(int i=0; i<btoWrite.length-20; i++){

                    type=LittleEndian.getUShort(btoWrite,i+2);
                    size=LittleEndian.getUInt(btoWrite,i+4);
                   
                    if (type==4008){
                            length = (int)size+3;
                            offset = i+4+1;
                            writer.write(btoWrite, offset, length);
                       
                        //skip the bytes that were already read   
                        i = Math.max(i, (offset + length));
                    }
                }
               
                PropertySetFactory.create(event.getStream());
            }catch (Exception e){
                String msg = "Cannot index ppt file: \n";
                if(getLogger().isErrorEnabled())
                    getLogger().error(msg + e);
            }   
        }   
    }

    /* (non-Javadoc)
     * @see 
com.configworks.cwk.be.search.converters.JavaDocumentConverter#initWriter(java.io.File)
     */
    public void initWriter(File dest) throws IOException {
        if (txtFileWriter == null)
            txtFileWriter = new BufferedOutputStream(new 
FileOutputStream(dest));
    }

    /* (non-Javadoc)
     * @see 
com.configworks.cwk.be.search.converters.JavaDocumentConverter#closeWriter()
     */
    public void closeWriter() throws IOException {
        if(txtFileWriter != null)
            txtFileWriter.close();
        else
            throw new IOException("Cannot close the writer, the object 
is Null!");
    }
}





> Do you Yahoo!?
> Take Yahoo! Mail with you! 
> <http://us.rd.yahoo.com/mail_us/taglines/mobile/*http://mobile.yahoo.com/maildemo> 
> Get it on your mobile phone. 




---------------------------------------------------------------------
To unsubscribe, e-mail: lucene-user-unsubscribe@jakarta.apache.org
For additional commands, e-mail: lucene-user-help@jakarta.apache.org