You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@poi.apache.org by Sergiu Gordea <gs...@ifit.uni-klu.ac.at> on 2004/07/13 09:41:45 UTC
Re: Indexing word documents
NATARAJAN THILLAI wrote:
Hi Sergiu,
I am Natarajan from India and now I was working search engine
project. I saw u r article in the net
(http://article.gmane.org/gmane.comp.jakarta.poi.user/4851). It's very
nice and useful to me.
I want to Indexing exe file so pls send me your
"com.configworks.cwk.share.Utils" file.
Advance Thanks.
Regards
Natarajan.
------------------------------------------------------------------------
Hi,
I'm glad to find that the code I submitted (I cannot claim is mine) is
usefull for other programmers.
I can sent you the utils class, no problem. But we are not indexing
exe files. the ExeConverterImpl is an external converter that converts
different
file formats to text in a batch mode. For example antiword is such an
converter.
Also the ppt converter I submitted throws an OutOfMemoryError. I'll send
the code with the bugfix.
I wish you luck in your work, and here is the classes:
Sergiu
/******* Util class *******/
package com.configworks.cwk.share;
import com.configworks.cwk.be.system.CwkConfigurationFactory;
import com.configworks.cwk.be.system.Debug;
import com.configworks.cwk.be.system.ICwkConfiguration;
import java.io.File;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.turbine.services.intake.model.Field;
/**
* KM-Portal Utilities Alle möglichen Methoden, die das programmieren
mit dem KM-Portal erleichtern
* sollen.
*
* @author Christine Keim
* @version 1
*/
public class Utils {
private static final Log logger =
LogFactory.getLog(Utils.class.getName());
public static final String DATE_PATTERN = "dd.MM.yyyy";
public static final String DATE_TIME_PATTERN = "dd.MM.yyyy HH:mm:ss";
public static final SimpleDateFormat DATE_FORMAT = new
SimpleDateFormat(DATE_PATTERN);
public final static SimpleDateFormat DATE_TIME_FORMAT = new
SimpleDateFormat(DATE_TIME_PATTERN);
private static final int MAX_FILENAME_LENGTH =
Integer.parseInt(CwkConfigurationFactory.getConfigurationInstance(
CwkConfigurationFactory.PROP_FILE_CONFIGURATION).getProperty(ICwkConfiguration.FILENAMES_LENGTH_MAX));
private static String[] typeimages = {"pdf.gif", "audio.gif",
"video.gif", "image.gif",
"office.gif", "data.gif",
"archive.gif", "link.gif",
"unknown.gif", "word.gif",
"excel.gif", "powerpoint.gif"};
private static String unknownTypeimage = "unknown.gif";
public String getDatePattern() {
return DATE_PATTERN;
}
/**
* Checks wether the given String is ull or empty, contains nothing
or only blanks...
*
* @param v String to check
* @return true if string is empty, else false
*/
public static boolean isEmpty(String v) {
return (v == null || "".equals(v.trim()));
}
/**
* @param current DOCUMENT ME!
* @return DOCUMENT ME!
* @see com.configworks.cwk.fe.tools.CwkToolkit#getRatingImage(float)
* @deprecated Use cwktoolkit.getRatingImage instead
*/
public static String getRatingImage(float current) {
if (current < 0) {
return "unrated.gif";
}
if (current == 0) {
return "stars-0-0.gif";
}
if (current <= 1) {
return "stars-0-5.gif";
}
if (current <= 2) {
return "stars-1-0.gif";
}
if (current <= 3) {
return "stars-1-5.gif";
}
if (current <= 4) {
return "stars-2-0.gif";
}
if (current <= 5) {
return "stars-2-5.gif";
}
if (current <= 6) {
return "stars-3-0.gif";
}
if (current <= 7) {
return "stars-3-5.gif";
}
if (current <= 8) {
return "stars-4-0.gif";
}
if (current <= 9) {
return "stars-4-5.gif";
} else {
return "stars-5-0.gif";
}
}
/**
* @param type DOCUMENT ME!
* @return DOCUMENT ME!
* @see com.configworks.cwk.fe.tools.CwkToolkit#getTypeImage(int)
* @deprecated use cwktoolkit.getTypeImage instead
*/
public static String getTypeImage(int type) {
if ((type >= 0) && (type < typeimages.length)) {
return typeimages[type];
} else {
return unknownTypeimage;
}
}
/**
* DOCUMENT ME!
*
* @param quality DOCUMENT ME!
* @param maxvalue DOCUMENT ME!
* @return DOCUMENT ME!
*/
public static String calculatePercentage(float quality, float
maxvalue) {
float result = (quality * 100) / maxvalue;
if (result < 0) {
result = 0;
}
return "" + Math.round(result);
}
/**
* DOCUMENT ME!
*
* @param s DOCUMENT ME!
* @return DOCUMENT ME!
*/
public static String capitalize(String s) {
String ret = s.substring(0, 1).toUpperCase() + s.substring(1,
s.length());
return ret;
}
/**
* DOCUMENT ME!
*
* @param fn DOCUMENT ME!
* @return DOCUMENT ME!
*/
public static String cutFileName(String fn) {
String separator = File.separator;
String cutfn = null;
if (fn.startsWith("/")) {
separator = "/";
} else {
separator = "\\";
}
if (fn.lastIndexOf(separator) >= 0) {
cutfn = fn.substring(fn.lastIndexOf(separator) + 1);
} else {
cutfn = fn;
}
return cutfn;
}
/**
* Cats the Classname from a packagePath+className string
* com.configworks.cwk.share.om.Tutorial becomes Tutorial
* @param fn String to cut ClassName from
*
* @return className
*/
public static String cutClassName(String fn) {
String separator = ".";
String cutfn = null;
if (fn.lastIndexOf(separator) >= 0) {
cutfn = fn.substring(fn.lastIndexOf(separator) + 1);
} else {
cutfn = fn;
}
return cutfn;
}
/**
* @param date DOCUMENT ME!
* @return DOCUMENT ME!
* @see com.configworks.cwk.fe.tools.CwkToolkit#formatDate(Date, Locale)
* @deprecated use cwktoolkit.formatDate instead
*/
public static String dateToString(Date date) {
if (date != null) {
return DATE_FORMAT.format(date);
} else {
return null;
}
}
/**
* @param date DOCUMENT ME!
* @param format DOCUMENT ME!
* @return DOCUMENT ME!
* @deprecated use cwktoolkit.formatDate or
cwktoolkit.formatDateTime instead
* DOCUMENT ME!
*/
public static String dateToString(Date date, String format) {
if (date != null) {
SimpleDateFormat df = new SimpleDateFormat(format);
return df.format(date);
} else {
return null;
}
}
/**
* @param date DOCUMENT ME!
* @return DOCUMENT ME!
* @deprecated use cwktoolkit.formatDate or
cwktoolkit.formatDateTime instead
* <p/>
* DOCUMENT ME!
*/
public static String dateToString(Field date) {
if ((date != null) && (date.getValue() != null)) {
return DATE_FORMAT.format(date.getValue());
} else {
return null;
}
}
/**
* converts an Java conform name to a torque underscore name
*
* @param javaname
* @return
*/
public static String jToU(String javaname) {
char[] chars = javaname.toCharArray();
StringBuffer underscore = new StringBuffer();
underscore.append(Character.toLowerCase(chars[0]));
for (int i = 1; i < chars.length; i++) {
if (Character.isUpperCase(chars[i])) {
underscore.append("_");
underscore.append(Character.toLowerCase(chars[i]));
} else {
underscore.append(chars[i]);
}
}
Debug.println(javaname + " =>" + underscore);
return underscore.toString();
}
/**
* DOCUMENT ME!
*
* @param in DOCUMENT ME!
* @return DOCUMENT ME!
*/
public static String nl2br(String in) {
return in.replaceAll("\n", "<br>");
}
/**
* @param in DOCUMENT ME!
* @return DOCUMENT ME!
* @see com.configworks.cwk.fe.tools.CwkToolkit#htmlEncode(String)
* @deprecated use cwktoolkit.htmlencode instead
*/
public static String out(String in) {
return nl2br(in);
}
/**
* replaces a torque name with underscores with an Java conform name
*
* @param uname
* @return
*/
public static String uToJ(String uname) {
char[] chars = uname.toCharArray();
StringBuffer java = new StringBuffer();
for (int i = 0; i < chars.length; i++) {
if (chars[i] == '_') {
i++;
java.append(Character.toUpperCase(chars[i]));
} else {
java.append(chars[i]);
}
}
Debug.println(uname + " =>" + java);
return java.toString();
}
/**
* this method is used to execute an OS COmmand
*
* @param execPath the execution path (path to executable file)
* @param sourcePath the source path (path to imput file)
* @param destinationPath the destination path (path to output file)
* @param params aditional parameters (if null or "" this
parameter is ignored)
* @return a refference of the created proccess
* @throws IOException
*/
public static Process executeOSCommand(String execPath, String
sourcePath,
String destinationPath,
String params)
throws IOException {
final String source = "<source>";
final String destination = "<destination>";
Runtime runtime = Runtime.getRuntime();
if (execPath == null) {
if (logger.isErrorEnabled())
logger.error("Execution command is not specified!");
return null;
}
int sourceIndex = execPath.indexOf(source);
int destinationIndex = execPath.indexOf(destination);
if ((sourceIndex >= 0) && (destinationIndex >= 0)) {
String execCommand = execPath.substring(0, sourceIndex) +
sourcePath + execPath.substring(
sourceIndex + source.length(), destinationIndex) +
destinationPath + execPath.substring(
destinationIndex + destination.length());
if (Utils.notEmptyString(params))
execCommand += " " + params;
//for windows change all / in the path to \ otherwise the
command cannot be executed
if (File.separator.equals("\\"))
execCommand = execCommand.replace('/', '\\');
System.out.println(execCommand);
if (logger.isTraceEnabled())
logger.trace("Executing command: " + execCommand);
return runtime.exec(execCommand);
}
return null;
}
/**
* !isEmptyString
*
* @param s
* @return
* @see #isEmpty(String)
* @since CWK 1.4.0
*/
public static boolean notEmptyString(String s) {
return !isEmpty(s);
}
/**
* @param s
* @return
* @see
* @see #isEmpty(String)
* @since CWK 1.4.0
* @deprecated use isempty instead
* True if s==null or ""
*/
public static boolean isEmptyString(String s) {
return (s == null || s.equals(""));
}
}
/****************************** ppt converter implementation
***************/
/* @(#) CWK 1.5 23.06.2004
*
* Copyright 2003-2005 ConfigWorks Informationssysteme & Consulting GmbH
* Universitätsstr. 94/7 9020 Klagenfurt Austria
* www.configworks.com
* All rights reserved.
*/
package com.configworks.cwk.be.search.converters;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import org.apache.poi.hpsf.PropertySetFactory;
import org.apache.poi.poifs.eventfilesystem.POIFSReader;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
import org.apache.poi.util.LittleEndian;
/**
* Class description
*
* @author sergiu
* @version 1.0
* @since CWK 1.5
*/
public class PPTConverterImpl extends Converter {
static final String lineSeparator =
System.getProperty("line.separator");
private BufferedOutputStream txtFileWriter = null;
File dest = null;
/* (non-Javadoc)
* @see
com.configworks.cwk.be.search.converters.Converter#convertSource(java.io.File)
*/
public Reader convertSource(File source) {
if (source == null)
return null;
Reader reader = null;
InputStream inputStream = null;
try {
String filename = source.getName();
filename = filename.replace('.', '_');
filename += ".txt";
File tmpDir = new File(_config.getTempDirectory());
tmpDir.mkdirs();
dest = new File(tmpDir.getPath(), filename);
boolean created = dest.createNewFile();
//create the input and output streams
txtFileWriter = new BufferedOutputStream(
new FileOutputStream(dest));
inputStream = new FileInputStream(source);
extractText(inputStream);
if (!dest.exists())
return null;
dest.deleteOnExit();
reader = new BufferedReader(new FileReader(dest));
} catch (Exception e) {
getLogger().error("JavaDocumentConverter cannot convert the
source file: "
+ source.getAbsolutePath(), e);
reader = null;
}finally{
try {
if(txtFileWriter != null)
txtFileWriter.close();
if(inputStream != null)
inputStream.close();
} catch (IOException ex) {
if(getLogger().isDebugEnabled())
getLogger().error("Cannot close the stream: " + ex);
}
}
return reader;
}
/**
* Extract the text from a number of presentations.
*/
public boolean extractText(InputStream reader) throws IOException{
if(txtFileWriter == null)
throw new IOException("Writer Not Iititalized!");
POIFSReader r = new POIFSReader();
/* Register a listener for *all* documents. */
PptDocReaderListener listener = new
PptDocReaderListener(txtFileWriter);
r.registerListener(listener, "PowerPoint Document");
r.read(reader);
//if no exception was trown, consider that the conversion
was successful
return true;
}
class PptDocReaderListener implements POIFSReaderListener{
private BufferedOutputStream writer = null;
public PptDocReaderListener(){
}
public PptDocReaderListener(BufferedOutputStream writer){
this.writer = writer;
}
public void processPOIFSReaderEvent(POIFSReaderEvent event) {
try{
org.apache.poi.poifs.filesystem.DocumentInputStream dis
= null;
if(!event.getName().equalsIgnoreCase("PowerPoint Document"))
return;
dis=event.getStream();
byte btoWrite[]= new byte[12];
dis.read(btoWrite);
btoWrite = new byte[dis.available()];
dis.read(btoWrite, 0, dis.available());
long type = 0;
long size = 0;
int offset = 0;
int length = 0;
for(int i=0; i<btoWrite.length-20; i++){
type=LittleEndian.getUShort(btoWrite,i+2);
size=LittleEndian.getUInt(btoWrite,i+4);
if (type==4008){
length = (int)size+3;
offset = i+4+1;
writer.write(btoWrite, offset, length);
//skip the bytes that were already read
i = Math.max(i, (offset + length));
}
}
PropertySetFactory.create(event.getStream());
}catch (Exception e){
String msg = "Cannot index ppt file: \n";
if(getLogger().isErrorEnabled())
getLogger().error(msg + e);
}
}
}
/* (non-Javadoc)
* @see
com.configworks.cwk.be.search.converters.JavaDocumentConverter#initWriter(java.io.File)
*/
public void initWriter(File dest) throws IOException {
if (txtFileWriter == null)
txtFileWriter = new BufferedOutputStream(new
FileOutputStream(dest));
}
/* (non-Javadoc)
* @see
com.configworks.cwk.be.search.converters.JavaDocumentConverter#closeWriter()
*/
public void closeWriter() throws IOException {
if(txtFileWriter != null)
txtFileWriter.close();
else
throw new IOException("Cannot close the writer, the object
is Null!");
}
}
> Do you Yahoo!?
> Take Yahoo! Mail with you!
> <http://us.rd.yahoo.com/mail_us/taglines/mobile/*http://mobile.yahoo.com/maildemo>
> Get it on your mobile phone.
---------------------------------------------------------------------
To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
For additional commands, e-mail: poi-user-help@jakarta.apache.org