You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@poi.apache.org by Dimitri Pissarenko <di...@gmx.net> on 2004/01/27 22:54:39 UTC
Word to plain text converter
Hello!
I want to convert several Microsoft Word files to plain text files, so
that I can search through them with grep (or with analogous search
functions under Windows).
Has someone already written such a converter? Is this tool perhaps
open-source?
TIA
dap
---------------------------------------------------------------------
To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
For additional commands, e-mail: poi-user-help@jakarta.apache.org
RE: Word to plain text converter
Posted by Abdul Wahab <aw...@erp21.com.my>.
Hi Paolo,
When i try to use http://textmining.org lib, Im getting the text but after
long "beeb" sound with "system hang mode" if my word document contains table
format. how to control that..
-----Original Message-----
From: Paolo Tortora [mailto:p.tortora@eusysgroup.it]
Sent: Wednesday, January 28, 2004 5:49 PM
To: POI Users List; Ryan Ackley
Subject: R: Word to plain text converter
Here some piece of java code to extract plain text from Word:
/**************** START SOURCE CODE *******************/
import org.apache.poi.poifs.filesystem.*;
import org.apache.poi.util.LittleEndian;
import java.util.ArrayList;
import java.io.InputStream;
import java.io.IOException;
class WordExtractor {
public WordExtractor()
{
}
public String extractText(InputStream in) throws IOException
{
ArrayList text = new ArrayList();
POIFSFileSystem fsys = new POIFSFileSystem(in);
DocumentEntry headerProps =
(DocumentEntry)fsys.getRoot().getEntry("WordDocument");
DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
byte[] header = new byte[headerProps.getSize()];
din.read(header);
din.close();
// Prende le informazioni dall'header del documento
int info = LittleEndian.getShort(header, 0xa);
boolean useTable1 = (info & 0x200) != 0;
// Prende informazioni dalla piece table
int complexOffset = LittleEndian.getInt(header, 0x1a2);
String tableName = null;
if (useTable1)
{
tableName = "1Table";
}
else
{
tableName = "0Table";
}
DocumentEntry table = (DocumentEntry)fsys.getRoot().getEntry(tableName);
byte[] tableStream = new byte[table.getSize()];
din = fsys.createDocumentInputStream(tableName);
din.read(tableStream);
din.close();
din = null;
fsys = null;
table = null;
headerProps = null;
int multiple = findText(tableStream, complexOffset, text);
StringBuffer sb = new StringBuffer();
int size = text.size();
tableStream = null;
for (int x = 0; x < size; x++)
{
WordTextPiece nextPiece = (WordTextPiece)text.get(x);
int start = nextPiece.getStart();
int length = nextPiece.getLength();
boolean unicode = nextPiece.usesUnicode();
String toStr = null;
if (unicode)
{
toStr = new String(header, start, length * multiple, "UTF-16LE");
}
else
{
toStr = new String(header, start, length , "ISO-8859-1");
}
sb.append(toStr).append(" ");
}
return sb.toString();
}
private static int findText(byte[] tableStream, int complexOffset,
ArrayList text) throws IOException
{
//actual text
int pos = complexOffset;
int multiple = 2;
//skips through the prms before we reach the piece table. These contain
data
//for actual fast saved files
while(tableStream[pos] == 1)
{
pos++;
int skip = LittleEndian.getShort(tableStream, pos);
pos += 2 + skip;
}
if(tableStream[pos] != 2)
{
throw new IOException("corrupted Word file");
}
else
{
//parse out the text pieces
int pieceTableSize = LittleEndian.getInt(tableStream, ++pos);
pos += 4;
int pieces = (pieceTableSize - 4) / 12;
for (int x = 0; x < pieces; x++)
{
int filePos = LittleEndian.getInt(tableStream, pos + ((pieces + 1) * 4) +
(x * 8) + 2);
boolean unicode = false;
if ((filePos & 0x40000000) == 0)
{
unicode = true;
}
else
{
unicode = false;
multiple = 1;
filePos &= ~(0x40000000);//gives me FC in doc stream
filePos /= 2;
}
int totLength = LittleEndian.getInt(tableStream, pos + (x + 1) * 4) -
LittleEndian.getInt(tableStream, pos + (x * 4));
WordTextPiece piece = new WordTextPiece(filePos, totLength, unicode);
text.add(piece);
}
}
return multiple;
}
}
class WordTextPiece
{
private int _fcStart;
private boolean _usesUnicode;
private int _length;
public WordTextPiece(int start, int length, boolean unicode)
{
_usesUnicode = unicode;
_length = length;
_fcStart = start;
}
public boolean usesUnicode()
{
return _usesUnicode;
}
public int getStart()
{
return _fcStart;
}
public int getLength()
{
return _length;
}
}
/************** END SOURCE CODE **************/
-----Messaggio originale-----
Da: Ryan Ackley [mailto:sackley@apache.org]
Inviato: Wednesday, January 28, 2004 12:16 AM
A: POI Users List
Oggetto: Re: Word to plain text converter
http://textmining.org
----- Original Message -----
From: "Dimitri Pissarenko" <di...@gmx.net>
To: <po...@jakarta.apache.org>
Sent: Tuesday, January 27, 2004 4:54 PM
Subject: Word to plain text converter
Hello!
I want to convert several Microsoft Word files to plain text files, so
that I can search through them with grep (or with analogous search
functions under Windows).
Has someone already written such a converter? Is this tool perhaps
open-source?
TIA
dap
---------------------------------------------------------------------
To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
For additional commands, e-mail: poi-user-help@jakarta.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
For additional commands, e-mail: poi-user-help@jakarta.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
For additional commands, e-mail: poi-user-help@jakarta.apache.org
---
[This E-mail scanned for viruses by iRepublics.com Anti Virus Solutions]
p.s. get your web hosted for free at iRepublics.com
33MB webspace free, SMS messaging, 30 email accounts
---------------------------------------------------------------------
To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
For additional commands, e-mail: poi-user-help@jakarta.apache.org
R: Word to plain text converter
Posted by Paolo Tortora <p....@eusysgroup.it>.
Here some piece of java code to extract plain text from Word:
/**************** START SOURCE CODE *******************/
import org.apache.poi.poifs.filesystem.*;
import org.apache.poi.util.LittleEndian;
import java.util.ArrayList;
import java.io.InputStream;
import java.io.IOException;
class WordExtractor {
public WordExtractor()
{
}
public String extractText(InputStream in) throws IOException
{
ArrayList text = new ArrayList();
POIFSFileSystem fsys = new POIFSFileSystem(in);
DocumentEntry headerProps =
(DocumentEntry)fsys.getRoot().getEntry("WordDocument");
DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
byte[] header = new byte[headerProps.getSize()];
din.read(header);
din.close();
// Prende le informazioni dall'header del documento
int info = LittleEndian.getShort(header, 0xa);
boolean useTable1 = (info & 0x200) != 0;
// Prende informazioni dalla piece table
int complexOffset = LittleEndian.getInt(header, 0x1a2);
String tableName = null;
if (useTable1)
{
tableName = "1Table";
}
else
{
tableName = "0Table";
}
DocumentEntry table = (DocumentEntry)fsys.getRoot().getEntry(tableName);
byte[] tableStream = new byte[table.getSize()];
din = fsys.createDocumentInputStream(tableName);
din.read(tableStream);
din.close();
din = null;
fsys = null;
table = null;
headerProps = null;
int multiple = findText(tableStream, complexOffset, text);
StringBuffer sb = new StringBuffer();
int size = text.size();
tableStream = null;
for (int x = 0; x < size; x++)
{
WordTextPiece nextPiece = (WordTextPiece)text.get(x);
int start = nextPiece.getStart();
int length = nextPiece.getLength();
boolean unicode = nextPiece.usesUnicode();
String toStr = null;
if (unicode)
{
toStr = new String(header, start, length * multiple, "UTF-16LE");
}
else
{
toStr = new String(header, start, length , "ISO-8859-1");
}
sb.append(toStr).append(" ");
}
return sb.toString();
}
private static int findText(byte[] tableStream, int complexOffset,
ArrayList text) throws IOException
{
//actual text
int pos = complexOffset;
int multiple = 2;
//skips through the prms before we reach the piece table. These contain
data
//for actual fast saved files
while(tableStream[pos] == 1)
{
pos++;
int skip = LittleEndian.getShort(tableStream, pos);
pos += 2 + skip;
}
if(tableStream[pos] != 2)
{
throw new IOException("corrupted Word file");
}
else
{
//parse out the text pieces
int pieceTableSize = LittleEndian.getInt(tableStream, ++pos);
pos += 4;
int pieces = (pieceTableSize - 4) / 12;
for (int x = 0; x < pieces; x++)
{
int filePos = LittleEndian.getInt(tableStream, pos + ((pieces + 1) * 4) +
(x * 8) + 2);
boolean unicode = false;
if ((filePos & 0x40000000) == 0)
{
unicode = true;
}
else
{
unicode = false;
multiple = 1;
filePos &= ~(0x40000000);//gives me FC in doc stream
filePos /= 2;
}
int totLength = LittleEndian.getInt(tableStream, pos + (x + 1) * 4) -
LittleEndian.getInt(tableStream, pos + (x * 4));
WordTextPiece piece = new WordTextPiece(filePos, totLength, unicode);
text.add(piece);
}
}
return multiple;
}
}
class WordTextPiece
{
private int _fcStart;
private boolean _usesUnicode;
private int _length;
public WordTextPiece(int start, int length, boolean unicode)
{
_usesUnicode = unicode;
_length = length;
_fcStart = start;
}
public boolean usesUnicode()
{
return _usesUnicode;
}
public int getStart()
{
return _fcStart;
}
public int getLength()
{
return _length;
}
}
/************** END SOURCE CODE **************/
-----Messaggio originale-----
Da: Ryan Ackley [mailto:sackley@apache.org]
Inviato: Wednesday, January 28, 2004 12:16 AM
A: POI Users List
Oggetto: Re: Word to plain text converter
http://textmining.org
----- Original Message -----
From: "Dimitri Pissarenko" <di...@gmx.net>
To: <po...@jakarta.apache.org>
Sent: Tuesday, January 27, 2004 4:54 PM
Subject: Word to plain text converter
Hello!
I want to convert several Microsoft Word files to plain text files, so
that I can search through them with grep (or with analogous search
functions under Windows).
Has someone already written such a converter? Is this tool perhaps
open-source?
TIA
dap
---------------------------------------------------------------------
To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
For additional commands, e-mail: poi-user-help@jakarta.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
For additional commands, e-mail: poi-user-help@jakarta.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
For additional commands, e-mail: poi-user-help@jakarta.apache.org
Re: Word to plain text converter
Posted by Ryan Ackley <sa...@apache.org>.
http://textmining.org
----- Original Message -----
From: "Dimitri Pissarenko" <di...@gmx.net>
To: <po...@jakarta.apache.org>
Sent: Tuesday, January 27, 2004 4:54 PM
Subject: Word to plain text converter
Hello!
I want to convert several Microsoft Word files to plain text files, so
that I can search through them with grep (or with analogous search
functions under Windows).
Has someone already written such a converter? Is this tool perhaps
open-source?
TIA
dap
---------------------------------------------------------------------
To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
For additional commands, e-mail: poi-user-help@jakarta.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
For additional commands, e-mail: poi-user-help@jakarta.apache.org
R: Word to plain text converter
Posted by Paolo Tortora <p....@eusysgroup.it>.
Thank you
-----Messaggio originale-----
Da: Ryan Ackley [mailto:sackley@apache.org]
Inviato: Wednesday, January 28, 2004 2:02 PM
A: POI Users List
Oggetto: Re: Word to plain text converter
Paolo,
I appreciate your good intentions by posting this code but your violating
the terms of the license agreement. You don't give any credit to
textmining.org. There are no restrictions to you using the code except that
you have to give credit. It was part of the license agreement that was in
the zip file when you downloaded the library. Its the same when you use
Apache stuff. I'm sure you didn't realise this so its no big deal. Now you
know :-)
-Ryan
----- Original Message -----
From: "Paolo Tortora" <p....@eusysgroup.it>
To: "POI Users List" <po...@jakarta.apache.org>
Sent: Wednesday, January 28, 2004 4:50 AM
Subject: R: Word to plain text converter
> Here is java source code to extract plain text from Word:
>
> /**************** START SOURCE CODE *******************/
> import org.apache.poi.poifs.filesystem.*;
> import org.apache.poi.util.LittleEndian;
>
> import java.util.ArrayList;
> import java.io.InputStream;
>
> import java.io.IOException;
>
> class WordExtractor {
>
> public WordExtractor()
> {
> }
>
> public String extractText(InputStream in) throws IOException
> {
> ArrayList text = new ArrayList();
> POIFSFileSystem fsys = new POIFSFileSystem(in);
>
> DocumentEntry headerProps =
> (DocumentEntry)fsys.getRoot().getEntry("WordDocument");
> DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
> byte[] header = new byte[headerProps.getSize()];
>
> din.read(header);
> din.close();
> // Prende le informazioni dall'header del documento
> int info = LittleEndian.getShort(header, 0xa);
>
> boolean useTable1 = (info & 0x200) != 0;
>
> // Prende informazioni dalla piece table
> int complexOffset = LittleEndian.getInt(header, 0x1a2);
>
>
> String tableName = null;
> if (useTable1)
> {
> tableName = "1Table";
> }
> else
> {
> tableName = "0Table";
> }
>
> DocumentEntry table = (DocumentEntry)fsys.getRoot().getEntry(tableName);
> byte[] tableStream = new byte[table.getSize()];
>
> din = fsys.createDocumentInputStream(tableName);
>
> din.read(tableStream);
> din.close();
>
> din = null;
> fsys = null;
> table = null;
> headerProps = null;
>
> int multiple = findText(tableStream, complexOffset, text);
>
> StringBuffer sb = new StringBuffer();
> int size = text.size();
> tableStream = null;
>
> for (int x = 0; x < size; x++)
> {
> WordTextPiece nextPiece = (WordTextPiece)text.get(x);
> int start = nextPiece.getStart();
> int length = nextPiece.getLength();
>
> boolean unicode = nextPiece.usesUnicode();
> String toStr = null;
> if (unicode)
> {
> toStr = new String(header, start, length * multiple, "UTF-16LE");
> }
> else
> {
> toStr = new String(header, start, length , "ISO-8859-1");
> }
> sb.append(toStr).append(" ");
>
> }
> return sb.toString();
> }
>
> private static int findText(byte[] tableStream, int complexOffset,
> ArrayList text) throws IOException
> {
> //actual text
> int pos = complexOffset;
> int multiple = 2;
> //skips through the prms before we reach the piece table. These contain
> data
> //for actual fast saved files
> while(tableStream[pos] == 1)
> {
> pos++;
> int skip = LittleEndian.getShort(tableStream, pos);
> pos += 2 + skip;
> }
> if(tableStream[pos] != 2)
> {
> throw new IOException("corrupted Word file");
> }
> else
> {
> //parse out the text pieces
> int pieceTableSize = LittleEndian.getInt(tableStream, ++pos);
> pos += 4;
> int pieces = (pieceTableSize - 4) / 12;
> for (int x = 0; x < pieces; x++)
> {
> int filePos = LittleEndian.getInt(tableStream, pos + ((pieces + 1) * 4) +
> (x * 8) + 2);
> boolean unicode = false;
> if ((filePos & 0x40000000) == 0)
> {
> unicode = true;
> }
> else
> {
> unicode = false;
> multiple = 1;
> filePos &= ~(0x40000000);//gives me FC in doc stream
> filePos /= 2;
> }
> int totLength = LittleEndian.getInt(tableStream, pos + (x + 1) * 4) -
> LittleEndian.getInt(tableStream, pos + (x * 4));
>
> WordTextPiece piece = new WordTextPiece(filePos, totLength, unicode);
> text.add(piece);
>
> }
>
> }
> return multiple;
> }
>
> }
>
> class WordTextPiece
> {
> private int _fcStart;
> private boolean _usesUnicode;
> private int _length;
>
> public WordTextPiece(int start, int length, boolean unicode)
> {
> _usesUnicode = unicode;
> _length = length;
> _fcStart = start;
> }
> public boolean usesUnicode()
> {
> return _usesUnicode;
> }
>
> public int getStart()
> {
> return _fcStart;
> }
> public int getLength()
> {
> return _length;
> }
>
> }
>
> /************** END SOURCE CODE **************/
>
>
> -----Messaggio originale-----
> Da: Dimitri Pissarenko [mailto:dimitri.pissarenko@gmx.net]
> Inviato: Tuesday, January 27, 2004 10:55 PM
> A: poi-user@jakarta.apache.org
> Oggetto: Word to plain text converter
>
>
> Hello!
>
> I want to convert several Microsoft Word files to plain text files, so
> that I can search through them with grep (or with analogous search
> functions under Windows).
>
> Has someone already written such a converter? Is this tool perhaps
> open-source?
>
> TIA
>
> dap
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
> For additional commands, e-mail: poi-user-help@jakarta.apache.org
>
>
>
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
> For additional commands, e-mail: poi-user-help@jakarta.apache.org
>
---------------------------------------------------------------------
To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
For additional commands, e-mail: poi-user-help@jakarta.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
For additional commands, e-mail: poi-user-help@jakarta.apache.org
Re: Word to plain text converter
Posted by Ryan Ackley <sa...@apache.org>.
Paolo,
I appreciate your good intentions by posting this code but your violating
the terms of the license agreement. You don't give any credit to
textmining.org. There are no restrictions to you using the code except that
you have to give credit. It was part of the license agreement that was in
the zip file when you downloaded the library. Its the same when you use
Apache stuff. I'm sure you didn't realise this so its no big deal. Now you
know :-)
-Ryan
----- Original Message -----
From: "Paolo Tortora" <p....@eusysgroup.it>
To: "POI Users List" <po...@jakarta.apache.org>
Sent: Wednesday, January 28, 2004 4:50 AM
Subject: R: Word to plain text converter
> Here is java source code to extract plain text from Word:
>
> /**************** START SOURCE CODE *******************/
> import org.apache.poi.poifs.filesystem.*;
> import org.apache.poi.util.LittleEndian;
>
> import java.util.ArrayList;
> import java.io.InputStream;
>
> import java.io.IOException;
>
> class WordExtractor {
>
> public WordExtractor()
> {
> }
>
> public String extractText(InputStream in) throws IOException
> {
> ArrayList text = new ArrayList();
> POIFSFileSystem fsys = new POIFSFileSystem(in);
>
> DocumentEntry headerProps =
> (DocumentEntry)fsys.getRoot().getEntry("WordDocument");
> DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
> byte[] header = new byte[headerProps.getSize()];
>
> din.read(header);
> din.close();
> // Prende le informazioni dall'header del documento
> int info = LittleEndian.getShort(header, 0xa);
>
> boolean useTable1 = (info & 0x200) != 0;
>
> // Prende informazioni dalla piece table
> int complexOffset = LittleEndian.getInt(header, 0x1a2);
>
>
> String tableName = null;
> if (useTable1)
> {
> tableName = "1Table";
> }
> else
> {
> tableName = "0Table";
> }
>
> DocumentEntry table = (DocumentEntry)fsys.getRoot().getEntry(tableName);
> byte[] tableStream = new byte[table.getSize()];
>
> din = fsys.createDocumentInputStream(tableName);
>
> din.read(tableStream);
> din.close();
>
> din = null;
> fsys = null;
> table = null;
> headerProps = null;
>
> int multiple = findText(tableStream, complexOffset, text);
>
> StringBuffer sb = new StringBuffer();
> int size = text.size();
> tableStream = null;
>
> for (int x = 0; x < size; x++)
> {
> WordTextPiece nextPiece = (WordTextPiece)text.get(x);
> int start = nextPiece.getStart();
> int length = nextPiece.getLength();
>
> boolean unicode = nextPiece.usesUnicode();
> String toStr = null;
> if (unicode)
> {
> toStr = new String(header, start, length * multiple, "UTF-16LE");
> }
> else
> {
> toStr = new String(header, start, length , "ISO-8859-1");
> }
> sb.append(toStr).append(" ");
>
> }
> return sb.toString();
> }
>
> private static int findText(byte[] tableStream, int complexOffset,
> ArrayList text) throws IOException
> {
> //actual text
> int pos = complexOffset;
> int multiple = 2;
> //skips through the prms before we reach the piece table. These contain
> data
> //for actual fast saved files
> while(tableStream[pos] == 1)
> {
> pos++;
> int skip = LittleEndian.getShort(tableStream, pos);
> pos += 2 + skip;
> }
> if(tableStream[pos] != 2)
> {
> throw new IOException("corrupted Word file");
> }
> else
> {
> //parse out the text pieces
> int pieceTableSize = LittleEndian.getInt(tableStream, ++pos);
> pos += 4;
> int pieces = (pieceTableSize - 4) / 12;
> for (int x = 0; x < pieces; x++)
> {
> int filePos = LittleEndian.getInt(tableStream, pos + ((pieces + 1) * 4) +
> (x * 8) + 2);
> boolean unicode = false;
> if ((filePos & 0x40000000) == 0)
> {
> unicode = true;
> }
> else
> {
> unicode = false;
> multiple = 1;
> filePos &= ~(0x40000000);//gives me FC in doc stream
> filePos /= 2;
> }
> int totLength = LittleEndian.getInt(tableStream, pos + (x + 1) * 4) -
> LittleEndian.getInt(tableStream, pos + (x * 4));
>
> WordTextPiece piece = new WordTextPiece(filePos, totLength, unicode);
> text.add(piece);
>
> }
>
> }
> return multiple;
> }
>
> }
>
> class WordTextPiece
> {
> private int _fcStart;
> private boolean _usesUnicode;
> private int _length;
>
> public WordTextPiece(int start, int length, boolean unicode)
> {
> _usesUnicode = unicode;
> _length = length;
> _fcStart = start;
> }
> public boolean usesUnicode()
> {
> return _usesUnicode;
> }
>
> public int getStart()
> {
> return _fcStart;
> }
> public int getLength()
> {
> return _length;
> }
>
> }
>
> /************** END SOURCE CODE **************/
>
>
> -----Messaggio originale-----
> Da: Dimitri Pissarenko [mailto:dimitri.pissarenko@gmx.net]
> Inviato: Tuesday, January 27, 2004 10:55 PM
> A: poi-user@jakarta.apache.org
> Oggetto: Word to plain text converter
>
>
> Hello!
>
> I want to convert several Microsoft Word files to plain text files, so
> that I can search through them with grep (or with analogous search
> functions under Windows).
>
> Has someone already written such a converter? Is this tool perhaps
> open-source?
>
> TIA
>
> dap
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
> For additional commands, e-mail: poi-user-help@jakarta.apache.org
>
>
>
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
> For additional commands, e-mail: poi-user-help@jakarta.apache.org
>
---------------------------------------------------------------------
To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
For additional commands, e-mail: poi-user-help@jakarta.apache.org
R: Word to plain text converter
Posted by Paolo Tortora <p....@eusysgroup.it>.
Here is java source code to extract plain text from Word:
/**************** START SOURCE CODE *******************/
import org.apache.poi.poifs.filesystem.*;
import org.apache.poi.util.LittleEndian;
import java.util.ArrayList;
import java.io.InputStream;
import java.io.IOException;
class WordExtractor {
public WordExtractor()
{
}
public String extractText(InputStream in) throws IOException
{
ArrayList text = new ArrayList();
POIFSFileSystem fsys = new POIFSFileSystem(in);
DocumentEntry headerProps =
(DocumentEntry)fsys.getRoot().getEntry("WordDocument");
DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
byte[] header = new byte[headerProps.getSize()];
din.read(header);
din.close();
// Prende le informazioni dall'header del documento
int info = LittleEndian.getShort(header, 0xa);
boolean useTable1 = (info & 0x200) != 0;
// Prende informazioni dalla piece table
int complexOffset = LittleEndian.getInt(header, 0x1a2);
String tableName = null;
if (useTable1)
{
tableName = "1Table";
}
else
{
tableName = "0Table";
}
DocumentEntry table = (DocumentEntry)fsys.getRoot().getEntry(tableName);
byte[] tableStream = new byte[table.getSize()];
din = fsys.createDocumentInputStream(tableName);
din.read(tableStream);
din.close();
din = null;
fsys = null;
table = null;
headerProps = null;
int multiple = findText(tableStream, complexOffset, text);
StringBuffer sb = new StringBuffer();
int size = text.size();
tableStream = null;
for (int x = 0; x < size; x++)
{
WordTextPiece nextPiece = (WordTextPiece)text.get(x);
int start = nextPiece.getStart();
int length = nextPiece.getLength();
boolean unicode = nextPiece.usesUnicode();
String toStr = null;
if (unicode)
{
toStr = new String(header, start, length * multiple, "UTF-16LE");
}
else
{
toStr = new String(header, start, length , "ISO-8859-1");
}
sb.append(toStr).append(" ");
}
return sb.toString();
}
private static int findText(byte[] tableStream, int complexOffset,
ArrayList text) throws IOException
{
//actual text
int pos = complexOffset;
int multiple = 2;
//skips through the prms before we reach the piece table. These contain
data
//for actual fast saved files
while(tableStream[pos] == 1)
{
pos++;
int skip = LittleEndian.getShort(tableStream, pos);
pos += 2 + skip;
}
if(tableStream[pos] != 2)
{
throw new IOException("corrupted Word file");
}
else
{
//parse out the text pieces
int pieceTableSize = LittleEndian.getInt(tableStream, ++pos);
pos += 4;
int pieces = (pieceTableSize - 4) / 12;
for (int x = 0; x < pieces; x++)
{
int filePos = LittleEndian.getInt(tableStream, pos + ((pieces + 1) * 4) +
(x * 8) + 2);
boolean unicode = false;
if ((filePos & 0x40000000) == 0)
{
unicode = true;
}
else
{
unicode = false;
multiple = 1;
filePos &= ~(0x40000000);//gives me FC in doc stream
filePos /= 2;
}
int totLength = LittleEndian.getInt(tableStream, pos + (x + 1) * 4) -
LittleEndian.getInt(tableStream, pos + (x * 4));
WordTextPiece piece = new WordTextPiece(filePos, totLength, unicode);
text.add(piece);
}
}
return multiple;
}
}
class WordTextPiece
{
private int _fcStart;
private boolean _usesUnicode;
private int _length;
public WordTextPiece(int start, int length, boolean unicode)
{
_usesUnicode = unicode;
_length = length;
_fcStart = start;
}
public boolean usesUnicode()
{
return _usesUnicode;
}
public int getStart()
{
return _fcStart;
}
public int getLength()
{
return _length;
}
}
/************** END SOURCE CODE **************/
-----Messaggio originale-----
Da: Dimitri Pissarenko [mailto:dimitri.pissarenko@gmx.net]
Inviato: Tuesday, January 27, 2004 10:55 PM
A: poi-user@jakarta.apache.org
Oggetto: Word to plain text converter
Hello!
I want to convert several Microsoft Word files to plain text files, so
that I can search through them with grep (or with analogous search
functions under Windows).
Has someone already written such a converter? Is this tool perhaps
open-source?
TIA
dap
---------------------------------------------------------------------
To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
For additional commands, e-mail: poi-user-help@jakarta.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
For additional commands, e-mail: poi-user-help@jakarta.apache.org