You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@poi.apache.org by Dimitri Pissarenko <di...@gmx.net> on 2004/01/27 22:54:39 UTC

Word to plain text converter

Hello!

I want to convert several Microsoft Word files to plain text files, so
that I can search through them with grep (or with analogous search
functions under Windows).

Has someone already written such a converter? Is this tool perhaps
open-source?

TIA

dap

---------------------------------------------------------------------
To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
For additional commands, e-mail: poi-user-help@jakarta.apache.org


RE: Word to plain text converter

Posted by Abdul Wahab <aw...@erp21.com.my>.
Hi Paolo,

When i try to use http://textmining.org lib, Im getting the text but after
long "beeb" sound with "system hang mode" if my word document contains table
format. how to control that..



-----Original Message-----
From: Paolo Tortora [mailto:p.tortora@eusysgroup.it]
Sent: Wednesday, January 28, 2004 5:49 PM
To: POI Users List; Ryan Ackley
Subject: R: Word to plain text converter


Here some piece of java code to extract plain text from Word:


/**************** START SOURCE CODE *******************/
import org.apache.poi.poifs.filesystem.*;
import org.apache.poi.util.LittleEndian;

import java.util.ArrayList;
import java.io.InputStream;

import java.io.IOException;

class WordExtractor {

  public WordExtractor()
  {
  }

    public String extractText(InputStream in) throws IOException
  {
	ArrayList text = new ArrayList();
	POIFSFileSystem fsys = new POIFSFileSystem(in);

	DocumentEntry headerProps =
		(DocumentEntry)fsys.getRoot().getEntry("WordDocument");
	DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
	byte[] header = new byte[headerProps.getSize()];

	din.read(header);
	din.close();
	// Prende le informazioni dall'header del documento
	int info = LittleEndian.getShort(header, 0xa);

	boolean useTable1 = (info & 0x200) != 0;

	// Prende informazioni dalla piece table
	int complexOffset = LittleEndian.getInt(header, 0x1a2);


	String tableName = null;
	if (useTable1)
	{
	  tableName = "1Table";
	}
	else
	{
	  tableName = "0Table";
	}

	DocumentEntry table = (DocumentEntry)fsys.getRoot().getEntry(tableName);
	byte[] tableStream = new byte[table.getSize()];

	din = fsys.createDocumentInputStream(tableName);

	din.read(tableStream);
	din.close();

	din = null;
	fsys = null;
	table = null;
	headerProps = null;

	int multiple = findText(tableStream, complexOffset, text);

	StringBuffer sb = new StringBuffer();
	int size = text.size();
	tableStream = null;

	for (int x = 0; x < size; x++)
	{
	  WordTextPiece nextPiece = (WordTextPiece)text.get(x);
	  int start = nextPiece.getStart();
	  int length = nextPiece.getLength();

	  boolean unicode = nextPiece.usesUnicode();
	  String toStr = null;
	  if (unicode)
	  {
		toStr = new String(header, start, length * multiple, "UTF-16LE");
	  }
	  else
	  {
		toStr = new String(header, start, length , "ISO-8859-1");
	  }
	  sb.append(toStr).append(" ");

	}
	return sb.toString();
  }

   private static int findText(byte[] tableStream, int complexOffset,
ArrayList text) throws IOException
  {
	//actual text
	int pos = complexOffset;
	int multiple = 2;
	//skips through the prms before we reach the piece table. These contain
data
	//for actual fast saved files
	while(tableStream[pos] == 1)
	{
		pos++;
		int skip = LittleEndian.getShort(tableStream, pos);
		pos += 2 + skip;
	}
	if(tableStream[pos] != 2)
	{
		throw new IOException("corrupted Word file");
	}
	else
	{
		//parse out the text pieces
		int pieceTableSize = LittleEndian.getInt(tableStream, ++pos);
		pos += 4;
		int pieces = (pieceTableSize - 4) / 12;
		for (int x = 0; x < pieces; x++)
		{
			int filePos = LittleEndian.getInt(tableStream, pos + ((pieces + 1) * 4) +
(x * 8) + 2);
			boolean unicode = false;
			if ((filePos & 0x40000000) == 0)
			{
				unicode = true;
			}
			else
			{
				unicode = false;
				multiple = 1;
				filePos &= ~(0x40000000);//gives me FC in doc stream
				filePos /= 2;
			}
			int totLength = LittleEndian.getInt(tableStream, pos + (x + 1) * 4) -
							LittleEndian.getInt(tableStream, pos + (x * 4));

			WordTextPiece piece = new WordTextPiece(filePos, totLength, unicode);
			text.add(piece);

		}

	}
	return multiple;
  }

}

class WordTextPiece
{
  private int _fcStart;
  private boolean _usesUnicode;
  private int _length;

  public WordTextPiece(int start, int length, boolean unicode)
  {
	_usesUnicode = unicode;
	_length = length;
	_fcStart = start;
  }
   public boolean usesUnicode()
  {
	  return _usesUnicode;
  }

  public int getStart()
  {
	  return _fcStart;
  }
  public int getLength()
  {
	return _length;
  }

}

/************** END SOURCE CODE **************/


-----Messaggio originale-----
Da: Ryan Ackley [mailto:sackley@apache.org]
Inviato: Wednesday, January 28, 2004 12:16 AM
A: POI Users List
Oggetto: Re: Word to plain text converter


http://textmining.org

----- Original Message -----
From: "Dimitri Pissarenko" <di...@gmx.net>
To: <po...@jakarta.apache.org>
Sent: Tuesday, January 27, 2004 4:54 PM
Subject: Word to plain text converter


Hello!

I want to convert several Microsoft Word files to plain text files, so
that I can search through them with grep (or with analogous search
functions under Windows).

Has someone already written such a converter? Is this tool perhaps
open-source?

TIA

dap

---------------------------------------------------------------------
To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
For additional commands, e-mail: poi-user-help@jakarta.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
For additional commands, e-mail: poi-user-help@jakarta.apache.org




---------------------------------------------------------------------
To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
For additional commands, e-mail: poi-user-help@jakarta.apache.org


---
[This E-mail scanned for viruses by iRepublics.com Anti Virus Solutions]


p.s. get your web hosted for free at iRepublics.com
33MB webspace free, SMS messaging, 30 email accounts


---------------------------------------------------------------------
To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
For additional commands, e-mail: poi-user-help@jakarta.apache.org


R: Word to plain text converter

Posted by Paolo Tortora <p....@eusysgroup.it>.
Here some piece of java code to extract plain text from Word:


/**************** START SOURCE CODE *******************/
import org.apache.poi.poifs.filesystem.*;
import org.apache.poi.util.LittleEndian;

import java.util.ArrayList;
import java.io.InputStream;

import java.io.IOException;

class WordExtractor {

  public WordExtractor()
  {
  }

    public String extractText(InputStream in) throws IOException
  {
	ArrayList text = new ArrayList();
	POIFSFileSystem fsys = new POIFSFileSystem(in);

	DocumentEntry headerProps =
		(DocumentEntry)fsys.getRoot().getEntry("WordDocument");
	DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
	byte[] header = new byte[headerProps.getSize()];

	din.read(header);
	din.close();
	// Prende le informazioni dall'header del documento
	int info = LittleEndian.getShort(header, 0xa);

	boolean useTable1 = (info & 0x200) != 0;

	// Prende informazioni dalla piece table
	int complexOffset = LittleEndian.getInt(header, 0x1a2);


	String tableName = null;
	if (useTable1)
	{
	  tableName = "1Table";
	}
	else
	{
	  tableName = "0Table";
	}

	DocumentEntry table = (DocumentEntry)fsys.getRoot().getEntry(tableName);
	byte[] tableStream = new byte[table.getSize()];

	din = fsys.createDocumentInputStream(tableName);

	din.read(tableStream);
	din.close();

	din = null;
	fsys = null;
	table = null;
	headerProps = null;

	int multiple = findText(tableStream, complexOffset, text);

	StringBuffer sb = new StringBuffer();
	int size = text.size();
	tableStream = null;

	for (int x = 0; x < size; x++)
	{
	  WordTextPiece nextPiece = (WordTextPiece)text.get(x);
	  int start = nextPiece.getStart();
	  int length = nextPiece.getLength();

	  boolean unicode = nextPiece.usesUnicode();
	  String toStr = null;
	  if (unicode)
	  {
		toStr = new String(header, start, length * multiple, "UTF-16LE");
	  }
	  else
	  {
		toStr = new String(header, start, length , "ISO-8859-1");
	  }
	  sb.append(toStr).append(" ");

	}
	return sb.toString();
  }

   private static int findText(byte[] tableStream, int complexOffset,
ArrayList text) throws IOException
  {
	//actual text
	int pos = complexOffset;
	int multiple = 2;
	//skips through the prms before we reach the piece table. These contain
data
	//for actual fast saved files
	while(tableStream[pos] == 1)
	{
		pos++;
		int skip = LittleEndian.getShort(tableStream, pos);
		pos += 2 + skip;
	}
	if(tableStream[pos] != 2)
	{
		throw new IOException("corrupted Word file");
	}
	else
	{
		//parse out the text pieces
		int pieceTableSize = LittleEndian.getInt(tableStream, ++pos);
		pos += 4;
		int pieces = (pieceTableSize - 4) / 12;
		for (int x = 0; x < pieces; x++)
		{
			int filePos = LittleEndian.getInt(tableStream, pos + ((pieces + 1) * 4) +
(x * 8) + 2);
			boolean unicode = false;
			if ((filePos & 0x40000000) == 0)
			{
				unicode = true;
			}
			else
			{
				unicode = false;
				multiple = 1;
				filePos &= ~(0x40000000);//gives me FC in doc stream
				filePos /= 2;
			}
			int totLength = LittleEndian.getInt(tableStream, pos + (x + 1) * 4) -
							LittleEndian.getInt(tableStream, pos + (x * 4));

			WordTextPiece piece = new WordTextPiece(filePos, totLength, unicode);
			text.add(piece);

		}

	}
	return multiple;
  }

}

class WordTextPiece
{
  private int _fcStart;
  private boolean _usesUnicode;
  private int _length;

  public WordTextPiece(int start, int length, boolean unicode)
  {
	_usesUnicode = unicode;
	_length = length;
	_fcStart = start;
  }
   public boolean usesUnicode()
  {
	  return _usesUnicode;
  }

  public int getStart()
  {
	  return _fcStart;
  }
  public int getLength()
  {
	return _length;
  }

}

/************** END SOURCE CODE **************/


-----Messaggio originale-----
Da: Ryan Ackley [mailto:sackley@apache.org]
Inviato: Wednesday, January 28, 2004 12:16 AM
A: POI Users List
Oggetto: Re: Word to plain text converter


http://textmining.org

----- Original Message -----
From: "Dimitri Pissarenko" <di...@gmx.net>
To: <po...@jakarta.apache.org>
Sent: Tuesday, January 27, 2004 4:54 PM
Subject: Word to plain text converter


Hello!

I want to convert several Microsoft Word files to plain text files, so
that I can search through them with grep (or with analogous search
functions under Windows).

Has someone already written such a converter? Is this tool perhaps
open-source?

TIA

dap

---------------------------------------------------------------------
To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
For additional commands, e-mail: poi-user-help@jakarta.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
For additional commands, e-mail: poi-user-help@jakarta.apache.org




---------------------------------------------------------------------
To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
For additional commands, e-mail: poi-user-help@jakarta.apache.org


Re: Word to plain text converter

Posted by Ryan Ackley <sa...@apache.org>.
http://textmining.org

----- Original Message ----- 
From: "Dimitri Pissarenko" <di...@gmx.net>
To: <po...@jakarta.apache.org>
Sent: Tuesday, January 27, 2004 4:54 PM
Subject: Word to plain text converter


Hello!

I want to convert several Microsoft Word files to plain text files, so
that I can search through them with grep (or with analogous search
functions under Windows).

Has someone already written such a converter? Is this tool perhaps
open-source?

TIA

dap

---------------------------------------------------------------------
To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
For additional commands, e-mail: poi-user-help@jakarta.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
For additional commands, e-mail: poi-user-help@jakarta.apache.org


R: Word to plain text converter

Posted by Paolo Tortora <p....@eusysgroup.it>.
Thank you

-----Messaggio originale-----
Da: Ryan Ackley [mailto:sackley@apache.org]
Inviato: Wednesday, January 28, 2004 2:02 PM
A: POI Users List
Oggetto: Re: Word to plain text converter


Paolo,

I appreciate your good intentions by posting this code but your violating
the terms of the license agreement. You don't give any credit to
textmining.org. There are no restrictions to you using the code except that
you have to give credit. It was part of the license agreement that was in
the zip file when you downloaded the library. Its the same when you use
Apache stuff. I'm sure you didn't realise this so its no big deal. Now you
know :-)

-Ryan


----- Original Message ----- 
From: "Paolo Tortora" <p....@eusysgroup.it>
To: "POI Users List" <po...@jakarta.apache.org>
Sent: Wednesday, January 28, 2004 4:50 AM
Subject: R: Word to plain text converter


> Here is java source code to extract plain text from Word:
>
> /**************** START SOURCE CODE *******************/
> import org.apache.poi.poifs.filesystem.*;
> import org.apache.poi.util.LittleEndian;
>
> import java.util.ArrayList;
> import java.io.InputStream;
>
> import java.io.IOException;
>
> class WordExtractor {
>
>   public WordExtractor()
>   {
>   }
>
>     public String extractText(InputStream in) throws IOException
>   {
> ArrayList text = new ArrayList();
> POIFSFileSystem fsys = new POIFSFileSystem(in);
>
> DocumentEntry headerProps =
> (DocumentEntry)fsys.getRoot().getEntry("WordDocument");
> DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
> byte[] header = new byte[headerProps.getSize()];
>
> din.read(header);
> din.close();
> // Prende le informazioni dall'header del documento
> int info = LittleEndian.getShort(header, 0xa);
>
> boolean useTable1 = (info & 0x200) != 0;
>
> // Prende informazioni dalla piece table
> int complexOffset = LittleEndian.getInt(header, 0x1a2);
>
>
> String tableName = null;
> if (useTable1)
> {
>   tableName = "1Table";
> }
> else
> {
>   tableName = "0Table";
> }
>
> DocumentEntry table = (DocumentEntry)fsys.getRoot().getEntry(tableName);
> byte[] tableStream = new byte[table.getSize()];
>
> din = fsys.createDocumentInputStream(tableName);
>
> din.read(tableStream);
> din.close();
>
> din = null;
> fsys = null;
> table = null;
> headerProps = null;
>
> int multiple = findText(tableStream, complexOffset, text);
>
> StringBuffer sb = new StringBuffer();
> int size = text.size();
> tableStream = null;
>
> for (int x = 0; x < size; x++)
> {
>   WordTextPiece nextPiece = (WordTextPiece)text.get(x);
>   int start = nextPiece.getStart();
>   int length = nextPiece.getLength();
>
>   boolean unicode = nextPiece.usesUnicode();
>   String toStr = null;
>   if (unicode)
>   {
> toStr = new String(header, start, length * multiple, "UTF-16LE");
>   }
>   else
>   {
> toStr = new String(header, start, length , "ISO-8859-1");
>   }
>   sb.append(toStr).append(" ");
>
> }
> return sb.toString();
>   }
>
>    private static int findText(byte[] tableStream, int complexOffset,
> ArrayList text) throws IOException
>   {
> //actual text
> int pos = complexOffset;
> int multiple = 2;
> //skips through the prms before we reach the piece table. These contain
> data
> //for actual fast saved files
> while(tableStream[pos] == 1)
> {
> pos++;
> int skip = LittleEndian.getShort(tableStream, pos);
> pos += 2 + skip;
> }
> if(tableStream[pos] != 2)
> {
> throw new IOException("corrupted Word file");
> }
> else
> {
> //parse out the text pieces
> int pieceTableSize = LittleEndian.getInt(tableStream, ++pos);
> pos += 4;
> int pieces = (pieceTableSize - 4) / 12;
> for (int x = 0; x < pieces; x++)
> {
> int filePos = LittleEndian.getInt(tableStream, pos + ((pieces + 1) * 4) +
> (x * 8) + 2);
> boolean unicode = false;
> if ((filePos & 0x40000000) == 0)
> {
> unicode = true;
> }
> else
> {
> unicode = false;
> multiple = 1;
> filePos &= ~(0x40000000);//gives me FC in doc stream
> filePos /= 2;
> }
> int totLength = LittleEndian.getInt(tableStream, pos + (x + 1) * 4) -
> LittleEndian.getInt(tableStream, pos + (x * 4));
>
> WordTextPiece piece = new WordTextPiece(filePos, totLength, unicode);
> text.add(piece);
>
> }
>
> }
> return multiple;
>   }
>
> }
>
> class WordTextPiece
> {
>   private int _fcStart;
>   private boolean _usesUnicode;
>   private int _length;
>
>   public WordTextPiece(int start, int length, boolean unicode)
>   {
> _usesUnicode = unicode;
> _length = length;
> _fcStart = start;
>   }
>    public boolean usesUnicode()
>   {
>   return _usesUnicode;
>   }
>
>   public int getStart()
>   {
>   return _fcStart;
>   }
>   public int getLength()
>   {
> return _length;
>   }
>
> }
>
> /************** END SOURCE CODE **************/
>
>
> -----Messaggio originale-----
> Da: Dimitri Pissarenko [mailto:dimitri.pissarenko@gmx.net]
> Inviato: Tuesday, January 27, 2004 10:55 PM
> A: poi-user@jakarta.apache.org
> Oggetto: Word to plain text converter
>
>
> Hello!
>
> I want to convert several Microsoft Word files to plain text files, so
> that I can search through them with grep (or with analogous search
> functions under Windows).
>
> Has someone already written such a converter? Is this tool perhaps
> open-source?
>
> TIA
>
> dap
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
> For additional commands, e-mail: poi-user-help@jakarta.apache.org
>
>
>
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
> For additional commands, e-mail: poi-user-help@jakarta.apache.org
>


---------------------------------------------------------------------
To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
For additional commands, e-mail: poi-user-help@jakarta.apache.org




---------------------------------------------------------------------
To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
For additional commands, e-mail: poi-user-help@jakarta.apache.org


Re: Word to plain text converter

Posted by Ryan Ackley <sa...@apache.org>.
Paolo,

I appreciate your good intentions by posting this code but your violating
the terms of the license agreement. You don't give any credit to
textmining.org. There are no restrictions to you using the code except that
you have to give credit. It was part of the license agreement that was in
the zip file when you downloaded the library. Its the same when you use
Apache stuff. I'm sure you didn't realise this so its no big deal. Now you
know :-)

-Ryan


----- Original Message ----- 
From: "Paolo Tortora" <p....@eusysgroup.it>
To: "POI Users List" <po...@jakarta.apache.org>
Sent: Wednesday, January 28, 2004 4:50 AM
Subject: R: Word to plain text converter


> Here is java source code to extract plain text from Word:
>
> /**************** START SOURCE CODE *******************/
> import org.apache.poi.poifs.filesystem.*;
> import org.apache.poi.util.LittleEndian;
>
> import java.util.ArrayList;
> import java.io.InputStream;
>
> import java.io.IOException;
>
> class WordExtractor {
>
>   public WordExtractor()
>   {
>   }
>
>     public String extractText(InputStream in) throws IOException
>   {
> ArrayList text = new ArrayList();
> POIFSFileSystem fsys = new POIFSFileSystem(in);
>
> DocumentEntry headerProps =
> (DocumentEntry)fsys.getRoot().getEntry("WordDocument");
> DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
> byte[] header = new byte[headerProps.getSize()];
>
> din.read(header);
> din.close();
> // Prende le informazioni dall'header del documento
> int info = LittleEndian.getShort(header, 0xa);
>
> boolean useTable1 = (info & 0x200) != 0;
>
> // Prende informazioni dalla piece table
> int complexOffset = LittleEndian.getInt(header, 0x1a2);
>
>
> String tableName = null;
> if (useTable1)
> {
>   tableName = "1Table";
> }
> else
> {
>   tableName = "0Table";
> }
>
> DocumentEntry table = (DocumentEntry)fsys.getRoot().getEntry(tableName);
> byte[] tableStream = new byte[table.getSize()];
>
> din = fsys.createDocumentInputStream(tableName);
>
> din.read(tableStream);
> din.close();
>
> din = null;
> fsys = null;
> table = null;
> headerProps = null;
>
> int multiple = findText(tableStream, complexOffset, text);
>
> StringBuffer sb = new StringBuffer();
> int size = text.size();
> tableStream = null;
>
> for (int x = 0; x < size; x++)
> {
>   WordTextPiece nextPiece = (WordTextPiece)text.get(x);
>   int start = nextPiece.getStart();
>   int length = nextPiece.getLength();
>
>   boolean unicode = nextPiece.usesUnicode();
>   String toStr = null;
>   if (unicode)
>   {
> toStr = new String(header, start, length * multiple, "UTF-16LE");
>   }
>   else
>   {
> toStr = new String(header, start, length , "ISO-8859-1");
>   }
>   sb.append(toStr).append(" ");
>
> }
> return sb.toString();
>   }
>
>    private static int findText(byte[] tableStream, int complexOffset,
> ArrayList text) throws IOException
>   {
> //actual text
> int pos = complexOffset;
> int multiple = 2;
> //skips through the prms before we reach the piece table. These contain
> data
> //for actual fast saved files
> while(tableStream[pos] == 1)
> {
> pos++;
> int skip = LittleEndian.getShort(tableStream, pos);
> pos += 2 + skip;
> }
> if(tableStream[pos] != 2)
> {
> throw new IOException("corrupted Word file");
> }
> else
> {
> //parse out the text pieces
> int pieceTableSize = LittleEndian.getInt(tableStream, ++pos);
> pos += 4;
> int pieces = (pieceTableSize - 4) / 12;
> for (int x = 0; x < pieces; x++)
> {
> int filePos = LittleEndian.getInt(tableStream, pos + ((pieces + 1) * 4) +
> (x * 8) + 2);
> boolean unicode = false;
> if ((filePos & 0x40000000) == 0)
> {
> unicode = true;
> }
> else
> {
> unicode = false;
> multiple = 1;
> filePos &= ~(0x40000000);//gives me FC in doc stream
> filePos /= 2;
> }
> int totLength = LittleEndian.getInt(tableStream, pos + (x + 1) * 4) -
> LittleEndian.getInt(tableStream, pos + (x * 4));
>
> WordTextPiece piece = new WordTextPiece(filePos, totLength, unicode);
> text.add(piece);
>
> }
>
> }
> return multiple;
>   }
>
> }
>
> class WordTextPiece
> {
>   private int _fcStart;
>   private boolean _usesUnicode;
>   private int _length;
>
>   public WordTextPiece(int start, int length, boolean unicode)
>   {
> _usesUnicode = unicode;
> _length = length;
> _fcStart = start;
>   }
>    public boolean usesUnicode()
>   {
>   return _usesUnicode;
>   }
>
>   public int getStart()
>   {
>   return _fcStart;
>   }
>   public int getLength()
>   {
> return _length;
>   }
>
> }
>
> /************** END SOURCE CODE **************/
>
>
> -----Messaggio originale-----
> Da: Dimitri Pissarenko [mailto:dimitri.pissarenko@gmx.net]
> Inviato: Tuesday, January 27, 2004 10:55 PM
> A: poi-user@jakarta.apache.org
> Oggetto: Word to plain text converter
>
>
> Hello!
>
> I want to convert several Microsoft Word files to plain text files, so
> that I can search through them with grep (or with analogous search
> functions under Windows).
>
> Has someone already written such a converter? Is this tool perhaps
> open-source?
>
> TIA
>
> dap
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
> For additional commands, e-mail: poi-user-help@jakarta.apache.org
>
>
>
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
> For additional commands, e-mail: poi-user-help@jakarta.apache.org
>


---------------------------------------------------------------------
To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
For additional commands, e-mail: poi-user-help@jakarta.apache.org


R: Word to plain text converter

Posted by Paolo Tortora <p....@eusysgroup.it>.
Here is java source code to extract plain text from Word:

/**************** START SOURCE CODE *******************/
import org.apache.poi.poifs.filesystem.*;
import org.apache.poi.util.LittleEndian;

import java.util.ArrayList;
import java.io.InputStream;

import java.io.IOException;

class WordExtractor {

  public WordExtractor()
  {
  }

    public String extractText(InputStream in) throws IOException
  {
	ArrayList text = new ArrayList();
	POIFSFileSystem fsys = new POIFSFileSystem(in);

	DocumentEntry headerProps =
		(DocumentEntry)fsys.getRoot().getEntry("WordDocument");
	DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
	byte[] header = new byte[headerProps.getSize()];

	din.read(header);
	din.close();
	// Prende le informazioni dall'header del documento
	int info = LittleEndian.getShort(header, 0xa);

	boolean useTable1 = (info & 0x200) != 0;

	// Prende informazioni dalla piece table
	int complexOffset = LittleEndian.getInt(header, 0x1a2);


	String tableName = null;
	if (useTable1)
	{
	  tableName = "1Table";
	}
	else
	{
	  tableName = "0Table";
	}

	DocumentEntry table = (DocumentEntry)fsys.getRoot().getEntry(tableName);
	byte[] tableStream = new byte[table.getSize()];

	din = fsys.createDocumentInputStream(tableName);

	din.read(tableStream);
	din.close();

	din = null;
	fsys = null;
	table = null;
	headerProps = null;

	int multiple = findText(tableStream, complexOffset, text);

	StringBuffer sb = new StringBuffer();
	int size = text.size();
	tableStream = null;

	for (int x = 0; x < size; x++)
	{
	  WordTextPiece nextPiece = (WordTextPiece)text.get(x);
	  int start = nextPiece.getStart();
	  int length = nextPiece.getLength();

	  boolean unicode = nextPiece.usesUnicode();
	  String toStr = null;
	  if (unicode)
	  {
		toStr = new String(header, start, length * multiple, "UTF-16LE");
	  }
	  else
	  {
		toStr = new String(header, start, length , "ISO-8859-1");
	  }
	  sb.append(toStr).append(" ");

	}
	return sb.toString();
  }

   private static int findText(byte[] tableStream, int complexOffset,
ArrayList text) throws IOException
  {
	//actual text
	int pos = complexOffset;
	int multiple = 2;
	//skips through the prms before we reach the piece table. These contain
data
	//for actual fast saved files
	while(tableStream[pos] == 1)
	{
		pos++;
		int skip = LittleEndian.getShort(tableStream, pos);
		pos += 2 + skip;
	}
	if(tableStream[pos] != 2)
	{
		throw new IOException("corrupted Word file");
	}
	else
	{
		//parse out the text pieces
		int pieceTableSize = LittleEndian.getInt(tableStream, ++pos);
		pos += 4;
		int pieces = (pieceTableSize - 4) / 12;
		for (int x = 0; x < pieces; x++)
		{
			int filePos = LittleEndian.getInt(tableStream, pos + ((pieces + 1) * 4) +
(x * 8) + 2);
			boolean unicode = false;
			if ((filePos & 0x40000000) == 0)
			{
				unicode = true;
			}
			else
			{
				unicode = false;
				multiple = 1;
				filePos &= ~(0x40000000);//gives me FC in doc stream
				filePos /= 2;
			}
			int totLength = LittleEndian.getInt(tableStream, pos + (x + 1) * 4) -
							LittleEndian.getInt(tableStream, pos + (x * 4));

			WordTextPiece piece = new WordTextPiece(filePos, totLength, unicode);
			text.add(piece);

		}

	}
	return multiple;
  }

}

class WordTextPiece
{
  private int _fcStart;
  private boolean _usesUnicode;
  private int _length;

  public WordTextPiece(int start, int length, boolean unicode)
  {
	_usesUnicode = unicode;
	_length = length;
	_fcStart = start;
  }
   public boolean usesUnicode()
  {
	  return _usesUnicode;
  }

  public int getStart()
  {
	  return _fcStart;
  }
  public int getLength()
  {
	return _length;
  }

}

/************** END SOURCE CODE **************/


-----Messaggio originale-----
Da: Dimitri Pissarenko [mailto:dimitri.pissarenko@gmx.net]
Inviato: Tuesday, January 27, 2004 10:55 PM
A: poi-user@jakarta.apache.org
Oggetto: Word to plain text converter


Hello!

I want to convert several Microsoft Word files to plain text files, so
that I can search through them with grep (or with analogous search
functions under Windows).

Has someone already written such a converter? Is this tool perhaps
open-source?

TIA

dap

---------------------------------------------------------------------
To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
For additional commands, e-mail: poi-user-help@jakarta.apache.org




---------------------------------------------------------------------
To unsubscribe, e-mail: poi-user-unsubscribe@jakarta.apache.org
For additional commands, e-mail: poi-user-help@jakarta.apache.org