You are viewing a plain text version of this content. The canonical link for it is here.

Posted to users@opennlp.apache.org by lina abu jaradeh <li...@hotmail.com> on 2014/05/18 21:32:26 UTC

FW: Problem with my project

I am trying to train en-ner-location.bin file using opennlp in java The thing is i got the training text file in the following format <START:location> Fontana <END> .<START:location> Palo Verde <END> .
<START:location> Picacho <END> .
I stored it in a text file called citytrain.txt the file contains 120770 line of city namesthen i used the following code to train the fileimport java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.Collections;

import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.NameSample;
import opennlp.tools.namefind.NameSampleDataStream;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.Span;
import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator;

public class TrainNames {	 
		@SuppressWarnings("deprecation")
		public void TrainNames() throws IOException{
			//File baseDir = new File("src/test/resources");
		    //File destDir = new File("target");
		    //<start id="ne-train"/>
		    File inFile = new File("citytrain.txt");
		    NameSampleDataStream nss = new NameSampleDataStream( //<co id="co.opennlp.name.initnamestream"/>
		      new PlainTextByLineStream(
		        new java.io.FileReader(inFile)));

		    int iterations = 100;
		    int cutoff = 5;
		    TokenNameFinderModel model = NameFinderME.train( //<co id="co.opennlp.name.train"/>
		        "en", // language
		        "location", // type
		        nss, 
		        (AdaptiveFeatureGenerator) null,
		        Collections.<String,Object>emptyMap(),
		        iterations,
		        cutoff);
		    
		    File outFile = new File("en-ner-locationNews.bin");
		    FileOutputStream outFileStream = new FileOutputStream(outFile);
		    model.serialize(outFileStream);
		}
}

when i used the output bi file to test it on a string to pull up Fontana from a string the result returned the whole string....don't know why or what i am doing wrongthe following code i used to extract Fontana from a stringimport java.io.FileInputStream;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.Span;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;

import org.xml.sax.SAXException;


public class CityFinder {

    public String Tokens[];

    public static void main(String[] args) throws IOException, SAXException {

    	CityFinder toi = new CityFinder();
        String cnt;
        cnt="John is planning to specialize in Electrical Engineering in UC Fontana and pursue a career with IBM.";
        toi.tokenization(cnt);
        String cities = toi.namefind(toi.Tokens);
        String org = toi.orgfind(toi.Tokens);

        System.out.println("City name is : "+cities);
        System.out.println("organization name is: "+org);

    }
        public String namefind(String cnt[]) {
        InputStream is;
        TokenNameFinderModel tnf;
        NameFinderME nf;
        String sd = "";
        try {
            is = new FileInputStream("en-ner-locationNew.bin");
            tnf = new TokenNameFinderModel(is);
            nf = new NameFinderME(tnf);
            Span sp[] = nf.find(cnt);
            String a[] = Span.spansToStrings(sp, cnt);
            StringBuilder fd = new StringBuilder();
            int l = a.length;

            for (int j = 0; j < l; j++) {
                fd = fd.append(a[j] + "\n");

            }
            sd = fd.toString();

        } catch (FileNotFoundException e) {

            e.printStackTrace();
        } catch (InvalidFormatException e) {

            e.printStackTrace();
        } catch (IOException e) {

            e.printStackTrace();
        }
        return sd;
    }

    public String orgfind(String cnt[]) {
        InputStream is;
        TokenNameFinderModel tnf;
        NameFinderME nf;
        String sd = "";
        try {
            is = new FileInputStream("en-ner-organization.bin");
            tnf = new TokenNameFinderModel(is);
            nf = new NameFinderME(tnf);
            Span sp[] = nf.find(cnt);
            String a[] = Span.spansToStrings(sp, cnt);
            StringBuilder fd = new StringBuilder();
            int l = a.length;
            for (int j = 0; j < l; j++) {
                fd = fd.append(a[j] + "\n");

            }

            sd = fd.toString();

        } catch (FileNotFoundException e) {

            e.printStackTrace();
        } catch (InvalidFormatException e) {

            e.printStackTrace();
        } catch (IOException e) {

            e.printStackTrace();
        }
        return sd;

    }
    public void tokenization(String tokens) {

        InputStream is;
        TokenizerModel tm;
        try {
            is = new FileInputStream("en-token.bin");
            tm = new TokenizerModel(is);
            Tokenizer tz = new TokenizerME(tm);
            Tokens = tz.tokenize(tokens);
            // System.out.println(Tokens[1]);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

}
can you please let me know where i did wrong...??

Re: FW: Problem with my project

Posted by Sasikumar N <sa...@gmail.com>.

Hi Lina,
             I have also tried like you for person's data set and end up
with the situation of yours, Then, I have collected some sentences (more
than 100 for a single person's name) from wikipedia and google, related to
the persons who I would like to add in the training file and then tagged
like you did, finally it worked for me.

Ex. <START:person>  Albert Einstein <END> was a German-born theoretical
physicist. <START:person> Albert Einstein <END> was born at Ulm, in
Württemberg, Germany, on March 14, 1879. <START:person> Albert Einstein
<END> is best known for his mass–energy equivalence formula E = mc².

Regards,
Sasikumar N.


On Mon, May 19, 2014 at 1:02 AM, lina abu jaradeh <li...@hotmail.com>wrote:

> I am trying to train en-ner-location.bin file using opennlp in java The
> thing is i got the training text file in the following format
> <START:location> Fontana <END> .<START:location> Palo Verde <END> .
> <START:location> Picacho <END> .
> I stored it in a text file called citytrain.txt the file contains 120770
> line of city namesthen i used the following code to train the fileimport
> java.io.BufferedOutputStream;
> import java.io.BufferedReader;
> import java.io.File;
> import java.io.FileInputStream;
> import java.io.FileOutputStream;
> import java.io.FileReader;
> import java.io.IOException;
> import java.io.InputStream;
> import java.nio.charset.Charset;
> import java.util.Collections;
>
> import opennlp.tools.namefind.NameFinderME;
> import opennlp.tools.namefind.NameSample;
> import opennlp.tools.namefind.NameSampleDataStream;
> import opennlp.tools.namefind.TokenNameFinderModel;
> import opennlp.tools.tokenize.Tokenizer;
> import opennlp.tools.tokenize.TokenizerME;
> import opennlp.tools.tokenize.TokenizerModel;
> import opennlp.tools.util.ObjectStream;
> import opennlp.tools.util.PlainTextByLineStream;
> import opennlp.tools.util.Span;
> import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator;
>
> public class TrainNames {
>                 @SuppressWarnings("deprecation")
>                 public void TrainNames() throws IOException{
>                         //File baseDir = new File("src/test/resources");
>                     //File destDir = new File("target");
>                     //<start id="ne-train"/>
>                     File inFile = new File("citytrain.txt");
>                     NameSampleDataStream nss = new NameSampleDataStream(
> //<co id="co.opennlp.name.initnamestream"/>
>                       new PlainTextByLineStream(
>                         new java.io.FileReader(inFile)));
>
>                     int iterations = 100;
>                     int cutoff = 5;
>                     TokenNameFinderModel model = NameFinderME.train( //<co
> id="co.opennlp.name.train"/>
>                         "en", // language
>                         "location", // type
>                         nss,
>                         (AdaptiveFeatureGenerator) null,
>                         Collections.<String,Object>emptyMap(),
>                         iterations,
>                         cutoff);
>
>                     File outFile = new File("en-ner-locationNews.bin");
>                     FileOutputStream outFileStream = new
> FileOutputStream(outFile);
>                     model.serialize(outFileStream);
>                 }
> }
>
> when i used the output bi file to test it on a string to pull up Fontana
> from a string the result returned the whole string....don't know why or
> what i am doing wrongthe following code i used to extract Fontana from a
> stringimport java.io.FileInputStream;
>
> import java.io.FileNotFoundException;
> import java.io.IOException;
> import java.io.InputStream;
> import opennlp.tools.namefind.NameFinderME;
> import opennlp.tools.namefind.TokenNameFinderModel;
> import opennlp.tools.util.InvalidFormatException;
> import opennlp.tools.util.Span;
> import opennlp.tools.tokenize.Tokenizer;
> import opennlp.tools.tokenize.TokenizerME;
> import opennlp.tools.tokenize.TokenizerModel;
> import opennlp.tools.tokenize.SimpleTokenizer;
> import opennlp.tools.sentdetect.SentenceDetectorME;
> import opennlp.tools.sentdetect.SentenceModel;
>
> import org.xml.sax.SAXException;
>
>
> public class CityFinder {
>
>     public String Tokens[];
>
>     public static void main(String[] args) throws IOException,
> SAXException {
>
>         CityFinder toi = new CityFinder();
>         String cnt;
>         cnt="John is planning to specialize in Electrical Engineering in
> UC Fontana and pursue a career with IBM.";
>         toi.tokenization(cnt);
>         String cities = toi.namefind(toi.Tokens);
>         String org = toi.orgfind(toi.Tokens);
>
>         System.out.println("City name is : "+cities);
>         System.out.println("organization name is: "+org);
>
>     }
>         public String namefind(String cnt[]) {
>         InputStream is;
>         TokenNameFinderModel tnf;
>         NameFinderME nf;
>         String sd = "";
>         try {
>             is = new FileInputStream("en-ner-locationNew.bin");
>             tnf = new TokenNameFinderModel(is);
>             nf = new NameFinderME(tnf);
>             Span sp[] = nf.find(cnt);
>             String a[] = Span.spansToStrings(sp, cnt);
>             StringBuilder fd = new StringBuilder();
>             int l = a.length;
>
>             for (int j = 0; j < l; j++) {
>                 fd = fd.append(a[j] + "\n");
>
>             }
>             sd = fd.toString();
>
>         } catch (FileNotFoundException e) {
>
>             e.printStackTrace();
>         } catch (InvalidFormatException e) {
>
>             e.printStackTrace();
>         } catch (IOException e) {
>
>             e.printStackTrace();
>         }
>         return sd;
>     }
>
>     public String orgfind(String cnt[]) {
>         InputStream is;
>         TokenNameFinderModel tnf;
>         NameFinderME nf;
>         String sd = "";
>         try {
>             is = new FileInputStream("en-ner-organization.bin");
>             tnf = new TokenNameFinderModel(is);
>             nf = new NameFinderME(tnf);
>             Span sp[] = nf.find(cnt);
>             String a[] = Span.spansToStrings(sp, cnt);
>             StringBuilder fd = new StringBuilder();
>             int l = a.length;
>             for (int j = 0; j < l; j++) {
>                 fd = fd.append(a[j] + "\n");
>
>             }
>
>             sd = fd.toString();
>
>         } catch (FileNotFoundException e) {
>
>             e.printStackTrace();
>         } catch (InvalidFormatException e) {
>
>             e.printStackTrace();
>         } catch (IOException e) {
>
>             e.printStackTrace();
>         }
>         return sd;
>
>     }
>     public void tokenization(String tokens) {
>
>         InputStream is;
>         TokenizerModel tm;
>         try {
>             is = new FileInputStream("en-token.bin");
>             tm = new TokenizerModel(is);
>             Tokenizer tz = new TokenizerME(tm);
>             Tokens = tz.tokenize(tokens);
>             // System.out.println(Tokens[1]);
>         } catch (IOException e) {
>             e.printStackTrace();
>         }
>     }
>
> }
> can you please let me know where i did wrong...??
>




-- 
Regards,
SASIKUMAR N