You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by sm...@apache.org on 2017/04/25 12:06:01 UTC

opennlp git commit: OPENNLP-1035:Add unit tests and javadocs for BrownBigramFeatureGenerator, closes apache/opennlp#174

Repository: opennlp
Updated Branches:
  refs/heads/master 406021733 -> 60595251e


OPENNLP-1035:Add unit tests and javadocs for BrownBigramFeatureGenerator, closes apache/opennlp#174


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/60595251
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/60595251
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/60595251

Branch: refs/heads/master
Commit: 60595251eec5979e14540c6d00043e24905a7404
Parents: 4060217
Author: jzonthemtn <je...@mtnfog.com>
Authored: Tue Apr 25 08:05:49 2017 -0400
Committer: smarthi <sm...@apache.org>
Committed: Tue Apr 25 08:05:49 2017 -0400

----------------------------------------------------------------------
 .../featuregen/BrownBigramFeatureGenerator.java |  20 +-
 .../BrownBigramFeatureGeneratorTest.java        |  87 +++
 .../opennlp/tools/formats/brown-cluster.txt     | 665 +++++++++++++++++++
 3 files changed, 764 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/60595251/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownBigramFeatureGenerator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownBigramFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownBigramFeatureGenerator.java
index 4f0a24a..f16ba97 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownBigramFeatureGenerator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownBigramFeatureGenerator.java
@@ -24,25 +24,30 @@ import java.util.List;
  */
 public class BrownBigramFeatureGenerator implements AdaptiveFeatureGenerator {
 
-  private BrownCluster brownLexicon;
-
-  public BrownBigramFeatureGenerator(BrownCluster dict) {
-    this.brownLexicon = dict;
+  private BrownCluster brownCluster;
+
+  /**
+   * Creates a new Brown Cluster bigram feature generator.
+   * @param brownCluster A {@link BrownCluster}.
+   */
+  public BrownBigramFeatureGenerator(BrownCluster brownCluster) {
+    this.brownCluster = brownCluster;
   }
 
+  @Override
   public void createFeatures(List<String> features, String[] tokens, int index,
       String[] previousOutcomes) {
 
-    List<String> wordClasses = BrownTokenClasses.getWordClasses(tokens[index], brownLexicon);
+    List<String> wordClasses = BrownTokenClasses.getWordClasses(tokens[index], brownCluster);
     if (index > 0) {
-      List<String> prevWordClasses = BrownTokenClasses.getWordClasses(tokens[index - 1], brownLexicon);
+      List<String> prevWordClasses = BrownTokenClasses.getWordClasses(tokens[index - 1], brownCluster);
       for (int i = 0; i < wordClasses.size() && i < prevWordClasses.size(); i++)
       features.add("p" + "browncluster" + "," + "browncluster" + "="
           + prevWordClasses.get(i) + "," + wordClasses.get(i));
     }
 
     if (index + 1 < tokens.length) {
-      List<String> nextWordClasses = BrownTokenClasses.getWordClasses(tokens[index + 1], brownLexicon);
+      List<String> nextWordClasses = BrownTokenClasses.getWordClasses(tokens[index + 1], brownCluster);
       for (int i = 0; i < wordClasses.size() && i < nextWordClasses.size(); i++) {
         features.add("browncluster" + "," + "n" + "browncluster" + "="
             + wordClasses.get(i) + "," + nextWordClasses.get(i));
@@ -51,4 +56,3 @@ public class BrownBigramFeatureGenerator implements AdaptiveFeatureGenerator {
   }
 
 }
-

http://git-wip-us.apache.org/repos/asf/opennlp/blob/60595251/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/BrownBigramFeatureGeneratorTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/BrownBigramFeatureGeneratorTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/BrownBigramFeatureGeneratorTest.java
new file mode 100644
index 0000000..03810e8
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/BrownBigramFeatureGeneratorTest.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.featuregen;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import opennlp.tools.formats.ResourceAsStreamFactory;
+
+public class BrownBigramFeatureGeneratorTest {
+
+  private AdaptiveFeatureGenerator generator;
+  
+  @Before
+  public void setup() throws IOException {
+
+    ResourceAsStreamFactory stream = new ResourceAsStreamFactory(
+        getClass(), "/opennlp/tools/formats/brown-cluster.txt");
+
+    BrownCluster brownCluster = new BrownCluster(stream.createInputStream()); 
+    
+    generator = new BrownBigramFeatureGenerator(brownCluster);
+
+  }
+
+  @Test
+  public void createFeaturesTest() throws IOException {
+
+    String[] tokens = new String[] {"he", "went", "with", "you"};
+
+    List<String> features = new ArrayList<>();
+    generator.createFeatures(features, tokens, 3, null);
+
+    Assert.assertEquals(2, features.size());
+    Assert.assertTrue(features.contains("pbrowncluster,browncluster=0101,0010"));
+    Assert.assertTrue(features.contains("pbrowncluster,browncluster=01010,00101"));
+    
+  }
+  
+  @Test
+  public void createFeaturesSuccessiveTokensTest() throws IOException {
+
+    final String[] testSentence = new String[] {"he", "went", "with", "you", "in", "town"};
+
+    List<String> features = new ArrayList<>();
+    generator.createFeatures(features, testSentence, 3, null);
+
+    Assert.assertEquals(3, features.size());
+    Assert.assertTrue(features.contains("pbrowncluster,browncluster=0101,0010"));
+    Assert.assertTrue(features.contains("pbrowncluster,browncluster=01010,00101"));
+    Assert.assertTrue(features.contains("browncluster,nbrowncluster=0010,0000"));
+    
+  }
+  
+  @Test
+  public void noFeaturesTest() throws IOException {
+
+    final String[] testSentence = new String[] {"he", "went", "with", "you"};
+
+    List<String> features = new ArrayList<>();
+    generator.createFeatures(features, testSentence, 0, null);
+
+    Assert.assertEquals(0, features.size());
+    
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/60595251/opennlp-tools/src/test/resources/opennlp/tools/formats/brown-cluster.txt
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/brown-cluster.txt b/opennlp-tools/src/test/resources/opennlp/tools/formats/brown-cluster.txt
new file mode 100644
index 0000000..df31bc7
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/brown-cluster.txt
@@ -0,0 +1,665 @@
+0000	18,	1
+0000	wedding	1
+0000	A	1
+0000	No,	1
+0000	prefered	1
+0000	hurry	1
+0000	address?	1
+0000	sounds	1
+0000	any	1
+0000	soon,	1
+0000	in	56
+0000	Worcesterstreet	1
+00010	summer.	1
+00010	56473	1
+00010	different	1
+00010	20193	1
+00010	Ulm	1
+00010	17818	1
+00010	beautiful	1
+00010	23213	1
+00010	12424	1
+00010	Rue-de-Grandes-Illusions	1
+00010	good.	1
+00010	Barmerstr.	1
+00010	81737	1
+00010	order	1
+00010	1912	1
+00010	63737	1
+00010	Chesterstr.	1
+00010	80333	1
+00010	81234	1
+00010	that's	1
+00010	78181	1
+00010	30291	1
+00010	84630	1
+00010	25334	1
+00010	30303	2
+00010	Leipzig.	2
+00010	your	3
+00010	her	10
+000110	5.	1
+000110	Hamburg,	1
+000110	contact	1
+000110	faked.	1
+000110	streetname	1
+000110	34.	1
+000110	83939	1
+000110	25.	1
+000110	2.	1
+000110	part-time	1
+000110	help-wanted	1
+000110	11	1
+000110	some	1
+000110	Gauting.	1
+000110	address.	1
+000110	parent's	1
+000110	reply.	1
+000110	touch	1
+000110	Berlin.	5
+000110	Munich.	5
+000111	there,	1
+000111	Schulz	1
+000111	Paris	1
+000111	Edinburgh,	1
+000111	day	1
+000111	1	1
+000111	you?	1
+000111	saw	1
+000111	see	1
+000111	house	1
+000111	recently	1
+000111	Don't	1
+000111	back	1
+000111	apartment	1
+000111	12,	1
+000111	Are	2
+000111	Could	2
+000111	did	2
+000111	job	2
+000111	still	3
+000111	Thank	3
+000111	up	3
+00100	30202.	1
+00100	Yesterday,	1
+00100	ad	1
+00100	homesick,	1
+00100	Now,	1
+00100	man	1
+00100	help.	1
+00100	area.	1
+00100	"Westbad".	1
+00100	or	2
+00100	It's	2
+00100	It	2
+00100	The	7
+00100	As	3
+00101	Arent't	1
+00101	offer.	1
+00101	celebrated	1
+00101	available.	1
+00101	spontaneously.	1
+00101	sounding	1
+00101	party	2
+00101	you	12
+001100	last	1
+001100	called,	1
+001100	That	1
+001100	life	1
+001100	pointed	1
+001100	building	1
+001100	restaurant	1
+001100	5,	1
+001100	one	1
+001100	interested	1
+001100	located	1
+001100	Please	1
+001100	answered	1
+001100	Hospital	1
+001100	112,	2
+001100	arrived	3
+001100	lived	4
+001100	lives	4
+001101	Unter-den-Linden	1
+001101	this	1
+001101	moment.	1
+001101	tip	1
+001101	10th	1
+001101	reckon.	1
+001101	factory	1
+001101	line	1
+001101	Paracelsus	1
+001101	Alan	1
+001101	it's	2
+001101	company	2
+001101	who	4
+001110	didn't	1
+001110	postcode	1
+001110	police	1
+001110	building.	1
+001110	concierge	1
+001110	flaring	1
+001110	finally	3
+001110	she	7
+001110	Last	4
+001110	She	5
+0011110	Erding,	1
+0011110	Spain,	1
+0011110	resident,	1
+0011110	lady,	1
+0011110	later	1
+0011110	business	1
+0011110	idea	1
+0011110	Berlin	1
+0011110	England,	1
+0011110	Sure,	1
+0011110	,	10
+0011110	longer	1
+0011111	is.	1
+0011111	15	1
+0011111	Schneider	1
+0011111	Hinterhofer	1
+0011111	me.	1
+0011111	Our	1
+0011111	Seile	1
+0011111	Meier	1
+0011111	Bauer	1
+0011111	Sander	1
+0011111	Clara	1
+0011111	Schmidt	2
+0011111	minutes	2
+0011111	Miller	5
+0100	school	1
+0100	They	1
+0100	8	1
+0100	9	1
+0100	Europe.	1
+0100	those	1
+0100	Baumann,	1
+0100	a	38
+0100	high	1
+01010	About	1
+01010	has	1
+01010	us,	1
+01010	13,	1
+01010	university.	1
+01010	tell	1
+01010	On	2
+01010	than	2
+01010	An	2
+01010	Alisa	2
+01010	on	3
+01010	with	7
+01010	called	5
+01010	got	5
+01011	through	1
+01011	shoes?	1
+01011	city.	1
+01011	quickly	1
+01011	trauma,	1
+01011	situate	1
+01011	much!	1
+01011	then,	1
+01011	friday!	1
+01011	about	1
+01011	knew	2
+01011	of	17
+01011	him	3
+011000	drove	1
+011000	Yes,	1
+011000	away.	1
+011000	parents'	1
+011000	life-threatening,	1
+011000	Weilheim,	1
+011000	15.	1
+011000	33,	1
+011000	86th	1
+011000	1995.	1
+011000	apartment,	1
+011000	took	2
+011000	where	3
+011000	if	5
+011000	But	7
+011001	the	54
+011001	Blumenweg	1
+011010	problem	1
+011010	country	1
+011010	Her	1
+011010	rumour	1
+011010	middle-aged	1
+011010	police.	1
+011010	exhibition.	1
+011010	empty	1
+011010	hours	1
+011010	father	1
+011010	area	1
+011010	staff	1
+011010	Reichstag.	1
+011010	"Tapasbar"	1
+011010	to.	1
+011010	Lenbachhaus	1
+011010	complete	1
+011010	owner	1
+011010	1.	1
+011010	11,	1
+011010	15,	2
+011010	street	2
+011010	accident	2
+011010	Ostbahnhof	2
+011010	address	3
+0110110	help	1
+0110110	grateful	1
+0110110	singer	1
+0110110	new	1
+0110110	moment	1
+0110110	costumers	1
+0110110	ancestors.	1
+0110110	Schubert	1
+0110110	ups	1
+0110110	pedestrians.	1
+0110110	hint	1
+0110110	semester,	1
+0110110	aunt	1
+0110110	face-to-face,	1
+0110110	guests	1
+0110110	happy	1
+0110110	number	2
+0110110	6,	2
+0110110	name	8
+01101110	French	1
+01101110	Luise	1
+01101110	knowledge	1
+01101110	pictures	1
+01101110	them	2
+01101110	away	2
+01101110	out	4
+01101110	years	2
+01101111	pain,	1
+01101111	Is	1
+01101111	sign	1
+01101111	home,	1
+01101111	14,	1
+01101111	appreciated	1
+01101111	happened	1
+01101111	by	1
+01101111	point:	1
+01101111	opened	2
+01101111	near	4
+01101111	instantly	3
+01110	taxi	1
+01110	p.m.!	1
+01110	13	1
+01110	barbecue.	1
+01110	speed	1
+01110	tree.	1
+01110	tenant	1
+01110	metropolis	1
+01110	delivery	1
+01110	family	1
+01110	list	1
+01110	week.	1
+01110	student,	1
+01110	delicious	1
+01110	good	1
+01110	well-payed	1
+01110	student	1
+01110	person!	1
+01110	smaller	1
+01110	small	2
+01110	more	2
+01110	look	2
+01110	quite	2
+01110	bigger	2
+01110	young	2
+01110	tourist	2
+01110	great	3
+01110	letter	3
+01110	friend	4
+0111100	Elenor	1
+0111100	definitely	1
+0111100	Gina	1
+0111100	currently	1
+0111100	Marie	1
+0111100	McKennedy	1
+0111100	ten	1
+0111100	sometimes.	1
+0111100	Michael	1
+0111100	Michel	1
+0111100	competent	1
+0111100	Gerhard	1
+0111100	Stefanie	2
+0111100	five	2
+0111100	Mike	2
+0111100	Stefan	3
+0111101	particulary	1
+0111101	broken.	1
+0111101	10	1
+0111101	leather?	1
+0111101	grandaunt.	1
+0111101	90	1
+0111101	Julie	1
+0111101	badly	1
+0111101	you:	1
+0111101	July	1
+0111101	painfully	1
+0111101	founded	1
+0111101	Fernandes	1
+0111101	old	2
+0111101	elderly	2
+0111101	March	2
+0111101	him.	2
+0111101	2	2
+0111101	an	5
+0111110	6th	1
+0111110	Peter	1
+0111110	turbulent	1
+0111110	German	1
+0111110	informatics,	1
+0111110	phone	1
+0111110	October	1
+0111110	directly	1
+0111110	His	2
+0111110	My	4
+0111110	his	5
+0111110	our	5
+01111110	Oh	1
+01111110	mortal	1
+01111110	Natalie	1
+01111110	83454	1
+01111110	programming	1
+01111110	she's	2
+01111110	Hi	2
+01111110	that	9
+01111111	attention.	1
+01111111	central	1
+01111111	town.	1
+01111111	town	1
+01111111	Spanish	1
+01111111	lodge	1
+01111111	right	1
+01111111	married	2
+01111111	later,	2
+01111111	from	9
+01111111	local	2
+1000	information.	1
+1000	capital.	1
+1000	officer.	1
+1000	retired	1
+1000	most.	1
+1000	reception	1
+1000	wounds	1
+1000	12	1
+1000	personal	1
+1000	colour.	1
+1000	shoes	1
+1000	030/827234.	1
+1000	inquiries?	1
+1000	Brandenburger	1
+1000	computer...	1
+1000	underground	1
+1000	smalltown	1
+1000	city	2
+1000	only	2
+1000	first	4
+1000	home	3
+1000	woman	3
+1000	famous	4
+1001	multiple	1
+1001	France	1
+1001	care	1
+1001	burnt	1
+1001	birthday	1
+1001	there	2
+1001	they	3
+1001	it	8
+1001	He	4
+1001	which	4
+1010	Now	1
+1010	off	1
+1010	yes,	1
+1010	too.	1
+1010	and	30
+1010	56,	1
+10110	Euro,	1
+10110	Heidelberg.	1
+10110	countries,	1
+10110	injured.	1
+10110	widow.	1
+10110	danger.	1
+10110	fact	1
+10110	magazine.	1
+10110	12.	1
+10110	anniversary.	1
+10110	traditional	1
+10110	up,	1
+10110	that?	1
+10110	Fritsch.	1
+10110	amazing,	1
+10110	"Twentytwo".	1
+10110	am	1
+10110	Ottobrunn.	1
+10110	years.	1
+10110	her.	1
+10110	whom	2
+10110	Hamburg.	4
+10110	.	4
+10110	So	6
+10111	photo	1
+10111	place.	1
+10111	p.m..	1
+10111	Heidelberg's	1
+10111	September,	1
+10111	21,	1
+10111	jacket,	1
+10111	anyway,	1
+10111	Therefore,	1
+10111	couple,	1
+10111	so	2
+10111	When	2
+10111	year,	3
+10111	husband	2
+1100	place,	1
+1100	Convulsed	1
+1100	Driving	1
+1100	notable	1
+1100	album	1
+1100	meal.	1
+1100	I've	2
+1100	Hi,	2
+1100	We	2
+1100	I	37
+110100	takes	1
+110100	reported	1
+110100	is	15
+110100	wasn't	3
+110101	Bye!	1
+110101	He's	1
+110101	bike	1
+110101	can	1
+110101	agency	1
+110101	Highfly-Hotel	1
+110101	shop	1
+110101	"Daily's"	1
+110101	was	15
+110101	depended	1
+110110	Afterwards,	1
+110110	maps.	1
+110110	Lenbachhaus.	1
+110110	flair	1
+110110	immediately	1
+110110	weren't	1
+110110	addresses	1
+110110	desk	1
+110110	station	1
+110110	I'll	1
+110110	Tor	1
+110110	hospital	1
+110110	because	2
+110110	own	2
+110110	into	6
+110110	as	4
+1101110	frequented	1
+1101110	yet	1
+1101110	Since	1
+1101110	made	1
+1101110	what	1
+1101110	he	9
+1101110	information	2
+1101111	Italian.	1
+1101111	entertainer	1
+1101111	foreign	1
+1101111	delighted.	1
+1101111	George	3
+1101111	we	7
+111000	wrote	1
+111000	hadnt't	1
+111000	looking	1
+111000	just	1
+111000	realized	1
+111000	their	1
+111000	never	1
+111000	love	1
+111000	brought	2
+111000	really	2
+111000	heard	2
+111000	Although	2
+111000	like	7
+1110010	live	1
+1110010	don't	1
+1110010	injured	1
+1110010	first,	1
+1110010	hope	1
+1110010	want	1
+1110010	didn`t	1
+1110010	knows	1
+1110010	merely	1
+1110010	two	1
+1110010	worked	2
+1110010	tried	2
+1110010	no	2
+1110010	moved	4
+1110010	best	2
+1110011	need	1
+1110011	always	1
+1110011	alone	1
+1110011	liked	1
+1110011	forward	1
+1110011	proposed	1
+1110011	came	1
+1110011	talking	1
+1110011	pick	1
+1110011	told	2
+1110011	went	2
+1110011	decided	3
+1110011	wanted	3
+1110011	how	3
+1110011	have	4
+1110100	gave	1
+1110100	downs	1
+1110100	appartment	1
+1110100	hospital.	1
+1110100	last-minute.	1
+1110100	languages,	1
+1110100	sights,	1
+1110100	enjoyed	1
+1110100	I'm	6
+1110100	I'd	4
+1110101	felt	1
+1110101	flames	1
+1110101	enjoy	1
+1110101	deem	1
+1110101	called?	1
+1110101	hardly	1
+1110101	spent	1
+1110101	asked	2
+1110101	had	7
+1110101	found	3
+1110110	Munich,	1
+1110110	Scotland,	1
+1110110	day,	1
+1110110	study	1
+1110110	friend.	1
+1110110	after	1
+1110110	apartments	1
+1110110	show	1
+1110110	there.	1
+1110110	read	2
+1110110	get	3
+1110110	know	6
+1110111	right?	1
+1110111	soon	1
+1110111	uni.	1
+1110111	ambulance.	1
+1110111	Sunday	1
+1110111	before.	1
+1110111	possible.	1
+1110111	my	9
+1110111	he'd	2
+111100	you'll	1
+111100	?	1
+111100	not	2
+111100	to	42
+111101	it.	1
+111101	call	1
+111101	One	1
+111101	Bruno	1
+111101	once	1
+111101	around	1
+111101	for	7
+111101	at	13
+1111100	Hauptbahnhof?	1
+1111100	hesitant	1
+1111100	visit	1
+1111100	completely	1
+1111100	start	1
+1111100	managed	1
+1111100	money	1
+1111100	go	1
+1111100	offered	1
+1111100	possible	1
+1111100	afford	1
+1111100	driver	2
+1111100	write	3
+1111100	easy	2
+1111101	relaxed	1
+1111101	simply	1
+1111101	sure.	1
+1111101	starts	1
+1111101	friendly	1
+1111101	give	1
+1111101	sitting	1
+1111101	going	1
+1111101	urgent	1
+1111101	please	2
+1111101	next	3
+1111101	very	6
+1111110	who's	1
+1111110	much,	1
+1111110	friday?	1
+1111110	explained	1
+1111110	met	1
+1111110	Where	1
+1111110	How	2
+1111110	much	2
+1111110	are	2
+1111110	could	2
+1111110	me	6
+1111110	enough	3
+1111111	seen	1
+1111111	papers	1
+1111111	"Mondnacht"	1
+1111111	both.	1
+1111111	crashed	1
+1111111	studies	1
+1111111	bring	1
+1111111	pull	1
+1111111	teacher	1
+1111111	boy	1
+1111111	far	1
+1111111	move	1
+1111111	travelling	1
+1111111	Yeah	2
+1111111	ring	2
+1111111	meet	2
+1111111	find	5
+1111111	be	3
\ No newline at end of file