You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by sm...@apache.org on 2017/04/25 12:06:01 UTC
opennlp git commit: OPENNLP-1035:Add unit tests and javadocs for
BrownBigramFeatureGenerator, closes apache/opennlp#174
Repository: opennlp
Updated Branches:
refs/heads/master 406021733 -> 60595251e
OPENNLP-1035:Add unit tests and javadocs for BrownBigramFeatureGenerator, closes apache/opennlp#174
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/60595251
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/60595251
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/60595251
Branch: refs/heads/master
Commit: 60595251eec5979e14540c6d00043e24905a7404
Parents: 4060217
Author: jzonthemtn <je...@mtnfog.com>
Authored: Tue Apr 25 08:05:49 2017 -0400
Committer: smarthi <sm...@apache.org>
Committed: Tue Apr 25 08:05:49 2017 -0400
----------------------------------------------------------------------
.../featuregen/BrownBigramFeatureGenerator.java | 20 +-
.../BrownBigramFeatureGeneratorTest.java | 87 +++
.../opennlp/tools/formats/brown-cluster.txt | 665 +++++++++++++++++++
3 files changed, 764 insertions(+), 8 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/60595251/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownBigramFeatureGenerator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownBigramFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownBigramFeatureGenerator.java
index 4f0a24a..f16ba97 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownBigramFeatureGenerator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownBigramFeatureGenerator.java
@@ -24,25 +24,30 @@ import java.util.List;
*/
public class BrownBigramFeatureGenerator implements AdaptiveFeatureGenerator {
- private BrownCluster brownLexicon;
-
- public BrownBigramFeatureGenerator(BrownCluster dict) {
- this.brownLexicon = dict;
+ private BrownCluster brownCluster;
+
+ /**
+ * Creates a new Brown Cluster bigram feature generator.
+ * @param brownCluster A {@link BrownCluster}.
+ */
+ public BrownBigramFeatureGenerator(BrownCluster brownCluster) {
+ this.brownCluster = brownCluster;
}
+ @Override
public void createFeatures(List<String> features, String[] tokens, int index,
String[] previousOutcomes) {
- List<String> wordClasses = BrownTokenClasses.getWordClasses(tokens[index], brownLexicon);
+ List<String> wordClasses = BrownTokenClasses.getWordClasses(tokens[index], brownCluster);
if (index > 0) {
- List<String> prevWordClasses = BrownTokenClasses.getWordClasses(tokens[index - 1], brownLexicon);
+ List<String> prevWordClasses = BrownTokenClasses.getWordClasses(tokens[index - 1], brownCluster);
for (int i = 0; i < wordClasses.size() && i < prevWordClasses.size(); i++)
features.add("p" + "browncluster" + "," + "browncluster" + "="
+ prevWordClasses.get(i) + "," + wordClasses.get(i));
}
if (index + 1 < tokens.length) {
- List<String> nextWordClasses = BrownTokenClasses.getWordClasses(tokens[index + 1], brownLexicon);
+ List<String> nextWordClasses = BrownTokenClasses.getWordClasses(tokens[index + 1], brownCluster);
for (int i = 0; i < wordClasses.size() && i < nextWordClasses.size(); i++) {
features.add("browncluster" + "," + "n" + "browncluster" + "="
+ wordClasses.get(i) + "," + nextWordClasses.get(i));
@@ -51,4 +56,3 @@ public class BrownBigramFeatureGenerator implements AdaptiveFeatureGenerator {
}
}
-
http://git-wip-us.apache.org/repos/asf/opennlp/blob/60595251/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/BrownBigramFeatureGeneratorTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/BrownBigramFeatureGeneratorTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/BrownBigramFeatureGeneratorTest.java
new file mode 100644
index 0000000..03810e8
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/BrownBigramFeatureGeneratorTest.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.featuregen;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import opennlp.tools.formats.ResourceAsStreamFactory;
+
+public class BrownBigramFeatureGeneratorTest {
+
+ private AdaptiveFeatureGenerator generator;
+
+ @Before
+ public void setup() throws IOException {
+
+ ResourceAsStreamFactory stream = new ResourceAsStreamFactory(
+ getClass(), "/opennlp/tools/formats/brown-cluster.txt");
+
+ BrownCluster brownCluster = new BrownCluster(stream.createInputStream());
+
+ generator = new BrownBigramFeatureGenerator(brownCluster);
+
+ }
+
+ @Test
+ public void createFeaturesTest() throws IOException {
+
+ String[] tokens = new String[] {"he", "went", "with", "you"};
+
+ List<String> features = new ArrayList<>();
+ generator.createFeatures(features, tokens, 3, null);
+
+ Assert.assertEquals(2, features.size());
+ Assert.assertTrue(features.contains("pbrowncluster,browncluster=0101,0010"));
+ Assert.assertTrue(features.contains("pbrowncluster,browncluster=01010,00101"));
+
+ }
+
+ @Test
+ public void createFeaturesSuccessiveTokensTest() throws IOException {
+
+ final String[] testSentence = new String[] {"he", "went", "with", "you", "in", "town"};
+
+ List<String> features = new ArrayList<>();
+ generator.createFeatures(features, testSentence, 3, null);
+
+ Assert.assertEquals(3, features.size());
+ Assert.assertTrue(features.contains("pbrowncluster,browncluster=0101,0010"));
+ Assert.assertTrue(features.contains("pbrowncluster,browncluster=01010,00101"));
+ Assert.assertTrue(features.contains("browncluster,nbrowncluster=0010,0000"));
+
+ }
+
+ @Test
+ public void noFeaturesTest() throws IOException {
+
+ final String[] testSentence = new String[] {"he", "went", "with", "you"};
+
+ List<String> features = new ArrayList<>();
+ generator.createFeatures(features, testSentence, 0, null);
+
+ Assert.assertEquals(0, features.size());
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/60595251/opennlp-tools/src/test/resources/opennlp/tools/formats/brown-cluster.txt
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/brown-cluster.txt b/opennlp-tools/src/test/resources/opennlp/tools/formats/brown-cluster.txt
new file mode 100644
index 0000000..df31bc7
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/brown-cluster.txt
@@ -0,0 +1,665 @@
+0000 18, 1
+0000 wedding 1
+0000 A 1
+0000 No, 1
+0000 prefered 1
+0000 hurry 1
+0000 address? 1
+0000 sounds 1
+0000 any 1
+0000 soon, 1
+0000 in 56
+0000 Worcesterstreet 1
+00010 summer. 1
+00010 56473 1
+00010 different 1
+00010 20193 1
+00010 Ulm 1
+00010 17818 1
+00010 beautiful 1
+00010 23213 1
+00010 12424 1
+00010 Rue-de-Grandes-Illusions 1
+00010 good. 1
+00010 Barmerstr. 1
+00010 81737 1
+00010 order 1
+00010 1912 1
+00010 63737 1
+00010 Chesterstr. 1
+00010 80333 1
+00010 81234 1
+00010 that's 1
+00010 78181 1
+00010 30291 1
+00010 84630 1
+00010 25334 1
+00010 30303 2
+00010 Leipzig. 2
+00010 your 3
+00010 her 10
+000110 5. 1
+000110 Hamburg, 1
+000110 contact 1
+000110 faked. 1
+000110 streetname 1
+000110 34. 1
+000110 83939 1
+000110 25. 1
+000110 2. 1
+000110 part-time 1
+000110 help-wanted 1
+000110 11 1
+000110 some 1
+000110 Gauting. 1
+000110 address. 1
+000110 parent's 1
+000110 reply. 1
+000110 touch 1
+000110 Berlin. 5
+000110 Munich. 5
+000111 there, 1
+000111 Schulz 1
+000111 Paris 1
+000111 Edinburgh, 1
+000111 day 1
+000111 1 1
+000111 you? 1
+000111 saw 1
+000111 see 1
+000111 house 1
+000111 recently 1
+000111 Don't 1
+000111 back 1
+000111 apartment 1
+000111 12, 1
+000111 Are 2
+000111 Could 2
+000111 did 2
+000111 job 2
+000111 still 3
+000111 Thank 3
+000111 up 3
+00100 30202. 1
+00100 Yesterday, 1
+00100 ad 1
+00100 homesick, 1
+00100 Now, 1
+00100 man 1
+00100 help. 1
+00100 area. 1
+00100 "Westbad". 1
+00100 or 2
+00100 It's 2
+00100 It 2
+00100 The 7
+00100 As 3
+00101 Arent't 1
+00101 offer. 1
+00101 celebrated 1
+00101 available. 1
+00101 spontaneously. 1
+00101 sounding 1
+00101 party 2
+00101 you 12
+001100 last 1
+001100 called, 1
+001100 That 1
+001100 life 1
+001100 pointed 1
+001100 building 1
+001100 restaurant 1
+001100 5, 1
+001100 one 1
+001100 interested 1
+001100 located 1
+001100 Please 1
+001100 answered 1
+001100 Hospital 1
+001100 112, 2
+001100 arrived 3
+001100 lived 4
+001100 lives 4
+001101 Unter-den-Linden 1
+001101 this 1
+001101 moment. 1
+001101 tip 1
+001101 10th 1
+001101 reckon. 1
+001101 factory 1
+001101 line 1
+001101 Paracelsus 1
+001101 Alan 1
+001101 it's 2
+001101 company 2
+001101 who 4
+001110 didn't 1
+001110 postcode 1
+001110 police 1
+001110 building. 1
+001110 concierge 1
+001110 flaring 1
+001110 finally 3
+001110 she 7
+001110 Last 4
+001110 She 5
+0011110 Erding, 1
+0011110 Spain, 1
+0011110 resident, 1
+0011110 lady, 1
+0011110 later 1
+0011110 business 1
+0011110 idea 1
+0011110 Berlin 1
+0011110 England, 1
+0011110 Sure, 1
+0011110 , 10
+0011110 longer 1
+0011111 is. 1
+0011111 15 1
+0011111 Schneider 1
+0011111 Hinterhofer 1
+0011111 me. 1
+0011111 Our 1
+0011111 Seile 1
+0011111 Meier 1
+0011111 Bauer 1
+0011111 Sander 1
+0011111 Clara 1
+0011111 Schmidt 2
+0011111 minutes 2
+0011111 Miller 5
+0100 school 1
+0100 They 1
+0100 8 1
+0100 9 1
+0100 Europe. 1
+0100 those 1
+0100 Baumann, 1
+0100 a 38
+0100 high 1
+01010 About 1
+01010 has 1
+01010 us, 1
+01010 13, 1
+01010 university. 1
+01010 tell 1
+01010 On 2
+01010 than 2
+01010 An 2
+01010 Alisa 2
+01010 on 3
+01010 with 7
+01010 called 5
+01010 got 5
+01011 through 1
+01011 shoes? 1
+01011 city. 1
+01011 quickly 1
+01011 trauma, 1
+01011 situate 1
+01011 much! 1
+01011 then, 1
+01011 friday! 1
+01011 about 1
+01011 knew 2
+01011 of 17
+01011 him 3
+011000 drove 1
+011000 Yes, 1
+011000 away. 1
+011000 parents' 1
+011000 life-threatening, 1
+011000 Weilheim, 1
+011000 15. 1
+011000 33, 1
+011000 86th 1
+011000 1995. 1
+011000 apartment, 1
+011000 took 2
+011000 where 3
+011000 if 5
+011000 But 7
+011001 the 54
+011001 Blumenweg 1
+011010 problem 1
+011010 country 1
+011010 Her 1
+011010 rumour 1
+011010 middle-aged 1
+011010 police. 1
+011010 exhibition. 1
+011010 empty 1
+011010 hours 1
+011010 father 1
+011010 area 1
+011010 staff 1
+011010 Reichstag. 1
+011010 "Tapasbar" 1
+011010 to. 1
+011010 Lenbachhaus 1
+011010 complete 1
+011010 owner 1
+011010 1. 1
+011010 11, 1
+011010 15, 2
+011010 street 2
+011010 accident 2
+011010 Ostbahnhof 2
+011010 address 3
+0110110 help 1
+0110110 grateful 1
+0110110 singer 1
+0110110 new 1
+0110110 moment 1
+0110110 costumers 1
+0110110 ancestors. 1
+0110110 Schubert 1
+0110110 ups 1
+0110110 pedestrians. 1
+0110110 hint 1
+0110110 semester, 1
+0110110 aunt 1
+0110110 face-to-face, 1
+0110110 guests 1
+0110110 happy 1
+0110110 number 2
+0110110 6, 2
+0110110 name 8
+01101110 French 1
+01101110 Luise 1
+01101110 knowledge 1
+01101110 pictures 1
+01101110 them 2
+01101110 away 2
+01101110 out 4
+01101110 years 2
+01101111 pain, 1
+01101111 Is 1
+01101111 sign 1
+01101111 home, 1
+01101111 14, 1
+01101111 appreciated 1
+01101111 happened 1
+01101111 by 1
+01101111 point: 1
+01101111 opened 2
+01101111 near 4
+01101111 instantly 3
+01110 taxi 1
+01110 p.m.! 1
+01110 13 1
+01110 barbecue. 1
+01110 speed 1
+01110 tree. 1
+01110 tenant 1
+01110 metropolis 1
+01110 delivery 1
+01110 family 1
+01110 list 1
+01110 week. 1
+01110 student, 1
+01110 delicious 1
+01110 good 1
+01110 well-payed 1
+01110 student 1
+01110 person! 1
+01110 smaller 1
+01110 small 2
+01110 more 2
+01110 look 2
+01110 quite 2
+01110 bigger 2
+01110 young 2
+01110 tourist 2
+01110 great 3
+01110 letter 3
+01110 friend 4
+0111100 Elenor 1
+0111100 definitely 1
+0111100 Gina 1
+0111100 currently 1
+0111100 Marie 1
+0111100 McKennedy 1
+0111100 ten 1
+0111100 sometimes. 1
+0111100 Michael 1
+0111100 Michel 1
+0111100 competent 1
+0111100 Gerhard 1
+0111100 Stefanie 2
+0111100 five 2
+0111100 Mike 2
+0111100 Stefan 3
+0111101 particulary 1
+0111101 broken. 1
+0111101 10 1
+0111101 leather? 1
+0111101 grandaunt. 1
+0111101 90 1
+0111101 Julie 1
+0111101 badly 1
+0111101 you: 1
+0111101 July 1
+0111101 painfully 1
+0111101 founded 1
+0111101 Fernandes 1
+0111101 old 2
+0111101 elderly 2
+0111101 March 2
+0111101 him. 2
+0111101 2 2
+0111101 an 5
+0111110 6th 1
+0111110 Peter 1
+0111110 turbulent 1
+0111110 German 1
+0111110 informatics, 1
+0111110 phone 1
+0111110 October 1
+0111110 directly 1
+0111110 His 2
+0111110 My 4
+0111110 his 5
+0111110 our 5
+01111110 Oh 1
+01111110 mortal 1
+01111110 Natalie 1
+01111110 83454 1
+01111110 programming 1
+01111110 she's 2
+01111110 Hi 2
+01111110 that 9
+01111111 attention. 1
+01111111 central 1
+01111111 town. 1
+01111111 town 1
+01111111 Spanish 1
+01111111 lodge 1
+01111111 right 1
+01111111 married 2
+01111111 later, 2
+01111111 from 9
+01111111 local 2
+1000 information. 1
+1000 capital. 1
+1000 officer. 1
+1000 retired 1
+1000 most. 1
+1000 reception 1
+1000 wounds 1
+1000 12 1
+1000 personal 1
+1000 colour. 1
+1000 shoes 1
+1000 030/827234. 1
+1000 inquiries? 1
+1000 Brandenburger 1
+1000 computer... 1
+1000 underground 1
+1000 smalltown 1
+1000 city 2
+1000 only 2
+1000 first 4
+1000 home 3
+1000 woman 3
+1000 famous 4
+1001 multiple 1
+1001 France 1
+1001 care 1
+1001 burnt 1
+1001 birthday 1
+1001 there 2
+1001 they 3
+1001 it 8
+1001 He 4
+1001 which 4
+1010 Now 1
+1010 off 1
+1010 yes, 1
+1010 too. 1
+1010 and 30
+1010 56, 1
+10110 Euro, 1
+10110 Heidelberg. 1
+10110 countries, 1
+10110 injured. 1
+10110 widow. 1
+10110 danger. 1
+10110 fact 1
+10110 magazine. 1
+10110 12. 1
+10110 anniversary. 1
+10110 traditional 1
+10110 up, 1
+10110 that? 1
+10110 Fritsch. 1
+10110 amazing, 1
+10110 "Twentytwo". 1
+10110 am 1
+10110 Ottobrunn. 1
+10110 years. 1
+10110 her. 1
+10110 whom 2
+10110 Hamburg. 4
+10110 . 4
+10110 So 6
+10111 photo 1
+10111 place. 1
+10111 p.m.. 1
+10111 Heidelberg's 1
+10111 September, 1
+10111 21, 1
+10111 jacket, 1
+10111 anyway, 1
+10111 Therefore, 1
+10111 couple, 1
+10111 so 2
+10111 When 2
+10111 year, 3
+10111 husband 2
+1100 place, 1
+1100 Convulsed 1
+1100 Driving 1
+1100 notable 1
+1100 album 1
+1100 meal. 1
+1100 I've 2
+1100 Hi, 2
+1100 We 2
+1100 I 37
+110100 takes 1
+110100 reported 1
+110100 is 15
+110100 wasn't 3
+110101 Bye! 1
+110101 He's 1
+110101 bike 1
+110101 can 1
+110101 agency 1
+110101 Highfly-Hotel 1
+110101 shop 1
+110101 "Daily's" 1
+110101 was 15
+110101 depended 1
+110110 Afterwards, 1
+110110 maps. 1
+110110 Lenbachhaus. 1
+110110 flair 1
+110110 immediately 1
+110110 weren't 1
+110110 addresses 1
+110110 desk 1
+110110 station 1
+110110 I'll 1
+110110 Tor 1
+110110 hospital 1
+110110 because 2
+110110 own 2
+110110 into 6
+110110 as 4
+1101110 frequented 1
+1101110 yet 1
+1101110 Since 1
+1101110 made 1
+1101110 what 1
+1101110 he 9
+1101110 information 2
+1101111 Italian. 1
+1101111 entertainer 1
+1101111 foreign 1
+1101111 delighted. 1
+1101111 George 3
+1101111 we 7
+111000 wrote 1
+111000 hadnt't 1
+111000 looking 1
+111000 just 1
+111000 realized 1
+111000 their 1
+111000 never 1
+111000 love 1
+111000 brought 2
+111000 really 2
+111000 heard 2
+111000 Although 2
+111000 like 7
+1110010 live 1
+1110010 don't 1
+1110010 injured 1
+1110010 first, 1
+1110010 hope 1
+1110010 want 1
+1110010 didn`t 1
+1110010 knows 1
+1110010 merely 1
+1110010 two 1
+1110010 worked 2
+1110010 tried 2
+1110010 no 2
+1110010 moved 4
+1110010 best 2
+1110011 need 1
+1110011 always 1
+1110011 alone 1
+1110011 liked 1
+1110011 forward 1
+1110011 proposed 1
+1110011 came 1
+1110011 talking 1
+1110011 pick 1
+1110011 told 2
+1110011 went 2
+1110011 decided 3
+1110011 wanted 3
+1110011 how 3
+1110011 have 4
+1110100 gave 1
+1110100 downs 1
+1110100 appartment 1
+1110100 hospital. 1
+1110100 last-minute. 1
+1110100 languages, 1
+1110100 sights, 1
+1110100 enjoyed 1
+1110100 I'm 6
+1110100 I'd 4
+1110101 felt 1
+1110101 flames 1
+1110101 enjoy 1
+1110101 deem 1
+1110101 called? 1
+1110101 hardly 1
+1110101 spent 1
+1110101 asked 2
+1110101 had 7
+1110101 found 3
+1110110 Munich, 1
+1110110 Scotland, 1
+1110110 day, 1
+1110110 study 1
+1110110 friend. 1
+1110110 after 1
+1110110 apartments 1
+1110110 show 1
+1110110 there. 1
+1110110 read 2
+1110110 get 3
+1110110 know 6
+1110111 right? 1
+1110111 soon 1
+1110111 uni. 1
+1110111 ambulance. 1
+1110111 Sunday 1
+1110111 before. 1
+1110111 possible. 1
+1110111 my 9
+1110111 he'd 2
+111100 you'll 1
+111100 ? 1
+111100 not 2
+111100 to 42
+111101 it. 1
+111101 call 1
+111101 One 1
+111101 Bruno 1
+111101 once 1
+111101 around 1
+111101 for 7
+111101 at 13
+1111100 Hauptbahnhof? 1
+1111100 hesitant 1
+1111100 visit 1
+1111100 completely 1
+1111100 start 1
+1111100 managed 1
+1111100 money 1
+1111100 go 1
+1111100 offered 1
+1111100 possible 1
+1111100 afford 1
+1111100 driver 2
+1111100 write 3
+1111100 easy 2
+1111101 relaxed 1
+1111101 simply 1
+1111101 sure. 1
+1111101 starts 1
+1111101 friendly 1
+1111101 give 1
+1111101 sitting 1
+1111101 going 1
+1111101 urgent 1
+1111101 please 2
+1111101 next 3
+1111101 very 6
+1111110 who's 1
+1111110 much, 1
+1111110 friday? 1
+1111110 explained 1
+1111110 met 1
+1111110 Where 1
+1111110 How 2
+1111110 much 2
+1111110 are 2
+1111110 could 2
+1111110 me 6
+1111110 enough 3
+1111111 seen 1
+1111111 papers 1
+1111111 "Mondnacht" 1
+1111111 both. 1
+1111111 crashed 1
+1111111 studies 1
+1111111 bring 1
+1111111 pull 1
+1111111 teacher 1
+1111111 boy 1
+1111111 far 1
+1111111 move 1
+1111111 travelling 1
+1111111 Yeah 2
+1111111 ring 2
+1111111 meet 2
+1111111 find 5
+1111111 be 3
\ No newline at end of file