You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by nj...@apache.org on 2018/06/27 22:15:05 UTC
madlib git commit: Bugfix: Fix failing dev check in CRF
Repository: madlib
Updated Branches:
refs/heads/master 887a086cb -> 32cf6ba19
Bugfix: Fix failing dev check in CRF
A couple of dev check files in CRF did not have the label table creation
in it, which resulted in dev-check failures (not sure how it was going
through fine earlier). This commit consists of changes to fix those failures.
Closes #283
Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/32cf6ba1
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/32cf6ba1
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/32cf6ba1
Branch: refs/heads/master
Commit: 32cf6ba1926af1aaf0babbdc210d46df1dc42360
Parents: 887a086
Author: Nandish Jayaram <nj...@apache.org>
Authored: Wed Jun 27 11:25:46 2018 -0700
Committer: Nandish Jayaram <nj...@apache.org>
Committed: Wed Jun 27 15:06:04 2018 -0700
----------------------------------------------------------------------
.../modules/crf/test/crf_test_small.sql_in | 10 +--
.../modules/crf/test/crf_train_large.sql_in | 84 ++++++++++++--------
2 files changed, 54 insertions(+), 40 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/madlib/blob/32cf6ba1/src/ports/postgres/modules/crf/test/crf_test_small.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/crf/test/crf_test_small.sql_in b/src/ports/postgres/modules/crf/test/crf_test_small.sql_in
index 1e9533b..5d1f4bc 100644
--- a/src/ports/postgres/modules/crf/test/crf_test_small.sql_in
+++ b/src/ports/postgres/modules/crf/test/crf_test_small.sql_in
@@ -14,7 +14,7 @@
(2,E'But analysts reckon underlying support for sterling has been eroded by the chancellor ''s failure to announce any new policy measures in his Mansion House speech last Thursday .'),
(3,E'His actions prevent disaster.');
analyze crf_document;
-
+
-- Features table
CREATE TABLE crf_feature_test (id integer,name text,prev_label_id integer,label_id integer,weight float);
INSERT INTO crf_feature_test VALUES
@@ -90,7 +90,7 @@
(18,'PRP$'),(19,'RB'), (20,'RBR'), (21,'RBS'), (22,'RP'), (23,'SYM'), (24,'TO'), (25,'UH'), (26,'VB'),
(27,'VBD'), (28,'VBG'),(29,'VBN'), (30,'VBP'), (31,'VBZ'),(32,'WDT'), (33,'WP'), (34,'WP$'),(35,'WRB'),
(36,'$'), (37,'#'), (38,''''''), (39,'``'), (40,'('), (41,')'), (42,','), (43,'.'), (44,':');
- analyze crf_label;
+ analyze test_crf_label;
-- Segment table
CREATE TABLE test_segmenttbl (start_pos integer,doc_id integer,seg_text text,max_pos integer);
@@ -106,15 +106,15 @@
(13,2,'''s',28), (14,2,'failure',28), (15,2,'to',28), (16,2,'announce',28), (17,2,'any',28),
(18,2,'new',28), (19,2,'policy',28), (20,2,'measures',28),(21,2,'in',28), (22,2,'his',28),
(23,2,'mansion',28), (24,2,'house',28), (25,2,'speech',28), (26,2,'last',28), (27,2,'thursday',28),
- (28,2,'.',28), (0,3,'his',4), (1,3,'actions',4), (2,3,'prevent',4), (3,3,'disaster',4),
+ (28,2,'.',28), (0,3,'his',4), (1,3,'actions',4), (2,3,'prevent',4), (3,3,'disaster',4),
(4,3,'.',4);
analyze test_segmenttbl;
- -- extract features for tokens stored in segmenttbl
+ -- extract features for tokens stored in segmenttbl
SELECT crf_test_fgen('test_segmenttbl','crf_dictionary','test_crf_label','crf_regex','crf_feature_test','viterbi_mtbl','viterbi_rtbl');
-- Expected viterbi labeling result
- -- The result is produced from Dr. Sunita's CRF java package with the same input
+ -- The result is produced from Dr. Sunita's CRF java package with the same input
CREATE TABLE expected_extraction(doc_id integer, start_pos integer, seg_text text, label character varying);
INSERT INTO expected_extraction VALUES
(1,0,'chancellor','NNP'),(1,1,'of','IN'), (1,2,'the','DT'), (1,3,'exchequer','NNP'), (1,4,'nigel','NNP'),
http://git-wip-us.apache.org/repos/asf/madlib/blob/32cf6ba1/src/ports/postgres/modules/crf/test/crf_train_large.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/crf/test/crf_train_large.sql_in b/src/ports/postgres/modules/crf/test/crf_train_large.sql_in
index d4a2d86..b6d4e7c 100644
--- a/src/ports/postgres/modules/crf/test/crf_train_large.sql_in
+++ b/src/ports/postgres/modules/crf/test/crf_train_large.sql_in
@@ -234,26 +234,40 @@ INSERT INTO train_new_segmenttbl VALUES
(30, 7, 'years', 13, 31),
(31, 7, '.', 44, 31);
-CREATE TABLE train_new_regex(pattern text,name text);
+CREATE TABLE train_new_regex(pattern text,name text);
INSERT INTO train_new_regex VALUES
-('^[A-Z][a-z]+$','InitCapital'), ('^[A-Z]+$','isAllCapital'),
+ ('^[A-Z][a-z]+$','InitCapital'), ('^[A-Z]+$','isAllCapital'),
('^.*[0-9]+.*$','containsDigit'),('^.+[.]$','endsWithDot'),
('^.+[,]$','endsWithComma'), ('^.+er$','endsWithER'),
('^.+est$','endsWithEst'), ('^.+ed$','endsWithED'),
('^.+s$','endsWithS'), ('^.+ing$','endsWithIng'),
('^.+ly$','endsWithly'), ('^.+-.+$','isDashSeparatedWords'),
('^.*@.*$','isEmailId');
- analyze train_new_regex;
+analyze train_new_regex;
- SELECT crf_train_fgen('train_new_segmenttbl', 'train_new_regex', 'crf_label', 'train_new_dictionary', 'train_new_featuretbl','train_new_featureset');
+CREATE TABLE crf_label_new (id integer,label character varying);
+INSERT INTO crf_label_new VALUES
+ (0,'CC'), (1,'CD'), (2,'DT'), (3,'EX'), (4,'FW'), (5,'IN'), (6,'JJ'), (7,'JJR'), (8,'JJS'),
+ (9,'LS'), (10,'MD'), (11,'NN'), (12,'NNS'), (13,'NNP'),(14,'NNPS'),(15,'PDT'),(16,'POS'),(17,'PRP'),
+ (18,'PRP$'),(19,'RB'), (20,'RBR'), (21,'RBS'), (22,'RP'), (23,'SYM'), (24,'TO'), (25,'UH'), (26,'VB'),
+ (27,'VBD'), (28,'VBG'),(29,'VBN'), (30,'VBP'), (31,'VBZ'),(32,'WDT'), (33,'WP'), (34,'WP$'),(35,'WRB'),
+ (36,'$'), (37,'#'), (38,'''''');
+INSERT INTO crf_label_new VALUES
+ (39,<!'``'!>);
+m4_changequote(,)
+INSERT INTO crf_label_new VALUES
+ (40,'('), (41,')'), (42,','), (43,'.'), (44,':');
+analyze crf_label_new;
- SELECT lincrf_train('train_new_featuretbl', 'train_new_featureset', 'crf_label', 'train_new_stats', 'train_new_crf_feature', 30);
+SELECT crf_train_fgen('train_new_segmenttbl', 'train_new_regex', 'crf_label_new', 'train_new_dictionary', 'train_new_featuretbl','train_new_featureset');
- -- Expected feature table
- -- The result is produced from Dr. Sunita's CRF java package with the same input
- CREATE TABLE expected_crf_feature_new(id integer,name text,prev_label integer,label integer,weight float);
+SELECT lincrf_train('train_new_featuretbl', 'train_new_featureset', 'crf_label_new', 'train_new_stats', 'train_new_crf_feature', 30);
- INSERT INTO expected_crf_feature_new VALUES
+-- Expected feature table
+-- The result is produced from Dr. Sunita's CRF java package with the same input
+CREATE TABLE expected_crf_feature_new(id integer,name text,prev_label integer,label integer,weight float);
+
+INSERT INTO expected_crf_feature_new VALUES
(0, 'S.', -1, 12, 0.5516753522178934),
(1, 'W_freight', -1, 12, 5.959241076198326),
(2, 'E.', 12, 13, 2.0789747316372034),
@@ -545,33 +559,33 @@ INSERT INTO train_new_regex VALUES
(288, 'E.', 27, 13, 0.6748848167259296),
(289, 'W_past', -1, 7, 2.852378831268221);
- SELECT assert(
- SUM(abs(c1.weight-c2.weight)) < 0.1,
- 'Total difference between extracted feature weights and expected feature weights is > 0.1.')
- FROM expected_crf_feature_new c1, train_new_crf_feature c2
- WHERE c1.name = c2.name AND c1.prev_label = c2.prev_label_id AND c1.label = c2.label_id;;
+SELECT assert(
+ SUM(abs(c1.weight-c2.weight)) < 0.1,
+ 'Total difference between extracted feature weights and expected feature weights is > 0.1.')
+FROM expected_crf_feature_new c1, train_new_crf_feature c2
+WHERE c1.name = c2.name AND c1.prev_label = c2.prev_label_id AND c1.label = c2.label_id;;
- -- Compare the expected features and the extraction features. It fails
- -- if the features do not match.
- SELECT assert(s1.count+s2.count = 0, 'Features extracted do not match expected features.')
- FROM (
- SELECT count(*) FROM(
- SELECT name, prev_label, label
- FROM expected_crf_feature_new
- EXCEPT ALL
- SELECT name, prev_label_id, label_id
- FROM train_new_crf_feature
- ) AS U
- )s1,
- (
- SELECT count(*) FROM(
- SELECT name, prev_label_id, label_id
- FROM train_new_crf_feature
- EXCEPT ALL
- SELECT name, prev_label, label
- FROM expected_crf_feature_new
- ) AS U
- )s2;
+-- Compare the expected features and the extraction features. It fails
+-- if the features do not match.
+SELECT assert(s1.count+s2.count = 0, 'Features extracted do not match expected features.')
+FROM (
+ SELECT count(*) FROM(
+ SELECT name, prev_label, label
+ FROM expected_crf_feature_new
+ EXCEPT ALL
+ SELECT name, prev_label_id, label_id
+ FROM train_new_crf_feature
+ ) AS U
+)s1,
+(
+ SELECT count(*) FROM(
+ SELECT name, prev_label_id, label_id
+ FROM train_new_crf_feature
+ EXCEPT ALL
+ SELECT name, prev_label, label
+ FROM expected_crf_feature_new
+ ) AS U
+)s2;
!>)
m4_changequote(<!`!>,<!'!>)