You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@madlib.apache.org by nj...@apache.org on 2018/06/27 22:15:05 UTC

madlib git commit: Bugfix: Fix failing dev check in CRF

Repository: madlib
Updated Branches:
  refs/heads/master 887a086cb -> 32cf6ba19


Bugfix: Fix failing dev check in CRF

A couple of dev check files in CRF did not have the label table creation
in it, which resulted in dev-check failures (not sure how it was going
through fine earlier). This commit consists of changes to fix those failures.

Closes #283


Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/32cf6ba1
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/32cf6ba1
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/32cf6ba1

Branch: refs/heads/master
Commit: 32cf6ba1926af1aaf0babbdc210d46df1dc42360
Parents: 887a086
Author: Nandish Jayaram <nj...@apache.org>
Authored: Wed Jun 27 11:25:46 2018 -0700
Committer: Nandish Jayaram <nj...@apache.org>
Committed: Wed Jun 27 15:06:04 2018 -0700

----------------------------------------------------------------------
 .../modules/crf/test/crf_test_small.sql_in      | 10 +--
 .../modules/crf/test/crf_train_large.sql_in     | 84 ++++++++++++--------
 2 files changed, 54 insertions(+), 40 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/madlib/blob/32cf6ba1/src/ports/postgres/modules/crf/test/crf_test_small.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/crf/test/crf_test_small.sql_in b/src/ports/postgres/modules/crf/test/crf_test_small.sql_in
index 1e9533b..5d1f4bc 100644
--- a/src/ports/postgres/modules/crf/test/crf_test_small.sql_in
+++ b/src/ports/postgres/modules/crf/test/crf_test_small.sql_in
@@ -14,7 +14,7 @@
         (2,E'But  analysts  reckon  underlying  support  for  sterling  has  been  eroded  by  the  chancellor ''s  failure  to  announce  any  new  policy  measures  in  his  Mansion  House  speech  last  Thursday  .'),
         (3,E'His actions prevent disaster.');
 	analyze crf_document;
-        
+
 	-- Features table
 	CREATE TABLE crf_feature_test (id integer,name text,prev_label_id integer,label_id integer,weight float);
         INSERT INTO crf_feature_test VALUES
@@ -90,7 +90,7 @@
         (18,'PRP$'),(19,'RB'), (20,'RBR'),  (21,'RBS'), (22,'RP'), (23,'SYM'), (24,'TO'), (25,'UH'), (26,'VB'),
         (27,'VBD'), (28,'VBG'),(29,'VBN'),  (30,'VBP'), (31,'VBZ'),(32,'WDT'), (33,'WP'), (34,'WP$'),(35,'WRB'),
         (36,'$'),   (37,'#'),  (38,''''''), (39,'``'),  (40,'('),  (41,')'),   (42,','),  (43,'.'),  (44,':');
-	analyze crf_label;
+	analyze test_crf_label;
 
 	-- Segment table
 	CREATE TABLE test_segmenttbl (start_pos integer,doc_id integer,seg_text text,max_pos integer);
@@ -106,15 +106,15 @@
 	(13,2,'''s',28),      (14,2,'failure',28), (15,2,'to',28),      (16,2,'announce',28), (17,2,'any',28),
 	(18,2,'new',28),      (19,2,'policy',28),  (20,2,'measures',28),(21,2,'in',28),       (22,2,'his',28),
 	(23,2,'mansion',28),  (24,2,'house',28),   (25,2,'speech',28),  (26,2,'last',28),     (27,2,'thursday',28),
-	(28,2,'.',28),        (0,3,'his',4),       (1,3,'actions',4),   (2,3,'prevent',4),    (3,3,'disaster',4), 
+	(28,2,'.',28),        (0,3,'his',4),       (1,3,'actions',4),   (2,3,'prevent',4),    (3,3,'disaster',4),
         (4,3,'.',4);
 	analyze test_segmenttbl;
 
-	-- extract features for tokens stored in segmenttbl 
+	-- extract features for tokens stored in segmenttbl
 	SELECT crf_test_fgen('test_segmenttbl','crf_dictionary','test_crf_label','crf_regex','crf_feature_test','viterbi_mtbl','viterbi_rtbl');
 
         -- Expected viterbi labeling result
-        -- The result is produced from Dr. Sunita's CRF java package with the same input 
+        -- The result is produced from Dr. Sunita's CRF java package with the same input
         CREATE TABLE expected_extraction(doc_id integer, start_pos integer, seg_text text, label character varying);
         INSERT INTO expected_extraction VALUES
 	(1,0,'chancellor','NNP'),(1,1,'of','IN'),         (1,2,'the','DT'),        (1,3,'exchequer','NNP'), (1,4,'nigel','NNP'),

http://git-wip-us.apache.org/repos/asf/madlib/blob/32cf6ba1/src/ports/postgres/modules/crf/test/crf_train_large.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/crf/test/crf_train_large.sql_in b/src/ports/postgres/modules/crf/test/crf_train_large.sql_in
index d4a2d86..b6d4e7c 100644
--- a/src/ports/postgres/modules/crf/test/crf_train_large.sql_in
+++ b/src/ports/postgres/modules/crf/test/crf_train_large.sql_in
@@ -234,26 +234,40 @@ INSERT INTO train_new_segmenttbl VALUES
 (30, 7, 'years', 13, 31),
 (31, 7, '.', 44, 31);
 
-CREATE TABLE train_new_regex(pattern text,name text); 
+CREATE TABLE train_new_regex(pattern text,name text);
 INSERT INTO train_new_regex VALUES
-('^[A-Z][a-z]+$','InitCapital'), ('^[A-Z]+$','isAllCapital'),
+        ('^[A-Z][a-z]+$','InitCapital'), ('^[A-Z]+$','isAllCapital'),
         ('^.*[0-9]+.*$','containsDigit'),('^.+[.]$','endsWithDot'),
         ('^.+[,]$','endsWithComma'),     ('^.+er$','endsWithER'),
         ('^.+est$','endsWithEst'),       ('^.+ed$','endsWithED'),
         ('^.+s$','endsWithS'),           ('^.+ing$','endsWithIng'),
         ('^.+ly$','endsWithly'),         ('^.+-.+$','isDashSeparatedWords'),
         ('^.*@.*$','isEmailId');
-        analyze train_new_regex;
+analyze train_new_regex;
 
-        SELECT crf_train_fgen('train_new_segmenttbl', 'train_new_regex', 'crf_label', 'train_new_dictionary', 'train_new_featuretbl','train_new_featureset');
+CREATE TABLE crf_label_new (id integer,label character varying);
+INSERT INTO crf_label_new VALUES
+        (0,'CC'),   (1,'CD'),  (2,'DT'),    (3,'EX'),   (4,'FW'),  (5,'IN'),   (6,'JJ'),  (7,'JJR'), (8,'JJS'),
+        (9,'LS'),   (10,'MD'), (11,'NN'),   (12,'NNS'), (13,'NNP'),(14,'NNPS'),(15,'PDT'),(16,'POS'),(17,'PRP'),
+        (18,'PRP$'),(19,'RB'), (20,'RBR'),  (21,'RBS'), (22,'RP'), (23,'SYM'), (24,'TO'), (25,'UH'), (26,'VB'),
+        (27,'VBD'), (28,'VBG'),(29,'VBN'),  (30,'VBP'), (31,'VBZ'),(32,'WDT'), (33,'WP'), (34,'WP$'),(35,'WRB'),
+        (36,'$'),   (37,'#'),  (38,'''''');
+INSERT INTO crf_label_new VALUES
+        (39,<!'``'!>);
+m4_changequote(,)
+INSERT INTO crf_label_new VALUES
+        (40,'('),  (41,')'),   (42,','),  (43,'.'),  (44,':');
+analyze crf_label_new;
 
-        SELECT lincrf_train('train_new_featuretbl', 'train_new_featureset', 'crf_label', 'train_new_stats', 'train_new_crf_feature', 30);
+SELECT crf_train_fgen('train_new_segmenttbl', 'train_new_regex', 'crf_label_new', 'train_new_dictionary', 'train_new_featuretbl','train_new_featureset');
 
-        -- Expected feature table
-        -- The result is produced from Dr. Sunita's CRF java package with the same input
-        CREATE TABLE expected_crf_feature_new(id integer,name text,prev_label integer,label integer,weight float);
+SELECT lincrf_train('train_new_featuretbl', 'train_new_featureset', 'crf_label_new', 'train_new_stats', 'train_new_crf_feature', 30);
 
-	INSERT INTO expected_crf_feature_new VALUES
+-- Expected feature table
+-- The result is produced from Dr. Sunita's CRF java package with the same input
+CREATE TABLE expected_crf_feature_new(id integer,name text,prev_label integer,label integer,weight float);
+
+INSERT INTO expected_crf_feature_new VALUES
 (0, 'S.', -1, 12, 0.5516753522178934),
 (1, 'W_freight', -1, 12, 5.959241076198326),
 (2, 'E.', 12, 13, 2.0789747316372034),
@@ -545,33 +559,33 @@ INSERT INTO train_new_regex VALUES
 (288, 'E.', 27, 13, 0.6748848167259296),
 (289, 'W_past', -1, 7, 2.852378831268221);
 
-	SELECT assert(
-		SUM(abs(c1.weight-c2.weight)) < 0.1,  
-		'Total difference between extracted feature weights and expected feature weights is > 0.1.') 
-	FROM  expected_crf_feature_new c1, train_new_crf_feature c2
-        WHERE c1.name = c2.name AND c1.prev_label = c2.prev_label_id AND c1.label = c2.label_id;;
+SELECT assert(
+	SUM(abs(c1.weight-c2.weight)) < 0.1,
+	'Total difference between extracted feature weights and expected feature weights is > 0.1.')
+FROM  expected_crf_feature_new c1, train_new_crf_feature c2
+WHERE c1.name = c2.name AND c1.prev_label = c2.prev_label_id AND c1.label = c2.label_id;;
 
-	-- Compare the expected features and the extraction features.  It fails
-	-- if the features do not match.
-	SELECT assert(s1.count+s2.count = 0,  'Features extracted do not match expected features.')
-	FROM (
-		SELECT count(*) FROM(
-			SELECT name, prev_label, label
-			FROM expected_crf_feature_new 
-			EXCEPT ALL
-			SELECT name, prev_label_id, label_id
-			FROM train_new_crf_feature
-		) AS U
-	)s1,
-	(
-		SELECT count(*) FROM(
-			SELECT name, prev_label_id, label_id
-			FROM  train_new_crf_feature
-			EXCEPT ALL
-			SELECT name, prev_label, label
-			FROM expected_crf_feature_new 
-		) AS U
-	)s2;
+-- Compare the expected features and the extraction features.  It fails
+-- if the features do not match.
+SELECT assert(s1.count+s2.count = 0,  'Features extracted do not match expected features.')
+FROM (
+	SELECT count(*) FROM(
+		SELECT name, prev_label, label
+		FROM expected_crf_feature_new
+		EXCEPT ALL
+		SELECT name, prev_label_id, label_id
+		FROM train_new_crf_feature
+	) AS U
+)s1,
+(
+	SELECT count(*) FROM(
+		SELECT name, prev_label_id, label_id
+		FROM  train_new_crf_feature
+		EXCEPT ALL
+		SELECT name, prev_label, label
+		FROM expected_crf_feature_new
+	) AS U
+)s2;
 
 !>)
 m4_changequote(<!`!>,<!'!>)