You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by du...@apache.org on 2016/01/26 02:12:46 UTC
[22/55] [partial] incubator-systemml git commit: [SYSTEMML-482]
[SYSTEMML-480] Adding a Git attributes file to enfore Unix-styled line
endings, and normalizing all of the line endings.
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/816e2db8/src/test/scripts/applications/impute/wfundInputGenerator1.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/impute/wfundInputGenerator1.dml b/src/test/scripts/applications/impute/wfundInputGenerator1.dml
index 7507958..8457fbd 100644
--- a/src/test/scripts/applications/impute/wfundInputGenerator1.dml
+++ b/src/test/scripts/applications/impute/wfundInputGenerator1.dml
@@ -1,469 +1,469 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# 2013-10-08: THIS IS THE ATTEMPT TO IMPLEMENT HIDDEN STATE AS "HIDDEN REPORTS"
-# THE FIRST TERMS IN THE REPORTS MATRIX ARE THE HIDDEN REPORTS, THE LAST ARE THE KNOWN REPORTS
-#
-# THIS VERSION IS WITH "TRADITIONAL" REGRESSIONS
-#
-
-# hadoop jar SystemML.jar -f test/scripts/applications/impute/wfundInputGenerator.dml -exec singlenode
-# -args
-# test/scripts/applications/impute/initial_reports
-# test/scripts/applications/impute/initial_reports_preprocessed
-# test/scripts/applications/impute/CReps
-# test/scripts/applications/impute/RegresValueMap
-# test/scripts/applications/impute/RegresFactorDefault
-# test/scripts/applications/impute/RegresParamMap
-# test/scripts/applications/impute/RegresCoeffDefault
-# test/scripts/applications/impute/RegresScaleMult
-
-is_GROUP_4_ENABLED = 1; # = 1 or 0
-num_known_terms = 6; # 20; # The number of known term reports, feel free to change
-num_predicted_terms = 1; # The number of predicted term reports, feel free to change
-
-num_terms = 2 * num_known_terms + num_predicted_terms;
-num_attrs = 19;
-
-# Indicator matrix for the "known" values that should not be matched to hidden reports:
-disabled_known_values = matrix (0.0, rows = num_attrs, cols = num_known_terms);
-disabled_known_values [4, 3] = 1.0;
-disabled_known_values [5, 3] = 1.0;
-disabled_known_values [6, 3] = 1.0;
-disabled_known_values [7, 3] = 1.0;
-
-initial_reports_unprocessed = read ($1);
-initial_reports = matrix (0.0, rows = num_attrs, cols = num_terms);
-initial_reports [, 1:num_known_terms] =
- initial_reports_unprocessed [, 1:num_known_terms];
-initial_reports [, (num_known_terms + num_predicted_terms + 1):num_terms] =
- initial_reports_unprocessed [, 1:num_known_terms];
-
-num_frees_per_term = 13;
-if (is_GROUP_4_ENABLED == 1) {
- num_frees_per_term = 15;
-}
-
-num_frees = (num_known_terms + num_predicted_terms) * num_frees_per_term;
-
-zero = matrix (0.0, rows = 1, cols = 1);
-
-# ---------------------------------------------------------
-# GENERATE AN AFFINE MAP FROM FREE VARIABLES TO THE REPORTS
-# AFFINE MAP = LINEAR MAP + INITIAL (DEFAULT) REPORTS
-# All free variables are mapped to the "HIDDEN" reports
-# ---------------------------------------------------------
-
-CReps = matrix (0.0, rows = (num_terms * num_attrs), cols = num_frees);
-
-for (t in 1:(num_known_terms + num_predicted_terms))
-{
- dt = (t-1) * num_attrs;
- df = (t-1) * num_frees_per_term;
-# constraint that row1 = row2 + row3 + row4 + row5 + row6 + row7
-# translated to free vars: row1 = free1 + free2 + free3 + free4 + free5 + free6
- CReps [dt + 1, df + 1] = 1.0 + zero;
- CReps [dt + 1, df + 2] = 1.0 + zero;
- CReps [dt + 1, df + 3] = 1.0 + zero;
- CReps [dt + 1, df + 4] = 1.0 + zero;
- CReps [dt + 1, df + 5] = 1.0 + zero;
- CReps [dt + 1, df + 6] = 1.0 + zero;
- CReps [dt + 2, df + 1] = 1.0 + zero;
- CReps [dt + 3, df + 2] = 1.0 + zero;
- CReps [dt + 4, df + 3] = 1.0 + zero;
- CReps [dt + 5, df + 4] = 1.0 + zero;
- CReps [dt + 6, df + 5] = 1.0 + zero;
- CReps [dt + 7, df + 6] = 1.0 + zero;
-
-# row 8 is free variable not appearing in any non-free variable
- CReps [dt + 8, df + 7] = 1.0 + zero;
-
-# constraint that row9 = row10 + row11 + row12 + row13 + row14 + row15
-# translated to free vars: row9 = free8 + free9 + free10 + free11 + free12 + free13
- CReps [dt + 9, df + 8] = 1.0 + zero;
- CReps [dt + 9, df + 9] = 1.0 + zero;
- CReps [dt + 9, df + 10] = 1.0 + zero;
- CReps [dt + 9, df + 11] = 1.0 + zero;
- CReps [dt + 9, df + 12] = 1.0 + zero;
- CReps [dt + 9, df + 13] = 1.0 + zero;
- CReps [dt + 10, df + 8] = 1.0 + zero;
- CReps [dt + 11, df + 9] = 1.0 + zero;
- CReps [dt + 12, df + 10] = 1.0 + zero;
- CReps [dt + 13, df + 11] = 1.0 + zero;
- CReps [dt + 14, df + 12] = 1.0 + zero;
- CReps [dt + 15, df + 13] = 1.0 + zero;
-
-# constraint that row16 = row14 + row15
-# translated to free vars: row16 = free14 + free15
- if (is_GROUP_4_ENABLED == 1) {
- CReps [dt + 16, df + 14] = 1.0 + zero;
- CReps [dt + 16, df + 15] = 1.0 + zero;
- CReps [dt + 17, df + 14] = 1.0 + zero;
- CReps [dt + 18, df + 15] = 1.0 + zero;
- }
-
-# constraint that row19 = total cost (all free variables)
-# translated to free vars: row19 = all free variables
- CReps [dt + 19, df + 1] = 1.0 + zero;
- CReps [dt + 19, df + 2] = 1.0 + zero;
- CReps [dt + 19, df + 3] = 1.0 + zero;
- CReps [dt + 19, df + 4] = 1.0 + zero;
- CReps [dt + 19, df + 5] = 1.0 + zero;
- CReps [dt + 19, df + 6] = 1.0 + zero;
- CReps [dt + 19, df + 7] = 1.0 + zero;
- CReps [dt + 19, df + 8] = 1.0 + zero;
- CReps [dt + 19, df + 9] = 1.0 + zero;
- CReps [dt + 19, df + 10] = 1.0 + zero;
- CReps [dt + 19, df + 11] = 1.0 + zero;
- CReps [dt + 19, df + 12] = 1.0 + zero;
- CReps [dt + 19, df + 13] = 1.0 + zero;
- if (is_GROUP_4_ENABLED == 1) {
- CReps [dt + 19, df + 14] = 1.0 + zero;
- CReps [dt + 19, df + 15] = 1.0 + zero;
- }
-}
-
-
-# ---------------------------------------------------------
-# GENERATE AN AFFINE MAP FROM REPORTS TO REGRESSION FACTORS
-# AFFINE MAP = LINEAR MAP + A VECTOR OF DEFAULTS
-# ---------------------------------------------------------
-
-# We have three types of regressions:
-# 1. For "hidden" reports:
-# x[t] ~ aggregate[t], x[t-1], (x[t-1] - x[t-2])
-# 2. For "observed" reports:
-# y[t] ~ x[t] (with coefficient 1)
-# 3. For some parameters: the regularization equations.
-# All regressions follow the 4-factor pattern.
-num_factors = 4;
-
-# We have one regression equation per time-term for each attribute,
-# plus a few "special" regularization regression equations:
-num_regularization_regs = 12;
-if (is_GROUP_4_ENABLED == 1) {
- num_regularization_regs = 16;
-}
-
-num_reg_eqs = num_terms * num_attrs + num_regularization_regs;
-
-RegresValueMap = matrix (0.0, rows = (num_reg_eqs * num_factors), cols = (num_terms * num_attrs));
-RegresFactorDefault = matrix (0.0, rows = (num_reg_eqs * num_factors), cols = 1);
-
-# All regression equations for the same attribute share the same parameters, regardless
-# of the term; some parameters are shared across multiple attributes, (those attributes
-# whose behavior is believed to be similar) as specified in the table below:
-
-num_params = 28;
-if (is_GROUP_4_ENABLED == 1) {
- num_params = 35;
-}
-
-# Factors: -self[t] total[t] self[t-1] self[t-1]-
-# self[t-2]
-# PARAMS:
-# Group 1: 1.0 prm#01 prm#08 prm#09 Row #01 = free#01 + ... + free#06
-# Group 1: " prm#02 prm#10 prm#11 Row #02 = free#01
-# Group 1: " prm#03 " " Row #03 = free#02
-# Group 1: " prm#04 " " Row #04 = free#03
-# Group 1: " prm#05 " " Row #05 = free#04
-# Group 1: " prm#06 " " Row #06 = free#05
-# Group 1: " prm#07 " " Row #07 = free#06
-# --------------------------------------------------------------------
-# Group 2: 1.0 prm#12 prm#13 prm#14 Row #08 = free#07
-# --------------------------------------------------------------------
-# Group 3: 1.0 prm#15 prm#22 prm#23 Row #09 = free#08 + ... + free#13
-# Group 3: " prm#16 prm#24 prm#25 Row #10 = free#08
-# Group 3: " prm#17 " " Row #11 = free#09
-# Group 3: " prm#18 " " Row #12 = free#10
-# Group 3: " prm#19 " " Row #13 = free#11
-# Group 3: " prm#20 " " Row #14 = free#12
-# Group 3: " prm#21 " " Row #15 = free#13
-# --------------------------------------------------------------------
-# GROUP-4 ZEROS: FIVE PARAMETERS REVOKED
-# Group 4: 1.0 prm#29 prm#32 prm#33 Row #16 = free#14 + free#15
-# Group 4: " prm#30 prm#34 prm#35 Row #17 = free#14
-# Group 4: " prm#31 " " Row #18 = free#15
-# --------------------------------------------------------------------
-# Group 5: 1.0 prm#26 prm#27 prm#28 Row #19 = free#01 + ... + free#15
-#
-# (The aggregates in Groups 1..4 regress on the total cost in Group 5;
-# the total cost in Group 5 regresses on the intercept.)
-
-# THE LAST REGULARIZATION "REGRESSION" EQUATIONS:
-# Factors: 1.0 -1.0 0.0 0.0
-# PARAMS:
-# prm#27 1.0 0.0 0.0
-# prm#28 0.0 0.0 0.0
-# prm#08 0.0 0.0 0.0
-# prm#09 0.0 0.0 0.0
-# prm#10 0.0 0.0 0.0
-# prm#11 0.0 0.0 0.0
-# prm#13 0.0 0.0 0.0
-# prm#14 0.0 0.0 0.0
-# prm#22 0.0 0.0 0.0
-# prm#23 0.0 0.0 0.0
-# prm#24 0.0 0.0 0.0
-# prm#25 0.0 0.0 0.0
-# prm#32 0.0 0.0 0.0 # GROUP-4 ZEROS:
-# prm#33 0.0 0.0 0.0 # THESE EQUATIONS
-# prm#34 0.0 0.0 0.0 # USE REVOKED PARAMETERS
-# prm#35 0.0 0.0 0.0 # AND DO NOT APPEAR
-
-
-# --------------------------------------------------------------
-# FIRST, AN AFFINE MAP FROM HIDDEN REPORTS TO REGRESSION FACTORS
-# --------------------------------------------------------------
-
-for (t in 1 : (num_known_terms + num_predicted_terms))
-{
- for (i in 1 : num_attrs)
- {
- reg_index = ((t-1) * num_attrs - 1 + i) * num_factors;
-
- if (i < 19)
- {
- agg_i = 19;
- if (i >= 2 & 1 <= 7) {agg_i = 1;}
- if (i >= 10 & 1 <= 15) {agg_i = 9;}
- if (i >= 17 & 1 <= 18) {agg_i = 16;}
-
- RegresValueMap [reg_index + 1, (t-1) * num_attrs + i] = -1.0 + zero; # 1st factor: -x[t]
- RegresValueMap [reg_index + 2, (t-1) * num_attrs + agg_i] = 1.0 + zero; # 2nd factor: aggregate[t]
- if (t == 1) {
- RegresValueMap [reg_index + 3, i] = 1.0 + zero; # For t = 1 the 3rd factor is x[t] = x[1]
- } else {
- RegresValueMap [reg_index + 3, (t-2) * num_attrs + i] = 1.0 + zero; # 3rd factor: x[t-1]
- }
- if (t >= 3) {
- RegresValueMap [reg_index + 4, (t-2) * num_attrs + i] = 1.0 + zero; # 4th factor is
- RegresValueMap [reg_index + 4, (t-3) * num_attrs + i] = -1.0 + zero; # x[t-1] - x[t-2]
- }
- }
-
-# Regression for the TOTAL:
-
- if (i == 19)
- {
- if (t >= 2) {
- RegresValueMap [reg_index + 1, (t-1) * num_attrs + 19] = -1.0 + zero; # 1st factor: -x[t]
- RegresFactorDefault [reg_index + 2, 1] = 1.0 + zero; # 2nd factor: Intercept
- RegresValueMap [reg_index + 3, (t-2) * num_attrs + 19] = 1.0 + zero; # 3rd factor: x[t-1]
- }
- if (t >= 3) {
- RegresValueMap [reg_index + 4, (t-2) * num_attrs + 19] = 1.0 + zero; # 4th factor is
- RegresValueMap [reg_index + 4, (t-3) * num_attrs + 19] = -1.0 + zero; # x[t-1] - x[t-2]
- }
- }
- }
-}
-
-# -----------------------------------------------------------------
-# SECOND, AN AFFINE MAP FROM OBSERVED REPORTS TO REGRESSION FACTORS
-# -----------------------------------------------------------------
-
-for (t in (num_known_terms + num_predicted_terms + 1) : num_terms)
-{
- t2 = t - (num_known_terms + num_predicted_terms);
- for (i in 1 : num_attrs) {
- reg_index = ((t-1) * num_attrs - 1 + i) * num_factors;
- RegresValueMap [reg_index + 1, (t - 1) * num_attrs + i] = -1.0 + zero; # 1st factor: -y[t]
- RegresValueMap [reg_index + 2, (t2 - 1) * num_attrs + i] = 1.0 + zero; # 2nd factor: x[t]
- }
-}
-
-# -----------------------------------------------------
-# THIRD, AN AFFINE MAP FOR REGULARIZATION "REGRESSIONS"
-# -----------------------------------------------------
-
-reg_index = num_terms * num_attrs * num_factors;
-for (i in 1:num_regularization_regs)
-{
- RegresFactorDefault [reg_index + 1, 1] = 1.0 + zero;
- RegresFactorDefault [reg_index + 2, 1] = -1.0 + zero;
- reg_index = reg_index + num_factors;
-}
-
-
-# ----------------------------------------------------------
-# GENERATE AN AFFINE MAP FROM PARAMETERS TO THE COEFFICIENTS
-# AT REGRESSION FACTORS: A LINEAR MAP + A VECTOR OF DEFAULTS
-# ----------------------------------------------------------
-
-RegresParamMap = matrix (0.0, rows = (num_reg_eqs * num_factors), cols = num_params);
-RegresCoeffDefault = matrix (0.0, rows = (num_reg_eqs * num_factors), cols = 1);
-
-# -----------------------------------------------------------
-# FIRST, AN AFFINE MAP THAT COVERS HIDDEN REPORTS REGRESSIONS
-# -----------------------------------------------------------
-
-for (t in 1 : (num_known_terms + num_predicted_terms)) {
-# Group 1 attributes:
- reg_index = ((t-1) * num_attrs - 1 + 1) * num_factors;
- RegresCoeffDefault [reg_index + 1, 1] = 1.0 + zero; # Default coefficient = 1.0
- RegresParamMap [reg_index + 2, 1] = 1.0 + zero; # Param #01
- RegresParamMap [reg_index + 3, 8] = 1.0 + zero; # Param #08
- RegresParamMap [reg_index + 4, 9] = 1.0 + zero; # Param #09
- for (i in 2 : 7) {
- reg_index = ((t-1) * num_attrs - 1 + i) * num_factors;
- RegresCoeffDefault [reg_index + 1, 1] = 1.0 + zero; # Default coefficient = 1.0
- RegresParamMap [reg_index + 2, i] = 1.0 + zero; # Param #02-#07
- RegresParamMap [reg_index + 3, 10] = 1.0 + zero; # Param #10
- RegresParamMap [reg_index + 4, 11] = 1.0 + zero; # Param #11
- }
-# Group 2 attribute:
- reg_index = ((t-1) * num_attrs - 1 + 8) * num_factors;
- RegresCoeffDefault [reg_index + 1, 1] = 1.0 + zero; # Default coefficient = 1.0
- RegresParamMap [reg_index + 2, 12] = 1.0 + zero; # Param #12
- RegresParamMap [reg_index + 3, 13] = 1.0 + zero; # Param #13
- RegresParamMap [reg_index + 4, 14] = 1.0 + zero; # Param #14
-# Group 3 attributes:
- reg_index = ((t-1) * num_attrs - 1 + 9) * num_factors;
- RegresCoeffDefault [reg_index + 1, 1] = 1.0 + zero; # Default coefficient = 1.0
- RegresParamMap [reg_index + 2, 15] = 1.0 + zero; # Param #17
- RegresParamMap [reg_index + 3, 22] = 1.0 + zero; # Param #22
- RegresParamMap [reg_index + 4, 23] = 1.0 + zero; # Param #23
- for (i in 10 : 15) {
- reg_index = ((t-1) * num_attrs - 1 + i) * num_factors;
- RegresCoeffDefault [reg_index + 1, 1] = 1.0 + zero; # Default coefficient = 1.0
- RegresParamMap [reg_index + 2, 6 + i] = 1.0 + zero; # Param #16-#21
- RegresParamMap [reg_index + 3, 24] = 1.0 + zero; # Param #24
- RegresParamMap [reg_index + 4, 25] = 1.0 + zero; # Param #25
- }
-
-# Group 4 attributes:
-if (is_GROUP_4_ENABLED == 1) {
- reg_index = ((t-1) * num_attrs - 1 + 16) * num_factors;
- RegresCoeffDefault [reg_index + 1, 1] = 1.0 + zero; # Default coefficient = 1.0
- RegresParamMap [reg_index + 2, 29] = 1.0 + zero; # Param #29
- RegresParamMap [reg_index + 3, 32] = 1.0 + zero; # Param #32
- RegresParamMap [reg_index + 4, 33] = 1.0 + zero; # Param #33
- for (i in 17 : 18) {
- reg_index = ((t-1) * num_attrs - 1 + i) * num_factors;
- RegresCoeffDefault [reg_index + 1, 1] = 1.0 + zero; # Default coefficient = 1.0
- RegresParamMap [reg_index + 2, 13 + i] = 1.0 + zero; # Param #30-#31
- RegresParamMap [reg_index + 3, 34] = 1.0 + zero; # Param #34
- RegresParamMap [reg_index + 4, 35] = 1.0 + zero; # Param #35
- }
-}
-
-# Group 5 attribute:
- reg_index = ((t-1) * num_attrs - 1 + 19) * num_factors;
- RegresCoeffDefault [reg_index + 1, 1] = 1.0 + zero; # Default coefficient = 1.0
- RegresParamMap [reg_index + 2, 26] = 1.0 + zero; # Param #26
- RegresParamMap [reg_index + 3, 27] = 1.0 + zero; # Param #27
- RegresParamMap [reg_index + 4, 28] = 1.0 + zero; # Param #28
-}
-
-# --------------------------------------------------------------
-# SECOND, AN AFFINE MAP THAT COVERS OBSERVED REPORTS REGRESSIONS
-# --------------------------------------------------------------
-
-for (t in (num_known_terms + num_predicted_terms + 1) : num_terms)
-{
- for (i in 1 : num_attrs) {
- if (castAsScalar (disabled_known_values [i, t - (num_known_terms + num_predicted_terms)]) == 0.0)
- {
- reg_index = ((t-1) * num_attrs - 1 + i) * num_factors;
- RegresCoeffDefault [reg_index + 1, 1] = 1.0 + zero; # Default coefficient = 1.0
- RegresCoeffDefault [reg_index + 2, 1] = 1.0 + zero; # Default coefficient = 1.0
- }
- }
-}
-
-# -------------------------------------------------------------
-# THIRD, AN AFFINE MAP THAT COVERS REGULARIZATION "REGRESSIONS"
-# -------------------------------------------------------------
-
-reg_index = num_terms * num_attrs * num_factors;
- RegresParamMap [reg_index + 1, 27] = 1.0 + zero; # Param #27
- RegresCoeffDefault [reg_index + 2, 1] = 1.0 + zero;
-reg_index = reg_index + num_factors;
- RegresParamMap [reg_index + 1, 28] = 1.0 + zero; # Param #28
-reg_index = reg_index + num_factors;
- RegresParamMap [reg_index + 1, 08] = 1.0 + zero; # Param #08
-reg_index = reg_index + num_factors;
- RegresParamMap [reg_index + 1, 09] = 1.0 + zero; # Param #09
-reg_index = reg_index + num_factors;
- RegresParamMap [reg_index + 1, 10] = 1.0 + zero; # Param #10
-reg_index = reg_index + num_factors;
- RegresParamMap [reg_index + 1, 11] = 1.0 + zero; # Param #11
-reg_index = reg_index + num_factors;
- RegresParamMap [reg_index + 1, 13] = 1.0 + zero; # Param #13
-reg_index = reg_index + num_factors;
- RegresParamMap [reg_index + 1, 14] = 1.0 + zero; # Param #14
-reg_index = reg_index + num_factors;
- RegresParamMap [reg_index + 1, 22] = 1.0 + zero; # Param #22
-reg_index = reg_index + num_factors;
- RegresParamMap [reg_index + 1, 23] = 1.0 + zero; # Param #23
-reg_index = reg_index + num_factors;
- RegresParamMap [reg_index + 1, 24] = 1.0 + zero; # Param #24
-reg_index = reg_index + num_factors;
- RegresParamMap [reg_index + 1, 25] = 1.0 + zero; # Param #25
-
-if (is_GROUP_4_ENABLED == 1) {
- reg_index = reg_index + num_factors;
- RegresParamMap [reg_index + 1, 32] = 1.0 + zero; # Param #32
- reg_index = reg_index + num_factors;
- RegresParamMap [reg_index + 1, 33] = 1.0 + zero; # Param #33
- reg_index = reg_index + num_factors;
- RegresParamMap [reg_index + 1, 34] = 1.0 + zero; # Param #34
- reg_index = reg_index + num_factors;
- RegresParamMap [reg_index + 1, 35] = 1.0 + zero; # Param #35
-}
-
-# ----------------------------------------------------------
-# GENERATE A VECTOR OF SCALE MULTIPLIERS, ONE PER REGRESSION
-# ----------------------------------------------------------
-
-RegresScaleMult = matrix (1.0, rows = num_reg_eqs, cols = 1);
-
-global_weight = 0.5 + zero;
-
-attribute_size = rowMeans (abs (initial_reports [, 1:num_known_terms]));
-max_attr_size = max (attribute_size);
-
-for (t in 1 : num_terms) {
- for (i in 1 : num_attrs) {
- regeqn = (t-1) * num_attrs + i;
- scale_down = sqrt (attribute_size [i, 1] / max_attr_size) * 0.999 + 0.001;
- acceptable_drift = scale_down * max_attr_size * 0.001;
- RegresScaleMult [regeqn, 1] = global_weight / (acceptable_drift^2);
- }
-}
-
-for (i in 1 : num_regularization_regs) {
- regeqn = num_terms * num_attrs + i;
- acceptable_drift = 0.01;
- RegresScaleMult [regeqn, 1] = global_weight / (acceptable_drift^2);
-}
-
-# --------------------------------
-# WRITE OUT ALL GENERATED MATRICES
-# --------------------------------
-
-write (initial_reports, $2, format="text");
-write (CReps, $3, format="text");
-write (RegresValueMap, $4, format="text");
-write (RegresFactorDefault,$5, format="text");
-write (RegresParamMap, $6, format="text");
-write (RegresCoeffDefault, $7, format="text");
-write (RegresScaleMult, $8, format="text");
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# 2013-10-08: THIS IS THE ATTEMPT TO IMPLEMENT HIDDEN STATE AS "HIDDEN REPORTS"
+# THE FIRST TERMS IN THE REPORTS MATRIX ARE THE HIDDEN REPORTS, THE LAST ARE THE KNOWN REPORTS
+#
+# THIS VERSION IS WITH "TRADITIONAL" REGRESSIONS
+#
+
+# hadoop jar SystemML.jar -f test/scripts/applications/impute/wfundInputGenerator.dml -exec singlenode
+# -args
+# test/scripts/applications/impute/initial_reports
+# test/scripts/applications/impute/initial_reports_preprocessed
+# test/scripts/applications/impute/CReps
+# test/scripts/applications/impute/RegresValueMap
+# test/scripts/applications/impute/RegresFactorDefault
+# test/scripts/applications/impute/RegresParamMap
+# test/scripts/applications/impute/RegresCoeffDefault
+# test/scripts/applications/impute/RegresScaleMult
+
+is_GROUP_4_ENABLED = 1; # = 1 or 0
+num_known_terms = 6; # 20; # The number of known term reports, feel free to change
+num_predicted_terms = 1; # The number of predicted term reports, feel free to change
+
+num_terms = 2 * num_known_terms + num_predicted_terms;
+num_attrs = 19;
+
+# Indicator matrix for the "known" values that should not be matched to hidden reports:
+disabled_known_values = matrix (0.0, rows = num_attrs, cols = num_known_terms);
+disabled_known_values [4, 3] = 1.0;
+disabled_known_values [5, 3] = 1.0;
+disabled_known_values [6, 3] = 1.0;
+disabled_known_values [7, 3] = 1.0;
+
+initial_reports_unprocessed = read ($1);
+initial_reports = matrix (0.0, rows = num_attrs, cols = num_terms);
+initial_reports [, 1:num_known_terms] =
+ initial_reports_unprocessed [, 1:num_known_terms];
+initial_reports [, (num_known_terms + num_predicted_terms + 1):num_terms] =
+ initial_reports_unprocessed [, 1:num_known_terms];
+
+num_frees_per_term = 13;
+if (is_GROUP_4_ENABLED == 1) {
+ num_frees_per_term = 15;
+}
+
+num_frees = (num_known_terms + num_predicted_terms) * num_frees_per_term;
+
+zero = matrix (0.0, rows = 1, cols = 1);
+
+# ---------------------------------------------------------
+# GENERATE AN AFFINE MAP FROM FREE VARIABLES TO THE REPORTS
+# AFFINE MAP = LINEAR MAP + INITIAL (DEFAULT) REPORTS
+# All free variables are mapped to the "HIDDEN" reports
+# ---------------------------------------------------------
+
+CReps = matrix (0.0, rows = (num_terms * num_attrs), cols = num_frees);
+
+for (t in 1:(num_known_terms + num_predicted_terms))
+{
+ dt = (t-1) * num_attrs;
+ df = (t-1) * num_frees_per_term;
+# constraint that row1 = row2 + row3 + row4 + row5 + row6 + row7
+# translated to free vars: row1 = free1 + free2 + free3 + free4 + free5 + free6
+ CReps [dt + 1, df + 1] = 1.0 + zero;
+ CReps [dt + 1, df + 2] = 1.0 + zero;
+ CReps [dt + 1, df + 3] = 1.0 + zero;
+ CReps [dt + 1, df + 4] = 1.0 + zero;
+ CReps [dt + 1, df + 5] = 1.0 + zero;
+ CReps [dt + 1, df + 6] = 1.0 + zero;
+ CReps [dt + 2, df + 1] = 1.0 + zero;
+ CReps [dt + 3, df + 2] = 1.0 + zero;
+ CReps [dt + 4, df + 3] = 1.0 + zero;
+ CReps [dt + 5, df + 4] = 1.0 + zero;
+ CReps [dt + 6, df + 5] = 1.0 + zero;
+ CReps [dt + 7, df + 6] = 1.0 + zero;
+
+# row 8 is free variable not appearing in any non-free variable
+ CReps [dt + 8, df + 7] = 1.0 + zero;
+
+# constraint that row9 = row10 + row11 + row12 + row13 + row14 + row15
+# translated to free vars: row9 = free8 + free9 + free10 + free11 + free12 + free13
+ CReps [dt + 9, df + 8] = 1.0 + zero;
+ CReps [dt + 9, df + 9] = 1.0 + zero;
+ CReps [dt + 9, df + 10] = 1.0 + zero;
+ CReps [dt + 9, df + 11] = 1.0 + zero;
+ CReps [dt + 9, df + 12] = 1.0 + zero;
+ CReps [dt + 9, df + 13] = 1.0 + zero;
+ CReps [dt + 10, df + 8] = 1.0 + zero;
+ CReps [dt + 11, df + 9] = 1.0 + zero;
+ CReps [dt + 12, df + 10] = 1.0 + zero;
+ CReps [dt + 13, df + 11] = 1.0 + zero;
+ CReps [dt + 14, df + 12] = 1.0 + zero;
+ CReps [dt + 15, df + 13] = 1.0 + zero;
+
+# constraint that row16 = row14 + row15
+# translated to free vars: row16 = free14 + free15
+ if (is_GROUP_4_ENABLED == 1) {
+ CReps [dt + 16, df + 14] = 1.0 + zero;
+ CReps [dt + 16, df + 15] = 1.0 + zero;
+ CReps [dt + 17, df + 14] = 1.0 + zero;
+ CReps [dt + 18, df + 15] = 1.0 + zero;
+ }
+
+# constraint that row19 = total cost (all free variables)
+# translated to free vars: row19 = all free variables
+ CReps [dt + 19, df + 1] = 1.0 + zero;
+ CReps [dt + 19, df + 2] = 1.0 + zero;
+ CReps [dt + 19, df + 3] = 1.0 + zero;
+ CReps [dt + 19, df + 4] = 1.0 + zero;
+ CReps [dt + 19, df + 5] = 1.0 + zero;
+ CReps [dt + 19, df + 6] = 1.0 + zero;
+ CReps [dt + 19, df + 7] = 1.0 + zero;
+ CReps [dt + 19, df + 8] = 1.0 + zero;
+ CReps [dt + 19, df + 9] = 1.0 + zero;
+ CReps [dt + 19, df + 10] = 1.0 + zero;
+ CReps [dt + 19, df + 11] = 1.0 + zero;
+ CReps [dt + 19, df + 12] = 1.0 + zero;
+ CReps [dt + 19, df + 13] = 1.0 + zero;
+ if (is_GROUP_4_ENABLED == 1) {
+ CReps [dt + 19, df + 14] = 1.0 + zero;
+ CReps [dt + 19, df + 15] = 1.0 + zero;
+ }
+}
+
+
+# ---------------------------------------------------------
+# GENERATE AN AFFINE MAP FROM REPORTS TO REGRESSION FACTORS
+# AFFINE MAP = LINEAR MAP + A VECTOR OF DEFAULTS
+# ---------------------------------------------------------
+
+# We have three types of regressions:
+# 1. For "hidden" reports:
+# x[t] ~ aggregate[t], x[t-1], (x[t-1] - x[t-2])
+# 2. For "observed" reports:
+# y[t] ~ x[t] (with coefficient 1)
+# 3. For some parameters: the regularization equations.
+# All regressions follow the 4-factor pattern.
+num_factors = 4;
+
+# We have one regression equation per time-term for each attribute,
+# plus a few "special" regularization regression equations:
+num_regularization_regs = 12;
+if (is_GROUP_4_ENABLED == 1) {
+ num_regularization_regs = 16;
+}
+
+num_reg_eqs = num_terms * num_attrs + num_regularization_regs;
+
+RegresValueMap = matrix (0.0, rows = (num_reg_eqs * num_factors), cols = (num_terms * num_attrs));
+RegresFactorDefault = matrix (0.0, rows = (num_reg_eqs * num_factors), cols = 1);
+
+# All regression equations for the same attribute share the same parameters, regardless
+# of the term; some parameters are shared across multiple attributes, (those attributes
+# whose behavior is believed to be similar) as specified in the table below:
+
+num_params = 28;
+if (is_GROUP_4_ENABLED == 1) {
+ num_params = 35;
+}
+
+# Factors: -self[t] total[t] self[t-1] self[t-1]-
+# self[t-2]
+# PARAMS:
+# Group 1: 1.0 prm#01 prm#08 prm#09 Row #01 = free#01 + ... + free#06
+# Group 1: " prm#02 prm#10 prm#11 Row #02 = free#01
+# Group 1: " prm#03 " " Row #03 = free#02
+# Group 1: " prm#04 " " Row #04 = free#03
+# Group 1: " prm#05 " " Row #05 = free#04
+# Group 1: " prm#06 " " Row #06 = free#05
+# Group 1: " prm#07 " " Row #07 = free#06
+# --------------------------------------------------------------------
+# Group 2: 1.0 prm#12 prm#13 prm#14 Row #08 = free#07
+# --------------------------------------------------------------------
+# Group 3: 1.0 prm#15 prm#22 prm#23 Row #09 = free#08 + ... + free#13
+# Group 3: " prm#16 prm#24 prm#25 Row #10 = free#08
+# Group 3: " prm#17 " " Row #11 = free#09
+# Group 3: " prm#18 " " Row #12 = free#10
+# Group 3: " prm#19 " " Row #13 = free#11
+# Group 3: " prm#20 " " Row #14 = free#12
+# Group 3: " prm#21 " " Row #15 = free#13
+# --------------------------------------------------------------------
+# GROUP-4 ZEROS: FIVE PARAMETERS REVOKED
+# Group 4: 1.0 prm#29 prm#32 prm#33 Row #16 = free#14 + free#15
+# Group 4: " prm#30 prm#34 prm#35 Row #17 = free#14
+# Group 4: " prm#31 " " Row #18 = free#15
+# --------------------------------------------------------------------
+# Group 5: 1.0 prm#26 prm#27 prm#28 Row #19 = free#01 + ... + free#15
+#
+# (The aggregates in Groups 1..4 regress on the total cost in Group 5;
+# the total cost in Group 5 regresses on the intercept.)
+
+# THE LAST REGULARIZATION "REGRESSION" EQUATIONS:
+# Factors: 1.0 -1.0 0.0 0.0
+# PARAMS:
+# prm#27 1.0 0.0 0.0
+# prm#28 0.0 0.0 0.0
+# prm#08 0.0 0.0 0.0
+# prm#09 0.0 0.0 0.0
+# prm#10 0.0 0.0 0.0
+# prm#11 0.0 0.0 0.0
+# prm#13 0.0 0.0 0.0
+# prm#14 0.0 0.0 0.0
+# prm#22 0.0 0.0 0.0
+# prm#23 0.0 0.0 0.0
+# prm#24 0.0 0.0 0.0
+# prm#25 0.0 0.0 0.0
+# prm#32 0.0 0.0 0.0 # GROUP-4 ZEROS:
+# prm#33 0.0 0.0 0.0 # THESE EQUATIONS
+# prm#34 0.0 0.0 0.0 # USE REVOKED PARAMETERS
+# prm#35 0.0 0.0 0.0 # AND DO NOT APPEAR
+
+
+# --------------------------------------------------------------
+# FIRST, AN AFFINE MAP FROM HIDDEN REPORTS TO REGRESSION FACTORS
+# --------------------------------------------------------------
+
+for (t in 1 : (num_known_terms + num_predicted_terms))
+{
+ for (i in 1 : num_attrs)
+ {
+ reg_index = ((t-1) * num_attrs - 1 + i) * num_factors;
+
+ if (i < 19)
+ {
+ agg_i = 19;
+ if (i >= 2 & 1 <= 7) {agg_i = 1;}
+ if (i >= 10 & 1 <= 15) {agg_i = 9;}
+ if (i >= 17 & 1 <= 18) {agg_i = 16;}
+
+ RegresValueMap [reg_index + 1, (t-1) * num_attrs + i] = -1.0 + zero; # 1st factor: -x[t]
+ RegresValueMap [reg_index + 2, (t-1) * num_attrs + agg_i] = 1.0 + zero; # 2nd factor: aggregate[t]
+ if (t == 1) {
+ RegresValueMap [reg_index + 3, i] = 1.0 + zero; # For t = 1 the 3rd factor is x[t] = x[1]
+ } else {
+ RegresValueMap [reg_index + 3, (t-2) * num_attrs + i] = 1.0 + zero; # 3rd factor: x[t-1]
+ }
+ if (t >= 3) {
+ RegresValueMap [reg_index + 4, (t-2) * num_attrs + i] = 1.0 + zero; # 4th factor is
+ RegresValueMap [reg_index + 4, (t-3) * num_attrs + i] = -1.0 + zero; # x[t-1] - x[t-2]
+ }
+ }
+
+# Regression for the TOTAL:
+
+ if (i == 19)
+ {
+ if (t >= 2) {
+ RegresValueMap [reg_index + 1, (t-1) * num_attrs + 19] = -1.0 + zero; # 1st factor: -x[t]
+ RegresFactorDefault [reg_index + 2, 1] = 1.0 + zero; # 2nd factor: Intercept
+ RegresValueMap [reg_index + 3, (t-2) * num_attrs + 19] = 1.0 + zero; # 3rd factor: x[t-1]
+ }
+ if (t >= 3) {
+ RegresValueMap [reg_index + 4, (t-2) * num_attrs + 19] = 1.0 + zero; # 4th factor is
+ RegresValueMap [reg_index + 4, (t-3) * num_attrs + 19] = -1.0 + zero; # x[t-1] - x[t-2]
+ }
+ }
+ }
+}
+
+# -----------------------------------------------------------------
+# SECOND, AN AFFINE MAP FROM OBSERVED REPORTS TO REGRESSION FACTORS
+# -----------------------------------------------------------------
+
+for (t in (num_known_terms + num_predicted_terms + 1) : num_terms)
+{
+ t2 = t - (num_known_terms + num_predicted_terms);
+ for (i in 1 : num_attrs) {
+ reg_index = ((t-1) * num_attrs - 1 + i) * num_factors;
+ RegresValueMap [reg_index + 1, (t - 1) * num_attrs + i] = -1.0 + zero; # 1st factor: -y[t]
+ RegresValueMap [reg_index + 2, (t2 - 1) * num_attrs + i] = 1.0 + zero; # 2nd factor: x[t]
+ }
+}
+
+# -----------------------------------------------------
+# THIRD, AN AFFINE MAP FOR REGULARIZATION "REGRESSIONS"
+# -----------------------------------------------------
+
+reg_index = num_terms * num_attrs * num_factors;
+for (i in 1:num_regularization_regs)
+{
+ RegresFactorDefault [reg_index + 1, 1] = 1.0 + zero;
+ RegresFactorDefault [reg_index + 2, 1] = -1.0 + zero;
+ reg_index = reg_index + num_factors;
+}
+
+
+# ----------------------------------------------------------
+# GENERATE AN AFFINE MAP FROM PARAMETERS TO THE COEFFICIENTS
+# AT REGRESSION FACTORS: A LINEAR MAP + A VECTOR OF DEFAULTS
+# ----------------------------------------------------------
+
+RegresParamMap = matrix (0.0, rows = (num_reg_eqs * num_factors), cols = num_params);
+RegresCoeffDefault = matrix (0.0, rows = (num_reg_eqs * num_factors), cols = 1);
+
+# -----------------------------------------------------------
+# FIRST, AN AFFINE MAP THAT COVERS HIDDEN REPORTS REGRESSIONS
+# -----------------------------------------------------------
+
+for (t in 1 : (num_known_terms + num_predicted_terms)) {
+# Group 1 attributes:
+ reg_index = ((t-1) * num_attrs - 1 + 1) * num_factors;
+ RegresCoeffDefault [reg_index + 1, 1] = 1.0 + zero; # Default coefficient = 1.0
+ RegresParamMap [reg_index + 2, 1] = 1.0 + zero; # Param #01
+ RegresParamMap [reg_index + 3, 8] = 1.0 + zero; # Param #08
+ RegresParamMap [reg_index + 4, 9] = 1.0 + zero; # Param #09
+ for (i in 2 : 7) {
+ reg_index = ((t-1) * num_attrs - 1 + i) * num_factors;
+ RegresCoeffDefault [reg_index + 1, 1] = 1.0 + zero; # Default coefficient = 1.0
+ RegresParamMap [reg_index + 2, i] = 1.0 + zero; # Param #02-#07
+ RegresParamMap [reg_index + 3, 10] = 1.0 + zero; # Param #10
+ RegresParamMap [reg_index + 4, 11] = 1.0 + zero; # Param #11
+ }
+# Group 2 attribute:
+ reg_index = ((t-1) * num_attrs - 1 + 8) * num_factors;
+ RegresCoeffDefault [reg_index + 1, 1] = 1.0 + zero; # Default coefficient = 1.0
+ RegresParamMap [reg_index + 2, 12] = 1.0 + zero; # Param #12
+ RegresParamMap [reg_index + 3, 13] = 1.0 + zero; # Param #13
+ RegresParamMap [reg_index + 4, 14] = 1.0 + zero; # Param #14
+# Group 3 attributes:
+ reg_index = ((t-1) * num_attrs - 1 + 9) * num_factors;
+ RegresCoeffDefault [reg_index + 1, 1] = 1.0 + zero; # Default coefficient = 1.0
+ RegresParamMap [reg_index + 2, 15] = 1.0 + zero; # Param #17
+ RegresParamMap [reg_index + 3, 22] = 1.0 + zero; # Param #22
+ RegresParamMap [reg_index + 4, 23] = 1.0 + zero; # Param #23
+ for (i in 10 : 15) {
+ reg_index = ((t-1) * num_attrs - 1 + i) * num_factors;
+ RegresCoeffDefault [reg_index + 1, 1] = 1.0 + zero; # Default coefficient = 1.0
+ RegresParamMap [reg_index + 2, 6 + i] = 1.0 + zero; # Param #16-#21
+ RegresParamMap [reg_index + 3, 24] = 1.0 + zero; # Param #24
+ RegresParamMap [reg_index + 4, 25] = 1.0 + zero; # Param #25
+ }
+
+# Group 4 attributes:
+if (is_GROUP_4_ENABLED == 1) {
+ reg_index = ((t-1) * num_attrs - 1 + 16) * num_factors;
+ RegresCoeffDefault [reg_index + 1, 1] = 1.0 + zero; # Default coefficient = 1.0
+ RegresParamMap [reg_index + 2, 29] = 1.0 + zero; # Param #29
+ RegresParamMap [reg_index + 3, 32] = 1.0 + zero; # Param #32
+ RegresParamMap [reg_index + 4, 33] = 1.0 + zero; # Param #33
+ for (i in 17 : 18) {
+ reg_index = ((t-1) * num_attrs - 1 + i) * num_factors;
+ RegresCoeffDefault [reg_index + 1, 1] = 1.0 + zero; # Default coefficient = 1.0
+ RegresParamMap [reg_index + 2, 13 + i] = 1.0 + zero; # Param #30-#31
+ RegresParamMap [reg_index + 3, 34] = 1.0 + zero; # Param #34
+ RegresParamMap [reg_index + 4, 35] = 1.0 + zero; # Param #35
+ }
+}
+
+# Group 5 attribute:
+ reg_index = ((t-1) * num_attrs - 1 + 19) * num_factors;
+ RegresCoeffDefault [reg_index + 1, 1] = 1.0 + zero; # Default coefficient = 1.0
+ RegresParamMap [reg_index + 2, 26] = 1.0 + zero; # Param #26
+ RegresParamMap [reg_index + 3, 27] = 1.0 + zero; # Param #27
+ RegresParamMap [reg_index + 4, 28] = 1.0 + zero; # Param #28
+}
+
+# --------------------------------------------------------------
+# SECOND, AN AFFINE MAP THAT COVERS OBSERVED REPORTS REGRESSIONS
+# --------------------------------------------------------------
+
+for (t in (num_known_terms + num_predicted_terms + 1) : num_terms)
+{
+ for (i in 1 : num_attrs) {
+ if (castAsScalar (disabled_known_values [i, t - (num_known_terms + num_predicted_terms)]) == 0.0)
+ {
+ reg_index = ((t-1) * num_attrs - 1 + i) * num_factors;
+ RegresCoeffDefault [reg_index + 1, 1] = 1.0 + zero; # Default coefficient = 1.0
+ RegresCoeffDefault [reg_index + 2, 1] = 1.0 + zero; # Default coefficient = 1.0
+ }
+ }
+}
+
+# -------------------------------------------------------------
+# THIRD, AN AFFINE MAP THAT COVERS REGULARIZATION "REGRESSIONS"
+# -------------------------------------------------------------
+
+reg_index = num_terms * num_attrs * num_factors;
+ RegresParamMap [reg_index + 1, 27] = 1.0 + zero; # Param #27
+ RegresCoeffDefault [reg_index + 2, 1] = 1.0 + zero;
+reg_index = reg_index + num_factors;
+ RegresParamMap [reg_index + 1, 28] = 1.0 + zero; # Param #28
+reg_index = reg_index + num_factors;
+ RegresParamMap [reg_index + 1, 08] = 1.0 + zero; # Param #08
+reg_index = reg_index + num_factors;
+ RegresParamMap [reg_index + 1, 09] = 1.0 + zero; # Param #09
+reg_index = reg_index + num_factors;
+ RegresParamMap [reg_index + 1, 10] = 1.0 + zero; # Param #10
+reg_index = reg_index + num_factors;
+ RegresParamMap [reg_index + 1, 11] = 1.0 + zero; # Param #11
+reg_index = reg_index + num_factors;
+ RegresParamMap [reg_index + 1, 13] = 1.0 + zero; # Param #13
+reg_index = reg_index + num_factors;
+ RegresParamMap [reg_index + 1, 14] = 1.0 + zero; # Param #14
+reg_index = reg_index + num_factors;
+ RegresParamMap [reg_index + 1, 22] = 1.0 + zero; # Param #22
+reg_index = reg_index + num_factors;
+ RegresParamMap [reg_index + 1, 23] = 1.0 + zero; # Param #23
+reg_index = reg_index + num_factors;
+ RegresParamMap [reg_index + 1, 24] = 1.0 + zero; # Param #24
+reg_index = reg_index + num_factors;
+ RegresParamMap [reg_index + 1, 25] = 1.0 + zero; # Param #25
+
+if (is_GROUP_4_ENABLED == 1) {
+ reg_index = reg_index + num_factors;
+ RegresParamMap [reg_index + 1, 32] = 1.0 + zero; # Param #32
+ reg_index = reg_index + num_factors;
+ RegresParamMap [reg_index + 1, 33] = 1.0 + zero; # Param #33
+ reg_index = reg_index + num_factors;
+ RegresParamMap [reg_index + 1, 34] = 1.0 + zero; # Param #34
+ reg_index = reg_index + num_factors;
+ RegresParamMap [reg_index + 1, 35] = 1.0 + zero; # Param #35
+}
+
+# ----------------------------------------------------------
+# GENERATE A VECTOR OF SCALE MULTIPLIERS, ONE PER REGRESSION
+# ----------------------------------------------------------
+
+RegresScaleMult = matrix (1.0, rows = num_reg_eqs, cols = 1);
+
+global_weight = 0.5 + zero;
+
+attribute_size = rowMeans (abs (initial_reports [, 1:num_known_terms]));
+max_attr_size = max (attribute_size);
+
+for (t in 1 : num_terms) {
+ for (i in 1 : num_attrs) {
+ regeqn = (t-1) * num_attrs + i;
+ scale_down = sqrt (attribute_size [i, 1] / max_attr_size) * 0.999 + 0.001;
+ acceptable_drift = scale_down * max_attr_size * 0.001;
+ RegresScaleMult [regeqn, 1] = global_weight / (acceptable_drift^2);
+ }
+}
+
+for (i in 1 : num_regularization_regs) {
+ regeqn = num_terms * num_attrs + i;
+ acceptable_drift = 0.01;
+ RegresScaleMult [regeqn, 1] = global_weight / (acceptable_drift^2);
+}
+
+# --------------------------------
+# WRITE OUT ALL GENERATED MATRICES
+# --------------------------------
+
+write (initial_reports, $2, format="text");
+write (CReps, $3, format="text");
+write (RegresValueMap, $4, format="text");
+write (RegresFactorDefault,$5, format="text");
+write (RegresParamMap, $6, format="text");
+write (RegresCoeffDefault, $7, format="text");
+write (RegresScaleMult, $8, format="text");
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/816e2db8/src/test/scripts/applications/impute/wfundInputGenerator2.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/applications/impute/wfundInputGenerator2.dml b/src/test/scripts/applications/impute/wfundInputGenerator2.dml
index 52845ea..e6d302d 100644
--- a/src/test/scripts/applications/impute/wfundInputGenerator2.dml
+++ b/src/test/scripts/applications/impute/wfundInputGenerator2.dml
@@ -1,446 +1,446 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# 2013-10-08: THIS IS THE ATTEMPT TO IMPLEMENT HIDDEN STATE AS "HIDDEN REPORTS"
-# THE FIRST TERMS IN THE REPORTS MATRIX ARE THE HIDDEN REPORTS, THE LAST ARE THE KNOWN REPORTS
-#
-# THIS VERSION IS WITH "DIFFERENTIAL" REGRESSIONS & AUXILIARY ATTRIBUTES
-#
-# hadoop jar SystemML.jar -f test/scripts/applications/impute/wfundInputGenerator2.dml -exec singlenode
-# -args
-# test/scripts/applications/impute/initial_reports_unprocessed
-# test/scripts/applications/impute/initial_reports_preprocessed
-# test/scripts/applications/impute/CReps
-# test/scripts/applications/impute/RegresValueMap
-# test/scripts/applications/impute/RegresFactorDefault
-# test/scripts/applications/impute/RegresParamMap
-# test/scripts/applications/impute/RegresCoeffDefault
-# test/scripts/applications/impute/RegresScaleMult
-
-num_observed_attrs = 19; # The number of attributes in the report
-num_auxiliary_attrs = 5; # The number of extra attributes used to decompose the observed ones
-num_attrs = num_observed_attrs + num_auxiliary_attrs;
-zero = matrix (0.0, rows = 1, cols = 1);
-
-# -------------------------------------------
-# FEEL FREE / DON'T FORGET TO CHANGE THESE:
-# -------------------------------------------
-
-is_GROUP_4_ENABLED = 0; # = 1 or 0
-is_FLIPPING_ENABLED = 0; # = 1 or 0 DISABLE THIS!
-is_QUARTERLY_ENABLED = 1; # = 1 or 0 (enabled for sabesp)
-is_OCTALLY_ENABLED = 0; # = 1 or 0 DISABLE THIS!
-
-num_known_terms = 20; # The number of known term reports
-num_predicted_terms = 1; # The number of predicted term reports
-num_state_terms = num_known_terms + num_predicted_terms;
-
-# Indicator matrix to show which report values should NOT be penalized
-# because of their difference between "observed" and "hidden" reports:
-
-disabled_known_values = matrix (0.0, rows = num_observed_attrs, cols = num_known_terms);
-# disabled_known_values [4, 3] = 1.0 + zero;
-# disabled_known_values [5, 3] = 1.0 + zero;
-# disabled_known_values [6, 3] = 1.0 + zero;
-# disabled_known_values [7, 3] = 1.0 + zero;
-
-
-# --------------------------------------------------------
-# subtotals_tree [i, 1] = the closest subtotal attribute
-# " 0" means that this attribute's values are constants
-# "-1" means that this attribute is a root total
-# --------------------------------------------------------
-
-subtotals_tree = matrix (0.0, rows = num_attrs, cols = 1);
-
-subtotals_tree [ 1, 1] = 19 + zero; subtotals_tree [ 9, 1] = 19 + zero;
-subtotals_tree [ 2, 1] = 1 + zero; subtotals_tree [10, 1] = 9 + zero;
-subtotals_tree [ 3, 1] = 1 + zero; subtotals_tree [11, 1] = 9 + zero;
-subtotals_tree [ 4, 1] = 1 + zero; subtotals_tree [12, 1] = 9 + zero;
-subtotals_tree [ 5, 1] = 1 + zero; subtotals_tree [13, 1] = 9 + zero;
-subtotals_tree [ 6, 1] = 1 + zero; subtotals_tree [14, 1] = 9 + zero;
-subtotals_tree [ 7, 1] = 1 + zero; subtotals_tree [15, 1] = 9 + zero;
-subtotals_tree [ 8, 1] = 19 + zero; subtotals_tree [19, 1] = -1 + zero; # TOTAL
-
-if (is_GROUP_4_ENABLED == 1) {
- subtotals_tree [16, 1] = 19 + zero;
- subtotals_tree [17, 1] = 16 + zero;
- subtotals_tree [18, 1] = 16 + zero;
-}
-
-subtotals_tree [20, 1] = -1 + zero; # Auxiliary TOTAL
-subtotals_tree [21, 1] = 20 + zero;
-if (is_FLIPPING_ENABLED == 1) {subtotals_tree [22, 1] = 20 + zero;}
-if (is_QUARTERLY_ENABLED == 1) {subtotals_tree [23, 1] = 20 + zero;}
-if (is_OCTALLY_ENABLED == 1) {subtotals_tree [24, 1] = 20 + zero;}
-
-# -------------------------------------------------------------------
-# We have two full column-slots for every report: one slot for the
-# "hidden" report (# i) and one slot for the "observed" report
-# (# i + num_state_terms). Only the "hidden" part has degrees of
-# freedom associated with it; the "observed" part is kept constant.
-# We penalize most "hidden" values if they deviate too far from the
-# "observed" values. We also use this penalty to regularize
-# auxiliary attributes and/or predicted reports, in which case their
-# "observed" counterparts are set to zero.
-# -------------------------------------------------------------------
-
-num_terms = 2 * num_state_terms;
-
-initial_reports_unprocessed = read ($1);
-initial_reports = matrix (0.0, rows = num_attrs, cols = num_terms);
-initial_reports [1:num_observed_attrs, 1:num_known_terms] =
- initial_reports_unprocessed [1:num_observed_attrs, 1:num_known_terms];
-initial_reports [1:num_observed_attrs, (num_state_terms + 1) : (num_state_terms + num_known_terms)] =
- initial_reports_unprocessed [1:num_observed_attrs, 1:num_known_terms];
-
-disabled_known_values_extended = matrix (0.0, rows = num_attrs, cols = num_state_terms);
-disabled_known_values_extended [1:num_observed_attrs, 1:num_known_terms] = disabled_known_values;
-disabled_known_values = disabled_known_values_extended;
-
-# ---------------------------------------------------------
-# GENERATE AN AFFINE MAP FROM FREE VARIABLES TO THE REPORTS
-# AFFINE MAP = LINEAR MAP + INITIAL (DEFAULT) REPORTS
-# All free variables are mapped to the "HIDDEN" reports
-# ---------------------------------------------------------
-
-is_free = matrix (1.0, rows = num_attrs, cols = 1);
-for (i in 1:num_attrs) {
- j = castAsScalar (subtotals_tree [i, 1]);
- if (j > 0.0) {
- is_free [j, 1] = 0.0 + zero;
- } else {
- if (j == 0.0) {
- is_free [i, 1] = 0.0 + zero;
-} } }
-num_frees_per_term = sum (is_free);
-num_frees = num_state_terms * num_frees_per_term;
-
-CReps_block = matrix (0.0, rows = num_attrs, cols = num_frees_per_term);
-index_free = 0;
-for (i in 1:num_attrs) {
- if (castAsScalar (is_free [i, 1]) == 1.0) {
- index_free = index_free + 1;
- j = i;
- while (j > 0.0) {
- CReps_block [j, index_free] = 1.0 + zero;
- j = castAsScalar (subtotals_tree [j, 1]);
-} } }
-
-CReps = matrix (0.0, rows = (num_terms * num_attrs), cols = num_frees);
-for (t in 1:num_state_terms)
-{
- dt = (t-1) * num_attrs;
- df = (t-1) * num_frees_per_term;
- CReps [(dt + 1) : (dt + num_attrs), (df + 1) : (df + num_frees_per_term)] = CReps_block;
-}
-
-
-# ---------------------------------------------------------
-# GENERATE AN AFFINE MAP FROM REPORTS TO REGRESSION FACTORS
-# AFFINE MAP = LINEAR MAP + A VECTOR OF DEFAULTS
-# ---------------------------------------------------------
-
-# We have one regression equation per time-term for each attribute, plus a few special
-# regularization "regression" equations. There are three types of regressions:
-# 1. For "hidden" reports:
-# x[t] ~ subtotal[t], x[t-1], (x[t-1] - x[t-2])
-# (TOTAL[t] - TOTAL[t-1]) ~ (TOTAL[t-1] - TOTAL[t-2]), aux_1[t] (with coeff. 1)
-# where aux_1[t] = aux_2[t] + ... + aux_5[t] (implemented as hard constraint)
-# 2. For "observed" reports:
-# y[t] ~ x[t] (with coefficient 1)
-# 3. For all parameters: regularization equations.
-# All regressions follow the 4-factor pattern.
-
-num_factors = 4;
-num_params = 18 * 3 + 1;
-num_reg_eqs = num_terms * num_attrs + num_params;
-
-# All regression equations for the same attribute share the same parameters, regardless
-# of the term; some parameters may be shared across multiple attributes, (those attributes
-# whose behavior is believed to be similar) as specified in the table below:
-
-# NON-TOTAL OBSERVED ATTRIBUTE REGRESSION EQUATIONS:
-#
-# Factors: (x[t-1] -
-# -x[t] agg[t] x[t-1] x[t-2])
-# -----------------------------------------------------------------------------
-# Row #i = 1...18: 1.0 prm[3*i-1] prm[3*i] prm[3*i+1]
-# (Must have: agg = subtotals_tree [i, 1] > 0.0)
-# -----------------------------------------------------------------------------
-
-# TOTAL AND AUXILIARY ATTRIBUTE REGRESSION EQUATIONS:
-#
-# Factors: -(x[t] - (x[t-1] -
-# x[t-1]) x[t-2]) x[t-1] aux_1[t]
-# -----------------------------------------------------------------------------
-# TOTAL (Row #19): 1.0 prm[1] 0.0 1.0
-# aux_1 (Row #20): 0.0 0.0 0.0 0.0
-# aux_2 (Row #21): 1.0 1.0 0.0 0.0 "steady"
-# aux_3 (Row #22): 1.0 1.0 -4.0 0.0 "flipping"
-# aux_4 (Row #23): 1.0 1.0 -2.0 0.0 "quarterly"
-# aux_5 (Row #24): 1.0 1.0 sqrt(2)-2 0.0 "octally"
-# -----------------------------------------------------------------------------
-
-# THE LAST REGULARIZATION "REGRESSION" EQUATIONS:
-#
-# Factors: -1.0 1.0 0.0 0.0
-# -----------------------------------------------------------------------------
-# For prm[1]: prm[1] 0.0 ? 0.0 0.0 ???
-# For i = 1...18: prm[3*i-1] 0.0 0.0 0.0 if subtotals_tree [i, 1] == 0.0
-# prm[3*i] 1.0 0.0 0.0
-# prm[3*i+1] 0.0 0.0 0.0
-# For all others: 0.0 0.0 0.0 0.0
-# -----------------------------------------------------------------------------
-
-RegresValueMap = matrix (0.0, rows = (num_reg_eqs * num_factors), cols = (num_terms * num_attrs));
-RegresFactorDefault = matrix (0.0, rows = (num_reg_eqs * num_factors), cols = 1);
-
-# --------------------------------------------------------------
-# FIRST, AN AFFINE MAP FROM HIDDEN REPORTS TO REGRESSION FACTORS
-# --------------------------------------------------------------
-
-for (t in 2 : num_state_terms) {
- for (i in 1 : num_attrs) {
- reg_index = ((t-1) * num_attrs - 1 + i) * num_factors;
- agg = castAsScalar (subtotals_tree [i, 1]);
- if (i <= 18 & agg > 0)
- {
- RegresValueMap [reg_index + 1, (t-1) * num_attrs + i ] = -1.0 + zero; # 1st factor: -x[t]
- RegresValueMap [reg_index + 2, (t-1) * num_attrs + agg] = 1.0 + zero; # 2nd factor: agg[t]
- RegresValueMap [reg_index + 3, (t-2) * num_attrs + i ] = 1.0 + zero; # 3rd factor: x[t-1]
- if (t == 2) {
- RegresValueMap [reg_index + 4, (t-1) * num_attrs + i] = 1.0 + zero; # 4th factor:
- RegresValueMap [reg_index + 4, (t-2) * num_attrs + i] = -1.0 + zero; # x[t] - x[t-1]
- } else {
- RegresValueMap [reg_index + 4, (t-2) * num_attrs + i] = 1.0 + zero; # 4th factor:
- RegresValueMap [reg_index + 4, (t-3) * num_attrs + i] = -1.0 + zero; # x[t-1] - x[t-2]
- }
-### RegresFactorDefault [reg_index + 4, 1] = 1.0 + zero; # 4th factor: Intercept
- }
- if ((i == 19 | i >= 21) & t >= 3 & agg != 0)
- {
- reg_index = ((t-1) * num_attrs - 1 + i) * num_factors;
- RegresValueMap [reg_index + 1, (t-1) * num_attrs + i] = -1.0 + zero; # 1st factor:
- RegresValueMap [reg_index + 1, (t-2) * num_attrs + i] = 1.0 + zero; # - x[t] + x[t-1]
- RegresValueMap [reg_index + 2, (t-2) * num_attrs + i] = 1.0 + zero; # 2nd factor:
- RegresValueMap [reg_index + 2, (t-3) * num_attrs + i] = -1.0 + zero; # x[t-1] - x[t-2]
- RegresValueMap [reg_index + 3, (t-2) * num_attrs + i] = 1.0 + zero; # 3rd factor: x[t-1]
- RegresValueMap [reg_index + 4, (t-1) * num_attrs + 20] = 1.0 + zero; # 4th factor: aux_1[t]
-} } }
-
-# ----------------------------------------------------------------------------------------
-# SECOND, AN AFFINE MAP FROM OBSERVED REPORTS TO REGRESSION FACTORS FOR HIDDEN-TO-OBSERVED
-# REPORTS MATCHING AND/OR REPORT VALUE REGULARIZATION
-# NOTE THAT WE REGULARIZE AUXILIARY ATTRIBUTES BY MATCHING THEM TO ZEROS!
-# ----------------------------------------------------------------------------------------
-
-for (t1 in (num_state_terms + 1) : num_terms) {
- t2 = t1 - num_state_terms;
- for (i in 1 : num_attrs) {
- if ((i <= num_observed_attrs & t2 <= num_known_terms & castAsScalar (disabled_known_values [i, t2]) == 0.0) |
- (i > num_observed_attrs & castAsScalar (subtotals_tree [i, 1]) > 0.0))
- {
- reg_index = ((t1 - 1) * num_attrs - 1 + i) * num_factors;
- RegresValueMap [reg_index + 1, (t1 - 1) * num_attrs + i] = -1.0 + zero; # 1st factor: -y[t]
- RegresValueMap [reg_index + 2, (t2 - 1) * num_attrs + i] = 1.0 + zero; # 2nd factor: x[t]
-} } }
-
-# -----------------------------------------------------------------------
-# THIRD, AN AFFINE MAP THAT COVERS PARAMETER REGULARIZATION "REGRESSIONS"
-# -----------------------------------------------------------------------
-
-reg_index_base = num_terms * num_attrs * num_factors;
-for (param in 1:num_params)
-{
- reg_index = reg_index_base + (param - 1) * num_factors;
- RegresFactorDefault [reg_index + 1, 1] = -1.0 + zero;
- RegresFactorDefault [reg_index + 2, 1] = 1.0 + zero;
-}
-
-
-# ----------------------------------------------------------
-# GENERATE AN AFFINE MAP FROM PARAMETERS TO THE COEFFICIENTS
-# AT REGRESSION FACTORS: A LINEAR MAP + A VECTOR OF DEFAULTS
-# ----------------------------------------------------------
-
-RegresParamMap = matrix (0.0, rows = (num_reg_eqs * num_factors), cols = num_params);
-RegresCoeffDefault = matrix (0.0, rows = (num_reg_eqs * num_factors), cols = 1);
-
-# -----------------------------------------------------------
-# FIRST, AN AFFINE MAP THAT COVERS HIDDEN REPORTS REGRESSIONS
-# -----------------------------------------------------------
-
-for (t in 2 : num_state_terms) {
- for (i in 1 : num_observed_attrs) {
- if (castAsScalar (subtotals_tree [i, 1]) > 0.0) {
- param_1 = 3 * i - 1;
- param_2 = 3 * i;
- param_3 = 3 * i + 1;
- reg_index = ((t-1) * num_attrs - 1 + i) * num_factors;
- RegresCoeffDefault [reg_index + 1, 1] = 1.0 + zero;
- RegresParamMap [reg_index + 2, param_1] = 1.0 + zero;
- RegresParamMap [reg_index + 3, param_2] = 1.0 + zero;
- RegresParamMap [reg_index + 4, param_3] = 1.0 + zero;
- } }
-
- reg_index = ((t-1) * num_attrs - 1 + 19) * num_factors;
- RegresCoeffDefault [reg_index + 1, 1] = 1.0 + zero;
- RegresParamMap [reg_index + 2, 1] = 1.0 + zero; # prm[1]
- RegresCoeffDefault [reg_index + 4, 1] = 1.0 + zero;
-
- for (i in (num_observed_attrs + 1) : num_attrs) {
- if (castAsScalar (subtotals_tree [i, 1]) > 0.0) {
- reg_index = ((t-1) * num_attrs - 1 + i) * num_factors;
- RegresCoeffDefault [reg_index + 1, 1] = 1.0 + zero;
- RegresCoeffDefault [reg_index + 2, 1] = 1.0 + zero;
- if (i == 22) {
- RegresCoeffDefault [reg_index + 3, 1] = -4.0 + zero;
- }
- if (i == 23) {
- RegresCoeffDefault [reg_index + 3, 1] = -2.0 + zero;
- }
- if (i == 24) {
- RegresCoeffDefault [reg_index + 3, 1] = sqrt (2.0) - 2.0 + zero;
-} } } }
-
-# -----------------------------------------------------------------------
-# SECOND, AN AFFINE MAP THAT COVERS HIDDEN-TO-OBSERVED REPORTS MATCHING
-# AND/OR REPORT VALUE REGULARIZATION
-# NOTE THAT WE REGULARIZE AUXILIARY ATTRIBUTES BY MATCHING THEM TO ZEROS!
-# -----------------------------------------------------------------------
-
-for (t1 in (num_state_terms + 1) : num_terms) {
- t2 = t1 - num_state_terms;
- for (i in 1 : num_attrs) {
- if ((i <= num_observed_attrs & t2 <= num_known_terms & castAsScalar (disabled_known_values [i, t2]) == 0.0) |
- (i > num_observed_attrs & castAsScalar (subtotals_tree [i, 1]) > 0.0))
- {
- reg_index = ((t1 - 1) * num_attrs - 1 + i) * num_factors;
- RegresCoeffDefault [reg_index + 1, 1] = 1.0 + zero;
- RegresCoeffDefault [reg_index + 2, 1] = 1.0 + zero;
-} } }
-
-# -----------------------------------------------------------------------
-# THIRD, AN AFFINE MAP THAT COVERS PARAMETER REGULARIZATION "REGRESSIONS"
-# -----------------------------------------------------------------------
-
-reg_index_base = num_terms * num_attrs * num_factors;
-
-param = 1;
-
-reg_index = reg_index_base + (param - 1) * num_factors;
-RegresParamMap [reg_index + 1, param] = 1.0 + zero;
-RegresCoeffDefault [reg_index + 2, 1 ] = 0.0 + zero;
-
-for (i in 1 : num_observed_attrs) {
- agg = castAsScalar (subtotals_tree [i, 1]);
- if (agg >= 0.0)
- {
- param = 3 * i - 1;
-
- if (agg == 0.0) {
- reg_index = reg_index_base + (param - 1) * num_factors;
- RegresParamMap [reg_index + 1, param] = 1.0 + zero;
- RegresCoeffDefault [reg_index + 2, 1 ] = 0.0 + zero;
- }
-
- param = 3 * i;
-
- reg_index = reg_index_base + (param - 1) * num_factors;
- RegresParamMap [reg_index + 1, param] = 1.0 + zero;
- RegresCoeffDefault [reg_index + 2, 1 ] = 1.0 + zero;
-
- param = 3 * i + 1;
-
- reg_index = reg_index_base + (param - 1) * num_factors;
- RegresParamMap [reg_index + 1, param] = 1.0 + zero;
- RegresCoeffDefault [reg_index + 2, 1 ] = 0.0 + zero;
- }
-}
-
-
-# ----------------------------------------------------------
-# GENERATE A VECTOR OF SCALE MULTIPLIERS, ONE PER REGRESSION
-# ----------------------------------------------------------
-
-RegresScaleMult = matrix (1.0, rows = num_reg_eqs, cols = 1);
-
-global_weight = 0.5 + zero;
-
-attribute_size = rowMeans (abs (initial_reports [1:num_observed_attrs, 1:num_known_terms]));
-max_attr_size = max (attribute_size);
-difference_size = rowMeans (abs (initial_reports [1:num_observed_attrs, 2:num_known_terms]
- - initial_reports [1:num_observed_attrs, 1:(num_known_terms-1)]));
-max_diff_size = max (difference_size);
-
-for (i in 1 : num_attrs)
-{
- scale_factor = 1.0;
- if (i <= num_observed_attrs) {
- ### CORRECTION FOR OBSERVED ATTRIBUTES:
- attribute_size_i = castAsScalar (attribute_size [i, 1]);
- scale_factor = sqrt (attribute_size_i / max_attr_size) * 0.999 + 0.001;
- }
- for (t in 1 : num_terms) {
- if (t <= num_state_terms) {
- ### HIDDEN-STATE RECURRENCE REGRESSIONS
- if (i <= num_observed_attrs) {
- ### RECURRENCES FOR OBSERVED ATTRIBUTES:
- acceptable_drift = scale_factor * max_attr_size * 0.0005;
- } else {
- ### RECURRENCES FOR AUXILIARY ATTRIBUTES:
- acceptable_drift = scale_factor * max_diff_size * 0.0005;
- }
- } else {
- ### MATCHING AND REGULARIZATION
- if (i <= num_observed_attrs) {
- ### MATCHING OF HIDDEN WITH OBSERVED ATTRIBUTES:
- acceptable_drift = scale_factor * max_attr_size * 0.001;
- } else {
- ### REGULARIZATION OF AUXILIARY ATTRIBUTES:
- acceptable_drift = scale_factor * max_diff_size * 0.1;
- } }
- regeqn = (t-1) * num_attrs + i;
- RegresScaleMult [regeqn, 1] = global_weight / (acceptable_drift^2);
- }
-}
-
-for (i in 1 : num_params) {
- regeqn = num_terms * num_attrs + i;
- acceptable_drift = 0.05;
- if (i == 1) {
- acceptable_drift = 0.01; # 0.005;
- }
- RegresScaleMult [regeqn, 1] = global_weight / (acceptable_drift^2);
-}
-
-# --------------------------------
-# WRITE OUT ALL GENERATED MATRICES
-# --------------------------------
-
-write (initial_reports, $2, format="text");
-write (CReps, $3, format="text");
-write (RegresValueMap, $4, format="text");
-write (RegresFactorDefault,$5, format="text");
-write (RegresParamMap, $6, format="text");
-write (RegresCoeffDefault, $7, format="text");
-write (RegresScaleMult, $8, format="text");
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# 2013-10-08: THIS IS THE ATTEMPT TO IMPLEMENT HIDDEN STATE AS "HIDDEN REPORTS"
+# THE FIRST TERMS IN THE REPORTS MATRIX ARE THE HIDDEN REPORTS, THE LAST ARE THE KNOWN REPORTS
+#
+# THIS VERSION IS WITH "DIFFERENTIAL" REGRESSIONS & AUXILIARY ATTRIBUTES
+#
+# hadoop jar SystemML.jar -f test/scripts/applications/impute/wfundInputGenerator2.dml -exec singlenode
+# -args
+# test/scripts/applications/impute/initial_reports_unprocessed
+# test/scripts/applications/impute/initial_reports_preprocessed
+# test/scripts/applications/impute/CReps
+# test/scripts/applications/impute/RegresValueMap
+# test/scripts/applications/impute/RegresFactorDefault
+# test/scripts/applications/impute/RegresParamMap
+# test/scripts/applications/impute/RegresCoeffDefault
+# test/scripts/applications/impute/RegresScaleMult
+
+num_observed_attrs = 19; # The number of attributes in the report
+num_auxiliary_attrs = 5; # The number of extra attributes used to decompose the observed ones
+num_attrs = num_observed_attrs + num_auxiliary_attrs;
+zero = matrix (0.0, rows = 1, cols = 1);
+
+# -------------------------------------------
+# FEEL FREE / DON'T FORGET TO CHANGE THESE:
+# -------------------------------------------
+
+is_GROUP_4_ENABLED = 0; # = 1 or 0
+is_FLIPPING_ENABLED = 0; # = 1 or 0 DISABLE THIS!
+is_QUARTERLY_ENABLED = 1; # = 1 or 0 (enabled for sabesp)
+is_OCTALLY_ENABLED = 0; # = 1 or 0 DISABLE THIS!
+
+num_known_terms = 20; # The number of known term reports
+num_predicted_terms = 1; # The number of predicted term reports
+num_state_terms = num_known_terms + num_predicted_terms;
+
+# Indicator matrix to show which report values should NOT be penalized
+# because of their difference between "observed" and "hidden" reports:
+
+disabled_known_values = matrix (0.0, rows = num_observed_attrs, cols = num_known_terms);
+# disabled_known_values [4, 3] = 1.0 + zero;
+# disabled_known_values [5, 3] = 1.0 + zero;
+# disabled_known_values [6, 3] = 1.0 + zero;
+# disabled_known_values [7, 3] = 1.0 + zero;
+
+
+# --------------------------------------------------------
+# subtotals_tree [i, 1] = the closest subtotal attribute
+# " 0" means that this attribute's values are constants
+# "-1" means that this attribute is a root total
+# --------------------------------------------------------
+
+subtotals_tree = matrix (0.0, rows = num_attrs, cols = 1);
+
+subtotals_tree [ 1, 1] = 19 + zero; subtotals_tree [ 9, 1] = 19 + zero;
+subtotals_tree [ 2, 1] = 1 + zero; subtotals_tree [10, 1] = 9 + zero;
+subtotals_tree [ 3, 1] = 1 + zero; subtotals_tree [11, 1] = 9 + zero;
+subtotals_tree [ 4, 1] = 1 + zero; subtotals_tree [12, 1] = 9 + zero;
+subtotals_tree [ 5, 1] = 1 + zero; subtotals_tree [13, 1] = 9 + zero;
+subtotals_tree [ 6, 1] = 1 + zero; subtotals_tree [14, 1] = 9 + zero;
+subtotals_tree [ 7, 1] = 1 + zero; subtotals_tree [15, 1] = 9 + zero;
+subtotals_tree [ 8, 1] = 19 + zero; subtotals_tree [19, 1] = -1 + zero; # TOTAL
+
+if (is_GROUP_4_ENABLED == 1) {
+ subtotals_tree [16, 1] = 19 + zero;
+ subtotals_tree [17, 1] = 16 + zero;
+ subtotals_tree [18, 1] = 16 + zero;
+}
+
+subtotals_tree [20, 1] = -1 + zero; # Auxiliary TOTAL
+subtotals_tree [21, 1] = 20 + zero;
+if (is_FLIPPING_ENABLED == 1) {subtotals_tree [22, 1] = 20 + zero;}
+if (is_QUARTERLY_ENABLED == 1) {subtotals_tree [23, 1] = 20 + zero;}
+if (is_OCTALLY_ENABLED == 1) {subtotals_tree [24, 1] = 20 + zero;}
+
+# -------------------------------------------------------------------
+# We have two full column-slots for every report: one slot for the
+# "hidden" report (# i) and one slot for the "observed" report
+# (# i + num_state_terms). Only the "hidden" part has degrees of
+# freedom associated with it; the "observed" part is kept constant.
+# We penalize most "hidden" values if they deviate too far from the
+# "observed" values. We also use this penalty to regularize
+# auxiliary attributes and/or predicted reports, in which case their
+# "observed" counterparts are set to zero.
+# -------------------------------------------------------------------
+
+num_terms = 2 * num_state_terms;
+
+initial_reports_unprocessed = read ($1);
+initial_reports = matrix (0.0, rows = num_attrs, cols = num_terms);
+initial_reports [1:num_observed_attrs, 1:num_known_terms] =
+ initial_reports_unprocessed [1:num_observed_attrs, 1:num_known_terms];
+initial_reports [1:num_observed_attrs, (num_state_terms + 1) : (num_state_terms + num_known_terms)] =
+ initial_reports_unprocessed [1:num_observed_attrs, 1:num_known_terms];
+
+disabled_known_values_extended = matrix (0.0, rows = num_attrs, cols = num_state_terms);
+disabled_known_values_extended [1:num_observed_attrs, 1:num_known_terms] = disabled_known_values;
+disabled_known_values = disabled_known_values_extended;
+
+# ---------------------------------------------------------
+# GENERATE AN AFFINE MAP FROM FREE VARIABLES TO THE REPORTS
+# AFFINE MAP = LINEAR MAP + INITIAL (DEFAULT) REPORTS
+# All free variables are mapped to the "HIDDEN" reports
+# ---------------------------------------------------------
+
+is_free = matrix (1.0, rows = num_attrs, cols = 1);
+for (i in 1:num_attrs) {
+ j = castAsScalar (subtotals_tree [i, 1]);
+ if (j > 0.0) {
+ is_free [j, 1] = 0.0 + zero;
+ } else {
+ if (j == 0.0) {
+ is_free [i, 1] = 0.0 + zero;
+} } }
+num_frees_per_term = sum (is_free);
+num_frees = num_state_terms * num_frees_per_term;
+
+CReps_block = matrix (0.0, rows = num_attrs, cols = num_frees_per_term);
+index_free = 0;
+for (i in 1:num_attrs) {
+ if (castAsScalar (is_free [i, 1]) == 1.0) {
+ index_free = index_free + 1;
+ j = i;
+ while (j > 0.0) {
+ CReps_block [j, index_free] = 1.0 + zero;
+ j = castAsScalar (subtotals_tree [j, 1]);
+} } }
+
+CReps = matrix (0.0, rows = (num_terms * num_attrs), cols = num_frees);
+for (t in 1:num_state_terms)
+{
+ dt = (t-1) * num_attrs;
+ df = (t-1) * num_frees_per_term;
+ CReps [(dt + 1) : (dt + num_attrs), (df + 1) : (df + num_frees_per_term)] = CReps_block;
+}
+
+
+# ---------------------------------------------------------
+# GENERATE AN AFFINE MAP FROM REPORTS TO REGRESSION FACTORS
+# AFFINE MAP = LINEAR MAP + A VECTOR OF DEFAULTS
+# ---------------------------------------------------------
+
+# We have one regression equation per time-term for each attribute, plus a few special
+# regularization "regression" equations. There are three types of regressions:
+# 1. For "hidden" reports:
+# x[t] ~ subtotal[t], x[t-1], (x[t-1] - x[t-2])
+# (TOTAL[t] - TOTAL[t-1]) ~ (TOTAL[t-1] - TOTAL[t-2]), aux_1[t] (with coeff. 1)
+# where aux_1[t] = aux_2[t] + ... + aux_5[t] (implemented as hard constraint)
+# 2. For "observed" reports:
+# y[t] ~ x[t] (with coefficient 1)
+# 3. For all parameters: regularization equations.
+# All regressions follow the 4-factor pattern.
+
+num_factors = 4;
+num_params = 18 * 3 + 1;
+num_reg_eqs = num_terms * num_attrs + num_params;
+
+# All regression equations for the same attribute share the same parameters, regardless
+# of the term; some parameters may be shared across multiple attributes, (those attributes
+# whose behavior is believed to be similar) as specified in the table below:
+
+# NON-TOTAL OBSERVED ATTRIBUTE REGRESSION EQUATIONS:
+#
+# Factors: (x[t-1] -
+# -x[t] agg[t] x[t-1] x[t-2])
+# -----------------------------------------------------------------------------
+# Row #i = 1...18: 1.0 prm[3*i-1] prm[3*i] prm[3*i+1]
+# (Must have: agg = subtotals_tree [i, 1] > 0.0)
+# -----------------------------------------------------------------------------
+
+# TOTAL AND AUXILIARY ATTRIBUTE REGRESSION EQUATIONS:
+#
+# Factors: -(x[t] - (x[t-1] -
+# x[t-1]) x[t-2]) x[t-1] aux_1[t]
+# -----------------------------------------------------------------------------
+# TOTAL (Row #19): 1.0 prm[1] 0.0 1.0
+# aux_1 (Row #20): 0.0 0.0 0.0 0.0
+# aux_2 (Row #21): 1.0 1.0 0.0 0.0 "steady"
+# aux_3 (Row #22): 1.0 1.0 -4.0 0.0 "flipping"
+# aux_4 (Row #23): 1.0 1.0 -2.0 0.0 "quarterly"
+# aux_5 (Row #24): 1.0 1.0 sqrt(2)-2 0.0 "octally"
+# -----------------------------------------------------------------------------
+
+# THE LAST REGULARIZATION "REGRESSION" EQUATIONS:
+#
+# Factors: -1.0 1.0 0.0 0.0
+# -----------------------------------------------------------------------------
+# For prm[1]: prm[1] 0.0 ? 0.0 0.0 ???
+# For i = 1...18: prm[3*i-1] 0.0 0.0 0.0 if subtotals_tree [i, 1] == 0.0
+# prm[3*i] 1.0 0.0 0.0
+# prm[3*i+1] 0.0 0.0 0.0
+# For all others: 0.0 0.0 0.0 0.0
+# -----------------------------------------------------------------------------
+
+RegresValueMap = matrix (0.0, rows = (num_reg_eqs * num_factors), cols = (num_terms * num_attrs));
+RegresFactorDefault = matrix (0.0, rows = (num_reg_eqs * num_factors), cols = 1);
+
+# --------------------------------------------------------------
+# FIRST, AN AFFINE MAP FROM HIDDEN REPORTS TO REGRESSION FACTORS
+# --------------------------------------------------------------
+
+for (t in 2 : num_state_terms) {
+ for (i in 1 : num_attrs) {
+ reg_index = ((t-1) * num_attrs - 1 + i) * num_factors;
+ agg = castAsScalar (subtotals_tree [i, 1]);
+ if (i <= 18 & agg > 0)
+ {
+ RegresValueMap [reg_index + 1, (t-1) * num_attrs + i ] = -1.0 + zero; # 1st factor: -x[t]
+ RegresValueMap [reg_index + 2, (t-1) * num_attrs + agg] = 1.0 + zero; # 2nd factor: agg[t]
+ RegresValueMap [reg_index + 3, (t-2) * num_attrs + i ] = 1.0 + zero; # 3rd factor: x[t-1]
+ if (t == 2) {
+ RegresValueMap [reg_index + 4, (t-1) * num_attrs + i] = 1.0 + zero; # 4th factor:
+ RegresValueMap [reg_index + 4, (t-2) * num_attrs + i] = -1.0 + zero; # x[t] - x[t-1]
+ } else {
+ RegresValueMap [reg_index + 4, (t-2) * num_attrs + i] = 1.0 + zero; # 4th factor:
+ RegresValueMap [reg_index + 4, (t-3) * num_attrs + i] = -1.0 + zero; # x[t-1] - x[t-2]
+ }
+### RegresFactorDefault [reg_index + 4, 1] = 1.0 + zero; # 4th factor: Intercept
+ }
+ if ((i == 19 | i >= 21) & t >= 3 & agg != 0)
+ {
+ reg_index = ((t-1) * num_attrs - 1 + i) * num_factors;
+ RegresValueMap [reg_index + 1, (t-1) * num_attrs + i] = -1.0 + zero; # 1st factor:
+ RegresValueMap [reg_index + 1, (t-2) * num_attrs + i] = 1.0 + zero; # - x[t] + x[t-1]
+ RegresValueMap [reg_index + 2, (t-2) * num_attrs + i] = 1.0 + zero; # 2nd factor:
+ RegresValueMap [reg_index + 2, (t-3) * num_attrs + i] = -1.0 + zero; # x[t-1] - x[t-2]
+ RegresValueMap [reg_index + 3, (t-2) * num_attrs + i] = 1.0 + zero; # 3rd factor: x[t-1]
+ RegresValueMap [reg_index + 4, (t-1) * num_attrs + 20] = 1.0 + zero; # 4th factor: aux_1[t]
+} } }
+
+# ----------------------------------------------------------------------------------------
+# SECOND, AN AFFINE MAP FROM OBSERVED REPORTS TO REGRESSION FACTORS FOR HIDDEN-TO-OBSERVED
+# REPORTS MATCHING AND/OR REPORT VALUE REGULARIZATION
+# NOTE THAT WE REGULARIZE AUXILIARY ATTRIBUTES BY MATCHING THEM TO ZEROS!
+# ----------------------------------------------------------------------------------------
+
+for (t1 in (num_state_terms + 1) : num_terms) {
+ t2 = t1 - num_state_terms;
+ for (i in 1 : num_attrs) {
+ if ((i <= num_observed_attrs & t2 <= num_known_terms & castAsScalar (disabled_known_values [i, t2]) == 0.0) |
+ (i > num_observed_attrs & castAsScalar (subtotals_tree [i, 1]) > 0.0))
+ {
+ reg_index = ((t1 - 1) * num_attrs - 1 + i) * num_factors;
+ RegresValueMap [reg_index + 1, (t1 - 1) * num_attrs + i] = -1.0 + zero; # 1st factor: -y[t]
+ RegresValueMap [reg_index + 2, (t2 - 1) * num_attrs + i] = 1.0 + zero; # 2nd factor: x[t]
+} } }
+
+# -----------------------------------------------------------------------
+# THIRD, AN AFFINE MAP THAT COVERS PARAMETER REGULARIZATION "REGRESSIONS"
+# -----------------------------------------------------------------------
+
+reg_index_base = num_terms * num_attrs * num_factors;
+for (param in 1:num_params)
+{
+ reg_index = reg_index_base + (param - 1) * num_factors;
+ RegresFactorDefault [reg_index + 1, 1] = -1.0 + zero;
+ RegresFactorDefault [reg_index + 2, 1] = 1.0 + zero;
+}
+
+
+# ----------------------------------------------------------
+# GENERATE AN AFFINE MAP FROM PARAMETERS TO THE COEFFICIENTS
+# AT REGRESSION FACTORS: A LINEAR MAP + A VECTOR OF DEFAULTS
+# ----------------------------------------------------------
+
+RegresParamMap = matrix (0.0, rows = (num_reg_eqs * num_factors), cols = num_params);
+RegresCoeffDefault = matrix (0.0, rows = (num_reg_eqs * num_factors), cols = 1);
+
+# -----------------------------------------------------------
+# FIRST, AN AFFINE MAP THAT COVERS HIDDEN REPORTS REGRESSIONS
+# -----------------------------------------------------------
+
+for (t in 2 : num_state_terms) {
+ for (i in 1 : num_observed_attrs) {
+ if (castAsScalar (subtotals_tree [i, 1]) > 0.0) {
+ param_1 = 3 * i - 1;
+ param_2 = 3 * i;
+ param_3 = 3 * i + 1;
+ reg_index = ((t-1) * num_attrs - 1 + i) * num_factors;
+ RegresCoeffDefault [reg_index + 1, 1] = 1.0 + zero;
+ RegresParamMap [reg_index + 2, param_1] = 1.0 + zero;
+ RegresParamMap [reg_index + 3, param_2] = 1.0 + zero;
+ RegresParamMap [reg_index + 4, param_3] = 1.0 + zero;
+ } }
+
+ reg_index = ((t-1) * num_attrs - 1 + 19) * num_factors;
+ RegresCoeffDefault [reg_index + 1, 1] = 1.0 + zero;
+ RegresParamMap [reg_index + 2, 1] = 1.0 + zero; # prm[1]
+ RegresCoeffDefault [reg_index + 4, 1] = 1.0 + zero;
+
+ for (i in (num_observed_attrs + 1) : num_attrs) {
+ if (castAsScalar (subtotals_tree [i, 1]) > 0.0) {
+ reg_index = ((t-1) * num_attrs - 1 + i) * num_factors;
+ RegresCoeffDefault [reg_index + 1, 1] = 1.0 + zero;
+ RegresCoeffDefault [reg_index + 2, 1] = 1.0 + zero;
+ if (i == 22) {
+ RegresCoeffDefault [reg_index + 3, 1] = -4.0 + zero;
+ }
+ if (i == 23) {
+ RegresCoeffDefault [reg_index + 3, 1] = -2.0 + zero;
+ }
+ if (i == 24) {
+ RegresCoeffDefault [reg_index + 3, 1] = sqrt (2.0) - 2.0 + zero;
+} } } }
+
+# -----------------------------------------------------------------------
+# SECOND, AN AFFINE MAP THAT COVERS HIDDEN-TO-OBSERVED REPORTS MATCHING
+# AND/OR REPORT VALUE REGULARIZATION
+# NOTE THAT WE REGULARIZE AUXILIARY ATTRIBUTES BY MATCHING THEM TO ZEROS!
+# -----------------------------------------------------------------------
+
+for (t1 in (num_state_terms + 1) : num_terms) {
+ t2 = t1 - num_state_terms;
+ for (i in 1 : num_attrs) {
+ if ((i <= num_observed_attrs & t2 <= num_known_terms & castAsScalar (disabled_known_values [i, t2]) == 0.0) |
+ (i > num_observed_attrs & castAsScalar (subtotals_tree [i, 1]) > 0.0))
+ {
+ reg_index = ((t1 - 1) * num_attrs - 1 + i) * num_factors;
+ RegresCoeffDefault [reg_index + 1, 1] = 1.0 + zero;
+ RegresCoeffDefault [reg_index + 2, 1] = 1.0 + zero;
+} } }
+
+# -----------------------------------------------------------------------
+# THIRD, AN AFFINE MAP THAT COVERS PARAMETER REGULARIZATION "REGRESSIONS"
+# -----------------------------------------------------------------------
+
+reg_index_base = num_terms * num_attrs * num_factors;
+
+param = 1;
+
+reg_index = reg_index_base + (param - 1) * num_factors;
+RegresParamMap [reg_index + 1, param] = 1.0 + zero;
+RegresCoeffDefault [reg_index + 2, 1 ] = 0.0 + zero;
+
+for (i in 1 : num_observed_attrs) {
+ agg = castAsScalar (subtotals_tree [i, 1]);
+ if (agg >= 0.0)
+ {
+ param = 3 * i - 1;
+
+ if (agg == 0.0) {
+ reg_index = reg_index_base + (param - 1) * num_factors;
+ RegresParamMap [reg_index + 1, param] = 1.0 + zero;
+ RegresCoeffDefault [reg_index + 2, 1 ] = 0.0 + zero;
+ }
+
+ param = 3 * i;
+
+ reg_index = reg_index_base + (param - 1) * num_factors;
+ RegresParamMap [reg_index + 1, param] = 1.0 + zero;
+ RegresCoeffDefault [reg_index + 2, 1 ] = 1.0 + zero;
+
+ param = 3 * i + 1;
+
+ reg_index = reg_index_base + (param - 1) * num_factors;
+ RegresParamMap [reg_index + 1, param] = 1.0 + zero;
+ RegresCoeffDefault [reg_index + 2, 1 ] = 0.0 + zero;
+ }
+}
+
+
+# ----------------------------------------------------------
+# GENERATE A VECTOR OF SCALE MULTIPLIERS, ONE PER REGRESSION
+# ----------------------------------------------------------
+
+RegresScaleMult = matrix (1.0, rows = num_reg_eqs, cols = 1);
+
+global_weight = 0.5 + zero;
+
+attribute_size = rowMeans (abs (initial_reports [1:num_observed_attrs, 1:num_known_terms]));
+max_attr_size = max (attribute_size);
+difference_size = rowMeans (abs (initial_reports [1:num_observed_attrs, 2:num_known_terms]
+ - initial_reports [1:num_observed_attrs, 1:(num_known_terms-1)]));
+max_diff_size = max (difference_size);
+
+for (i in 1 : num_attrs)
+{
+ scale_factor = 1.0;
+ if (i <= num_observed_attrs) {
+ ### CORRECTION FOR OBSERVED ATTRIBUTES:
+ attribute_size_i = castAsScalar (attribute_size [i, 1]);
+ scale_factor = sqrt (attribute_size_i / max_attr_size) * 0.999 + 0.001;
+ }
+ for (t in 1 : num_terms) {
+ if (t <= num_state_terms) {
+ ### HIDDEN-STATE RECURRENCE REGRESSIONS
+ if (i <= num_observed_attrs) {
+ ### RECURRENCES FOR OBSERVED ATTRIBUTES:
+ acceptable_drift = scale_factor * max_attr_size * 0.0005;
+ } else {
+ ### RECURRENCES FOR AUXILIARY ATTRIBUTES:
+ acceptable_drift = scale_factor * max_diff_size * 0.0005;
+ }
+ } else {
+ ### MATCHING AND REGULARIZATION
+ if (i <= num_observed_attrs) {
+ ### MATCHING OF HIDDEN WITH OBSERVED ATTRIBUTES:
+ acceptable_drift = scale_factor * max_attr_size * 0.001;
+ } else {
+ ### REGULARIZATION OF AUXILIARY ATTRIBUTES:
+ acceptable_drift = scale_factor * max_diff_size * 0.1;
+ } }
+ regeqn = (t-1) * num_attrs + i;
+ RegresScaleMult [regeqn, 1] = global_weight / (acceptable_drift^2);
+ }
+}
+
+for (i in 1 : num_params) {
+ regeqn = num_terms * num_attrs + i;
+ acceptable_drift = 0.05;
+ if (i == 1) {
+ acceptable_drift = 0.01; # 0.005;
+ }
+ RegresScaleMult [regeqn, 1] = global_weight / (acceptable_drift^2);
+}
+
+# --------------------------------
+# WRITE OUT ALL GENERATED MATRICES
+# --------------------------------
+
+write (initial_reports, $2, format="text");
+write (CReps, $3, format="text");
+write (RegresValueMap, $4, format="text");
+write (RegresFactorDefault,$5, format="text");
+write (RegresParamMap, $6, format="text");
+write (RegresCoeffDefault, $7, format="text");
+write (RegresScaleMult, $8, format="text");