/* * *Programmed by Donald E. Ramirez; *Department of Mathematics *University of Virginia *Charlottesville, VA 22904-4137 *USA * *der@virginia.edu *http://www.math.virginia.edu/~der/home.html * *SAS code; *Case Studies of Normal Diagnostics in Regression Using Recovered Errors; *Proceedings of the Interface 2000 Conference; *Version: April 16, 2000; * *Applesauce Data from D. R. Jensen; * *Program computes P-values for tests of normality for oridinary residuals * and recovered errors; * *Columns of X are 1=x0 x1 x2 x3 y1 y2; *NX = 4 *NY = 2 *N = 48 * *infile = applesauce.txt * *This SAS code will run for models with NX = 4 and NY = 2; *The title (*01), N (*02) and infile name (*03) will need to be modified; * *For most problems the scaling matrix D (*04) is the identity matrix *To handle X'X when ill-conditioned, we scale to improved numerical accuracy * *For other dimensions of NX or NY, 26 lines of code (*01) to (*26) need to be modified; * *To reduce the output file, the following proc commands may be commented out; * proc capability (for the q-q plots); * proc autoreg (for the BLUE residuals); * *The P-values of interest are : * *P-Values from the ordinary residuals, * denoted e in paper and y_resid1 y_resid2 in SAS code * 0.4114 and .4568 for this example *P-Values from the unrotated recovered errors from eigenvectors of H, * denoted R_sub_H in paper and qpy1 qpy2 in SAS code * 0.7114 and 0.6919 for this example *P-Values from the unrotated recovered errors from eigenvectors of B, * denoted R_sub_B in paper and ppy1 ppy2 in SAS code * 0.9610 and 0.8022 *P-Values from the rotated recovered errors from eigenvectors of H, * denoted A'_sub_H*Y in paper and rqpy1 rqpy2 in SAS code * 0.2501 and 0.1219 *P-Values from the rotated recovered errors from eigenvectors of B, * denoted A'_sub_B*Y in paper and rppy1 rppy2 in SAS code * 0.4389 and 0.0262 in this example * *For testing joint normality, the we recommend Mardia's multivariate skewness test * */ *01: CHANGE TITLE FOR NEW DATA SET; title "Applesauce Data from D. R. Jensen - Rows have been ranked by the leverages"; *02: CHANGE DATA VALUES FOR NEW DATA SET; data values; N = 48 ; NX = 4 ; NY = 2 ; run; *Warning: the first NX rows in X must be linearly independent to compute BLUS; *The ordering used in applesause.txt does not have the first NX rows in X independent; *Thus the BLUE residuals can not be computed with this ordering of X; *This is a standard problem with the BLUE residuals; *03: CHANGE FILE NAME AND VARIABLE NAMES FOR NEW DATA SET; data in_data; infile '.\applesauce.txt' ; input x0 x1 x2 x3 y1 y2 ; run; proc iml; use values; read all; use in_data; read all into XY; print XY; X = XY(| , 1 : NX |); *04: CHANGE NAMES OF VARIABLES FOR NEW DATA SET; names = {'x0' 'x1' 'x2' 'x3' 'y1' 'y2'}; create data_XY from XY(|colname=names|) ; append from XY; XY_bot = XY(| NX + 1 : N , |); create data_bot from XY_bot(|colname=names|) ; append from XY_bot; XY_top = XY(| 1 : NX , |); create data_top from XY_top(|colname=names|) ; append from XY_top; evals = eigval(inv(X`*X)); print 'Eigvals for inv(X`*X) from the original data'; print evals; *05: IF REQUIRED, SCALE X by XD = X*D TO IMPROVE CALCULATION OF H = X*INV(X'X)*X' ; *USUALLY D = I(N); /* Improve calculations by improving the condition number of X'X; */ D = { 1 0 0 0, 0 .01 0 0, 0 0 1 0, 0 0 0 .01}; XD =X*D; evals_XD = eigval(inv(XD`*XD)); print "Eigvals for inv(XD`*XD) from the scaled data"; print evals_XD; /* End of scaling block; */ H = XD*inv(XD`*XD)*XD`; diag_H = 1:N; do i=1 to N; diag_H[i] = H[i,i]; end; print diag_H; Q = eigvec(H); QpXY = Q`*XY; *06: CHANGE NAMES OF VARIABLES FOR NEW DATA SET; names={'qpx0' 'qpx1' 'qpx2' 'qpx3' 'qpy1' 'qpy2'}; /* create Qp_data from QpXY(|colname=names|); append from QpXY; */ /* Remove the top r = rank_X = NX observations from the eigenvectors of H */ Qp_r = Q`(| NX + 1 : N , |); create Qp_r_ev from Qp_r; append from Qp_r; Qp_rXY = Qp_r*XY; names={'qpx0' 'qpx1' 'qpx2' 'qpx3' 'qpy1' 'qpy2'}; create Qp_r_dat from Qp_rXY(|colname=names|) ; append from Qp_rXY; Idn = I(N); B = Idn - H; P = eigvec(B); /* Remove the bottom r = rank_X observations from the eigenvectors of B */ Pp_r = P`(| 1 : N - NX , |); create Pp_r_ev from Pp_r; append from Pp_r; Pp_rXY = Pp_r*XY; names={'ppx0' 'ppx1' 'ppx2' 'ppx3' 'ppy1' 'ppy2'}; create Pp_r_dat from Pp_rXY(|colname=names|) ; append from Pp_rXY; PpXY = P`*XY; *07: CHANGE NAMES OF VARIABLES FOR NEW DATA SET; names={'ppx0' 'ppx1' 'ppx2' 'ppx3' 'ppy1' 'ppy2'}; /* create P_data from PpXY(|colname=names|); append from PpXY; */ /* end of iml */ proc means data=data_XY; run; /* Normal fit using the singular residuals ; */ /* P-VALUES FROM ORDINARY RESIDUALS; */ *08: CHANGE PROC MODEL FOR NEW DATA SET; proc model data=data_XY ; parm b01 b11 b21 b31 b02 b12 b22 b32 ; y1 = b01*x0 + b11*x1 + b21*x2 + b31*x3 ; y2 = b02*x0 + b12*x1 + b22*x2 + b32*x3 ; fit y1 y2 /normal; *09: CHANGE PROC GLM FOR NEW DATA SET; proc glm data = data_XY; model y1 y2 = x1 x2 x3 ; output out=dat_res r = y_resid1-y_resid2; run; *10: CHANGE PROC CAPABILITY FOR NEW DATA SET; proc capability data = dat_res; qqplot y_resid1 y_resid2; run; /* Normal fit using the residuals after transforming by the eigenvector matrix of H Note: Qp*X = 0 for the observation with nonzero residuals Note: Sample size = N - NX ; */ /* P-VALUES FROM UNROTATED RECOVERED ERRORS FROM EIGENVECTORS OF H, DENOTED R_SUB_H IN PAPER; */ *11: CHANGE PROC MODEL FOR NEW DATA SET; proc model data=Qp_r_dat ; parm b1 b2 ; qpy1 = b1 ; qpy2 = b2 ; fit qpy1 qpy2 /normal; run; *12: CHANGE PROC CAPABILITY FOR NEW DATA SET; proc capability data = Qp_r_dat ; qqplot qpy1 qpy2 ; run; /* Normal fit using the residuals after transforming by the eigenvector matrix of B = I - H Note: Pp*X = 0 for the observation with nonzero residuals Note: Sample size = N - NX */ /* P-VALUES FROM UNROTATED RECOVERED ERRORS FROM EIGENVECTORS OF B, DENOTED R_SUB_B IN PAPER; */ *13: CHANGE PROC MODEL FOR NEW DATA SET; proc model data=Pp_r_dat ; parm b1 b2 ; ppy1 = b1 ; ppy2 = b2 ; fit ppy1 ppy2 /normal; run; *14: CHANGE PROC CAPABILITY FOR NEW DATA SET; proc capability data = Pp_r_dat ; qqplot ppy1 ppy2 ; run; /* Switch the top of XY to the bottom */ data data_rev; set data_bot data_top; run; /* Create data sets to store the BLUS residuals */ data data_blu; set data_XY ; run; data dat_bl_r; set data_rev; run; /* Use autoreg to find Theil's BLUS residuals ; */ /* This model should be similar to the one using Qp */ *15: CHANGE PROC AUTOREG FOR NEW DATA SET (NY times); proc autoreg data=data_blu; model y1 = x1 x2 x3; output out=data_blu blus = y1_blus; run; proc autoreg data=data_blu; model y2 = x1 x2 x3; output out=data_blu blus = y2_blus; run; /* Normal fit for the BLUS residuals ; */ *16: CHANGE PROC DATA FOR NEW DATA SET; proc model data=data_blu ; parm b1 b2; y1_blus = b1 ; y2_blus = b2 ; fit y1_blus y2_blus /normal; run; *17: CHANGE PROC CAPABILITY FOR NEW DATA SET; proc capability data = data_blu ; qqplot y1_blus y2_blus ; run; /* Use autoreg to find Theil's BLUS residuals ; */ /* This model should be similar to the one using Pp */ *18: CHANGE PROC AUTOREG FOR NEW DATA SET (NY TIMES); proc autoreg data=dat_bl_r; model y1 = x1 x2 x3; output out=dat_bl_r blus = y1_bl_r; run; proc autoreg data=dat_bl_r ; model y2 = x1 x2 x3; output out=dat_bl_r blus = y2_bl_r; run; /* Normal fit for the BLUS residuals ; */ *19: CHANGE PROC DATA FOR NEW DATA SET; proc model data=dat_bl_r ; parm b1 b2; y1_bl_r = b1 ; y2_bl_r = b2 ; fit y1_bl_r y2_bl_r /normal; run; *20: CHANGE PROC CAPABILITY FOR NEW DATA SET; proc capability data = dat_bl_r ; qqplot y1_bl_r y2_bl_r ; run; /* Add label for the name of the factors to Qp_r */ /* Array will be N - rank_X : N + 1 */ data Q_in ; format _name_ $varying5. rownum $char2. rowname $char1. ; set Qp_r_ev ; rownum= _n_ ; rowname = 'f'; _name_ = rowname || rownum ; drop rownum rowname ; run ; /* Create a factor pattern matrix */ /* Array will be N - rank_X : N + 2 */ data Q_pat(type = factor); _type_ ='PATTERN'; set Q_in ; run; proc factor data = Q_pat rotate = varimax norm = none target = Q_pat nocorr outstat = Q_varmax ; /* Create the varimax rotated matrix of Qp_r */ data rotate_Q; set Q_varmax; if _type_ = "PATTERN" ; drop _type_ _name_ ; run; proc iml ; use rotate_Q ; read all var _num_ into RQp ; use data_XY ; read all var _num_ into XY ; use values; read all ; RQpXY = RQp*XY; *21: CHANGE NAMES OF VARIABLES FOR NEW DATA SET; names={ 'rqpx0' 'rqpx1' 'rqpx2' 'rqpx3' 'rqpy1' 'rqpy2'}; create RQp_data from RQpXY(|colname=names|); append from RQpXY; /* Normal fit for the varimax rotation using the eigenvectors for H */ /* Note: RQpX = 0 */ /* P-VALUES FROM ROTATED RECOVERED ERRORS FROM EIGENVECTORS OF H, DENOTED A'_SUB_H*Y IN PAPER; */ *22: CHANGE PROC DATA FOR NEW DATA SET; proc model data=RQp_data; parm b1 b2 ; rqpy1 = b1 ; rqpy2 = b2 ; fit rqpy1 rqpy2 /normal; run; *23: CHANGE PROC CAPABILITY FOR NEW DATA SET; proc capability data = RQp_data ; qqplot rqpy1 rqpy2 ; run; /* Add a label for the name of the factors ; */ data P_in ; format _name_ $varying5. rownum $char2. rowname $char1. ; set Pp_r_ev ; rownum= _n_ ; rowname = 'f' ; _name_ = rowname || rownum ; drop rownum rowname ; run ; /* Create a factor pattern matrix ; */ data P_pat(type = factor); _type_ ='PATTERN'; set P_in ; run; proc factor data = P_pat rotate = varimax norm = none target = P_pat nocorr outstat = P_varmax ; run; /* Create the varimax rotated matrix */ data rotate_P; set P_varmax; if _type_ = "PATTERN" ; drop _type_ _name_ ; run; proc iml ; use rotate_p ; read all var _num_ into RPp ; use data_XY ; read all var _num_ into XY ; use values; read all; *24: CHANGE NAMES OF VARIABLES FOR NEW DATA SET; RPpXY = RPp*XY; names={ 'rppx0' 'rppx1' 'rppx2' 'rppx3' 'rppy1' 'rppy2'}; create RPp_data from RPpXY(|colname=names|); append from RPpXY; /* P-VALUES FROM ROTATED RECOVERED ERRORS FROM EIGENVECTORS OF B, DENOTED A'_SUB_B*Y IN PAPER; */ *25: CHANGE PROC DATA FOR NEW DATA SET; proc model data=RPp_data; parm b1 b2 ; rppy1 = b1 ; rppy2 = b2 ; fit rppy1 rppy2 /normal; run; *26: CHANGE PROC CAPABILITY FOR NEW DATA SET; proc capability data = RPp_data ; qqplot rppy1 rppy2 ; run;