#!/usr/bin/perl -w package Statistics::Regression; use strict; use warnings; use constant DEBUGGING => 1; our $NAME= "Statistics::Regression"; our $VERSION = '0.05'; our $usage= "Please read the pod for proper usage of Statistics::Regression.pm.\n"; ################################################################ =pod =head1 NAME Regression - weighted linear regression package (line+plane fitting) =head1 SYNOPSIS use Regression; my $reg= Statistics::Regression->new( "debug regression", [ "someX", "someY" ], $constanthandling ); $reg->include( 2.0, [ 3.0, -1.0 ] ); $reg->include( 1.0, [ 5.0, 2.0 ] ); $reg->include( 20.0, [ 31.0, 0.0 ] ); $reg->include( 15.0, [ 11.0, 2.0 ] ); $reg->print(); my $coefs= $reg->theta(); my $coefs_T= $reg->thetaT(); my $rsq= $reg->rsq(); The number of variables is determined from the number of variable names provided. =head1 EXECUTION Until DEBUGGING is turned off by the user, Regression.pm executes with a sample data set to check results. After DEBUGGING is turned off, Regression.pm checks if it has been invoked as an executable by itself. If so, it acts as a filter. It assumes that the first column is Y, other columns are X's. =head1 DESCRIPTION Regression.pm is a multivariate linear regression package. That is, it estimates the c coefficients for a line-fit of the type y= c(0)*x(0) + c(1)*x1 + c(2)*x2 + ... + c(k)*xk given a data set of N observations, each with k independent x variables and one dependent y variable. Naturally, N must be greater than k---and preferably considerably greater. Any reasonable undergraduate statistics book will explain what a regression is. There are three methods for intercept handling: AUTO(-9) automatically include a constant as the first variable. debugged! CHECK(1) confirm that the user has really included a '1' as the first coefficient not debugged ALLOW(-1) do not include a constant, AND allow users to run regressions without a constant not debugged Most of the output is self-explanatory (and demonstrated by print() ). ESig is unusual, though: it is the percent of the y-variable's standard deviation that the coefficient implies to be explained by the x-variable's standard deviation. It is one measure of "economic significance." =head2 EXPORT All methods are exported by default. =head2 Original Algorithm (ALGOL-60): W. M. Gentleman, University of Waterloo, "Basic Description For Large, Sparse Or Weighted Linear Least Squares Problems (Algorithm AS 75)," Applied Statistics (1974) Vol 23; No. 3 =head2 INTERNALS R=Rbar is an upperright triangular matrix, kept in normalized form with implicit 1's on the diagonal. D is a diagonal scaling matrix. These correspond to "standard Regression usage" as X' X = R' D R A backsubsitution routine (in thetacov) allows to invert the R matrix (the inverse is upper-right triangular, too!). Call this matrix H, that is H=R^(-1). (X' X)^(-1) = [(R' D^(1/2)') (D^(1/2) R)]^(-1) = [ R^-1 D^(-1/2) ] [ R^-1 D^(-1/2) ]' =head2 Remark This algorithm is the statistical "standard." Insertion of a new observation can be done one obs at any time (WITH A WEIGHT!), and still only takes a low quadratic time. The storage space requirement is of quadratic order (in the indep variables). A practically infinite number of observations can easily be processed! =head2 Remark The offset count of variables, names, etc., is all a big hack. The original algorithm had vectors beginning at 1, rather than 0, and I was too lazy to translate it properly. So, in order to have the external interface (include, new) begin with the appropriate numbers, I have to add and subtract indices in odd places. Probably somewhat inconsistently, too. =head1 Remark Data Check: To check the output, the above example with 4 observations produces the following SAS output: Sum of Mean Source DF Squares Square F Value Pr > F Model 2 217.40191 108.70096 2.11 0.4380 Error 1 51.59809 51.59809 Corrected Total 3 269.00000 Root MSE 7.18318 R-Square 0.8082 Dependent Mean 9.50000 Adj R-Sq 0.4246 Coeff Var 75.61243 Parameter Standard Variable DF Estimate Error t Value Pr > |t| Intercept 1 0.29503 6.05125 0.05 0.9690 x1 1 0.67227 0.32776 2.05 0.2888 x2 1 1.06878 2.79545 0.38 0.7675 =head1 AUTHOR Naturally, Gentleman invented this algorithm. Adaptation by ivo welch. Alan Miller (alan\@dmsmelb.mel.dms.CSIRO.AU) pointed out nicer ways to compute the R^2. =head1 REVISIONS 09/21/2002 Major additions, sdv's. =head1 SEE ALSO perl(1). =cut ################################################################ use constant TINY => 1e-8; use constant CONSTAUTO => (-9); use constant CONSTCHECK => (1); use constant CONSTALLOW => (0); ################################################################ my $nan= "NaN"; sub isNaN { if ($_[0] !~ /[0-9nan]/) { die "$0:$NAME: definitely not a number in NaN: '$_[0]'"; } return ($_[0]=~ /NaN/i) || ($_[0] != $_[0]); } ################################################################ =pod =head2 new( $name, [ 'varname1', 'varname2', ... , 'varnameN' ], optional:consthandling ) receives the number of variables on each observations (i.e., an integer) and returns the blessed data structure. Also takes an optional name for this regression to remember, as well as a reference to a k*1 array of names for the X coefficients. =cut ################################################################ sub new { my $classname= shift(@_); my $regname= shift(@_) || "with no name"; my $varnames= shift(@_) or die "$0:$NAME: Regression->new needs variable names to determine number of variables. $usage.\n"; (scalar(ref($varnames))eq"ARRAY") or die "$0:$NAME: Variable names must be passed as an array reference. $usage.\n"; my $K= scalar @{ $varnames }; my $allownonconst= shift(@_) || CONSTAUTO; (($allownonconst==(CONSTCHECK))||($allownonconst==(CONSTALLOW))||($allownonconst==(CONSTAUTO))) or die "$0:$NAME: Parameter 4 must be CHECK, ALLOW, AUTO, not $allownonconst\n"; if ($allownonconst==(CONSTAUTO)) { unshift( @{ $varnames }, "constant" ); # push( @{ $record{xnames} }, "y-var" ); ++$K; } ($K>1) or die "$0:$NAME: Cannot run a regression without at least two variables."; my @keys= qw(k regname xnames n sse syy sy wghtn d thetabar rbarsize rbar neverabort theta sigmasq rsq adjrsq sst xpxinv thetaT thetase consthandling xmean xsdv ymean ysdv eqweight); my %record; # use Tie::Hash::FixedKeys; # emphasis on safety of use, not ultimate speed; # tie %record, 'Tie::Hash::FixedKeys', @keys; %record = ( # initialized as such k => $K, regname => $regname, xnames => $varnames, neverabort => 0, # constantly updated: n => 0, sse => 0, syy => 0, sy => 0, wghtn => 0, d => zerovec($K), thetabar => zerovec($K), rbarsize => ($K+1)*$K/2+1, rbar => zerovec(($K+1)*$K/2+1), consthandling => $allownonconst, eqweight => 1 ); sub zerovec { my @rv; for (my $i=0; $i<=$_[0]; ++$i) { $rv[$i]=0; } return \@rv; } clearcomputed( \%record ); return bless \%record, $classname; } ################################################################ # this function is called everytime a new observation is included; sub clearcomputed { my $this= shift(@_); $this->{sst}= $this->{sigmasq}=undef; $this->{rsq}= $this->{adjrsq}= $this->{xpxinv} = undef; $this->{theta}= $this->{thetaT}= $this->{thetase}= undef; $this->{xmean}= $this->{xsdv}= $this->{ysdv}= $this->{ymean}= undef; return $this; } ################################################################ =pod =head2 include( $y, [ $x1, $x2, ... $xn ] , optional: $weight ) receives one new observation. Call is $blessedregr->include( $yvariable, [ $x1, $x2, $x3 ... $xk ], 1.0 ); where 1.0 is an (optional) weight. Note that inclusion with a weight of -1 can be used to delete an observation. =cut ################################################################ sub include { my $this = shift(); my $yelement= shift(); my $xrow= shift(); my $weight= shift() || 1.0; ($weight==1.0) or $this->{eqweight}=0; # omit observations with missing observations; defined($yelement) or die "$0:$NAME: Method Error: yelement is undef!"; defined($xrow) or die "$0:$NAME: Method Error: xrow is undef!"; (!isNaN($yelement)) or return $this->{n}; for (my $i=0; $i< ($this->{k})-(($this->{consthandling}==CONSTAUTO)?1:0); ++$i) { (defined($xrow->[$i])) or die "$0:$NAME: Call Error: xrow [ $i ] for obs $this->{obscnt} is undef"; (!(isNaN($xrow->[$i]))) or return $this->{n}; } my @xcopy= @$xrow; if ($this->{consthandling}==CONSTAUTO) { unshift(@xcopy, "1.0"); } elsif ($this->{consthandling}==CONSTCHECK) { ($xcopy[1]==1.0) or die "$0:$NAME: Call Error: First var is not 1 in @xcopy at obs $this->{obscnt}\n"; } unshift( @xcopy, undef ); # $xcopy[$i]= $xrow->[$i-1]; $this->{syy}+= ($weight*($yelement*$yelement)); $this->{sy}+= ($weight*($yelement)); if ($weight>=0.0) { ++$this->{n}; } else { --$this->{n}; } $this->{wghtn}+= $weight; for (my $i=1; $i<=$this->{k};++$i) { if ($weight==0.0) { return $this->{n}; } if (abs($xcopy[$i])>(TINY)) { my $xi=$xcopy[$i]; my $di=$this->{d}->[$i]; my $dprimei=$di+$weight*($xi*$xi); my $cbar= $di/$dprimei; my $sbar= $weight*$xi/$dprimei; $weight*=($cbar); $this->{d}->[$i]=$dprimei; my $nextr=int( (($i-1)*( (2.0*$this->{k}-$i))/2.0+1) ); if (!($nextr<=$this->{rbarsize}) ) { die "$0:$NAME: Internal Error 2"; } my $xk; for (my $kc=$i+1;$kc<=$this->{k};++$kc) { $xk=$xcopy[$kc]; $xcopy[$kc]=$xk-$xi*$this->{rbar}->[$nextr]; $this->{rbar}->[$nextr]= $cbar * $this->{rbar}->[$nextr]+$sbar*$xk; ++$nextr; } $xk=$yelement; $yelement-= $xi*$this->{thetabar}->[$i]; $this->{thetabar}->[$i]= $cbar*$this->{thetabar}->[$i]+$sbar*$xk; } } $this->{sse}+=$weight*($yelement*$yelement); $this->clearcomputed(); # indicate that intermediate results have become garbage now # emphasis on safety of use, not ultimate speed; return $this; } ################################################################ =pod =head2 theta() estimates and returns the vector of coefficients. =cut ################################################################ sub theta { my $this= shift(); (!defined($this->{theta})) or return $this->{theta}; # we already know it if ($this->{n} < $this->{k}) { return undef; } for (my $i=($this->{k}); $i>=1; --$i) { $this->{theta}->[$i]= $this->{thetabar}->[$i]; my $nextr= int (($i-1)*((2.0*$this->{k}-$i))/2.0+1); if (!($nextr<=$this->{rbarsize})) { die "$0:$NAME: Internal Error 3"; } for (my $kc=$i+1;$kc<=$this->{k};++$kc) { $this->{theta}->[$i]-=($this->{rbar}->[$nextr]*$this->{theta}->[$kc]); ++$nextr; } } my $ref= $this->{theta}; shift(@{$this->{theta}}); # we want to return theta's starting from 0 if (wantarray()) { return @{ $this->{theta} }; } return $this->{theta}; } ################################################################ =pod =head2 thetaT() estimates and returns the T-statistic of the coefficients. updates {thetaT}. In the process, this calls thetacov(), which in turn updates a lot of other information. =cut ################################################################ sub thetaT { my $this= shift(); $this->thetacov(); for (my $i=0; $i<($this->{k}); ++$i) { $this->{thetaT}->[$i] = $this->{theta}->[$i] / $this->{thetase}->[$i]; } if (wantarray()) { return @{ $this->{thetaT} }; } return $this->{thetaT}; } ################################################################ =pod =head2 predict() see Theil, p.122: predict returns the predicted value at xcoordinates x, as well as the standard error of one observation and the standard error of the mean. =cut ################ sub predict { my $this= shift(@_); my @x= @_; # make sure regression is computed! perhaps check for se1 or semean if (!defined($this->{theta})) { $this->theta(); } my $predval=0.0; for (my $j=1; $j<=$this->{k}; ++$j) { $predval+= $x[$j]*$this->{theta}->[$j]; } if (!wantarray()) { return $predval; } if (!defined($this->{xpxinv})) { $this->thetacov(); } my @xtxpxinv; for (my $j=1; $j<=$this->{k}; ++$j) { $xtxpxinv[$j]=0.0; for (my $i=1; $i<=($this->{k}); ++$i) { $xtxpxinv[$j]+= $x[$i]* ($this->{xpxinv}->[$i]->[$j]); } } my $xxpxm1x=0.0; for (my $j=1; $j<=$this->{k}; ++$j) { $xxpxm1x+= $xtxpxinv[$j]*$x[$j]; } # now we have "X* (X' X)^{-1} X*" in xxpxm1 $this->sigmasq(); return ( $predval, sqrt($this->{sigmasq}*$xxpxm1x), sqrt($this->{sigmasq}*(1.0+$xxpxm1x)) ); } ################################################################ =pod =head2 rsq, adjrsq, sigmasq, ymean, sst, k, n These functions provide common auxiliary information. rsq, adjrsq, sigmasq, sst, and ymean have not been checked but are likely correct. The results are stored for later usage, although this is somewhat unnecessary because the computation is so simple anyway. =cut ################################################################ sub rsq { my $this= shift(); return $this->{rsq}= 1.0- $this->{sse} / $this->sst(); } sub adjrsq { my $this= shift(); return $this->{adjrsq}= 1.0- (1.0- $this->rsq())*($this->{n}-1)/($this->{n} - $this->{k}); } sub sigmasq { my $this= shift(); return $this->{sigmasq}= ($this->{n}<=$this->{k}) ? "Inf" : ($this->{sse}/($this->{n} - $this->{k})); } sub ymean { my $this= shift(); return $this->{ymean}= $this->{sy}/$this->{wghtn}; } sub sst { my $this= shift(); return $this->{sst}= ($this->{syy} - $this->{wghtn}*($this->ymean())**2); } sub k { my $this= shift(); return $this->{k}; } sub n { my $this= shift(); return $this->{n}; } ################################################################ =pod =head2 print() prints the regression information. Equally important, it demonstrates how to obtain and use of Regression output. =cut ################################################################ sub print { my $this= $_[0]; if (!defined($this->{theta})) { $this->theta(); } if (!defined($this->{thetase})) { $this->thetaT(); } my $printunivar= (($this->{consthandling}==CONSTAUTO)||($this->{consthandling}==CONSTCHECK))&&($this->{eqweight}); $printunivar and $this->varunis(); # compute the univariate statistics, if it makes sense; print "************************************************************************\n"; printf("'$this->{regname}': R^2=%.1f%%, s^2=%.3f, N=$this->{n}", 100.0*$this->rsq(), $this->{sigmasq}); $printunivar and printf(" Y-MEAN=%.3f,Y-SDV=%.3f", $this->{ymean}, $this->{ysdv}); print "\n"; my $maxnamelength=0; for (my $i=0; $i< $this->{k}-1; ++$i) { if (($this->{xnames}->[$i]) && (length($this->{xnames}->[$i])>$maxnamelength)) { $maxnamelength= length($this->{xnames}->[$i]); } } print "--------------------------------------------------------------------\n"; printf sprintf("%-".(${maxnamelength}+3)."s", "Coefficient")."\t Theta\t S.E.\t T-Stat\t\t"; $printunivar and print "[X-Mean,X-Sdv,EcnSig]"; print "\n"; print "--------------------------------------------------------------------\n"; for (my $i=0; $i< $this->{k}; ++$i) { my $name= ($this->{xnames}->[$i]); $name= substr($name, 0, $maxnamelength); print "$i:$name:\t". sprintf("%7.4f", $this->{theta}->[$i])."\t". sprintf("%7.3f", $this->{thetase}->[$i])."\t". sprintf("%7.2f", $this->{thetaT}->[$i]); $printunivar and printf("\t\t[%6.3f,%6.3f,%3.1f%%]", $this->{xmean}->[$i], $this->{xsdv}->[$i], 100.0*($this->{theta}->[$i]*$this->{xsdv}->[$i])/$this->{ysdv}); print "\n"; } print "************************************************************************\n"; # we should print warnings here if two variables are correlated at >90%. # this can in principle be determined from xpx, which is computed in varunis(). return $this; } ################################################################ =pod =head1 Internal Functions The two main internal functions compute the (X' X)^(-1) (function thetacov()) and (X' X) (function varunis()) matrices. These functions are called in order to assess the statistical and economic significances of the X variables. =head2 thetacov() this routine produces the (X' X)^{-1} matrix. =head3 EXPLANATION The stored variables correspond to "standard gregression usage" as X' X = R' D R A backsubsitution routine (in thetacov) allows to invert the R matrix (the inverse is upper-right triangular, too!). Call this matrix H, that is H=R^(-1). (X' X)^(-1) = [(R' D^(1/2)') (D^(1/2) R)]^(-1) = [ R^-1 D^(-1/2) ] [ R^-1 D^(-1/2) ]' =head3 NOTE This routine could be substantially smartened up. I do not take advantage of the fact that the inverse of R in U is an upper-right triangular matrix with 1s on the diagonal, too, etc. =head3 NOTE This routine computes {xpxinv}, {thetase}, {sigmasq}. =cut ################################################################ sub thetacov { my $this= shift(@_); sub rbr { my $this= shift(@_); return ($_[0] == $_[1]) ? 1 : ($_[0]>$_[1]) ? 0 : $this->{rbar}->[ int( ((($_[0]-1)*(2.0*$this->{k}-$_[0])*0.5+1.0) + $_[1] - 1 - $_[0] )) ]; } my @u; for (my $i=1;$i<=$this->{k};++$i) { for (my $j=1;$j<=$this->{k};++$j) { $u[$i][$j]=0.0; } } for (my $j=$this->{k}; $j>=1; --$j) { $u[$j][$j]= 1.0/($this->rbr($j,$j)); for (my $k=$j-1; $k>=1; --$k) { $u[$k][$j]=0; for (my $i=$k+1; $i<=$j; ++$i) { $u[$k][$j]+= $this->rbr($k,$i)*$u[$i][$j]; } $u[$k][$j]*= (-1.0)/$this->rbr($k,$k); } } # now we have the inverse in u[], let's multiply it by D (-1/2) for (my $i=1;$i<=$this->{k};++$i) { for (my $j=1;$j<=$this->{k};++$j) { if (abs($this->{d}->[$j]){d}->[$j])==0.0) { if ($this->{neverabort}) { $u[$i][$j]= $nan; } else { my_abort("gregr:gregr_thetacov", "cannot compute the theta-covariance matrix for variable j=$j (d[j]=$this->d[$j])"); } } if (++$complaints>=100) { warn "[$0:$NAME: Further THETACOV warnings will be suppressed]\n"; } else { warn "[$0:$NAME: THETACOV Warning $this->d[$j] on $j]\n"; } } else { $u[$i][$j]*= sqrt(1.0/$this->{d}->[$j]); } } } # now we have inverse, multiplied by d^{1/2}, in u $this->sigmasq(); for (my $i=1;$i<=$this->{k}; ++$i) { for (my $j=1;$j<=$this->{k};++$j) { $this->{xpxinv}->[$i][$j]=0.0; for (my $k=1;$k<=$this->{k};++$k) { $this->{xpxinv}->[$i][$j] += $u[$i][$k] * $u[$j][$k]; } } } for (my $i=0; $i<($this->{k}); ++$i) { $this->{thetase}->[$i]= sqrt($this->{xpxinv}->[$i+1][$i+1]*$this->{sigmasq}); } return $this; } ################################################################ =pod =head2 varunis() this routine produces the (X' X) matrix, used to compute means and standard deviations on the X variables. It also computes the y-variable's mean and standard deviation. the regression package has enough information to also compute the means and standard deviations of all variables. This is straightforward if all observations have equal weights and a constant was included---as is the normal case. =cut ################################################################ sub varunis { my $this= shift(@_); ((($this->{consthandling}==CONSTAUTO)||($this->{consthandling}==CONSTCHECK)) && ($this->{eqweight})) or warn "[no means and stddevs computed, because const was not auto-included or obs have unequal weights]\n"; my $xpxsqrtd; # compute r'.sqrt{d} for (my $i=1;$i<=$this->{k};++$i) { for (my $j=1;$j<=$this->{k};++$j) { $xpxsqrtd->[$i]->[$j]+= $this->rbr($i,$j)*sqrt($this->{d}->[$i]); } } my $xpx; # compute [r'sqrt{d}]^2 = x'x for (my $i=1;$i<=$this->{k};++$i) { for (my $j=1;$j<=$this->{k};++$j) { $xpx->[$i]->[$j]=0.0; for (my $k=1;$k<=$this->{k};++$k) { $xpx->[$i]->[$j] += $xpxsqrtd->[$k]->[$j] * $xpxsqrtd->[$k]->[$i]; } } } # in the example, the result is {[4,50,3],[50,1116,29],[3,29,9]} for (my $i=1; $i<=$this->{k}; ++$i) { $this->{xmean}->[$i-1]= $xpx->[$i]->[1]/$this->{n}; $this->{xsdv}->[$i-1]= sqrt(($xpx->[$i]->[$i]-$xpx->[$i]->[1]*$xpx->[$i]->[1]/$this->{n})/($this->{n}-1)); } # now do the same for y $this->{ymean}= $this->{sy}/$this->{wghtn}; $this->{ysdv}= sqrt(($this->{syy}-$this->{ymean}*$this->{ymean}*$this->{n})/($this->{n}-1)); return $this; } ################################################################ =pod =head1 BUGS/PROBLEMS =over 4 =item Perl Problem perl is unaware of IEEE number representations. This makes it a pain to test whether an observation contains any missing variables (coded as 'NaN' in Regression.pm). =item Others I am a novice perl programmer, so this is probably ugly code. However, it does seem to work, and I could not find anything equivalent on cpan. =back =head1 INSTALLATION and DOCUMENTATION Installation consists of moving the file 'Regression.pm' into a subdirectory Statistics of your modules path (e.g., /usr/lib/perl5/site_perl/5.6.0/). The documentation was produced from the module: pod2html -noindex -title "perl weighted least squares regression package" Regression.pm > Regression.html =head1 LICENSE This module is released for free public use under a GPL license. (C) Ivo Welch, 2001. =cut ################################################################ if (DEBUGGING) { package main; print STDERR "\n[Regression.pm] In Debugging (Demonstration) Mode.\n\n"; my $reg= Statistics::Regression->new("debug regression", [ "X1name", "X2name" ] ); $reg->include( 2.0, [ 3.0, -1.0 ] ); $reg->include( 1.0, [ 5.0, 2.0 ] ); $reg->include( 20.0, [ 31.0, 0.0 ] ); $reg->include( 15.0, [ 11.0, 2.0 ] ); $reg->print(); # or: my $coefs= $reg->theta(); print @coefs; print $reg->rsq; my @theta= @{ $reg->theta() }; # theta() returns a reference, so we need to deref print "\nThe theta vector is '@theta'\n"; print STDERR "\nTo use Statistics::Regression.pm as a module, turn off the constant 'DEBUGGING'\n\n"; exit 0; } elsif ($0 =~ /regression.pm/i) { package main; # this program is used on the command line! print STDERR "[Regression.pm] Invoked From Command Line. Acting as Filter.\n"; my $reg; my $lastnumvars; while (<>) { if ($_ =~ /^\#/) { next; } my @fields= split(/\t/, $_); if (!defined($reg)) { $lastnumvars= $#fields; my @names; my $cnt=0; foreach my $f (@fields) { push(@names, sprintf("x%d-var",$cnt++)); } shift(@names); $reg= Statistics::Regression->new("regression", [@names] ); } ($lastnumvars == $#fields) or die "The number of columns in the file is not constant (1+$lastnumvars vs. 1+$#fields).\n"; my $y= shift(@fields); $reg->include( $y, [@fields] ); } print "\n\n"; $reg->print(); # or: my $coefs= $reg->theta(); print @coefs; print $reg->rsq; }