Files
lazarus-ccr/applications/lazstats/source/units/regressionunit.pas
2020-10-18 13:52:00 +00:00

381 lines
12 KiB
ObjectPascal

unit RegressionUnit;
{$mode objfpc}{$H+}
{$modeswitch advancedrecords}
interface
uses
Classes, SysUtils,
MatrixUnit;
type
ERegression = class(Exception);
type
TRegressionError = (regOK, regTooFewValues);
TBivariateRegressionResults = record
public
Slope, Intercept: Double;
XMean, YMean: Double;
XVariance, YVariance: Double;
XStdDev, YStdDev: Double;
StdErrorPredicted: Double;
R: Double;
t: Double;
SXX, SXY, SYY: Double;
Count, DF: Integer;
function ConfidenceLimits(x: Double; Upper: Boolean): Double;
end;
procedure BivariateRegression(const xData, yData: TDblVector;
AConfLevel: Double; out AResults: TBivariateRegressionResults);
type
TMultipleRegressionResults = record
private
procedure WriteMatrixReport(AReport: TStrings; AMatrix: TDblMatrix;
ATitle: String; AVarNames: TStringArray);
public
Coeffs: TDblVector; // Coefficients of the factors
CoeffStdErrors: TDblVector; // Standard errors of the coefficients
Beta: TDblVector; // Standardized coefficients
VarCovar: TDblMatrix; // Variance-covariance matrix
Correlations: TDblMatrix; // Correlations matrix
SStotal, SSreg, SSresid: Double; // Sum of squares
NumCases, NumDepVars: Integer; // number of cases, number of dependent variables
R2: Double; // coefficient of determination
AdjR2: Double; // Adjusted coefficient of determination
StdErrorEstimate: Double; // standard error of estimate
t: TDblVector;
p: TDblVector; // Probability for > t
F: Double;
Probability: Double; // Probability for > F
Means: TDblVector;
Variances: TDblVector;
StdDevs: TDblVector;
(*
XMeans: TDblVector; // Means of the x columns
XVariances: TDblVector; // Variances of the x columns
XStdDevs: TDblVector; // Std Deviations of the x columns
*)
MeanY: Double; // Mean of y column
VarianceY: Double; // Variance of y column
StdDevY: Double; // Std Deviation of y column
procedure WriteANOVAReport(AReport: TStrings);
procedure WriteCoeffsReport(AReport: TStrings; AVarNames: TStringArray);
procedure WriteCorrelationReport(AReport: TStrings; AVarNames: TStringArray);
procedure WriteVarCovarReport(AReport: TStrings; AVarNames: TStringArray);
end;
function MultipleRegression(const xData: TDblMatrix; const yData: TDblVector;
AConfLevel: Double; out AResults: TMultipleRegressionResults): TRegressionError;
implementation
uses
StrUtils,
MathUnit;
{===============================================================================
Bivariate regression
Calculates a bivariate regression y = a x + b
x is a vector of independent observations, y of dependent observations.
Both vectors must have the same length.
It is assumed that xData and yData contain at least 3 elements and
have the same count of elements.
===============================================================================}
procedure BivariateRegression(const xData, yData: TDblVector;
AConfLevel: Double; out AResults: TBivariateRegressionResults);
var
i: Integer;
begin
with AResults do
begin
Count := Length(xData);
// Calculate means, variances, stddevs
VecMeanVarStdDevSS(xData, XMean, XVariance, XStdDev, SXX);
VecMeanVarStdDevSS(yData, YMean, YVariance, YStdDev, SYY);
SXY := 0;
for i := 0 to Count-1 do
SXY := SXY + xData[i] * yData[i];
R := (SXY - XMean * YMean * Count) / ((Count - 1) * XStdDev * YStdDev);
StdErrorPredicted := sqrt(1.0 - sqr(R)) * YStdDev * sqrt((Count - 1) / (Count - 2));
Slope := R * YStdDev / XStdDev;
Intercept := YMean - Slope * XMean;
DF := Count - 2;
t := InverseT(AConfLevel, DF);
end;
end;
function TBivariateRegressionResults.ConfidenceLimits(x: Double; Upper: Boolean): Double;
var
yPred, seData: Double;
begin
yPred := Intercept + Slope * x;
seData := StdErrorPredicted * sqrt(1.0 + 1/Count + sqr(x - XMean)/SXX);
if Upper then
Result := yPred + t*seData
else
Result := yPred - t*seData;
end;
{===============================================================================
Multiple regression
===============================================================================}
function MultipleRegression(const xData: TDblMatrix; const yData: TDblVector;
AConfLevel: Double; out AResults: TMultipleRegressionResults): TRegressionError;
var
i, j, nx, mx, ny: Integer;
X, XT, XT_X, inv_XT_X: TDblMatrix;
XT_Y: TDblVector;
SY, SSY: Double;
DFresid: Integer;
begin
MatCheck(xData);
MatSize(xData, nx, mx);
ny := Length(yData);
if nx <> ny then
raise ERegression.Create('Dimension error.');
if nx < 2 then begin
Result := regTooFewValues;
exit;
end;
// Augmented x matrix: append ones vector for intercept
X := MatAppendColVector(xData, VecOnes(ny));
// Transposed augmented matrix
XT := MatTransposed(X);
// Product of augmented matrix with augmented Y
XT_Y := XT * yData;
// Product of transposed augmented with augmented matrix
XT_X := XT * X;
// Product of inverse of XT_X with XT_Y yields the coefficients. Intercept is last.
inv_XT_X := MatInverse(XT_X);
AResults.Coeffs := inv_XT_X * XT_Y;
// THe XT_X matrix contains the sum of squares (diagonal elements), sum of
// cross-products (off-diagonal elements and the sum the variable values
// (augmented column).
SetLength(AResults.Means, mx+1);
SetLength(AResults.Variances, mx+1);
SetLength(AResults.StdDevs, mx+1);
for i := 0 to mx do
begin
AResults.Means[i] := XT_X[i, mx] / nx;
AResults.Variances[i] := (XT_X[i, i] - sqr(AResults.Means[i]) * nx) / (nx - 1);
AResults.StdDevs[i] := sqrt(AResults.Variances[i]);
{
if AResults.StdDevs[i] = 0.0 then
begin
Result := regStdDevZero;
exit;
end;
}
end;
// Variance-covariance matrix
SetLength(AResults.VarCovar, mx, mx);
SetLength(AResults.Correlations, mx, mx);
for i := 0 to mx-1 do
for j := 0 to mx-1 do
begin
AResults.VarCovar[i, j] := (XT_X[i, j] - XT_X[i, mx] * XT_X[j, mx] / nx) / (nx - 1);
AResults.Correlations[i, j] := AResults.VarCovar[i, j] / (AResults.StdDevs[i] * AResults.StdDevs[j]);
end;
with AResults do
begin
NumCases := nx;
NumDepVars := mx;
DFresid := NumCases - NumDepVars - 1;
// Calculate column means of xData
// MatColMeanVarStdDev(X, XMeans, XVariances, XStdDevs);
// Basic stats of y data column
VecSumSS(yData, SY, SSY);
MeanY := SY / NumCases;
SStotal := SSY - sqr(MeanY) * NumCases; // needed for ANOVA
VarianceY := SStotal / (NumCases - 1);
StdDevY := sqrt(VarianceY);
{
if StdDevY = 0 then
begin
Result := regStdDevZero;
exit;
end;
}
// Standardized coefficients
Beta := VecMultiply(Coeffs, StdDevs) * (1.0/StdDevY);
// Get standard errors, squared multiple correlation, tests of significance
SetLength(CoeffStdErrors, mx+1);
SetLength(t, mx+1);
SetLength(p, mx+1);
SSresid := SSY - Coeffs * XT_Y;
SSreg := SStotal - SSresid;
StdErrorEstimate := sqrt(SSresid / DFresid);
R2 := SSreg / SStotal;
AdjR2 := 1.0 - (1.0 - R2) * (NumCases - 1) / DFresid;
F := (SSreg / NumDepVars) / (SSresid / DFresid);
Probability := ProbF(F, NumDepVars, DFresid);
for i := 0 to mx do
begin
CoeffStdErrors[i] := StdErrorEstimate * sqrt(inv_XT_X[i, i]);
t[i] := Coeffs[i] / CoeffStdErrors[i];
p[i] := ProbT(t[i], DFresid);
end;
end;
Result := regOK;
end;
{ Writes the ANOVA of the regression to a "report". }
procedure TMultipleRegressionResults.WriteANOVAReport(AReport: TStrings);
var
DFreg, DFresid, DFtotal: Integer;
begin
DFreg := NumDepVars;
DFtotal := NumCases - 1;
DFresid := DFtotal - DFreg;
AReport.Add('ANOVA');
AReport.Add('------------------------------------------------------------------------------');
AReport.Add('SOURCE DF Sum of Squares Mean Square F Probability');
AReport.Add('---------- --- ---------------- ---------------- ------------ -----------');
AReport.Add('Regression %3d %16.3f %16.3f %12.4f %10.4f', [DFreg, SSreg, SSreg/DFreg, F, Probability]);
AReport.Add('Residual %3d %16.3f %16.3f', [DFresid, SSresid, SSresid/DFresid]);
AReport.Add('Total %3d %16.3f', [DFtotal, SStotal]);
AReport.Add('------------------------------------------------------------------------------');
AReport.Add('');
AReport.Add('R2: %10.4f', [R2]);
AReport.Add('Adjusted R2: %10.4f', [AdjR2]);
AReport.Add('F: %10.2f (with %d and %d degrees of freedom)', [F, DFreg, DFresid]);
AReport.Add('Probability > F: %10.4f', [Probability]);
AReport.Add('Standard Error of Estimate: %10.2f', [StdErrorEstimate]);
end;
{ Writes the determined coefficients and their statistics to a "report".
The names of the independent variables are needed in the array AVarNames. }
procedure TMultipleRegressionResults.WriteCoeffsReport(AReport: TStrings;
AVarNames: TStringArray);
var
varName: String;
i, first, last: Integer;
begin
AReport.Add('COEFFICIENTS');
AReport.Add('------------------------------------------------------------------------------');
AReport.Add(' Variable Unstandardized Std.Error Standardized t p ');
AReport.Add('------------ -------------- ------------ ------------ --------- ---------');
first := 0;
last := High(Coeffs);
for i := first to last do
begin
if i = last then
varName := '(Intercept)'
else
if i <= High(AVarNames) then
varName := AVarNames[i]
else
varName := Format('VAR.%d', [i+1]);
AReport.Add('%12s %14.3f %12.3f %12.3f %9.4f %9.4f',
[varName, Coeffs[i], CoeffStdErrors[i], Beta[i], t[i], p[i]]);
end;
AReport.Add('------------------------------------------------------------------------------');
end;
{ Writes the correlation matrix to a "report".
The names of the independent variables are needed in the array AVarNames. }
procedure TMultipleRegressionResults.WriteCorrelationReport(AReport: TStrings;
AVarNames: TStringArray);
begin
WriteMatrixReport(AReport, Correlations, 'CORRELATION MATRIX', AVarNames);
end;
procedure TMultipleRegressionResults.WriteMatrixReport(AReport: TStrings;
AMatrix: TDblMatrix; ATitle: String; AVarNames: TStringArray);
function GetVarName(i: Integer): String;
begin
if i < High(AVarNames) then
Result := AVarNames[i]
else
Result := Format('VAR.%d', [i+1]);
end;
const
SPACE = ' ';
var
i, j, first, last: Integer;
s: String;
begin
first := 0;
last := High(AMatrix);
AReport.Add(ATitle);
s := DupeString(' ', 12);
for j := first to last do
s := s + SPACE + Format('%12s', [GetVarName(j)]);
AReport.Add(DupeString('-', Length(s)));
AReport.Add(s);
s := DupeString(' ', 12);
for j := first to last do
s := s + SPACE + DupeString('-', 12);
AReport.Add(s);
for i := first to last do
begin
s := Format('%12s', [GetVarName(i)]);
for j := first to last do
s := s + SPACE + Format('%12.3f', [AMatrix[i, j]]);
AReport.Add(s);
end;
AReport.Add(DupeString('-', Length(s)));
end;
{ Writes the variance/covariance matrix to a "report".
The names of the independent variables are needed in the array AVarNames. }
procedure TMultipleRegressionResults.WriteVarCovarReport(AReport: TStrings;
AVarNames: TStringArray);
begin
WriteMatrixReport(AReport, VarCovar, 'VARIANCE-COVARIANCE MATRIX', AVarNames);
end;
end.