LazStats: Massive refactoring of LSMRegUnit re-using general regression routines.

git-svn-id: https://svn.code.sf.net/p/lazarus-ccr/svn@7782 8e941d3f-bd1b-0410-a28a-d453659cc2b4
This commit is contained in:
wp_xxyyzz
2020-10-19 08:24:46 +00:00
parent dfffbcf6f8
commit b93c6936bf
8 changed files with 737 additions and 370 deletions

View File

@ -43,8 +43,10 @@ type
CoeffStdErrors: TDblVector; // Standard errors of the coefficients
Beta: TDblVector; // Standardized coefficients
XT_X: TDblMatrix; // Cross-products matrix
VarCovar: TDblMatrix; // Variance-covariance matrix
Correlations: TDblMatrix; // Correlations matrix
InvIndepCorrs: TDblMatrix; // Inverse of the INDEPENDENT correlation matrix
SStotal, SSreg, SSresid: Double; // Sum of squares
NumCases, NumDepVars: Integer; // number of cases, number of dependent variables
@ -59,24 +61,29 @@ type
Means: TDblVector;
Variances: TDblVector;
StdDevs: TDblVector;
(*
XMeans: TDblVector; // Means of the x columns
XVariances: TDblVector; // Variances of the x columns
XStdDevs: TDblVector; // Std Deviations of the x columns
*)
MeanY: Double; // Mean of y column
VarianceY: Double; // Variance of y column
StdDevY: Double; // Std Deviation of y column
YMean: Double; // Mean of y column
YVariance: Double; // Variance of y column
YStdDev: Double; // Std Deviation of y column
procedure WriteANOVAReport(AReport: TStrings);
procedure WriteCoeffsReport(AReport: TStrings; AVarNames: TStringArray);
procedure WriteCorrelationReport(AReport: TStrings; AVarNames: TStringArray);
procedure WriteCrossProductsReport(AReport: TStrings; AVarNames: TStringArray);
procedure WriteVarCovarReport(AReport: TStrings; AVarNames: TStringArray);
end;
function MultipleRegression(const xData: TDblMatrix; const yData: TDblVector;
AConfLevel: Double; out AResults: TMultipleRegressionResults): TRegressionError;
procedure PredictMR(const xData: TDblMatrix; const yData: TDblVector;
const ARegressionResults: TMultipleRegressionResults;
out zPred, zResid, RawPred, RawResid, StdErrPred, Hi95, Lo95: TDblVector);
implementation
@ -171,6 +178,7 @@ begin
// Product of transposed augmented with augmented matrix
XT_X := XT * X;
AResults.XT_X := Matcopy(XT_X);
// Product of inverse of XT_X with XT_Y yields the coefficients. Intercept is last.
inv_XT_X := MatInverse(XT_X);
@ -187,24 +195,20 @@ begin
AResults.Means[i] := XT_X[i, mx] / nx;
AResults.Variances[i] := (XT_X[i, i] - sqr(AResults.Means[i]) * nx) / (nx - 1);
AResults.StdDevs[i] := sqrt(AResults.Variances[i]);
{
if AResults.StdDevs[i] = 0.0 then
begin
Result := regStdDevZero;
exit;
end;
}
end;
// Variance-covariance matrix
SetLength(AResults.VarCovar, mx, mx);
SetLength(AResults.Correlations, mx, mx);
for i := 0 to mx-1 do
for j := 0 to mx-1 do
begin
AResults.VarCovar[i, j] := (XT_X[i, j] - XT_X[i, mx] * XT_X[j, mx] / nx) / (nx - 1);
AResults.Correlations[i, j] := AResults.VarCovar[i, j] / (AResults.StdDevs[i] * AResults.StdDevs[j]);
end;
with AResults do
begin
SetLength(VarCovar, mx, mx);
SetLength(Correlations, mx, mx);
for i := 0 to mx-1 do
for j := 0 to mx-1 do
begin
VarCovar[i, j] := (XT_X[i, j] - XT_X[i, mx] * XT_X[j, mx] / nx) / (nx - 1);
Correlations[i, j] := VarCovar[i, j] / (StdDevs[i] * StdDevs[j]);
end;
end;
with AResults do
begin
@ -213,24 +217,17 @@ begin
DFresid := NumCases - NumDepVars - 1;
// Calculate column means of xData
// MatColMeanVarStdDev(X, XMeans, XVariances, XStdDevs);
MatColMeanVarStdDev(xData, xMeans, xVariances, xStdDevs);
// Basic stats of y data column
VecSumSS(yData, SY, SSY);
MeanY := SY / NumCases;
SStotal := SSY - sqr(MeanY) * NumCases; // needed for ANOVA
VarianceY := SStotal / (NumCases - 1);
StdDevY := sqrt(VarianceY);
{
if StdDevY = 0 then
begin
Result := regStdDevZero;
exit;
end;
}
yMean := SY / NumCases;
SStotal := SSY - sqr(yMean) * NumCases; // needed for ANOVA
yVariance := SStotal / (NumCases - 1);
yStdDev := sqrt(yVariance);
// Standardized coefficients
Beta := VecMultiply(Coeffs, StdDevs) * (1.0/StdDevY);
Beta := VecMultiply(Coeffs, StdDevs) * (1.0/yStdDev);
// Get standard errors, squared multiple correlation, tests of significance
SetLength(CoeffStdErrors, mx+1);
@ -251,6 +248,8 @@ begin
t[i] := Coeffs[i] / CoeffStdErrors[i];
p[i] := ProbT(t[i], DFresid);
end;
InvIndepCorrs := MatInverse(SubMatrix(Correlations, 0, 0, mx-1, mx-1));
end;
Result := regOK;
@ -266,13 +265,13 @@ begin
DFtotal := NumCases - 1;
DFresid := DFtotal - DFreg;
AReport.Add('ANOVA');
AReport.Add('------------------------------------------------------------------------------');
AReport.Add('SOURCE DF Sum of Squares Mean Square F Probability');
AReport.Add('---------- --- ---------------- ---------------- ------------ -----------');
AReport.Add('Regression %3d %16.3f %16.3f %12.4f %10.4f', [DFreg, SSreg, SSreg/DFreg, F, Probability]);
AReport.Add('Residual %3d %16.3f %16.3f', [DFresid, SSresid, SSresid/DFresid]);
AReport.Add('Total %3d %16.3f', [DFtotal, SStotal]);
AReport.Add('------------------------------------------------------------------------------');
AReport.Add('--------------------------------------------------------------------------');
AReport.Add('SOURCE DF Sum of Squares Mean Square F Probability');
AReport.Add('---------- --- ---------------- ---------------- -------- -----------');
AReport.Add('Regression %4d %16.3f %16.3f %8.3f %10.3f', [DFreg, SSreg, SSreg/DFreg, F, Probability]);
AReport.Add('Residual %4d %16.3f %16.3f', [DFresid, SSresid, SSresid/DFresid]);
AReport.Add('Total %4d %16.3f', [DFtotal, SStotal]);
AReport.Add('--------------------------------------------------------------------------');
AReport.Add('');
AReport.Add('R2: %10.4f', [R2]);
AReport.Add('Adjusted R2: %10.4f', [AdjR2]);
@ -291,9 +290,9 @@ var
i, first, last: Integer;
begin
AReport.Add('COEFFICIENTS');
AReport.Add('------------------------------------------------------------------------------');
AReport.Add(' Variable Unstandardized Std.Error Standardized t p ');
AReport.Add('------------ -------------- ------------ ------------ --------- ---------');
AReport.Add('--------------------------------------------------------------------------');
AReport.Add(' Variable Unstandardized Std.Error Standardized t p ');
AReport.Add('------------ -------------- ------------ ------------ ------- -------');
first := 0;
last := High(Coeffs);
@ -307,10 +306,10 @@ begin
else
varName := Format('VAR.%d', [i+1]);
AReport.Add('%12s %14.3f %12.3f %12.3f %9.4f %9.4f',
AReport.Add('%12s %14.3f %12.3f %12.3f %7.3f %7.3f',
[varName, Coeffs[i], CoeffStdErrors[i], Beta[i], t[i], p[i]]);
end;
AReport.Add('------------------------------------------------------------------------------');
AReport.Add('--------------------------------------------------------------------------');
end;
@ -323,12 +322,26 @@ begin
end;
{ Writes the cross-product matrix to a "report"
The names of the independent variables are neede in the array AVarNames. }
procedure TMultipleRegressionResults.WriteCrossProductsReport(AReport: TStrings;
AVarNames: TStringArray);
var
saved: String;
begin
saved := AVarNames[High(AVarNames)];
AVarNames[High(AVarNames)] := '(Intercept)';
WriteMatrixReport(AReport, XT_X, 'CROSS-PRODUCT MATRIX', AVarNames);
AVarNames[High(AVarNames)] := saved;
end;
procedure TMultipleRegressionResults.WriteMatrixReport(AReport: TStrings;
AMatrix: TDblMatrix; ATitle: String; AVarNames: TStringArray);
function GetVarName(i: Integer): String;
begin
if i < High(AVarNames) then
if i <= High(AVarNames) then
Result := AVarNames[i]
else
Result := Format('VAR.%d', [i+1]);
@ -376,5 +389,90 @@ begin
end;
procedure PredictMR(const xData: TDblMatrix; const yData: TDblVector;
const ARegressionResults: TMultipleRegressionResults;
out zPred, zResid, RawPred, RawResid, StdErrPred, Hi95, Lo95: TDblVector);
var
i, j, k, n, m: Integer;
x, x1, x2, y: Double;
term1, term2, t: Double;
begin
zPred := nil; // to silence the compiler
zResid := nil;
RawPred := nil;
RawResid := nil;
StdErrPred := nil;
Hi95 := nil;
Lo95 := nil;
MatSize(xData, n,m);
// z predicted and zResidual
SetLength(zPred, n);
SetLength(zResid, n);
for i := 0 to n-1 do
begin
zPred[i] := 0;
for j := 0 to m-1 do
begin
x := (xData[i, j] - ARegressionResults.xMeans[j]) / ARegressionResults.xStdDevs[j];
zPred[i] := zPred[i] + ARegressionResults.Beta[j] * x;
end;
y := (yData[i] - ARegressionResults.yMean) / ARegressionResults.yStdDev;
zResid[i] := y - zPred[i];
end;
// Raw predicted and raw residuals
SetLength(RawPred, n);
SetLength(RawResid, n);
for i := 0 to n-1 do
begin
RawPred[i] := ARegressionResults.Coeffs[m]; // intercept
for j := 0 to m-1 do
RawPred[i] := RawPred[i] + ARegressionResults.Coeffs[j] * xData[i, j];
RawResid[i] := yData[i] - RawPred[i];
end;
// Calculate confidence interval for raw predicted score
SetLength(StdErrPred, n);
SetLength(Hi95, n);
SetLength(Lo95, n);
for i := 0 to n-1 do
begin
// Get term1 of the std. err. prediction
term1 := 0.0;
for j := 0 to m-1 do
begin
with ARegressionResults do
begin
x := (xData[i, j] - xMeans[j]) / xStdDevs[j];
term1 := term1 + x * x * InvIndepCorrs[j, j];;
end;
end;
// Get term2 of the std err. of prediction
term2 := 0.0;
for j := 0 to m-1 do
begin
for k := j + 1 to m-1 do
begin
with ARegressionResults do
begin
x1 := (xData[i, j] - xMeans[j]) / xStdDevs[j];
x2 := (xData[i, k] - xMeans[k]) / xStdDevs[k];
term2 := term2 + x1 * x2 * InvIndepCorrs[j, k];
end;
end;
end;
term2 := 2.0 * Term2;
stdErrPred[i] := sqrt(n + 1 + term1 + term2);
stdErrPred[i] := (ARegressionResults.StdErrorEstimate / sqrt(n)) * stdErrPred[i];
t := InverseT(0.975, n - m - 1);
Lo95[i] := RawPred[i] - t*StdErrPred[i];
Hi95[i] := RawPred[i] + t*StdErrPred[i];
end;
end;
end.