unit RegressionUnit; {$mode objfpc}{$H+} {$modeswitch advancedrecords} interface uses Classes, SysUtils, MatrixUnit; type ERegression = class(Exception); type TRegressionError = (regOK, regTooFewValues); TBivariateRegressionResults = record public Slope, Intercept: Double; XMean, YMean: Double; XVariance, YVariance: Double; XStdDev, YStdDev: Double; StdErrorPredicted: Double; R: Double; t: Double; SXX, SXY, SYY: Double; Count, DF: Integer; function ConfidenceLimits(x: Double; Upper: Boolean): Double; end; procedure BivariateRegression(const xData, yData: TDblVector; AConfLevel: Double; out AResults: TBivariateRegressionResults); type TMultipleRegressionResults = record private procedure WriteMatrixReport(AReport: TStrings; AMatrix: TDblMatrix; ATitle: String; AVarNames: TStringArray); public Coeffs: TDblVector; // Coefficients of the factors CoeffStdErrors: TDblVector; // Standard errors of the coefficients Beta: TDblVector; // Standardized coefficients XT_X: TDblMatrix; // Cross-products matrix VarCovar: TDblMatrix; // Variance-covariance matrix Correlations: TDblMatrix; // Correlations matrix InvIndepCorrs: TDblMatrix; // Inverse of the INDEPENDENT correlation matrix SStotal, SSreg, SSresid: Double; // Sum of squares NumCases, NumDepVars: Integer; // number of cases, number of dependent variables R2: Double; // coefficient of determination AdjR2: Double; // Adjusted coefficient of determination StdErrorEstimate: Double; // standard error of estimate t: TDblVector; p: TDblVector; // Probability for > t F: Double; Probability: Double; // Probability for > F Means: TDblVector; Variances: TDblVector; StdDevs: TDblVector; XMeans: TDblVector; // Means of the x columns XVariances: TDblVector; // Variances of the x columns XStdDevs: TDblVector; // Std Deviations of the x columns YMean: Double; // Mean of y column YVariance: Double; // Variance of y column YStdDev: Double; // Std Deviation of y column procedure WriteANOVAReport(AReport: TStrings); procedure WriteCoeffsReport(AReport: TStrings; AVarNames: TStringArray); procedure WriteCorrelationReport(AReport: TStrings; AVarNames: TStringArray); procedure WriteCrossProductsReport(AReport: TStrings; AVarNames: TStringArray); procedure WriteVarCovarReport(AReport: TStrings; AVarNames: TStringArray); end; function MultipleRegression(const xData: TDblMatrix; const yData: TDblVector; AConfLevel: Double; out AResults: TMultipleRegressionResults): TRegressionError; procedure PredictMR(const xData: TDblMatrix; const yData: TDblVector; const ARegressionResults: TMultipleRegressionResults; out zPred, zResid, RawPred, RawResid, StdErrPred, Hi95, Lo95: TDblVector); implementation uses StrUtils, MathUnit; {=============================================================================== Bivariate regression Calculates a bivariate regression y = a x + b x is a vector of independent observations, y of dependent observations. Both vectors must have the same length. It is assumed that xData and yData contain at least 3 elements and have the same count of elements. ===============================================================================} procedure BivariateRegression(const xData, yData: TDblVector; AConfLevel: Double; out AResults: TBivariateRegressionResults); var i: Integer; begin with AResults do begin Count := Length(xData); // Calculate means, variances, stddevs VecMeanVarStdDevSS(xData, XMean, XVariance, XStdDev, SXX); VecMeanVarStdDevSS(yData, YMean, YVariance, YStdDev, SYY); SXY := 0; for i := 0 to Count-1 do SXY := SXY + xData[i] * yData[i]; R := (SXY - XMean * YMean * Count) / ((Count - 1) * XStdDev * YStdDev); StdErrorPredicted := sqrt(1.0 - sqr(R)) * YStdDev * sqrt((Count - 1) / (Count - 2)); Slope := R * YStdDev / XStdDev; Intercept := YMean - Slope * XMean; DF := Count - 2; t := InverseT(AConfLevel, DF); end; end; function TBivariateRegressionResults.ConfidenceLimits(x: Double; Upper: Boolean): Double; var yPred, seData: Double; begin yPred := Intercept + Slope * x; seData := StdErrorPredicted * sqrt(1.0 + 1/Count + sqr(x - XMean)/SXX); if Upper then Result := yPred + t*seData else Result := yPred - t*seData; end; {=============================================================================== Multiple regression ===============================================================================} function MultipleRegression(const xData: TDblMatrix; const yData: TDblVector; AConfLevel: Double; out AResults: TMultipleRegressionResults): TRegressionError; var i, j, nx, mx, ny: Integer; X, XT, XT_X, inv_XT_X: TDblMatrix; XT_Y: TDblVector; SY, SSY: Double; DFresid: Integer; begin MatCheck(xData); MatSize(xData, nx, mx); ny := Length(yData); if nx <> ny then raise ERegression.Create('Dimension error.'); if nx < 2 then begin Result := regTooFewValues; exit; end; // Augmented x matrix: append ones vector for intercept X := MatAppendColVector(xData, VecOnes(ny)); // Transposed augmented matrix XT := MatTransposed(X); // Product of augmented matrix with augmented Y XT_Y := XT * yData; // Product of transposed augmented with augmented matrix XT_X := XT * X; AResults.XT_X := MatCopy(XT_X); // Product of inverse of XT_X with XT_Y yields the coefficients. Intercept is last. inv_XT_X := MatInverse(XT_X); AResults.Coeffs := inv_XT_X * XT_Y; // THe XT_X matrix contains the sum of squares (diagonal elements), sum of // cross-products (off-diagonal elements and the sum of the variable values // (augmented column). SetLength(AResults.Means, mx+1); SetLength(AResults.Variances, mx+1); SetLength(AResults.StdDevs, mx+1); for i := 0 to mx do begin AResults.Means[i] := XT_X[i, mx] / nx; AResults.Variances[i] := (XT_X[i, i] - sqr(AResults.Means[i]) * nx) / (nx - 1); AResults.StdDevs[i] := sqrt(AResults.Variances[i]); end; // Variance-covariance matrix with AResults do begin SetLength(VarCovar, mx, mx); SetLength(Correlations, mx, mx); for i := 0 to mx-1 do for j := 0 to mx-1 do begin VarCovar[i, j] := (XT_X[i, j] - XT_X[i, mx] * XT_X[j, mx] / nx) / (nx - 1); if StdDevs[i] <> 0 then Correlations[i, j] := VarCovar[i, j] / (StdDevs[i] * StdDevs[j]) else Correlations[i, j] := 0.0; end; end; with AResults do begin NumCases := nx; NumDepVars := mx; DFresid := NumCases - NumDepVars - 1; // Calculate column means of xData MatColMeanVarStdDev(xData, xMeans, xVariances, xStdDevs); // Basic stats of y data column VecSumSS(yData, SY, SSY); yMean := SY / NumCases; SStotal := SSY - sqr(yMean) * NumCases; // needed for ANOVA yVariance := SStotal / (NumCases - 1); yStdDev := sqrt(yVariance); // Standardized coefficients Beta := VecMultiply(Coeffs, StdDevs) * (1.0/yStdDev); // Get standard errors, squared multiple correlation, tests of significance SetLength(CoeffStdErrors, mx+1); SetLength(t, mx+1); SetLength(p, mx+1); SSresid := SSY - Coeffs * XT_Y; SSreg := SStotal - SSresid; StdErrorEstimate := sqrt(SSresid / DFresid); R2 := SSreg / SStotal; AdjR2 := 1.0 - (1.0 - R2) * (NumCases - 1) / DFresid; F := (SSreg / NumDepVars) / (SSresid / DFresid); Probability := ProbF(F, NumDepVars, DFresid); for i := 0 to mx do begin CoeffStdErrors[i] := StdErrorEstimate * sqrt(inv_XT_X[i, i]); t[i] := Coeffs[i] / CoeffStdErrors[i]; p[i] := ProbT(t[i], DFresid); end; InvIndepCorrs := MatInverse(SubMatrix(Correlations, 0, 0, mx-1, mx-1)); end; Result := regOK; end; { Writes the ANOVA of the regression to a "report". } procedure TMultipleRegressionResults.WriteANOVAReport(AReport: TStrings); var DFreg, DFresid, DFtotal: Integer; begin DFreg := NumDepVars; DFtotal := NumCases - 1; DFresid := DFtotal - DFreg; AReport.Add('ANOVA'); AReport.Add('--------------------------------------------------------------------------'); AReport.Add('SOURCE DF Sum of Squares Mean Square F Probability'); AReport.Add('---------- --- ---------------- ---------------- -------- -----------'); AReport.Add('Regression %4d %16.3f %16.3f %8.3f %10.3f', [DFreg, SSreg, SSreg/DFreg, F, Probability]); AReport.Add('Residual %4d %16.3f %16.3f', [DFresid, SSresid, SSresid/DFresid]); AReport.Add('Total %4d %16.3f', [DFtotal, SStotal]); AReport.Add('--------------------------------------------------------------------------'); AReport.Add(''); AReport.Add('R2: %10.4f', [R2]); AReport.Add('Adjusted R2: %10.4f', [AdjR2]); AReport.Add('F: %10.2f (with %d and %d degrees of freedom)', [F, DFreg, DFresid]); AReport.Add('Probability > F: %10.4f', [Probability]); AReport.Add('Standard Error of Estimate: %10.2f', [StdErrorEstimate]); end; { Writes the determined coefficients and their statistics to a "report". The names of the independent variables are needed in the array AVarNames. } procedure TMultipleRegressionResults.WriteCoeffsReport(AReport: TStrings; AVarNames: TStringArray); var varName: String; i, first, last: Integer; begin AReport.Add('COEFFICIENTS'); AReport.Add('--------------------------------------------------------------------------'); AReport.Add(' Variable Unstandardized Std.Error Standardized t p '); AReport.Add('------------ -------------- ------------ ------------ ------- -------'); first := 0; last := High(Coeffs); for i := first to last do begin if i = last then varName := '(Intercept)' else if i <= High(AVarNames) then varName := AVarNames[i] else varName := Format('VAR.%d', [i+1]); AReport.Add('%12s %14.3f %12.3f %12.3f %7.3f %7.3f', [varName, Coeffs[i], CoeffStdErrors[i], Beta[i], t[i], p[i]]); end; AReport.Add('--------------------------------------------------------------------------'); end; { Writes the correlation matrix to a "report". The names of the independent variables are needed in the array AVarNames. } procedure TMultipleRegressionResults.WriteCorrelationReport(AReport: TStrings; AVarNames: TStringArray); begin WriteMatrixReport(AReport, Correlations, 'CORRELATION MATRIX', AVarNames); end; { Writes the cross-product matrix to a "report" The names of the independent variables are neede in the array AVarNames. } procedure TMultipleRegressionResults.WriteCrossProductsReport(AReport: TStrings; AVarNames: TStringArray); var saved: String; begin saved := AVarNames[High(AVarNames)]; AVarNames[High(AVarNames)] := '(Intercept)'; WriteMatrixReport(AReport, XT_X, 'CROSS-PRODUCT MATRIX', AVarNames); AVarNames[High(AVarNames)] := saved; end; procedure TMultipleRegressionResults.WriteMatrixReport(AReport: TStrings; AMatrix: TDblMatrix; ATitle: String; AVarNames: TStringArray); function GetVarName(i: Integer): String; begin if i <= High(AVarNames) then Result := AVarNames[i] else Result := Format('VAR.%d', [i+1]); end; const SPACE = ' '; var i, j, first, last: Integer; s: String; begin first := 0; last := High(AMatrix); AReport.Add(ATitle); s := DupeString(' ', 12); for j := first to last do s := s + SPACE + Format('%12s', [GetVarName(j)]); AReport.Add(DupeString('-', Length(s))); AReport.Add(s); s := DupeString(' ', 12); for j := first to last do s := s + SPACE + DupeString('-', 12); AReport.Add(s); for i := first to last do begin s := Format('%12s', [GetVarName(i)]); for j := first to last do s := s + SPACE + Format('%12.3f', [AMatrix[i, j]]); AReport.Add(s); end; AReport.Add(DupeString('-', Length(s))); end; { Writes the variance/covariance matrix to a "report". The names of the independent variables are needed in the array AVarNames. } procedure TMultipleRegressionResults.WriteVarCovarReport(AReport: TStrings; AVarNames: TStringArray); begin WriteMatrixReport(AReport, VarCovar, 'VARIANCE-COVARIANCE MATRIX', AVarNames); end; procedure PredictMR(const xData: TDblMatrix; const yData: TDblVector; const ARegressionResults: TMultipleRegressionResults; out zPred, zResid, RawPred, RawResid, StdErrPred, Hi95, Lo95: TDblVector); var i, j, k, n, m: Integer; x, x1, x2, y: Double; term1, term2, t: Double; begin zPred := nil; // to silence the compiler zResid := nil; RawPred := nil; RawResid := nil; StdErrPred := nil; Hi95 := nil; Lo95 := nil; MatSize(xData, n,m); // z predicted and zResidual SetLength(zPred, n); SetLength(zResid, n); for i := 0 to n-1 do begin zPred[i] := 0; for j := 0 to m-1 do begin x := (xData[i, j] - ARegressionResults.xMeans[j]) / ARegressionResults.xStdDevs[j]; zPred[i] := zPred[i] + ARegressionResults.Beta[j] * x; end; y := (yData[i] - ARegressionResults.yMean) / ARegressionResults.yStdDev; zResid[i] := y - zPred[i]; end; // Raw predicted and raw residuals SetLength(RawPred, n); SetLength(RawResid, n); for i := 0 to n-1 do begin RawPred[i] := ARegressionResults.Coeffs[m]; // intercept for j := 0 to m-1 do RawPred[i] := RawPred[i] + ARegressionResults.Coeffs[j] * xData[i, j]; RawResid[i] := yData[i] - RawPred[i]; end; // Calculate confidence interval for raw predicted score SetLength(StdErrPred, n); SetLength(Hi95, n); SetLength(Lo95, n); for i := 0 to n-1 do begin // Get term1 of the std. err. prediction term1 := 0.0; for j := 0 to m-1 do begin with ARegressionResults do begin x := (xData[i, j] - xMeans[j]) / xStdDevs[j]; term1 := term1 + x * x * InvIndepCorrs[j, j];; end; end; // Get term2 of the std err. of prediction term2 := 0.0; for j := 0 to m-1 do begin for k := j + 1 to m-1 do begin with ARegressionResults do begin x1 := (xData[i, j] - xMeans[j]) / xStdDevs[j]; x2 := (xData[i, k] - xMeans[k]) / xStdDevs[k]; term2 := term2 + x1 * x2 * InvIndepCorrs[j, k]; end; end; end; term2 := 2.0 * Term2; stdErrPred[i] := sqrt(n + 1 + term1 + term2); stdErrPred[i] := (ARegressionResults.StdErrorEstimate / sqrt(n)) * stdErrPred[i]; t := InverseT(0.975, n - m - 1); Lo95[i] := RawPred[i] - t*StdErrPred[i]; Hi95[i] := RawPred[i] + t*StdErrPred[i]; end; end; end.