From 791746e7efd8b31f3492f745327d1c3d766a5ae4 Mon Sep 17 00:00:00 2001 From: wp_xxyyzz Date: Sat, 17 Oct 2020 22:42:41 +0000 Subject: [PATCH] LazStats: Massive refactoring of WLSUnit avoiding overwriting of grid data. Fix crash when accessing non-existing cells in DictionaryFrm. git-svn-id: https://svn.code.sf.net/p/lazarus-ccr/svn@7780 8e941d3f-bd1b-0410-a28a-d453659cc2b4 --- applications/lazstats/source/LazStats.lpi | 1 - .../analysis/descriptive/descriptiveunit.pas | 4 +- .../forms/analysis/descriptive/plotxyunit.pas | 4 +- .../descriptive/resistancelineunit.pas | 4 +- .../analysis/descriptive/stemleafunit.pas | 2 +- .../analysis/descriptive/xvsmultyunit.pas | 4 +- .../analysis/multiple_regression/wlsunit.lfm | 56 +- .../analysis/multiple_regression/wlsunit.pas | 618 +++++++++++------- .../lazstats/source/forms/mainunit.pas | 3 + .../lazstats/source/units/gridprocs.pas | 56 +- .../lazstats/source/units/matrixlib.pas | 11 +- .../lazstats/source/units/matrixunit.pas | 241 ++++++- .../lazstats/source/units/regressionunit.pas | 282 +++++++- 13 files changed, 964 insertions(+), 322 deletions(-) diff --git a/applications/lazstats/source/LazStats.lpi b/applications/lazstats/source/LazStats.lpi index 06442da67..af33e2c08 100644 --- a/applications/lazstats/source/LazStats.lpi +++ b/applications/lazstats/source/LazStats.lpi @@ -47,7 +47,6 @@ - diff --git a/applications/lazstats/source/forms/analysis/descriptive/descriptiveunit.pas b/applications/lazstats/source/forms/analysis/descriptive/descriptiveunit.pas index 54fd7463a..92c5110b1 100644 --- a/applications/lazstats/source/forms/analysis/descriptive/descriptiveunit.pas +++ b/applications/lazstats/source/forms/analysis/descriptive/descriptiveunit.pas @@ -146,9 +146,9 @@ begin FColIndex := AColIndex; if Length(FColsSelected) = 0 then - FValues := CollectValues(FDataGrid, AColIndex, FColsSelected) + FValues := CollectVecValues(FDataGrid, AColIndex, FColsSelected) else - FValues := CollectValues(FDataGrid, AColIndex); + FValues := CollectVecValues(FDataGrid, AColIndex); FNumCases := Length(FValues); SortOnX(FValues); diff --git a/applications/lazstats/source/forms/analysis/descriptive/plotxyunit.pas b/applications/lazstats/source/forms/analysis/descriptive/plotxyunit.pas index d92f8c53d..abb02ff4b 100644 --- a/applications/lazstats/source/forms/analysis/descriptive/plotxyunit.pas +++ b/applications/lazstats/source/forms/analysis/descriptive/plotxyunit.pas @@ -267,8 +267,8 @@ begin ColNoSelected[0] := xCol; ColNoSelected[1] := yCol; - XData := CollectValues(ADataGrid, xCol, colNoSelected); - YData := CollectValues(ADataGrid, ycol, colNoSelected); + XData := CollectVecValues(ADataGrid, xCol, colNoSelected); + YData := CollectVecValues(ADataGrid, ycol, colNoSelected); N := Length(XData); if N < 3 then begin diff --git a/applications/lazstats/source/forms/analysis/descriptive/resistancelineunit.pas b/applications/lazstats/source/forms/analysis/descriptive/resistancelineunit.pas index 313dcdec0..73a4daa4a 100644 --- a/applications/lazstats/source/forms/analysis/descriptive/resistancelineunit.pas +++ b/applications/lazstats/source/forms/analysis/descriptive/resistancelineunit.pas @@ -361,8 +361,8 @@ begin ColNoSelected[0] := xCol; ColNoSelected[1] := yCol; - XData := CollectValues(ADataGrid, xCol, colNoSelected); - YData := CollectValues(ADataGrid, ycol, colNoSelected); + XData := CollectVecValues(ADataGrid, xCol, colNoSelected); + YData := CollectVecValues(ADataGrid, ycol, colNoSelected); N := Length(XData); if N < 3 then begin diff --git a/applications/lazstats/source/forms/analysis/descriptive/stemleafunit.pas b/applications/lazstats/source/forms/analysis/descriptive/stemleafunit.pas index 9c77021dd..92457f73e 100644 --- a/applications/lazstats/source/forms/analysis/descriptive/stemleafunit.pas +++ b/applications/lazstats/source/forms/analysis/descriptive/stemleafunit.pas @@ -128,7 +128,7 @@ begin lReport.Add(''); // Stores values of the variable - values := CollectValues(OS3MainFrm.DataGrid, k); + values := CollectVecValues(OS3MainFrm.DataGrid, k); nCases := Length(values); SetLength(valueString, nCases); diff --git a/applications/lazstats/source/forms/analysis/descriptive/xvsmultyunit.pas b/applications/lazstats/source/forms/analysis/descriptive/xvsmultyunit.pas index 77b33f657..c7ef9e486 100644 --- a/applications/lazstats/source/forms/analysis/descriptive/xvsmultyunit.pas +++ b/applications/lazstats/source/forms/analysis/descriptive/xvsmultyunit.pas @@ -119,10 +119,10 @@ begin end; // Extract x and y values - xValues := CollectValues(OS3MainFrm.DataGrid, xCol, selected); + xValues := CollectVecValues(OS3MainFrm.DataGrid, xCol, selected); SetLength(yValues, nY); for i := 0 to nY-1 do - yValues[i] := CollectValues(OS3MainFrm.DataGrid, selected[i], selected); + yValues[i] := CollectVecValues(OS3MainFrm.DataGrid, selected[i], selected); // Make sure that all y columns have the same length N := Length(yValues[0]); diff --git a/applications/lazstats/source/forms/analysis/multiple_regression/wlsunit.lfm b/applications/lazstats/source/forms/analysis/multiple_regression/wlsunit.lfm index 59ab55183..55585fa91 100644 --- a/applications/lazstats/source/forms/analysis/multiple_regression/wlsunit.lfm +++ b/applications/lazstats/source/forms/analysis/multiple_regression/wlsunit.lfm @@ -42,26 +42,24 @@ inherited WLSFrm: TWLSFrm AnchorSideLeft.Control = ParamsPanel AnchorSideBottom.Control = ButtonBevel Left = 0 - Height = 147 - Top = 265 + Height = 131 + Top = 281 Width = 266 Anchors = [akLeft, akBottom] AutoSize = True Caption = 'Options' - ClientHeight = 127 + ClientHeight = 111 ClientWidth = 262 TabOrder = 10 object SaveChk: TCheckBox - AnchorSideLeft.Control = OptionsGroup - AnchorSideTop.Control = OptionsBevel + AnchorSideLeft.Control = WeightChk + AnchorSideTop.Control = OriginChk AnchorSideTop.Side = asrBottom - Left = 12 + Left = 40 Height = 19 - Top = 100 + Top = 42 Width = 180 - BorderSpacing.Left = 12 - BorderSpacing.Top = 8 - BorderSpacing.Bottom = 8 + BorderSpacing.Left = 24 Caption = 'Save estimated weights in grid' TabOrder = 0 end @@ -83,8 +81,9 @@ inherited WLSFrm: TWLSFrm AnchorSideTop.Side = asrBottom Left = 40 Height = 19 - Top = 65 + Top = 84 Width = 119 + BorderSpacing.Bottom = 8 Caption = 'Through the origin' TabOrder = 2 end @@ -104,32 +103,17 @@ inherited WLSFrm: TWLSFrm end object UserWeightsChk: TRadioButton AnchorSideLeft.Control = WeightChk - AnchorSideTop.Control = OriginChk + AnchorSideTop.Control = SaveChk AnchorSideTop.Side = asrBottom Left = 16 Height = 19 - Top = 46 + Top = 65 Width = 156 BorderSpacing.Top = 4 Caption = 'Use weights from column' OnChange = UserWeightsChkChange TabOrder = 4 end - object OptionsBevel: TBevel - AnchorSideLeft.Control = OptionsGroup - AnchorSideTop.Control = Origin2Chk - AnchorSideTop.Side = asrBottom - AnchorSideRight.Control = OptionsGroup - AnchorSideRight.Side = asrBottom - Left = 12 - Height = 8 - Top = 84 - Width = 238 - Anchors = [akTop, akLeft, akRight] - BorderSpacing.Left = 12 - BorderSpacing.Right = 12 - Shape = bsBottomLine - end end object Label1: TLabel[6] AnchorSideLeft.Control = ParamsPanel @@ -169,7 +153,7 @@ inherited WLSFrm: TWLSFrm AnchorSideBottom.Control = WeightVarEdit Left = 161 Height = 15 - Top = 205 + Top = 221 Width = 77 Anchors = [akLeft, akBottom] BorderSpacing.Bottom = 2 @@ -184,7 +168,7 @@ inherited WLSFrm: TWLSFrm AnchorSideRight.Control = DepInBtn AnchorSideBottom.Control = OptionsGroup Left = 0 - Height = 240 + Height = 256 Top = 17 Width = 119 Anchors = [akTop, akLeft, akRight, akBottom] @@ -266,7 +250,7 @@ inherited WLSFrm: TWLSFrm AnchorSideBottom.Control = WeightOutBtn Left = 127 Height = 26 - Top = 201 + Top = 217 Width = 26 Anchors = [akLeft, akBottom] BorderSpacing.Bottom = 4 @@ -283,7 +267,7 @@ inherited WLSFrm: TWLSFrm AnchorSideBottom.Side = asrBottom Left = 127 Height = 26 - Top = 231 + Top = 247 Width = 26 Anchors = [akLeft, akBottom] Images = MainDataModule.ImageList @@ -320,7 +304,7 @@ inherited WLSFrm: TWLSFrm AnchorSideRight.Side = asrBottom AnchorSideBottom.Control = WeightInBtn Left = 161 - Height = 63 + Height = 79 Top = 122 Width = 119 Anchors = [akTop, akLeft, akRight, akBottom] @@ -342,7 +326,7 @@ inherited WLSFrm: TWLSFrm AnchorSideBottom.Side = asrBottom Left = 161 Height = 23 - Top = 222 + Top = 238 Width = 119 Anchors = [akLeft, akRight, akBottom] BorderSpacing.Left = 8 @@ -362,13 +346,13 @@ inherited WLSFrm: TWLSFrm Height = 453 Top = 8 Width = 715 - ActivePage = OLSPage + ActivePage = ResidualsRegPage Align = alClient BorderSpacing.Left = 4 BorderSpacing.Top = 8 BorderSpacing.Right = 8 BorderSpacing.Bottom = 8 - TabIndex = 0 + TabIndex = 1 TabOrder = 2 object OLSPage: TTabSheet Caption = 'OLS Regression' diff --git a/applications/lazstats/source/forms/analysis/multiple_regression/wlsunit.pas b/applications/lazstats/source/forms/analysis/multiple_regression/wlsunit.pas index 743e7371c..4d7bda5b7 100644 --- a/applications/lazstats/source/forms/analysis/multiple_regression/wlsunit.pas +++ b/applications/lazstats/source/forms/analysis/multiple_regression/wlsunit.pas @@ -15,7 +15,6 @@ type { TWLSFrm } TWLSFrm = class(TBasicStatsParamsForm) - OptionsBevel: TBevel; DepInBtn: TBitBtn; DepOutBtn: TBitBtn; IndInBtn: TBitBtn; @@ -58,34 +57,47 @@ type ResidualsRegReportFrame: TReportFrame; WLSReportFrame: TReportFrame; - procedure AddPredictedStuffToGrid(AIndepCols: IntDyneVec; ANumIndepCols: Integer; - BWeights: DblDyneVec); + procedure AddVariable(AVarName: String; AData: DblDyneVec; ANumFormat: String); + + procedure AddWeightsToGrid(const ASqrPredictedResiduals, AWeights: DblDyneVec); + + procedure CalcWeights(xValues: DblDyneMat; ACoeffs: DblDyneVec; + out ASquaredPredictedResiduals: DblDyneVec; out AWeights: DblDyneVec); procedure CreateOrGetChartFrame(AColIndex: Integer; AVarName: String; out AMemo: TMemo; out AChartFrame: TChartFrame); function GetPageCaption(AVarName: String): String; - procedure PlotSquaredResiduals(AIndepCols: IntDyneVec; - ANumIndepCols, ADepCol: Integer; AConfLevel: Double); + procedure PlotSquaredResiduals(AIndepCols: IntDyneVec; ADepCol: Integer; + const AIndepValues: DblDyneMat; const ADepValues: DblDyneVec); procedure PlotXY(AChartFrame: TChartFrame; const XPoints, YPoints: DblDyneVec; const ARegressionResults: TBivariateRegressionResults; const XLabel, YLabel: String); - procedure PredictIt(ColNoSelected: IntDyneVec; NoVars: integer; - Means, StdDevs, BetaWeights: DblDyneVec; - StdErrEst: double; NoIndepVars: integer); + procedure Predict(const xData: DblDyneMat; const yData: DblDyneVec; + ARegressionResults: TMultipleRegressionResults); - function PrepareData(out ADepCol, ANumIndepCols: Integer; - out AIndepCols: IntDyneVec; out AWeightCol: Integer; - out ARowLabels: StrDyneVec): Boolean; + function PrepareData(out AIndepCols: IntDyneVec; out ADepCol: Integer; + out AWeightCol: Integer; out ARowLabels: StrDyneVec; + out xValues: DblDyneMat; out yValues: DblDyneVec): Boolean; - function Process_OLSRegression(AIndepCols: IntDyneVec; ANumIndepCols, ADepCol: Integer; - ARowLabels: StrDyneVec; ANumCases: Integer; PrintAll: Boolean): Boolean; + function Process_OLSRegression(AIndepCols: IntDyneVec; ADepCol: Integer; + const ARowLabels: StrDyneVec; const xValues: DblDyneMat; + const yValues: DblDyneVec): Boolean; function Process_SquaredResidualsRegression(AIndepCols: IntDyneVec; - ANumIndepCols, ADepCol: Integer; ARowLabels: StrDyneVec; - BWeights: DblDyneVec; ANumCases: Integer; PrintAll: Boolean): Boolean; + const ARowLabels: StrDyneVec; const xValues: DblDyneMat; + out AWeights: DblDyneVec): Boolean; + + function Process_WeightedRegression(AIndepCols: IntDyneVec; + const ARowLabels: StrDyneVec; const xValues: DblDyneMat; + const yValues: DblDyneVec; const AWeights: DblDyneVec; + SubtractMeans: Boolean): Boolean; + + function RegressionAndReport(const ARowLabels: StrDyneVec; + const xValues: DblDyneMat; const yValues: DblDyneVec; + out ARegressionResults: TMultipleRegressionResults; AReport: TStrings): Boolean; procedure WriteDescriptiveReport(AMemo: TMemo; const ARegressionResults: TBivariateRegressionResults; @@ -116,6 +128,9 @@ uses TAChartUtils, TAChartAxisUtils, TALegend, TASources, TACustomSeries, Utils, MatrixUnit, GridProcs; +const + CONF_LEVEL = DEFAULT_CONFIDENCE_LEVEL_PERCENT / 100.0; + { TWLSFrm } constructor TWLSFrm.Create(AOwner: TComponent); @@ -152,50 +167,70 @@ begin WLSReportFrame.BorderSpacing.Bottom := 0; WLSReportFrame.BorderSpacing.Right := 0; InitToolbar(WLSReportFrame.ReportToolbar, tpRight); + + PageControl.ActivePageIndex := 0; end; -{ Get predicted squared residuals and save recipricols to grid as weights } -procedure TWLSFrm.AddPredictedStuffToGrid(AIndepCols: IntDyneVec; - ANumIndepCols: Integer; BWeights: DblDyneVec); +{ Adds a new variable names AColTitle after the last grid column, + and writes the specified data to the grid (in the specified number format). } +procedure TWLSFrm.AddVariable(AVarName: String; AData: DblDyneVec; ANumFormat: String); var - col: Integer; - i, j: Integer; - X, predicted: Double; + i, colIndex: Integer; begin - col := NoVariables + 1; - OS3MainFrm.NoVarsEdit.Text := IntToStr(NoVariables); - DictionaryFrm.NewVar(col); - DictionaryFrm.DictGrid.Cells[1, col] := 'PredResid2'; - OS3MainFrm.DataGrid.Cells[col, 0] := 'PredResid2'; - - col := NoVariables + 1; - OS3MainFrm.NoVarsEdit.Text := IntToStr(NoVariables); - DictionaryFrm.NewVar(col); - DictionaryFrm.DictGrid.Cells[1, col] := 'WEIGHT'; - OS3MainFrm.DataGrid.Cells[col, 0] := 'WEIGHT'; - OS3MainFrm.NoVarsEdit.Text := IntToStr(NoVariables); - - for i := 1 to NoCases do + colIndex := GetVariableIndex(OS3MainFrm.DataGrid, AVarname); + if colIndex = -1 then begin - if (DataProcs.ValidValue(i, col-2)) then // do we have a valid squared OLS residual? - begin - predicted := 0.0; - for j := 0 to ANumIndepCols - 1 do - begin - X := StrToFloat(OS3MainFrm.DataGrid.Cells[AIndepCols[j], i]); - predicted := predicted + BWeights[j] * X; - end; - predicted := predicted + BWeights[ANumIndepCols]; - predicted := abs(predicted); - OS3MainFrm.DataGrid.Cells[col-1, i] := Format('%.3f', [predicted]); - if (predicted > 0.0) then - predicted := 1.0 / sqrt(predicted) - else - predicted := 0.0; - OS3MainFrm.DataGrid.Cells[col, i] := Format('%.3f', [predicted]); - end; + colIndex := NoVariables + 1; + DictionaryFrm.NewVar(colIndex); + DictionaryFrm.DictGrid.Cells[1, colIndex] := AVarName; + DictionaryFrm.DictGrid.Cells[7, colIndex] := 'R'; + OS3MainFrm.DataGrid.Cells[colIndex, 0] := AVarName; + OS3MainFrm.NoVarsEdit.Text := IntToStr(NoVariables); end; + for i := 0 to High(AData) do + OS3MainFrm.DataGrid.Cells[colIndex, i+1] := Format(ANumFormat, [AData[i]]); +end; + + +{ Calculate predicted squared residuals and save recipricols to grid as weights } +procedure TWLSFrm.AddWeightsToGrid(const ASqrPredictedResiduals, AWeights: DblDyneVec); +begin + // Create new variables and add to grid + AddVariable('Pred SqrResid', ASqrPredictedResiduals, '%.3f'); + AddVariable('WEIGHTS', AWeights, '%.3f'); +end; + + +{ Calculate predicted values of the squared residuals, as well as the weights } +procedure TWLSFrm.CalcWeights(xValues: DblDyneMat; ACoeffs: DblDyneVec; + out ASquaredPredictedResiduals: DblDyneVec; out AWeights: DblDyneVec); +var + i, j, n, m: Integer; + sum: Double; +begin + ASquaredPredictedResiduals := nil; + AWeights := nil; + + MatSize(xValues, n,m); + SetLength(ASquaredPredictedResiduals, n); + SetLength(AWeights, n); + + sum := 0; + for i := 0 to n-1 do + begin + ASquaredPredictedResiduals[i] := ACoeffs[m]; // intercept value + for j := 0 to m-1 do + ASquaredPredictedResiduals[i] += abs(xValues[i, j] * ACoeffs[j]); + if ASquaredPredictedResiduals[i] <> 0 then + AWeights[i] := 1 / ASquaredPredictedResiduals[i] + else + AWeights[i] := 0; + sum := sum + AWeights[i]; + end; + + // Normalize weights to 1.0 + AWeights := AWeights * (1.0 / sum); end; @@ -212,9 +247,7 @@ end; procedure TWLSFrm.Compute; var - i, j, noIndep, depCol, weightCol, oldDepCol, NCases, pos, col: integer; - IndepCols: IntDyneVec = nil; - RowLabels: StrDyneVec = nil; + i, j, noIndep, NCases, pos, col: integer; X, weight: double; Means: DblDyneVec = nil; Variances: DblDyneVec = nil; @@ -229,6 +262,15 @@ var R2: Double = 0.0; errorcode: Boolean = false; PrintDesc: boolean = true; + + indepCols: IntDyneVec = nil; + rowLabels: StrDyneVec = nil; + weights: DblDyneVec = nil; + xValues: DblDyneMat = nil; + yValues: DblDyneVec = nil; + depCol: Integer; + weightCol: Integer = -1; + useOrigin: Boolean; begin SetLength(Means, NoVariables + 2); SetLength(Variances, NoVariables + 2); @@ -244,31 +286,38 @@ begin NCases := NoCases; // Get column indexes and do some validation checks. - // NOTE that the Length(indepCols) is different from NoIndep. - if not PrepareData(depCol, noIndep, indepCols, weightCol, RowLabels) then + if not PrepareData(indepCols, depCol, weightCol, RowLabels, xValues, yValues) then exit; - // Save dependent column so we can re-use DepCol - oldDepCol := depCol; + // Do the OLS regression + if not Process_OLSRegression(indepCols, depCol, RowLabels, xValues, yValues) then + exit; - // *** Get OLS regression *** - Process_OLSRegression(indepCols, noIndep, depCol, RowLabels, nCases, printDesc); + // Regress the squared residuals on the predictors + if WeightChk.Checked then + begin + if not Process_SquaredResidualsRegression(indepCols, RowLabels, xValues, weights) then + exit; + useOrigin := OriginChk.Checked; + end else + begin + // Read the weights from the user column + weights := CollectVecValues(OS3MainFrm.DataGrid, weightCol, indepCols); + useOrigin := Origin2Chk.Checked; + end; + + // Do the weighted regression, finally + Process_WeightedRegression(indepCols, RowLabels, xValues, yValues, weights, useOrigin); + + + exit; - // *** Regress the squared residuals on the predictors *** - depCol := NoVariables; - Process_SquaredResidualsRegression(indepCols, noIndep, depCol, RowLabels, - BWeights, nCases, printDesc); - if SaveChk.Checked then //WeightChk.Checked then - AddPredictedStuffToGrid(indepCols, noIndep, BWeights); - // *** Display squared residuals for each independent variable *** - // NOTE: depCol points to the squared residuals column here - PlotSquaredResiduals(IndepCols, NoIndep, depCol, 0.95); if WeightChk.Checked then begin // Weight variables and do OLS regression on weighted variables - DepCol := olddepcol; +// DepCol := olddepcol; IndepCols[Noindep] := DepCol; for i := 1 to NoCases do begin @@ -326,7 +375,7 @@ begin if UserWeightsChk.Checked then begin // Weight variables and do OLS regression on weighted variables - depCol := olddepcol; +// depCol := olddepcol; indepCols[Noindep] := depCol; // wp: CALCULATION SHOULD NORMALIZE USER WEIGHTS HERE !!! for i := 1 to NoCases do begin @@ -536,7 +585,45 @@ end; procedure TWLSFrm.PlotSquaredResiduals(AIndepCols: IntDyneVec; - ANumIndepCols, ADepCol: Integer; AConfLevel: Double); + ADepCol: Integer; const AIndepValues: DblDyneMat; const ADepValues: DblDyneVec); +var + x, y: DblDyneVec; + i, xCol, yCol: Integer; + regressionRes: TBivariateRegressionResults; + memo: TMemo; + chartFrame: TChartFrame; + xLabel, yLabel: String; + numIndepCols: Integer; +begin + // We will plot the selected vector of the independent values vertically, + // and the dependent values horizontally. + + xCol := ADepCol; + x := VecCopy(ADepValues); + xLabel := OS3MainFrm.DataGrid.Cells[xCol, 0]; + numIndepCols := Length(AIndepCols); + + for i := 0 to numIndepCols-1 do + begin + yCol := AIndepCols[i]; + yLabel := OS3MainFrm.DataGrid.Cells[yCol, 0]; + y := MatColVector(AIndepValues, yCol-1); + SortOnX(x, y); + + // Regression + BivariateRegression(x, y, CONF_LEVEL, regressionRes); + + // Create tab with chart and report controls + CreateOrGetChartFrame(yCol-1, yLabel, memo, chartFrame); // -1 because yCol i is in grid units + + // Plot + PlotXY(chartFrame, x, y, regressionRes, xLabel, yLabel); + + // Print the descriptive statistics + WriteDescriptiveReport(memo, regressionRes, xLabel, yLabel); + end; +end; +(* var xCol, yCol: Integer; xLabel, yLabel: String; @@ -559,8 +646,8 @@ begin colNoSelected[1] := yCol; xLabel := OS3MainFrm.DataGrid.Cells[xCol, 0]; yLabel := OS3MainFrm.DataGrid.Cells[yCol, 0]; - xPoints := CollectValues(OS3MainFrm.DataGrid, xCol, colNoSelected); - yPoints := CollectValues(OS3MainFrm.DataGrid, yCol, colNoSelected); + xPoints := CollectVecValues(OS3MainFrm.DataGrid, xCol, colNoSelected); + yPoints := CollectVecValues(OS3MainFrm.DataGrid, yCol, colNoSelected); SortOnX(xPoints, yPoints); // Regression @@ -576,7 +663,7 @@ begin WriteDescriptiveReport(memo, regressionRes, xLabel, yLabel); end; end; - + *) procedure TWLSFrm.PlotXY(AChartFrame: TChartFrame; const XPoints, YPoints: DblDyneVec; const ARegressionResults: TBivariateRegressionResults; const XLabel, YLabel: String); @@ -643,52 +730,42 @@ begin end; end; - { Routine obtains predicted raw and standardized scores and their residuals. It is assumed that the dependent variable is last in the list of variable column pointers stored in the ColNoSelected vector. Get the z predicted score and its residual } -procedure TWLSFrm.PredictIt(ColNoSelected: IntDyneVec; NoVars: integer; - Means, StdDevs, BetaWeights: DblDyneVec; StdErrEst: double; NoIndepVars: integer); +procedure TWLSFrm.Predict(const xData: DblDyneMat; const yData: DblDyneVec; + ARegressionResults: TMultipleRegressionResults); var - col1, col2, col3, col4, col5, i, j, k, Index: integer; - predicted, zpredicted, z1, z2, resid, residsqr: double; - colName: String; + means, stddevs, variances: DblDyneVec; + i, j, n, m: Integer; + zPred: DblDyneVec = nil; +// zResid: DblDyneVec = nil; + rawPred: DblDyneVec = nil; + rawResid: DblDyneVec = nil; + sqrResid: DblDyneVec = nil; begin - colName := 'Pred.z.'; - col1 := GetVariableIndex(OS3MainFrm.DataGrid, colName); - if col1 = -1 then - begin - col1 := NoVariables + 1; - OS3MainFrm.NoVarsEdit.Text := IntToStr(NoVariables); - DictionaryFrm.DictGrid.ColCount := 8; - DictionaryFrm.NewVar(col1); - OS3MainFrm.DataGrid.Cells[col1, 0] := colName; - DictionaryFrm.DictGrid.Cells[1, col1] := colName; - end; + MatSize(xData, n, m); + MatColMeanVarStdDev(xData, means, variances, stddevs); - colName := 'z Resid.'; - col2 := GetVariableIndex(OS3MainFrm.DataGrid, colName); - if col2 = -1 then - begin - col2 := NoVariables + 1; - DictionaryFrm.NewVar(col2); - OS3MainFrm.NoVarsEdit.Text := IntToStr(NoVariables); - OS3MainFrm.DataGrid.Cells[col2, 0] := colName; - DictionaryFrm.DictGrid.Cells[1, col2] := colName; - end; + SetLength(zPred, n); +// SetLength(zResid, n); + SetLength(rawPred, n); + SetLength(rawResid, n); + SetLength(sqrResid, n); - for i := 1 to NoCases do + for i := 0 to n-1 do begin - zpredicted := 0.0; - for j := 0 to NoIndepVars - 1 do - begin - k := ColNoSelected[j]; - z1 := (StrToFloat(OS3MainFrm.DataGrid.Cells[k,i]) - Means[j]) / StdDevs[j]; - zpredicted := zpredicted + (z1 * BetaWeights[j]); - end; - OS3MainFrm.DataGrid.Cells[col1, i] := Format('%.4f',[zpredicted]); + zPred[i] := 0; + for j := 0 to m-1 do + zPred[i] := zPred[i] + (xData[i, j] - means[j]) / stdDevs[j] * ARegressionResults.Beta[j]; + { + zResid[i] := (yData[i] - ARegressionResults.MeanY) / ARegressionResults.StdDevY; + + w: THIS IS NOT CORRECT. Remove above line because it is not needed. + + This is the code used by the original routine if StdDevs[NoVars-1] <> 0.0 then begin Index := ColNoSelected[NoVars-1]; @@ -696,79 +773,50 @@ begin z2 := (z2 - Means[NoVars-1]) / StdDevs[NoVars-1]; // z score OS3MainFrm.DataGrid.Cells[col2, i] := Format('%.4f',[z2 - zpredicted]); // z residual end; + } + + rawPred[i] := ARegressionResults.Coeffs[m]; // intercept + for j := 0 to m-1 do + rawPred[i] := rawPred[i] + ARegressionResults.Coeffs[j] * xData[i, j]; + + rawResid[i] := rawPred[i] - yData[i]; + sqrResid[i] := sqr(rawResid[i]); end; - // Get raw predicted and residuals - colName := 'Pred.Raw'; - col3 := GetVariableIndex(OS3MainFrm.DataGrid, colName); - if col3 = -1 then - begin - col3 := NoVariables + 1; - OS3MainFrm.NoVarsEdit.Text := IntToStr(NoVariables); - DictionaryFrm.NewVar(col3); - DictionaryFrm.DictGrid.Cells[1, col3] := colName; - OS3MainFrm.DataGrid.Cells[col3, 0] := colName; - OS3MainFrm.DataGrid.ColCount := OS3MainFrm.DataGrid.ColCount + 1; - end; - - // calculate raw predicted scores and store in DataGrid at col - for i := 1 to NoCases do - begin // predicted raw obtained from previously predicted z score - predicted := StrToFloat(OS3MainFrm.DataGrid.Cells[col1, i]) * StdDevs[NoVars-1] + Means[NoVars-1]; - OS3MainFrm.DataGrid.Cells[col3, i] := Format('%.3f',[predicted]); - end; - - // Calculate residuals of predicted raw scores end; - colName := 'Raw Resid.'; - col4 := GetVariableIndex(OS3MainFrm.DataGrid, colName); - if col4 = -1 then - begin - col4 := NoVariables + 1; - OS3MainFrm.NoVarsEdit.Text := IntToStr(NoVariables); - DictionaryFrm.NewVar(col4); - DictionaryFrm.DictGrid.Cells[1, col4] := colName; - OS3MainFrm.DataGrid.Cells[col4, 0] := colName; - OS3MainFrm.DataGrid.ColCount := OS3MainFrm.DataGrid.ColCount + 1; - end; - for i := 1 to NoCases do - begin - Index := ColNoSelected[NoVars-1]; - resid := StrToFloat(OS3MainFrm.DataGrid.Cells[col3, i]) - StrToFloat(OS3MainFrm.DataGrid.Cells[Index, i]); - OS3MainFrm.DataGrid.Cells[col4, i] := Format('%.3f', [resid]); - end; - - // get square of raw residuals - colName := 'Resid Sqr'; - col5 := GetVariableIndex(OS3MainFrm.DataGrid, colName); - if col5 = -1 then - begin - col5 := NoVariables + 1; - DictionaryFrm.NewVar(col5); - OS3MainFrm.NoVarsEdit.Text := IntToStr(NoVariables); - DictionaryFrm.DictGrid.Cells[1, col5] := colName; - OS3MainFrm.DataGrid.Cells[col5, 0] := colName; - OS3MainFrm.DataGrid.ColCount := OS3MainFrm.DataGrid.ColCount + 1; - end; - for i := 1 to NoCases do - begin - residsqr := StrToFloat(OS3MainFrm.DataGrid.Cells[col4,i]); - residsqr := residsqr * residsqr; - OS3MainFrm.DataGrid.Cells[col5,i] := Format('%.3f',[residsqr]); - end; + AddVariable('z Pred', zPred, '%.4f'); +// AddGridColumn('z Resid', zResid, '%.4f'); + AddVariable('Raw Pred', rawPred, '%.3f'); + AddVariable('Raw Resid', rawResid, '%.3f'); + AddVariable('Sqr Resid', sqrResid, '%.3f'); end; -function TWLSFrm.PrepareData(out ADepCol, ANumIndepCols: Integer; - out AIndepCols: IntDyneVec; out AWeightCol: Integer; - out ARowLabels: StrDyneVec): Boolean; +{ Prepares the data for the analysis by extracting all needed data from the + grid: + - AIndepCols: integer array containing the grid column indexes of the + independent variables to be used + - ADepCol: grid column index of the dependent variable to be used + - AWeightCol: optional grid column index of the weight data to be used + - ARowLabels: string array containing the names of the independent variables + as well of the dependent variable (last) + - xValues: matrix with all independent values. The columns of the matrix + correspond to the variables, the row correspond to the cases. + - yValues: vector with the dependent variable values +} +function TWLSFrm.PrepareData(out AIndepCols: IntDyneVec; out ADepCol: Integer; + out AWeightCol: Integer; out ARowLabels: StrDyneVec; + out xValues: DblDyneMat; out yValues: DblDyneVec): Boolean; var i: Integer; msg: String; C: TWinControl; + numIndepCols: Integer; begin Result := false; AIndepCols := nil; ARowLabels := nil; + xValues := nil; + yvalues := nil; if not Validate(msg, C) then begin @@ -777,18 +825,14 @@ begin exit; end; - ANumIndepCols := IndVarList.Items.Count; + numIndepCols := IndVarList.Items.Count; ADepCol := GetVariableIndex(OS3MainFrm.DataGrid, DepVarEdit.Text); AWeightCol := GetVariableIndex(OS3MainFrm.DataGrid, WeightVarEdit.Text); - // The IndepCols store also other variables, in addition to the "real" - // independent variables. Until I know how this works, this array must be - // over-dimensions. - // ARowLabels alike. - SetLength(AIndepCols, NoVariables + 2); - SetLength(ARowLabels, NoVariables); + SetLength(AIndepCols, numIndepCols); + SetLength(ARowLabels, numIndepCols + 1); // +1 to add independent column label - for i := 0 to ANumIndepCols-1 do + for i := 0 to numIndepCols-1 do begin AIndepCols[i] := GetVariableIndex(OS3MainFrm.DataGrid, IndVarList.Items[i]); if AIndepCols[i] = -1 then @@ -798,9 +842,7 @@ begin end; ARowLabels[i] := IndVarList.Items[i]; end; - - // Append dependent column index to the independent columns vector. - AIndepCols[ANumIndepCols] := ADepCol; + ARowLabels[numIndepCols] := DepVarEdit.Text; // Check variable types: all of them must be numeric (float or integer) if not IsNumericCol(ADepCol) then @@ -809,7 +851,7 @@ begin exit; end; - for i := 0 to ANumIndepCols-1 do + for i := 0 to numIndepCols-1 do if not IsNumericCol(AIndepCols[i]) then begin ErrorMsg('Incorrect data type of independent variable "%s"', [ARowLabels[i]]); @@ -822,56 +864,45 @@ begin exit; end; + xValues := CollectMatValues(OS3MainFrm.DataGrid, AIndepCols); + yValues := CollectVecValues(OS3MainFrm.DataGrid, ADepCol); + Result := true; end; { Runs the ordinary least squares regression on the grid data } -function TWLSFrm.Process_OLSRegression(AIndepCols: IntDyneVec; ANumIndepCols, ADepCol: Integer; - ARowLabels: StrDyneVec; ANumCases: Integer; PrintAll: Boolean): Boolean; +function TWLSFrm.Process_OLSRegression(AIndepCols: IntDyneVec; + ADepCol: Integer; const ARowLabels: StrDyneVec; + const xValues: DblDyneMat; const yValues: DblDyneVec): Boolean; var lReport: TStrings; - means: DblDyneVec = nil; - variances: DblDyneVec = nil; - stdDevs: DblDyneVec = nil; - BWeights: DblDyneVec = nil; - BetaWeights: DblDyneVec = nil; - BStdErrs: DblDyneVec = nil; - BtTests: DblDyneVec = nil; - tProbs: DblDyneVec = nil; - R2, stdErrEst: Double; - error: Boolean; + regressionRes: TMultipleRegressionResults; + i: Integer; + numIndepCols: Integer; begin Result := false; + numIndepCols := Length(AIndepCols); lReport := TStringList.Create; try lReport.Add('ORDINARY LEAST SQUARES (OLS) REGRESSION RESULTS'); lReport.Add(''); + lReport.Add('Dependent variable: '); + lReport.Add(' ' + OS3MainFrm.DataGrid.Cells[ADepCol, 0]); + lReport.Add('Independent variables:'); + for i := 0 to numIndepCols-1 do + lReport.Add(' ' + ARowLabels[i]); + lReport.Add(''); - SetLength(means, NoVariables + 2); - SetLength(variances, NoVariables + 2); - SetLength(stdDevs, NoVariables + 2); - SetLength(BWeights, NoVariables + 2); - SetLength(BetaWeights, NoVariables + 2); - SetLength(BStdErrs, NoVariables + 2); - SetLength(Bttests, NoVariables + 2); - SetLength(tprobs, NoVariables + 2); + Result := RegressionAndReport(ARowLabels, xValues, yValues, regressionRes, lReport); - MReg(ANumIndepCols, AIndepCols, ADepCol, ARowLabels, Means, Variances, StdDevs, - BWeights, BetaWeights, BStdErrs, Bttests, tprobs, R2, stdErrEst, - ANumCases, error, PrintAll, lReport); + if Result then + begin + Predict(xValues, yValues, regressionRes); + OLSReportFrame.DisplayReport(lReport); + end; - // if error then // wp: Why does MReg exit with error??? - // exit; - - // Get predicted z score, residual z score, predicted raw score, - // residual raw score and squared raw residual score. Place in the DataGrid - PredictIt(AIndepCols, ANumIndepCols+1, means, stdDevs, BetaWeights, stdErrEst, ANumIndepCols); - - OLSReportFrame.DisplayReport(lReport); - - Result := true; finally lReport.Free; end; @@ -879,44 +910,137 @@ end; function TWLSFrm.Process_SquaredResidualsRegression(AIndepCols: IntDyneVec; - ANumIndepCols, ADepCol: Integer; ARowLabels: StrDyneVec; BWeights: DblDyneVec; - ANumCases: Integer; PrintAll: Boolean): Boolean; + const ARowLabels: StrDyneVec; const xValues: DblDyneMat; out AWeights: DblDyneVec): Boolean; var lReport: TStrings; - means: DblDyneVec = nil; - variances: DblDyneVec = nil; - stdDevs: DblDyneVec = nil; - BetaWeights: DblDyneVec = nil; - BStdErrs: DblDyneVec = nil; - BtTests: DblDyneVec = nil; - tProbs: DblDyneVec = nil; - R2, stdErrEst: Double; - error: Boolean; + sqrResiduals: DblDyneVec; + predSqrResiduals: DblDyneVec; + regressionRes: TMultipleRegressionResults; + i, depCol, numIndepCols: Integer; begin + AWeights := nil; + ResidualsRegPage.TabVisible := WeightChk.Checked; + + if not WeightChk.Checked then + exit; + + numIndepCols := Length(AIndepCols); + + // The last grid column (added by Process_ODSRegression) contains the + // squared residuals which will be fitted here. + depCol := NoVariables; + sqrResiduals := CollectVecValues(OS3MainFrm.DataGrid, depCol); + lReport := TStringList.Create; try lReport.Add('REGRESSION OF SQUARED RESIDUALS ON INDEPENDENT VARIABLES'); lReport.Add(''); + lReport.Add('Dependent variable: '); + lReport.Add(' ' + ARowLabels[numIndepCols]); + lReport.Add('Independent variables:'); + for i := 0 to numIndepCols-1 do + lReport.Add(' ' + ARowLabels[i]); + lReport.Add(''); - SetLength(means, NoVariables + 2); - SetLength(variances, NoVariables + 2); - SetLength(stdDevs, NoVariables + 2); - SetLength(BetaWeights, NoVariables + 2); - SetLength(BStdErrs, NoVariables + 2); - SetLength(Bttests, NoVariables + 2); - SetLength(tprobs, NoVariables + 2); + Result := RegressionAndReport(ARowLabels, xValues, sqrResiduals, regressionRes, lReport); - MReg(ANumIndepCols, AIndepCols, ADepCol, ARowLabels, Means, Variances, StdDevs, - BWeights, BetaWeights, BStdErrs, Bttests, tprobs, R2, stdErrEst, - ANumCases, error, PrintAll, lReport); + if Result then + begin + // Display the results + ResidualsRegReportFrame.DisplayReport(lReport); + + // Calculate weights and store them in the grid + CalcWeights(xValues, regressionRes.Coeffs, predSqrResiduals, AWeights); + + // Display squared residuals for each independent variable + PlotSquaredResiduals(AIndepCols, depCol, xValues, sqrResiduals); + + // Store weights to the grid + if SaveChk.Checked then + AddWeightsToGrid(predSqrResiduals, AWeights); + end; - ResidualsRegReportFrame.DisplayReport(lReport); finally lReport.Free; end; end; +function TWLSFrm.Process_WeightedRegression(AIndepCols: IntDyneVec; + const ARowLabels: StrDyneVec; const xValues: DblDyneMat; + const yValues: DblDyneVec; const AWeights: DblDyneVec; SubtractMeans: Boolean): Boolean; +var + i, j, n, m: Integer; + regressionRes: TMultipleRegressionResults; + lReport: TStrings; + means: DblDyneVec; +begin + MatSize(xValues, n, m); + + for i :=0 to n-1 do + for j := 0 to m-1 do + xValues[i, j] := xValues[i, j] * AWeights[i]; + + if SubtractMeans then + begin + means := MatRowMeans(xValues); + for i := 0 to n-1 do + for j := 0 to m-1 do + xValues[i, j] := xValues[i, j] - means[i]; + end; + + lReport := TStringList.Create; + try + lReport.Add('WEIGHTED LEAST SQUARES (WLS) REGRESSION RESULTS'); + lReport.Add(''); + lReport.Add('Dependent variable: '); + lReport.Add(' ' + ARowLabels[m]); + lReport.Add('Independent variables:'); + for i := 0 to m-1 do + lReport.Add(' ' + ARowLabels[i]); + lReport.Add(''); + + Result := RegressionAndReport(ARowLabels, xValues, yValues, regressionRes, lReport); + + if Result then + WLSReportFrame.DisplayReport(lReport); + + finally + lReport.Free; + end; +end; + + +function TWLSFrm.RegressionAndReport(const ARowLabels: StrDyneVec; + const xValues: DblDyneMat; const yValues: DblDyneVec; + out ARegressionResults: TMultipleRegressionResults; AReport: TStrings): Boolean; +var + err: TRegressionError; +begin + err := MultipleRegression(xValues, yValues, CONF_LEVEL, ARegressionResults); + case err of + regOK: ; + regTooFewValues: ErrorMsg('At least two values required for regression.'); + regStdDevZero: ErrorMsg('Standard deviation is zero.'); + end; + Result := (err = regOK); + + ARegressionResults.WriteCoeffsReport(AReport, ARowLabels); + AReport.Add(''); + AReport.Add(''); + + ARegressionResults.WriteANOVAReport(AReport); + AReport.Add(''); + AReport.Add(''); + + ARegressionResults.WriteVarCovarReport(AReport, ARowLabels); + AReport.Add(''); + AReport.Add(''); + + ARegressionResults.WriteCorrelationReport(AReport, ARowLabels); +end; + + procedure TWLSFrm.Reset; var i: integer; @@ -1073,20 +1197,12 @@ var begin lReport := TStringList.Create; try - { not needed - requires too much space - lReport.Add('Data file: %s', [OS3MainFrm.FileNameEdit.Text]); - lReport.Add(''); - lReport.Add('Variables:'); - lReport.Add(' X: %s', [xLabel]); - lReport.Add(' Y: %s', [yLabel]); - lReport.Add(''); - } - lReport.Add('Variable Mean Variance Std.Dev.'); - lReport.Add('---------- -------- -------- --------'); + lReport.Add(' Variable Mean Variance Std.Dev. '); + lReport.Add('------------ ------------ ------------ ------------'); with ARegressionResults do begin - lReport.Add('%-10s %8.2f %8.2f %8.2f', [xLabel, XMean, XVariance, XStdDev]); - lReport.Add('%-10s %8.2f %8.2f %8.2f', [yLabel, YMean, YVariance, YStdDev]); + lReport.Add('%12s %12.2f %12.2f %12.2f', [xLabel, XMean, XVariance, XStdDev]); + lReport.Add('%12s %12.2f %12.2f %12.2f', [yLabel, YMean, YVariance, YStdDev]); lReport.Add(''); lReport.Add('Regression:'); lReport.Add(' Correlation: %8.3f', [R]); diff --git a/applications/lazstats/source/forms/mainunit.pas b/applications/lazstats/source/forms/mainunit.pas index 738232217..a28f4354f 100644 --- a/applications/lazstats/source/forms/mainunit.pas +++ b/applications/lazstats/source/forms/mainunit.pas @@ -989,6 +989,9 @@ begin if not (Sender = DataGrid) then exit; + if aCol >= DictionaryFrm.DictGrid.RowCount then + exit; + ts := DataGrid.Canvas.TextStyle; justif := DictionaryFrm.DictGrid.Cells[7, aCol]; if justif = '' then justif := 'L'; diff --git a/applications/lazstats/source/units/gridprocs.pas b/applications/lazstats/source/units/gridprocs.pas index 2a6a22263..01b079fbc 100644 --- a/applications/lazstats/source/units/gridprocs.pas +++ b/applications/lazstats/source/units/gridprocs.pas @@ -8,9 +8,11 @@ uses Classes, SysUtils, Grids, Globals, DictionaryUnit; -function CollectValues(AGrid: TStringGrid; AColIndex: Integer; +function CollectVecValues(AGrid: TStringGrid; AColIndex: Integer; AColCheck: IntDyneVec = nil): DblDyneVec; +function CollectMatValues(AGrid: TStringGrid; AColIndices: IntDyneVec): DblDyneMat; + procedure GetMinMax(AGrid: TStringGrid; AColIndex: Integer; const AColCheck: IntDyneVec; out AMin, AMax: Double); @@ -43,7 +45,7 @@ uses Non-numeric values in the considered cell will raise an exception. NOTE: AColCheck must not be overdimensioned! } -function CollectValues(AGrid: TStringGrid; AColIndex: Integer; AColCheck: IntDyneVec): DblDyneVec; +function CollectVecValues(AGrid: TStringGrid; AColIndex: Integer; AColCheck: IntDyneVec): DblDyneVec; var row, n: Integer; val: Double; @@ -70,6 +72,33 @@ begin end; +{ Extracts the grid values from the columns with indices given by AColIndices + and puts them into the columns of the result matrix. + This means: The result matrix contains the variables as columns and the + cases as rows. } +function CollectMatValues(AGrid: TStringGrid; AColIndices: IntDyneVec): DblDyneMat; +var + nr, r, c, i, j: Integer; + val: Double; +begin + SetLength(Result, AGrid.RowCount, Length(AColIndices)); + nr := 0; + for r:= 1 to AGrid.RowCount-1 do + begin + if not GoodRecord(AGrid, r, AColIndices) then Continue; + i := r - 1; + for j := 0 to High(AColIndices) do + begin + c := AColIndices[j]; + if TryStrToFloat(trim(AGrid.Cells[c, r]), val) then + Result[i, j] := val; + end; + inc(nr); // count the number of rows in the matrix. + end; + SetLength(Result, nr); +end; + + { Determines the minimum and maximum of the values in the specified column of the grid. Rows with "invalid" data are ignored. If AColCheck contains other column indices these cells must be "valid", too. } @@ -101,7 +130,10 @@ end; the grid. } function GetVariableIndex(AGrid: TStringGrid; const AVarName: String): Integer; begin - Result := AGrid.Rows[0].IndexOf(AVarName); + if AVarName <> '' then + Result := AGrid.Rows[0].IndexOf(AVarName) + else + Result := -1; end; @@ -151,9 +183,13 @@ var missingCode: String; value: String; begin - missingCode := Trim(DictionaryFrm.DictGrid.Cells[6, ACol]); - value := Trim(AGrid.Cells[ACol, ARow]); - Result := (value = missingCode); + if ACol < DictionaryFrm.DictGrid.RowCount then + begin + missingCode := Trim(DictionaryFrm.DictGrid.Cells[6, ACol]); + value := Trim(AGrid.Cells[ACol, ARow]); + Result := (value = missingCode); + end else + Result := false; end; @@ -163,8 +199,12 @@ function IsNumericCol(AColIndex: Integer): Boolean; var typeCode: String; begin - typeCode := Trim(DictionaryFrm.DictGrid.Cells[4, AColIndex]); - Result := (typeCode = 'F') or (typeCode = 'I'); + if AColIndex < DictionaryFrm.DictGrid.RowCount then + begin + typeCode := Trim(DictionaryFrm.DictGrid.Cells[4, AColIndex]); + Result := (typeCode = 'F') or (typeCode = 'I'); + end else + Result := false; end; diff --git a/applications/lazstats/source/units/matrixlib.pas b/applications/lazstats/source/units/matrixlib.pas index b0e98c478..deadbb5d5 100644 --- a/applications/lazstats/source/units/matrixlib.pas +++ b/applications/lazstats/source/units/matrixlib.pas @@ -655,7 +655,6 @@ var SSY, SSres, resvar, SSreg: double; title: string; deplabel: string; - errcode: boolean = false; begin Assert(OS3MainFrm <> nil); @@ -668,7 +667,7 @@ begin SetLength(ColLabels, NCases); // initialize - errcode := false; + ErrorCode := false; for i := 0 to NCases do begin for j := 0 to NoIndep do X[i, j] := 0; @@ -721,7 +720,7 @@ begin deplabel := OS3MainFrm.DataGrid.Cells[DepCol,0]; RowLabels[NoIndep] := 'Intercept'; - VarY := SSY - (MeanY * MeanY / NCases); + VarY := SSY - sqr(MeanY) / NCases; VarY := VarY / (NCases - 1); SDY := sqrt(VarY); @@ -870,13 +869,15 @@ begin //AReport.Add('Standard Error of Estimate = %8.2f', [stderrest]); end; + // Needed for calculation of VIF and TOL + Correlations(NoIndep, IndepCols, XTX, Means, Variances, StdDevs, ErrorCode, NCases); + SVDinverse(XTX, NoIndep); + AReport.Add(DIVIDER_SMALL); AReport.Add(''); RowLabels[N-1] := 'Intercept'; AReport.Add(' Variable Beta B Std.Err. t prob VIF TOL'); AReport.Add('---------- ---------- ---------- ---------- ---------- ---------- --------- ----------'); - Correlations(NoIndep, IndepCols, XTX, Means, Variances, StdDevs, errcode, NCases); - SVDinverse(XTX, NoIndep); for i := 0 to NoIndep do begin VIF := XTX[i,i]; diff --git a/applications/lazstats/source/units/matrixunit.pas b/applications/lazstats/source/units/matrixunit.pas index e68a09dd9..c40885bd5 100644 --- a/applications/lazstats/source/units/matrixunit.pas +++ b/applications/lazstats/source/units/matrixunit.pas @@ -18,6 +18,18 @@ type // Vectors +operator + (A, B: TDblVector): TDblVector; +operator - (A, B: TDblVector): TDblVector; +operator * (A, B: TDblVector): Double; +operator * (A: TDblVector; b: Double): TDblVector; +operator * (a: Double; B: TDblVector): TDblVector; + +procedure VecCheck(A, B: TDblVector; out n: Integer); +function VecCopy(A: TDblVector): TDblVector; +function VecMultiply(A, B: TDblVector): TDblVector; +function VecOnes(n: Integer): TDblVector; +procedure VecSize(A: TDblVector; out n: Integer); + procedure VecMaxMin(const AData: TDblVector; out AMax, AMin: Double); procedure VecMeanStdDev(const AData: TDblVector; @@ -36,6 +48,7 @@ function VecMedian(const AData: TDblVector): Double; operator + (A, B: TDblMatrix): TDblMatrix; operator - (A, B: TDblMatrix): TDblMatrix; operator * (A, B: TDblMatrix): TDblMatrix; +operator * (A: TDblMatrix; v: TDblVector): TDblVector; { NOTE: Indices follow math convention: - 1st index is the row index, i.e. runs vertically @@ -44,6 +57,9 @@ operator * (A, B: TDblMatrix): TDblMatrix; function MatAppendColVector(A: TDblMatrix; v: TDblVector): TDblMatrix; procedure MatCheck(A: TDblMatrix); procedure MatCheckSquare(A: TDblMatrix; out n: Integer); +procedure MatColMeanVarStdDev(A: TDblMatrix; out AMeans, AVariances, AStdDevs: TDblVector); +function MatColMeans(A: TDblMatrix): TDblVector; +function MatColVector(A: TDblMatrix; AColIndex: Integer): TDblVector; function MatCopy(A: TDblMatrix): TDblMatrix; function MatDeterminant(A: TDblMatrix): double; function MatEqualSize(A, B: TDblMatrix): Boolean; @@ -53,9 +69,10 @@ function MatInverse(A: TDblMatrix): TDblMatrix; function MatIsSquare(A: TDblMatrix; out n: Integer): Boolean; function MatNumCols(A: TDblMatrix): Integer; function MatNumRows(A: TDblMatrix): Integer; +function MatRowMeans(A: TDblMatrix): TDblVector; +function MatRowVector(A: TDblMatrix; ARowIndex: Integer): TDblVector; procedure MatSize(A: TDblMatrix; out n, m: Integer); - -function MatTranspose(A: TDblMatrix): TDblMatrix; +function MatTransposed(A: TDblMatrix): TDblMatrix; // Sorting procedure Exchange(var a, b: Double); overload; @@ -88,6 +105,110 @@ uses Math; +operator + (A, B: TDblVector): TDblVector; +var + i, n: Integer; +begin + VecCheck(A, B, n); + SetLength(Result, n); + for i := 0 to n-1 do + Result[i] := A[i] + B[i]; +end; + + +operator - (A, B: TDblVector): TDblVector; +var + i, n: Integer; +begin + VecCheck(A, B, n); + SetLength(Result, n); + for i := 0 to n-1 do + Result[i] := A[i] - B[i]; +end; + + +// Vector dot product +operator * (A, B: TDblVector): Double; +var + i, n: Integer; +begin + VecCheck(A, B, n); + Result := 0; + for i := 0 to n-1 do + Result := Result + A[i] * B[i]; +end; + + +// Multiplication of a vector by a scalar +operator * (A: TDblVector; b: Double): TDblVector; +var + i, n: Integer; +begin + n := Length(A); + SetLength(Result, n); + for i := 0 to n-1 do + Result[i] := A[i] * b; +end; + +operator * (a: Double; B: TDblVector): TDblVector; +var + i, n: Integer; +begin + n := Length(B); + SetLength(Result, n); + for i := 0 to n-1 do + Result[i] := a * B[i]; +end; + + +procedure VecCheck(A, B: TDblVector; out n: Integer); +var + na, nb: Integer; +begin + na := Length(A); + nb := Length(B); + if na <> nb then + raise EMatrix.Create('Dimension error.') + else + n := na; +end; + + +function VecCopy(A: TDblVector): TDblVector; +var + i: Integer; +begin + SetLength(Result, Length(A)); + for i := 0 to High(A) do Result[i] := A[i]; +end; + + +function VecMultiply(A, B: TDblVector): TDblVector; +var + i, n: Integer; +begin + VecCheck(A, B, n); + SetLength(Result, n); + for i := 0 to n-1 do + Result[i] := A[i] * B[i]; +end; + + +function VecOnes(n: Integer): TDblVector; +var + i: Integer; +begin + SetLength(Result, n); + for i := 0 to n-1 do Result[i] := 1; +end; + + +procedure VecSize(A: TDblVector; out n: Integer); +begin + n := Length(A); +end; + + {=============================================================================== Statistical vector operations ===============================================================================} @@ -134,7 +255,7 @@ begin if n = 1 then exit; - AVariance := ((ss - sqr(AMean)) / n) / (n - 1); + AVariance := (ss - sqr(AMean) * n) / (n - 1); AStdDev := sqrt(AVariance); end; @@ -229,6 +350,7 @@ begin end; +{ Product of two matrices } operator * (A, B: TDblMatrix): TDblMatrix; var na, ma, nb, mb, i, j, k: Integer; @@ -251,6 +373,28 @@ begin end; +{ Product of an n x m matrix with an m vector } +operator * (A: TDblMatrix; v: TDblVector): TDblVector; +var + na, ma, nv, i, j: Integer; + sum: Double; +begin + MatSize(A, na, ma); + VecSize(v, nv); + if ma <> nv then + raise EMatrix.Create('Dimension error.'); + + SetLength(Result, na); + for i := 0 to na-1 do + begin + sum := 0; + for j := 0 to ma-1 do + sum := sum + A[i, j] * v[j]; + Result[i] := sum; + end; +end; + + { Adds the elements of the vector v to the rows of the matrix A, i.e. the number of columns increases by 1 } function MatAppendColVector(A: TDblMatrix; v: TDblVector): TDblMatrix; @@ -288,6 +432,59 @@ begin end; +procedure MatColMeanVarStdDev(A: TDblMatrix; out AMeans, AVariances, AStdDevs: TDblVector); +var + n, m, i, j: Integer; + s, ss: Double; +begin + MatSize(A, n, m); + SetLength(AMeans, m); + SetLength(AVariances, m); + SetLength(AStdDevs, m); + for j := 0 to m-1 do + begin + s := 0; + ss := 0; + for i := 0 to n-1 do + begin + s := s + A[i, j]; + ss := ss + sqr(A[i, j]); + end; + AMeans[j] := s / n; + AVariances[j] := (ss - sqr(AMeans[j]) * n) / (n - 1); + AStdDevs[j] := sqrt(AVariances[j]); + end; +end; + + +function MatColMeans(A: TDblMatrix): TDblVector; +var + i, j, n, m: Integer; + sum: Double; +begin + MatSize(A, n, m); + SetLength(Result, m); + for j := 0 to m-1 do + begin + sum := 0; + for i := 0 to n-1 do + sum := sum + A[i, j]; + Result[j] := sum / n; + end; +end; + + +function MatColVector(A: TDblMatrix; AColIndex: Integer): TDblVector; +var + i, n, m: Integer; +begin + MatSize(A, n,m); + SetLength(Result, n); + for i := 0 to n-1 do + Result[i] := A[i, AColIndex]; +end; + + function MatCopy(A: TDblMatrix): TDblMatrix; var n, m, i, j: Integer; @@ -405,6 +602,34 @@ begin end; +function MatRowMeans(A: TDblMatrix): TDblVector; +var + i, j, n, m: Integer; + sum: Double; +begin + MatSize(A, n,m); + SetLength(Result, n); + for i := 0 to n-1 do + begin + sum := 0; + for j := 0 to m-1 do + sum := sum + A[i, j]; + Result[i] := sum / m; + end; +end; + + +function MatRowVector(A: TDblMatrix; ARowIndex: Integer): TDblVector; +var + j, n, m: Integer; +begin + MatSize(A, n,m); + SetLength(Result, m); + for j := 0 to m-1 do + Result[j] := A[ARowIndex, j]; +end; + + procedure MatSize(A: TDblMatrix; out n, m: Integer); begin n := Length(A); @@ -412,15 +637,15 @@ begin end; -function MatTranspose(A: TDblMatrix): TDblMatrix; +function MatTransposed(A: TDblMatrix): TDblMatrix; var - n, i, j: Integer; + n, m, i, j: Integer; begin MatCheck(A); - MatCheckSquare(A, n); - SetLength(Result, n, n); + MatSize(A, n, m); + SetLength(Result, m, n); for i := 0 to n-1 do - for j := 0 to n-1 do + for j := 0 to m-1 do Result[j, i] := A[i, j]; end; diff --git a/applications/lazstats/source/units/regressionunit.pas b/applications/lazstats/source/units/regressionunit.pas index 031c6f5eb..ad4efb9ac 100644 --- a/applications/lazstats/source/units/regressionunit.pas +++ b/applications/lazstats/source/units/regressionunit.pas @@ -9,8 +9,15 @@ uses Classes, SysUtils, MatrixUnit; + type + ERegression = class(Exception); + +type + TRegressionError = (regOK, regTooFewValues, regStdDevZero); + TBivariateRegressionResults = record + public Slope, Intercept: Double; XMean, YMean: Double; XVariance, YVariance: Double; @@ -28,15 +35,53 @@ procedure BivariateRegression(const xData, yData: TDblVector; type TMultipleRegressionResults = record + private + procedure WriteMatrixReport(AReport: TStrings; AMatrix: TDblMatrix; + ATitle: String; AVarNames: TStringArray); + public + Coeffs: TDblVector; // Coefficients of the factors + CoeffStdErrors: TDblVector; // Standard errors of the coefficients + + Beta: TDblVector; // Standardized coefficients + VarCovar: TDblMatrix; // Variance-covariance matrix + Correlations: TDblMatrix; // Correlations matrix + + SStotal, SSreg, SSresid: Double; // Sum of squares + NumCases, NumDepVars: Integer; // number of cases, number of dependent variables + R2: Double; // coefficient of determination + AdjR2: Double; // Adjusted coefficient of determination + StdErrorEstimate: Double; // standard error of estimate + t: TDblVector; + p: TDblVector; // Probability for > t + F: Double; + Probability: Double; // Probability for > F + + Means: TDblVector; + Variances: TDblVector; + StdDevs: TDblVector; + (* + XMeans: TDblVector; // Means of the x columns + XVariances: TDblVector; // Variances of the x columns + XStdDevs: TDblVector; // Std Deviations of the x columns + *) + MeanY: Double; // Mean of y column + VarianceY: Double; // Variance of y column + StdDevY: Double; // Std Deviation of y column + + procedure WriteANOVAReport(AReport: TStrings); + procedure WriteCoeffsReport(AReport: TStrings; AVarNames: TStringArray); + procedure WriteCorrelationReport(AReport: TStrings; AVarNames: TStringArray); + procedure WriteVarCovarReport(AReport: TStrings; AVarNames: TStringArray); end; -procedure MultipleRegression(const xData: TDblMatrix; const yData: TDblVector; - AConfLevel: Double; out AResults: TMultipleRegressionResults); +function MultipleRegression(const xData: TDblMatrix; const yData: TDblVector; + AConfLevel: Double; out AResults: TMultipleRegressionResults): TRegressionError; implementation uses + StrUtils, MathUnit; {=============================================================================== @@ -95,12 +140,241 @@ end; {=============================================================================== Multiple regression ===============================================================================} -procedure MultipleRegression(const xData: TDblMatrix; const yData: TDblVector; - AConfLevel: Double; out AResults: TMultipleRegressionResults); +function MultipleRegression(const xData: TDblMatrix; const yData: TDblVector; + AConfLevel: Double; out AResults: TMultipleRegressionResults): TRegressionError; +var + i, j, nx, mx, ny: Integer; + X, XT, XT_X, inv_XT_X: TDblMatrix; + XT_Y: TDblVector; + SY, SSY: Double; + DFresid: Integer; begin + MatCheck(xData); + MatSize(xData, nx, mx); + ny := Length(yData); + if nx <> ny then + raise ERegression.Create('Dimension error.'); + + if nx < 2 then begin + Result := regTooFewValues; + exit; + end; + + // Augmented x matrix: append ones vector for intercept + X := MatAppendColVector(xData, VecOnes(ny)); + + // Transposed augmented matrix + XT := MatTransposed(X); + + // Product of augmented matrix with augmented Y + XT_Y := XT * yData; + + // Product of transposed augmented with augmented matrix + XT_X := XT * X; + + // Product of inverse of XT_X with XT_Y yields the coefficients. Intercept is last. + inv_XT_X := MatInverse(XT_X); + AResults.Coeffs := inv_XT_X * XT_Y; + + // THe XT_X matrix contains the sum of squares (diagonal elements), sum of + // cross-products (off-diagonal elements and the sum the variable values + // (augmented column). + SetLength(AResults.Means, mx+1); + SetLength(AResults.Variances, mx+1); + SetLength(AResults.StdDevs, mx+1); + for i := 0 to mx do + begin + AResults.Means[i] := XT_X[i, mx] / nx; + AResults.Variances[i] := (XT_X[i, i] - sqr(AResults.Means[i]) * nx) / (nx - 1); + AResults.StdDevs[i] := sqrt(AResults.Variances[i]); + { + if AResults.StdDevs[i] = 0.0 then + begin + Result := regStdDevZero; + exit; + end; + } + end; + + // Variance-covariance matrix + SetLength(AResults.VarCovar, mx, mx); + SetLength(AResults.Correlations, mx, mx); + for i := 0 to mx-1 do + for j := 0 to mx-1 do + begin + AResults.VarCovar[i, j] := (XT_X[i, j] - XT_X[i, mx] * XT_X[j, mx] / nx) / (nx - 1); + AResults.Correlations[i, j] := AResults.VarCovar[i, j] / (AResults.StdDevs[i] * AResults.StdDevs[j]); + end; + + with AResults do + begin + NumCases := nx; + NumDepVars := mx; + DFresid := NumCases - NumDepVars - 1; + + // Calculate column means of xData + // MatColMeanVarStdDev(X, XMeans, XVariances, XStdDevs); + + // Basic stats of y data column + VecSumSS(yData, SY, SSY); + MeanY := SY / NumCases; + SStotal := SSY - sqr(MeanY) * NumCases; // needed for ANOVA + VarianceY := SStotal / (NumCases - 1); + StdDevY := sqrt(VarianceY); + { + if StdDevY = 0 then + begin + Result := regStdDevZero; + exit; + end; + } + + // Standardized coefficients + Beta := VecMultiply(Coeffs, StdDevs) * (1.0/StdDevY); + + // Get standard errors, squared multiple correlation, tests of significance + SetLength(CoeffStdErrors, mx+1); + SetLength(t, mx+1); + SetLength(p, mx+1); + + SSresid := SSY - Coeffs * XT_Y; + SSreg := SStotal - SSresid; + StdErrorEstimate := sqrt(SSresid / DFresid); + R2 := SSreg / SStotal; + AdjR2 := 1.0 - (1.0 - R2) * (NumCases - 1) / DFresid; + F := (SSreg / NumDepVars) / (SSresid / DFresid); + Probability := ProbF(F, NumDepVars, DFresid); + + for i := 0 to mx do + begin + CoeffStdErrors[i] := StdErrorEstimate * sqrt(inv_XT_X[i, i]); + t[i] := Coeffs[i] / CoeffStdErrors[i]; + p[i] := ProbT(t[i], DFresid); + end; + end; + + Result := regOK; end; +{ Writes the ANOVA of the regression to a "report". } +procedure TMultipleRegressionResults.WriteANOVAReport(AReport: TStrings); +var + DFreg, DFresid, DFtotal: Integer; +begin + DFreg := NumDepVars; + DFtotal := NumCases - 1; + DFresid := DFtotal - DFreg; + AReport.Add('ANOVA'); + AReport.Add('------------------------------------------------------------------------------'); + AReport.Add('SOURCE DF Sum of Squares Mean Square F Probability'); + AReport.Add('---------- --- ---------------- ---------------- ------------ -----------'); + AReport.Add('Regression %3d %16.3f %16.3f %12.4f %10.4f', [DFreg, SSreg, SSreg/DFreg, F, Probability]); + AReport.Add('Residual %3d %16.3f %16.3f', [DFresid, SSresid, SSresid/DFresid]); + AReport.Add('Total %3d %16.3f', [DFtotal, SStotal]); + AReport.Add('------------------------------------------------------------------------------'); + AReport.Add(''); + AReport.Add('R2: %10.4f', [R2]); + AReport.Add('Adjusted R2: %10.4f', [AdjR2]); + AReport.Add('F: %10.2f (with %d and %d degrees of freedom)', [F, DFreg, DFresid]); + AReport.Add('Probability > F: %10.4f', [Probability]); + AReport.Add('Standard Error of Estimate: %10.2f', [StdErrorEstimate]); +end; + + +{ Writes the determined coefficients and their statistics to a "report". + The names of the independent variables are needed in the array AVarNames. } +procedure TMultipleRegressionResults.WriteCoeffsReport(AReport: TStrings; + AVarNames: TStringArray); +var + varName: String; + i, first, last: Integer; +begin + AReport.Add('COEFFICIENTS'); + AReport.Add('------------------------------------------------------------------------------'); + AReport.Add(' Variable Unstandardized Std.Error Standardized t p '); + AReport.Add('------------ -------------- ------------ ------------ --------- ---------'); + + first := 0; + last := High(Coeffs); + for i := first to last do + begin + if i = last then + varName := '(Intercept)' + else + if i <= High(AVarNames) then + varName := AVarNames[i] + else + varName := Format('VAR.%d', [i+1]); + + AReport.Add('%12s %14.3f %12.3f %12.3f %9.4f %9.4f', + [varName, Coeffs[i], CoeffStdErrors[i], Beta[i], t[i], p[i]]); + end; + AReport.Add('------------------------------------------------------------------------------'); +end; + + +{ Writes the correlation matrix to a "report". + The names of the independent variables are needed in the array AVarNames. } +procedure TMultipleRegressionResults.WriteCorrelationReport(AReport: TStrings; + AVarNames: TStringArray); +begin + WriteMatrixReport(AReport, Correlations, 'CORRELATION MATRIX', AVarNames); +end; + + +procedure TMultipleRegressionResults.WriteMatrixReport(AReport: TStrings; + AMatrix: TDblMatrix; ATitle: String; AVarNames: TStringArray); + + function GetVarName(i: Integer): String; + begin + if i < High(AVarNames) then + Result := AVarNames[i] + else + Result := Format('VAR.%d', [i+1]); + end; + +const + SPACE = ' '; +var + i, j, first, last: Integer; + s: String; +begin + first := 0; + last := High(AMatrix); + + AReport.Add(ATitle); + + s := DupeString(' ', 12); + for j := first to last do + s := s + SPACE + Format('%12s', [GetVarName(j)]); + AReport.Add(DupeString('-', Length(s))); + AReport.Add(s); + + s := DupeString(' ', 12); + for j := first to last do + s := s + SPACE + DupeString('-', 12); + AReport.Add(s); + + for i := first to last do + begin + s := Format('%12s', [GetVarName(i)]); + for j := first to last do + s := s + SPACE + Format('%12.3f', [AMatrix[i, j]]); + AReport.Add(s); + end; + AReport.Add(DupeString('-', Length(s))); +end; + + +{ Writes the variance/covariance matrix to a "report". + The names of the independent variables are needed in the array AVarNames. } +procedure TMultipleRegressionResults.WriteVarCovarReport(AReport: TStrings; + AVarNames: TStringArray); +begin + WriteMatrixReport(AReport, VarCovar, 'VARIANCE-COVARIANCE MATRIX', AVarNames); +end; + end.