LazStats: Complete refactoring of CompareDistUnit.

git-svn-id: https://svn.code.sf.net/p/lazarus-ccr/svn@7723 8e941d3f-bd1b-0410-a28a-d453659cc2b4
This commit is contained in:
wp_xxyyzz
2020-09-30 21:05:32 +00:00
parent 0e50c16f6b
commit 10049ca6b0
3 changed files with 382 additions and 571 deletions

View File

@ -65,7 +65,7 @@ function factorial(x : integer) : integer;
implementation
uses
MathUnit;
Utils, MathUnit;
function chisquaredprob(X : double; k : integer) : double;
var
@ -1629,28 +1629,22 @@ begin
end;
Result := NoGood;
end;
//-------------------------------------------------------------------
{ split a value into the whole and fractional parts}
function WholeValue(value : double) : double;
{ split a value into the whole and fractional parts}
VAR
whole : double;
begin
whole := Floor(value);
Result := whole;
Result := Floor(value);
end;
//---------------------------------------------------------------------------
function FractionValue(value : double) : double;
{ split a value into the whole and fractional parts }
VAR
fraction : double;
begin
fraction := value - Floor(value);
Result := fraction;
end;
//---------------------------------------------------------------------------
Function Quartiles(TypeQ : integer; pcntile : double; N : integer;
{ split a value into the whole and fractional parts }
function FractionValue(value : double) : double;
begin
Result := value - Floor(value);
end;
function Quartiles(TypeQ : integer; pcntile : double; N : integer;
VAR values : DblDyneVec) : double;
VAR
whole, fraction, Myresult, np, avalue, avalue1 : double;
@ -1768,173 +1762,176 @@ begin
result := p;
end;
function KolmogorovTest(na : integer; const a: DblDyneVec; nb: integer;
{ Statistical test whether two one-dimensional sets of points are compatible
with coming from the same parent distribution, using the Kolmogorov test.
That is, it is used to compare two experimental distributions of unbinned data.
Input:
a,b: One-dimensional arrays of length na, nb, respectively.
The elements of a and b must be given in ascending order.
option is a character string to specify options
"D" Put out a line of "Debug" printout
"M" Return the Maximum Kolmogorov distance instead of prob
Output:
The returned value prob is a calculated confidence level which gives a
statistical test for compatibility of a and b.
Values of prob close to zero are taken as indicating a small probability
of compatibility. For two point sets drawn randomly from the same parent
distribution, the value of prob should be uniformly distributed between
zero and one.
in case of error the function return -1
If the 2 sets have a different number of points, the minimum of
the two sets is used.
Method:
The Kolmogorov test is used. The test statistic is the maximum deviation
between the two integrated distribution functions, multiplied by the
normalizing factor (rdmax*sqrt(na*nb/(na+nb)).
Code adapted by Rene Brun from CERNLIB routine TKOLMO (Fred James)
(W.T. Eadie, D. Drijard, F.E. James, M. Roos and B. Sadoulet,
Statistical Methods in Experimental Physics, (North-Holland,
Amsterdam 1971) 269-271)
Method Improvement by Jason A Detwiler (JADetwiler@lbl.gov)
-----------------------------------------------------------
The nuts-and-bolts of the TMath::KolmogorovTest() algorithm is a for-loop
over the two sorted arrays a and b representing empirical distribution
functions. The for-loop handles 3 cases: when the next points to be
evaluated satisfy a>b, a<b, or a=b:
For the last case, a=b, the algorithm advances each array by one index in an
attempt to move through the equality. However, this is incorrect when one or
the other of a or b (or both) have a repeated value, call it x. For the KS
statistic to be computed properly, rdiff needs to be calculated after all of
the a and b at x have been tallied (this is due to the definition of the
empirical distribution function; another way to convince yourself that the
old CERNLIB method is wrong is that it implies that the function defined as the
difference between a and b is multi-valued at x -- besides being ugly, this
would invalidate Kolmogorov's theorem).
NOTE1
A good description of the Kolmogorov test can be seen at:
http://www.itl.nist.gov/div898/handbook/eda/section3/eda35g.htm
}
function KolmogorovTest(na: integer; const a: DblDyneVec; nb: integer;
const b: DblDyneVec; option: String; AReport: TStrings): double;
VAR
prob : double;
opt : string;
rna : double; // = na;
rnb : double; // = nb;
sa : double; // = 1./rna;
sb : double; // = 1./rnb;
rdiff, rdmax, x, z : double;
i, ia, ib : integer;
ok : boolean;
var
prob: double;
opt: string;
rna: double; // = na;
rnb: double; // = nb;
sa: double; // = 1./rna;
sb: double; // = 1./rnb;
rdiff, rdmax, x, z: double;
i, ia, ib: integer;
ok: boolean;
begin
// Statistical test whether two one-dimensional sets of points are compatible
// with coming from the same parent distribution, using the Kolmogorov test.
// That is, it is used to compare two experimental distributions of unbinned data.
//
// Input:
// a,b: One-dimensional arrays of length na, nb, respectively.
// The elements of a and b must be given in ascending order.
// option is a character string to specify options
// "D" Put out a line of "Debug" printout
// "M" Return the Maximum Kolmogorov distance instead of prob
//
// Output:
// The returned value prob is a calculated confidence level which gives a
// statistical test for compatibility of a and b.
// Values of prob close to zero are taken as indicating a small probability
// of compatibility. For two point sets drawn randomly from the same parent
// distribution, the value of prob should be uniformly distributed between
// zero and one.
//
// in case of error the function return -1
// If the 2 sets have a different number of points, the minimum of
// the two sets is used.
//
// Method:
// The Kolmogorov test is used. The test statistic is the maximum deviation
// between the two integrated distribution functions, multiplied by the
// normalizing factor (rdmax*sqrt(na*nb/(na+nb)).
//
// Code adapted by Rene Brun from CERNLIB routine TKOLMO (Fred James)
// (W.T. Eadie, D. Drijard, F.E. James, M. Roos and B. Sadoulet,
// Statistical Methods in Experimental Physics, (North-Holland,
// Amsterdam 1971) 269-271)
//
// Method Improvement by Jason A Detwiler (JADetwiler@lbl.gov)
// -----------------------------------------------------------
// The nuts-and-bolts of the TMath::KolmogorovTest() algorithm is a for-loop
// over the two sorted arrays a and b representing empirical distribution
// functions. The for-loop handles 3 cases: when the next points to be
// evaluated satisfy a>b, a<b, or a=b:
// For the last case, a=b, the algorithm advances each array by one index in an
// attempt to move through the equality. However, this is incorrect when one or
// the other of a or b (or both) have a repeated value, call it x. For the KS
// statistic to be computed properly, rdiff needs to be calculated after all of
// the a and b at x have been tallied (this is due to the definition of the
// empirical distribution function; another way to convince yourself that the
// old CERNLIB method is wrong is that it implies that the function defined as the
// difference between a and b is multi-valued at x -- besides being ugly, this
// would invalidate Kolmogorov's theorem).
//
// NOTE1
// A good description of the Kolmogorov test can be seen at:
// http://www.itl.nist.gov/div898/handbook/eda/section3/eda35g.htm
opt := option;
prob := -1;
opt := option;
// opt.ToUpper();
{ Require at least two points in each graph }
if (na <= 2) or (nb <= 2) then
begin
ErrorMsg('KolmogorovTest - Sets must have more than 2 points');
exit;
end;
prob := -1;
// Require at least two points in each graph
if (na <= 2) or (nb <= 2) then
begin
ShowMessage('KolmogorovTest - Sets must have more than 2 points');
exit;
end;
// Constants needed
rna := na;
rnb := nb;
sa := 1./rna;
sb := 1. / rnb;
// Starting values for main loop
if (a[0] < b[0]) then
begin
rdiff := -sa;
ia := 2;
ib := 1;
end
else
begin
rdiff := sb;
ib := 2;
ia := 1;
end;
rdmax := Abs(rdiff);
{ Constants needed }
rna := na;
rnb := nb;
sa := 1./rna;
sb := 1. / rnb;
// Main loop over point sets to find max distance
// rdiff is the running difference, and rdmax the max.
ok := FALSE;
for i := 0 to na + nb - 1 do
begin
if (a[ia-1] < b[ib-1]) then
{ Starting values for main loop }
if (a[0] < b[0]) then
begin
rdiff := -sa;
ia := 2;
ib := 1;
end
else
begin
rdiff := sb;
ib := 2;
ia := 1;
end;
rdmax := Abs(rdiff);
{ Main loop over point sets to find max distance
rdiff is the running difference, and rdmax the max. }
ok := false;
for i := 0 to na + nb - 1 do
begin
if (a[ia-1] < b[ib-1]) then
begin
rdiff := rdiff - sa;
ia := ia + 1;
if (ia > na) then
begin
rdiff := rdiff - sa;
ia := ia + 1;
if (ia > na) then
begin
ok := TRUE;
break;
end;
end
else if (a[ia-1] > b[ib-1]) then
begin
rdiff := rdiff + sb;
ib := ib + 1;
if (ib > nb) then
begin
ok := TRUE;
break;
end;
end
else
begin
x := a[ia-1];
while((a[ia-1] = x) and (ia <= na)) do
begin
rdiff := rdiff - sa;
ia := ia + 1;
end;
while ((b[ib-1] = x) and (ib <= nb)) do
begin
rdiff := rdiff + sb;
ib := ib + 1;
end;
if (ia > na) then
begin
ok := TRUE;
break;
end;
if (ib > nb) then
begin
ok := TRUE;
break;
end;
ok := true;
break;
end;
rdmax := Max(rdmax,Abs(rdiff));
end;
// Should never terminate this loop with ok = kFALSE!
end
else if (a[ia-1] > b[ib-1]) then
begin
rdiff := rdiff + sb;
ib := ib + 1;
if (ib > nb) then
begin
ok := true;
break;
end;
end
else
begin
x := a[ia-1];
while((a[ia-1] = x) and (ia <= na)) do
begin
rdiff := rdiff - sa;
ia := ia + 1;
end;
while ((b[ib-1] = x) and (ib <= nb)) do
begin
rdiff := rdiff + sb;
ib := ib + 1;
end;
if (ia > na) then
begin
ok := true;
break;
end;
if (ib > nb) then
begin
ok := true;
break;
end;
end;
rdmax := Max(rdmax, abs(rdiff));
end;
if (ok) then
begin
rdmax := Max(rdmax,Abs(rdiff));
z := rdmax * Sqrt(rna * rnb / (rna + rnb));
prob := KolmogorovProb(z);
end;
{ Should never terminate this loop with ok = FALSE! }
if ok then
begin
rdmax := Max(rdmax, abs(rdiff));
z := rdmax * Sqrt(rna * rnb / (rna + rnb));
prob := KolmogorovProb(z);
end;
// debug printout
if (opt = 'D') then
AReport.Add(' Kolmogorov Probability: %g, Max Dist: %g', [prob, rdmax]);
if(opt = 'M') then
result := rdmax
else
result := prob;
{ debug printout }
if (opt = 'D') then
AReport.Add(' Kolmogorov Probability: %.5f, Max Dist: %.5f', [prob, rdmax]);
if (opt = 'M') then
Result := rdmax
else
Result := prob;
end;
(* wp: moved to MathUnit for easier testing
procedure poisson_cdf ( x : integer; a : double; VAR cdf : double );
VAR
i : integer;