LazStats: Complete refactoring of CompareDistUnit.

git-svn-id: https://svn.code.sf.net/p/lazarus-ccr/svn@7723 8e941d3f-bd1b-0410-a28a-d453659cc2b4
2020-09-30 21:05:32 +00:00
parent 0e50c16f6b
commit 10049ca6b0
3 changed files with 382 additions and 571 deletions
--- a/applications/lazstats/source/units/functionslib.pas
+++ b/applications/lazstats/source/units/functionslib.pas
@ -65,7 +65,7 @@ function factorial(x : integer) : integer;
 implementation

 uses
-  MathUnit;
+  Utils, MathUnit;

 function chisquaredprob(X : double; k : integer) : double;
 var
@ -1629,28 +1629,22 @@ begin
     end;
     Result := NoGood;
 end;
-//-------------------------------------------------------------------

+
+{ split a value into the whole and fractional parts}
 function WholeValue(value : double) : double;
-        { split a value into the whole and fractional parts}
-VAR
-        whole : double;
 begin
-        whole := Floor(value);
-        Result := whole;
+  Result := Floor(value);
 end;
-//---------------------------------------------------------------------------
-function FractionValue(value : double) : double;
-        { split a value into the whole and fractional parts }
-VAR
-        fraction : double;
-begin
-        fraction := value - Floor(value);
-        Result := fraction;
-end;
-//---------------------------------------------------------------------------

-Function Quartiles(TypeQ : integer; pcntile : double; N : integer;
+{ split a value into the whole and fractional parts }
+function FractionValue(value : double) : double;
+begin
+  Result := value - Floor(value);
+end;
+
+
+function Quartiles(TypeQ : integer; pcntile : double; N : integer;
         VAR values : DblDyneVec) : double;
 VAR
        whole, fraction, Myresult, np, avalue, avalue1 : double;
@ -1768,173 +1762,176 @@ begin
   result := p;
 end;

-function KolmogorovTest(na : integer; const a: DblDyneVec; nb: integer;
+{ Statistical test whether two one-dimensional sets of points are compatible
+  with coming from the same parent distribution, using the Kolmogorov test.
+  That is, it is used to compare two experimental distributions of unbinned data.
+
+  Input:
+    a,b: One-dimensional arrays of length na, nb, respectively.
+         The elements of a and b must be given in ascending order.
+    option is a character string to specify options
+         "D" Put out a line of "Debug" printout
+         "M" Return the Maximum Kolmogorov distance instead of prob
+
+  Output:
+    The returned value prob is a calculated confidence level which gives a
+    statistical test for compatibility of a and b.
+    Values of prob close to zero are taken as indicating a small probability
+    of compatibility. For two point sets drawn randomly from the same parent
+    distribution, the value of prob should be uniformly distributed between
+    zero and one.
+
+    in case of error the function return -1
+
+  If the 2 sets have a different number of points, the minimum of
+  the two sets is used.
+
+  Method:
+    The Kolmogorov test is used. The test statistic is the maximum deviation
+    between the two integrated distribution functions, multiplied by the
+    normalizing factor (rdmax*sqrt(na*nb/(na+nb)).
+
+    Code adapted by Rene Brun from CERNLIB routine TKOLMO (Fred James)
+     (W.T. Eadie, D. Drijard, F.E. James, M. Roos and B. Sadoulet,
+        Statistical Methods in Experimental Physics, (North-Holland,
+        Amsterdam 1971) 269-271)
+
+  Method Improvement by Jason A Detwiler (JADetwiler@lbl.gov)
+  -----------------------------------------------------------
+   The nuts-and-bolts of the TMath::KolmogorovTest() algorithm is a for-loop
+   over the two sorted arrays a and b representing empirical distribution
+   functions. The for-loop handles 3 cases: when the next points to be
+   evaluated satisfy a>b, a<b, or a=b:
+
+   For the last case, a=b, the algorithm advances each array by one index in an
+   attempt to move through the equality. However, this is incorrect when one or
+   the other of a or b (or both) have a repeated value, call it x. For the KS
+   statistic to be computed properly, rdiff needs to be calculated after all of
+   the a and b at x have been tallied (this is due to the definition of the
+   empirical distribution function; another way to convince yourself that the
+   old CERNLIB method is wrong is that it implies that the function defined as the
+   difference between a and b is multi-valued at x -- besides being ugly, this
+   would invalidate Kolmogorov's theorem).
+
+  NOTE1
+    A good description of the Kolmogorov test can be seen at:
+      http://www.itl.nist.gov/div898/handbook/eda/section3/eda35g.htm
+}
+function KolmogorovTest(na: integer; const a: DblDyneVec; nb: integer;
         const b: DblDyneVec; option: String; AReport: TStrings): double;
-VAR
-   prob : double;
-   opt : string;
-   rna : double; // = na;
-   rnb : double; // = nb;
-   sa : double;  // = 1./rna;
-   sb : double;  // = 1./rnb;
-   rdiff, rdmax, x, z : double;
-   i, ia, ib : integer;
-   ok : boolean;
+var
+  prob: double;
+  opt: string;
+  rna: double; // = na;
+  rnb: double; // = nb;
+  sa: double;  // = 1./rna;
+  sb: double;  // = 1./rnb;
+  rdiff, rdmax, x, z: double;
+  i, ia, ib: integer;
+  ok: boolean;
 begin
-//  Statistical test whether two one-dimensional sets of points are compatible
-//  with coming from the same parent distribution, using the Kolmogorov test.
-//  That is, it is used to compare two experimental distributions of unbinned data.
-//
-//  Input:
-//  a,b: One-dimensional arrays of length na, nb, respectively.
-//       The elements of a and b must be given in ascending order.
-//  option is a character string to specify options
-//         "D" Put out a line of "Debug" printout
-//         "M" Return the Maximum Kolmogorov distance instead of prob
-//
-//  Output:
-//  The returned value prob is a calculated confidence level which gives a
-//  statistical test for compatibility of a and b.
-//  Values of prob close to zero are taken as indicating a small probability
-//  of compatibility. For two point sets drawn randomly from the same parent
-//  distribution, the value of prob should be uniformly distributed between
-//  zero and one.
-//
-//  in case of error the function return -1
-//  If the 2 sets have a different number of points, the minimum of
-//  the two sets is used.
-//
-//  Method:
-//  The Kolmogorov test is used. The test statistic is the maximum deviation
-//  between the two integrated distribution functions, multiplied by the
-//  normalizing factor (rdmax*sqrt(na*nb/(na+nb)).
-//
-//  Code adapted by Rene Brun from CERNLIB routine TKOLMO (Fred James)
-//   (W.T. Eadie, D. Drijard, F.E. James, M. Roos and B. Sadoulet,
-//      Statistical Methods in Experimental Physics, (North-Holland,
-//      Amsterdam 1971) 269-271)
-//
-//  Method Improvement by Jason A Detwiler (JADetwiler@lbl.gov)
-//  -----------------------------------------------------------
-//   The nuts-and-bolts of the TMath::KolmogorovTest() algorithm is a for-loop
-//   over the two sorted arrays a and b representing empirical distribution
-//   functions. The for-loop handles 3 cases: when the next points to be
-//   evaluated satisfy a>b, a<b, or a=b:
-//   For the last case, a=b, the algorithm advances each array by one index in an
-//   attempt to move through the equality. However, this is incorrect when one or
-//   the other of a or b (or both) have a repeated value, call it x. For the KS
-//   statistic to be computed properly, rdiff needs to be calculated after all of
-//   the a and b at x have been tallied (this is due to the definition of the
-//   empirical distribution function; another way to convince yourself that the
-//   old CERNLIB method is wrong is that it implies that the function defined as the
-//   difference between a and b is multi-valued at x -- besides being ugly, this
-//   would invalidate Kolmogorov's theorem).
-//
-//  NOTE1
-//  A good description of the Kolmogorov test can be seen at:
-//    http://www.itl.nist.gov/div898/handbook/eda/section3/eda35g.htm
+  opt := option;
+  prob := -1;

-   opt := option;
-//   opt.ToUpper();
+  { Require at least two points in each graph }
+  if (na <= 2) or (nb <= 2) then
+  begin
+    ErrorMsg('KolmogorovTest - Sets must have more than 2 points');
+    exit;
+  end;

-   prob := -1;
-//      Require at least two points in each graph
-   if (na <= 2) or (nb <= 2) then
-   begin
-      ShowMessage('KolmogorovTest - Sets must have more than 2 points');
-      exit;
-   end;
-//     Constants needed
-   rna := na;
-   rnb := nb;
-   sa  := 1./rna;
-   sb  := 1. / rnb;
-//     Starting values for main loop
-   if (a[0] < b[0]) then
-   begin
-      rdiff := -sa;
-      ia := 2;
-      ib := 1;
-   end
-   else
-   begin
-      rdiff := sb;
-      ib := 2;
-      ia := 1;
-   end;
-   rdmax := Abs(rdiff);
+  { Constants needed }
+  rna := na;
+  rnb := nb;
+  sa  := 1./rna;
+  sb  := 1. / rnb;

-//    Main loop over point sets to find max distance
-//    rdiff is the running difference, and rdmax the max.
-   ok := FALSE;
-   for i := 0 to na + nb - 1 do
-   begin
-      if (a[ia-1] < b[ib-1]) then
+  { Starting values for main loop }
+  if (a[0] < b[0]) then
+  begin
+    rdiff := -sa;
+    ia := 2;
+    ib := 1;
+  end
+  else
+  begin
+    rdiff := sb;
+    ib := 2;
+    ia := 1;
+  end;
+  rdmax := Abs(rdiff);
+
+  { Main loop over point sets to find max distance
+    rdiff is the running difference, and rdmax the max. }
+  ok := false;
+  for i := 0 to na + nb - 1 do
+  begin
+    if (a[ia-1] < b[ib-1]) then
+    begin
+      rdiff := rdiff - sa;
+      ia := ia + 1;
+      if (ia > na) then
      begin
-         rdiff := rdiff - sa;
-         ia := ia + 1;
-         if (ia > na) then
-         begin
-              ok := TRUE;
-              break;
-         end;
-      end
-      else if (a[ia-1] > b[ib-1]) then
-      begin
-         rdiff := rdiff + sb;
-         ib := ib + 1;
-         if (ib > nb) then
-         begin
-              ok := TRUE;
-              break;
-         end;
-      end
-      else
-      begin
-         x := a[ia-1];
-         while((a[ia-1] = x) and (ia <= na)) do
-         begin
-            rdiff := rdiff - sa;
-            ia := ia + 1;
-         end;
-         while ((b[ib-1] = x) and (ib <= nb)) do
-         begin
-            rdiff := rdiff + sb;
-            ib := ib + 1;
-         end;
-         if (ia > na) then
-         begin
-              ok := TRUE;
-              break;
-         end;
-         if (ib > nb) then
-         begin
-              ok := TRUE;
-              break;
-         end;
+        ok := true;
+        break;
      end;
-      rdmax := Max(rdmax,Abs(rdiff));
-   end;
-//    Should never terminate this loop with ok = kFALSE!
+    end
+    else if (a[ia-1] > b[ib-1]) then
+    begin
+      rdiff := rdiff + sb;
+      ib := ib + 1;
+      if (ib > nb) then
+      begin
+        ok := true;
+        break;
+      end;
+    end
+    else
+    begin
+      x := a[ia-1];
+      while((a[ia-1] = x) and (ia <= na)) do
+      begin
+        rdiff := rdiff - sa;
+        ia := ia + 1;
+      end;
+      while ((b[ib-1] = x) and (ib <= nb)) do
+      begin
+        rdiff := rdiff + sb;
+        ib := ib + 1;
+      end;
+      if (ia > na) then
+      begin
+        ok := true;
+        break;
+      end;
+      if (ib > nb) then
+      begin
+        ok := true;
+        break;
+      end;
+    end;
+    rdmax := Max(rdmax, abs(rdiff));
+  end;

-   if (ok) then
-   begin
-      rdmax := Max(rdmax,Abs(rdiff));
-      z := rdmax * Sqrt(rna * rnb / (rna + rnb));
-      prob := KolmogorovProb(z);
-   end;
+  { Should never terminate this loop with ok = FALSE! }
+  if ok then
+  begin
+    rdmax := Max(rdmax, abs(rdiff));
+    z := rdmax * Sqrt(rna * rnb / (rna + rnb));
+    prob := KolmogorovProb(z);
+  end;

-      // debug printout
-   if (opt = 'D') then
-      AReport.Add(' Kolmogorov Probability: %g, Max Dist: %g', [prob, rdmax]);
-   if(opt = 'M') then
-     result := rdmax
-   else
-     result := prob;
+  { debug printout }
+  if (opt = 'D') then
+    AReport.Add(' Kolmogorov Probability: %.5f, Max Dist: %.5f', [prob, rdmax]);
+
+  if (opt = 'M') then
+    Result := rdmax
+  else
+    Result := prob;
 end;

 (*     wp: moved to MathUnit for easier testing
-
 procedure poisson_cdf ( x : integer; a : double; VAR cdf : double );
 VAR
  i : integer;