You've already forked lazarus-ccr
906 lines
27 KiB
ObjectPascal
906 lines
27 KiB
ObjectPascal
![]() |
unit KMeansUnit;
|
||
|
|
||
|
{$mode objfpc}{$H+}
|
||
|
|
||
|
interface
|
||
|
|
||
|
uses
|
||
|
Classes, SysUtils, FileUtil, LResources, Forms, Controls, Graphics, Dialogs,
|
||
|
StdCtrls, Buttons, ExtCtrls,
|
||
|
MainUnit, Globals, DataProcs, OutputUnit, ContextHelpUnit;
|
||
|
|
||
|
type
|
||
|
|
||
|
{ TKMeansFrm }
|
||
|
|
||
|
TKMeansFrm = class(TForm)
|
||
|
Bevel1: TBevel;
|
||
|
DescChkBox: TCheckBox;
|
||
|
HelpBtn: TButton;
|
||
|
Panel1: TPanel;
|
||
|
Panel2: TPanel;
|
||
|
VarInBtn: TBitBtn;
|
||
|
VarOutBtn: TBitBtn;
|
||
|
AllBtn: TBitBtn;
|
||
|
ResetBtn: TButton;
|
||
|
CancelBtn: TButton;
|
||
|
ComputeBtn: TButton;
|
||
|
ReturnBtn: TButton;
|
||
|
StdChkBox: TCheckBox;
|
||
|
RepChkBox: TCheckBox;
|
||
|
GroupBox1: TGroupBox;
|
||
|
ItersEdit: TEdit;
|
||
|
Label2: TLabel;
|
||
|
Label3: TLabel;
|
||
|
Label4: TLabel;
|
||
|
ListBox1: TListBox;
|
||
|
VarList: TListBox;
|
||
|
NoClustersEdit: TEdit;
|
||
|
Label1: TLabel;
|
||
|
procedure AllBtnClick(Sender: TObject);
|
||
|
procedure ComputeBtnClick(Sender: TObject);
|
||
|
procedure FormActivate(Sender: TObject);
|
||
|
procedure FormCreate(Sender: TObject);
|
||
|
procedure FormShow(Sender: TObject);
|
||
|
procedure HelpBtnClick(Sender: TObject);
|
||
|
procedure ResetBtnClick(Sender: TObject);
|
||
|
procedure VarInBtnClick(Sender: TObject);
|
||
|
procedure VarOutBtnClick(Sender: TObject);
|
||
|
private
|
||
|
{ private declarations }
|
||
|
FAutoSized: Boolean;
|
||
|
procedure KMNS(VAR A : DblDyneMat; M, N : integer;
|
||
|
VAR C : DblDyneMat; K : integer; VAR IC1 : IntDyneVec;
|
||
|
VAR IC2 : IntDyneVec; VAR NC : IntDyneVec;
|
||
|
VAR AN1 : DblDyneVec; VAR AN2 : DblDyneVec;
|
||
|
VAR NCP : IntDyneVec; VAR D : DblDyneVec;
|
||
|
VAR ITRAN : IntDyneVec; VAR LIVE : IntDyneVec;
|
||
|
ITER : integer; VAR WSS : DblDyneVec; IFAULT : integer);
|
||
|
procedure OPTRA(VAR A : DblDyneMat; M, N : integer;
|
||
|
VAR C : DblDyneMat; K : integer;
|
||
|
VAR IC1 : IntDyneVec; VAR IC2 : IntDyneVec;
|
||
|
VAR NC : IntDyneVec; VAR AN1 : DblDyneVec;
|
||
|
VAR AN2 : DblDyneVec; VAR NCP : IntDyneVec;
|
||
|
VAR D : DblDyneVec; VAR ITRAN : IntDyneVec;
|
||
|
VAR LIVE : IntDyneVec; INDX : integer);
|
||
|
procedure QTRAN(VAR A : DblDyneMat; M, N : integer;
|
||
|
VAR C : DblDyneMat; K : integer;
|
||
|
VAR IC1 : IntDyneVec; VAR IC2 : IntDyneVec;
|
||
|
VAR NC : IntDyneVec; VAR AN1 : DblDyneVec;
|
||
|
VAR AN2 : DblDyneVec; VAR NCP : IntDyneVec;
|
||
|
VAR D : DblDyneVec; VAR ITRAN : IntDyneVec;
|
||
|
INDX : integer);
|
||
|
|
||
|
public
|
||
|
{ public declarations }
|
||
|
end;
|
||
|
|
||
|
var
|
||
|
KMeansFrm: TKMeansFrm;
|
||
|
|
||
|
implementation
|
||
|
|
||
|
uses
|
||
|
Math;
|
||
|
|
||
|
{ TKMeansFrm }
|
||
|
|
||
|
procedure TKMeansFrm.ResetBtnClick(Sender: TObject);
|
||
|
VAR cellstring : string;
|
||
|
i : integer;
|
||
|
begin
|
||
|
VarList.Clear;
|
||
|
ListBox1.Clear;
|
||
|
for i := 1 to NoVariables do
|
||
|
begin
|
||
|
cellstring := OS3MainFrm.DataGrid.Cells[i,0];
|
||
|
VarList.Items.Add(cellstring);
|
||
|
end;
|
||
|
RepChkBox.Checked := false;
|
||
|
StdChkBox.Checked := true;
|
||
|
VarOutBtn.Enabled := false;
|
||
|
DescChkBox.Checked := false;
|
||
|
NoClustersEdit.Text := '';
|
||
|
ItersEdit.Text := '100';
|
||
|
end;
|
||
|
|
||
|
procedure TKMeansFrm.VarInBtnClick(Sender: TObject);
|
||
|
VAR
|
||
|
index, count, i : integer;
|
||
|
cellstring : string;
|
||
|
begin
|
||
|
count := 0;
|
||
|
index := ListBox1.Items.Count;
|
||
|
for i := 0 to index - 1 do
|
||
|
begin
|
||
|
if (VarList.Selected[i]) then
|
||
|
begin
|
||
|
cellstring := VarList.Items.strings[i];
|
||
|
ListBox1.Items.Add(cellstring);
|
||
|
count := count + 1;
|
||
|
end;
|
||
|
end;
|
||
|
|
||
|
while (count > 0) do
|
||
|
begin
|
||
|
for i := 0 to VarList.Items.Count - 1 do
|
||
|
begin
|
||
|
if (VarList.Selected[i]) then
|
||
|
begin
|
||
|
VarList.Items.Delete(i);
|
||
|
count := count - 1;
|
||
|
end;
|
||
|
end;
|
||
|
end;
|
||
|
VarOutBtn.Enabled := true;
|
||
|
end;
|
||
|
|
||
|
procedure TKMeansFrm.VarOutBtnClick(Sender: TObject);
|
||
|
VAR index : integer;
|
||
|
cellstring : string;
|
||
|
begin
|
||
|
index := ListBox1.ItemIndex;
|
||
|
cellstring := ListBox1.Items.Strings[index];
|
||
|
VarList.Items.Add(cellstring);
|
||
|
ListBox1.Items.Delete(index);
|
||
|
end;
|
||
|
|
||
|
procedure TKMeansFrm.FormActivate(Sender: TObject);
|
||
|
var
|
||
|
w: Integer;
|
||
|
begin
|
||
|
if FAutoSized then
|
||
|
exit;
|
||
|
|
||
|
w := MaxValue([HelpBtn.Width, ResetBtn.Width, CancelBtn.Width, ComputeBtn.Width, ReturnBtn.Width]);
|
||
|
HelpBtn.Constraints.MinWidth := w;
|
||
|
ResetBtn.Constraints.MinWidth := w;
|
||
|
CancelBtn.Constraints.MinWidth := w;
|
||
|
ComputeBtn.Constraints.MinWidth := w;
|
||
|
ReturnBtn.Constraints.MinWidth := w;
|
||
|
|
||
|
Constraints.MinWidth := Width;
|
||
|
Constraints.MinHeight := Height;
|
||
|
|
||
|
FAutoSized := true;
|
||
|
end;
|
||
|
|
||
|
procedure TKMeansFrm.FormCreate(Sender: TObject);
|
||
|
begin
|
||
|
Assert(OS3MainFrm <> nil);
|
||
|
if OutputFrm = nil then
|
||
|
Application.CreateForm(TOutputFrm, OutputFrm);
|
||
|
end;
|
||
|
|
||
|
procedure TKMeansFrm.FormShow(Sender: TObject);
|
||
|
begin
|
||
|
ResetBtnClick(self);
|
||
|
end;
|
||
|
|
||
|
procedure TKMeansFrm.HelpBtnClick(Sender: TObject);
|
||
|
begin
|
||
|
if ContextHelpForm = nil then
|
||
|
Application.CreateForm(TContextHelpForm, ContextHelpForm);
|
||
|
ContextHelpForm.HelpMessage((Sender as TButton).Tag);
|
||
|
end;
|
||
|
|
||
|
procedure TKMeansFrm.AllBtnClick(Sender: TObject);
|
||
|
VAR
|
||
|
index, noitems : integer;
|
||
|
cellstring : string;
|
||
|
begin
|
||
|
noitems := VarList.Items.Count;
|
||
|
for index := 0 to noitems - 1 do
|
||
|
begin
|
||
|
cellstring := VarList.Items.Strings[index];
|
||
|
ListBox1.Items.Add(cellstring);
|
||
|
end;
|
||
|
VarList.Clear;
|
||
|
VarOutBtn.Enabled := true;
|
||
|
end;
|
||
|
|
||
|
procedure TKMeansFrm.ComputeBtnClick(Sender: TObject);
|
||
|
VAR
|
||
|
i, j, L, Ncols, N, M, K,IFAULT, ITER, col : integer;
|
||
|
center, itemp : integer;
|
||
|
IC1, IC2, NC, NCP, ITRAN, LIVE, ColSelected : IntDyneVec;
|
||
|
A, C : DblDyneMat;
|
||
|
D, AN1, AN2, WSS, DT : DblDyneVec;
|
||
|
cellstring: string;
|
||
|
outline : string;
|
||
|
strval : string;
|
||
|
varlabels, rowlabels : StrDyneVec;
|
||
|
Mean, stddev : double;
|
||
|
label cleanup;
|
||
|
|
||
|
begin
|
||
|
Ncols := ListBox1.Items.Count;
|
||
|
if (Ncols <= 0) then
|
||
|
begin
|
||
|
ShowMessage('ERROR! No variables selected to cluster.');
|
||
|
exit;
|
||
|
end;
|
||
|
|
||
|
N := Ncols;
|
||
|
M := NoCases;
|
||
|
K := StrToInt(NoClustersEdit.Text);
|
||
|
IFAULT := 0;
|
||
|
ITER := StrToInt(ItersEdit.Text);
|
||
|
|
||
|
SetLength(varlabels,Ncols);
|
||
|
SetLength(rowlabels,NoCases);
|
||
|
SetLength(ColSelected,Ncols);
|
||
|
SetLength(A,M+1,N+1);
|
||
|
SetLength(C,K+1,N+1);
|
||
|
SetLength(D,M+1);
|
||
|
SetLength(AN1,K+1);
|
||
|
SetLength(AN2,K+1);
|
||
|
SetLength(WSS,K+1);
|
||
|
SetLength(DT,3);
|
||
|
SetLength(IC1,M+1);
|
||
|
SetLength(IC2,M+1);
|
||
|
SetLength(NC,K+1);
|
||
|
SetLength(NCP,K+1);
|
||
|
SetLength(ITRAN,K+1);
|
||
|
SetLength(LIVE,K+1);
|
||
|
|
||
|
if (K <= 0) then
|
||
|
begin
|
||
|
ShowMessage('ERROR! You must enter the desired number of clusters.');
|
||
|
goto cleanup;
|
||
|
end;
|
||
|
|
||
|
// initialize arrays
|
||
|
for i := 1 to K do
|
||
|
begin
|
||
|
AN1[i] := 0.0;
|
||
|
AN2[i] := 0.0;
|
||
|
WSS[i] := 0.0;
|
||
|
NC[i] := 0;
|
||
|
NCP[i] := 0;
|
||
|
ITRAN[i] := 0;
|
||
|
LIVE[i] := 0;
|
||
|
for j := 1 to N do C[i,j] := 0.0;
|
||
|
end;
|
||
|
for i := 1 to M do
|
||
|
begin
|
||
|
IC1[i] := 0;
|
||
|
IC2[i] := 0;
|
||
|
D[i] := 0.0;
|
||
|
end;
|
||
|
|
||
|
//Get labels and columns of selected variables
|
||
|
for i := 0 to Ncols - 1 do
|
||
|
begin
|
||
|
cellstring := ListBox1.Items.Strings[i];
|
||
|
for j := 0 to NoVariables - 1 do
|
||
|
begin
|
||
|
if (cellstring = OS3MainFrm.DataGrid.Cells[j+1,0]) then
|
||
|
begin
|
||
|
varlabels[i] := cellstring;
|
||
|
ColSelected[i] := j+1;
|
||
|
end;
|
||
|
end;
|
||
|
end;
|
||
|
|
||
|
// Get labels of rows
|
||
|
for i := 0 to NoCases - 1 do rowlabels[i] := OS3MainFrm.DataGrid.Cells[0,i+1];
|
||
|
|
||
|
// read the data
|
||
|
for i := 1 to M do
|
||
|
begin
|
||
|
if (NOT GoodRecord(i,N,ColSelected)) then continue;
|
||
|
for j := 1 to N do
|
||
|
begin
|
||
|
col := ColSelected[j-1];
|
||
|
A[i,j] := StrToFloat(OS3MainFrm.DataGrid.Cells[col,i]);
|
||
|
end;
|
||
|
end;
|
||
|
|
||
|
OutputFrm.RichEdit.Clear;
|
||
|
OutputFrm.RichEdit.Lines.Add('K-Means Clustering. Adapted from AS 136 APPL. STATIST. (1979) VOL.28, NO.1');
|
||
|
OutputFrm.RichEdit.Lines.Add('');
|
||
|
outline := format('File := %s',[OS3MainFrm.FileNameEdit.Text]);
|
||
|
OutputFrm.RichEdit.Lines.Add(outline);
|
||
|
outline := format('No. Cases := %d, No. Variables := %d, No. Clusters := %d',[M,N,K]);
|
||
|
OutputFrm.RichEdit.Lines.Add(outline);
|
||
|
OutputFrm.RichEdit.Lines.Add('');
|
||
|
|
||
|
// transform to z scores if needed
|
||
|
if (StdChkBox.Checked = true) then
|
||
|
begin
|
||
|
for j := 1 to N do
|
||
|
begin
|
||
|
Mean := 0.0;
|
||
|
stddev := 0.0;
|
||
|
for i := 1 to M do
|
||
|
begin
|
||
|
Mean := Mean + A[i,j];
|
||
|
stddev := stddev + (A[i,j] * A[i,j]);
|
||
|
end;
|
||
|
stddev := stddev - Mean * Mean / M;
|
||
|
stddev := stddev / (M - 1);
|
||
|
Mean := Mean / M;
|
||
|
if (DescChkBox.Checked) then
|
||
|
begin
|
||
|
outline := format('Mean := %8.3f, Std.Dev. := %8.3f for %s',[Mean,stddev,varlabels[j-1]]);
|
||
|
OutputFrm.RichEdit.Lines.Add(outline);
|
||
|
end;
|
||
|
for i := 1 to M do
|
||
|
begin
|
||
|
A[i,j] := (A[i,j] - Mean) / stddev;
|
||
|
if (RepChkBox.Checked = true) then
|
||
|
begin
|
||
|
col := ColSelected[j-1];
|
||
|
outline := format('%8.5f',[A[i,j]]);
|
||
|
OS3MainFrm.DataGrid.Cells[col,i] := outline;
|
||
|
end;
|
||
|
end;
|
||
|
end;
|
||
|
end;
|
||
|
|
||
|
// Now enter initial points
|
||
|
for L := 1 to K do
|
||
|
begin
|
||
|
center := 1 + (L-1) * (M div K); // initial cluster center
|
||
|
for j := 1 to N do C[L,j] := A[center,j];
|
||
|
end;
|
||
|
|
||
|
// do analysis
|
||
|
KMNS(A,M,N,C,K,IC1,IC2,NC,AN1,AN2,NCP,D,ITRAN,LIVE,ITER,WSS,IFAULT);
|
||
|
|
||
|
// show results
|
||
|
|
||
|
// sort subjects by cluster
|
||
|
for i := 1 to M do IC2[i] := i; // store ids in here
|
||
|
for i := 1 to M - 1 do
|
||
|
begin
|
||
|
for j := i+1 to M do
|
||
|
begin
|
||
|
if (IC1[i] > IC1[j]) then // swap these clusters and ids
|
||
|
begin
|
||
|
itemp := IC1[i];
|
||
|
IC1[i] := IC1[j];
|
||
|
IC1[j] := itemp;
|
||
|
itemp := IC2[i];
|
||
|
IC2[i] := IC2[j];
|
||
|
IC2[j] := itemp;
|
||
|
end;
|
||
|
end;
|
||
|
end;
|
||
|
|
||
|
OutputFrm.RichEdit.Lines.Add('');
|
||
|
OutputFrm.RichEdit.Lines.Add('NUMBER OF SUBJECTS IN EACH CLUSTER');
|
||
|
for i := 1 to K do
|
||
|
begin
|
||
|
outline := format('Cluster := %d with %d cases.',[i,NC[i]]);
|
||
|
OutputFrm.RichEdit.Lines.Add(outline);
|
||
|
end;
|
||
|
|
||
|
OutputFrm.RichEdit.Lines.Add('');
|
||
|
OutputFrm.RichEdit.Lines.Add('PLACEMENT OF SUBJECTS IN CLUSTERS');
|
||
|
OutputFrm.RichEdit.Lines.Add('CLUSTER SUBJECT');
|
||
|
for i := 1 to M do
|
||
|
begin
|
||
|
outline := format(' %3d %3d',[IC1[i],IC2[i]]);
|
||
|
OutputFrm.RichEdit.Lines.Add(outline);
|
||
|
end;
|
||
|
|
||
|
OutputFrm.RichEdit.Lines.Add('');
|
||
|
OutputFrm.RichEdit.Lines.Add('AVERAGE VARIABLE VALUES BY CLUSTER');
|
||
|
outline := ' VARIABLES';
|
||
|
OutputFrm.RichEdit.Lines.Add(outline);
|
||
|
outline := 'CLUSTER';
|
||
|
for j := 1 to N do
|
||
|
begin
|
||
|
strval := format(' %3d ',[j]);
|
||
|
outline := outline + strval;
|
||
|
end;
|
||
|
OutputFrm.RichEdit.Lines.Add(outline);
|
||
|
OutputFrm.RichEdit.Lines.Add(' ');
|
||
|
for i := 1 to K do
|
||
|
begin
|
||
|
outline := format(' %3d ',[i]);
|
||
|
for j := 1 to N do
|
||
|
begin
|
||
|
strval := format('%5.2f ',[C[i,j]]);
|
||
|
outline := outline + strval;
|
||
|
end;
|
||
|
OutputFrm.RichEdit.Lines.Add(outline);
|
||
|
end;
|
||
|
OutputFrm.RichEdit.Lines.Add('');
|
||
|
OutputFrm.RichEdit.Lines.Add('WITHIN CLUSTER SUMS OF SQUARES');
|
||
|
for i := 1 to K do
|
||
|
begin
|
||
|
outline := format('Cluster %d := %6.3f',[i,WSS[i]]);
|
||
|
OutputFrm.RichEdit.Lines.Add(outline);
|
||
|
end;
|
||
|
|
||
|
OutputFrm.ShowModal;
|
||
|
|
||
|
// cleanup
|
||
|
cleanup:
|
||
|
LIVE := nil;
|
||
|
ITRAN := nil;
|
||
|
NCP := nil;
|
||
|
NC := nil;
|
||
|
IC2 := nil;
|
||
|
IC1 := nil;
|
||
|
DT := nil;
|
||
|
WSS := nil;
|
||
|
AN2 := nil;
|
||
|
AN1 := nil;
|
||
|
D := nil;
|
||
|
C := nil;
|
||
|
A := nil;
|
||
|
ColSelected := nil;
|
||
|
rowlabels := nil;
|
||
|
varlabels := nil;
|
||
|
end;
|
||
|
|
||
|
procedure TKMeansFrm.KMNS(VAR A : DblDyneMat; M, N : integer;
|
||
|
VAR C : DblDyneMat; K : integer; VAR IC1 : IntDyneVec;
|
||
|
VAR IC2 : IntDyneVec; VAR NC : IntDyneVec;
|
||
|
VAR AN1 : DblDyneVec; VAR AN2 : DblDyneVec;
|
||
|
VAR NCP : IntDyneVec; VAR D : DblDyneVec;
|
||
|
VAR ITRAN : IntDyneVec; VAR LIVE : IntDyneVec;
|
||
|
ITER : integer; VAR WSS : DblDyneVec; IFAULT : integer);
|
||
|
VAR
|
||
|
DT : array[0..2] of double;
|
||
|
BIG : double;
|
||
|
ZERO : double;
|
||
|
ONE : double;
|
||
|
DA, DB, DC, TEMP, AA : double;
|
||
|
L, II, INDX, I, J, IL, IJ : integer;
|
||
|
label cont50, cont40, cont150;
|
||
|
|
||
|
begin
|
||
|
// SUBROUTINE KMNS(A, M, N, C, K, IC1, IC2, NC, AN1, AN2, NCP, D,
|
||
|
// * ITRAN, LIVE, ITER, WSS, IFAULT)
|
||
|
//
|
||
|
// ALGORITHM AS 136 APPL. STATIST. (1979) VOL.28, NO.1
|
||
|
// Divide M points in N-dimensional space into K clusters so that
|
||
|
// the within cluster sum of squares is minimized.
|
||
|
//
|
||
|
// INTEGER IC1(M), IC2(M), NC(K), NCP(K), ITRAN(K), LIVE(K)
|
||
|
// REAL A(M,N), D(M), C(K,N), AN1(K), AN2(K), WSS(K), DT(2)
|
||
|
// REAL ZERO, ONE
|
||
|
//
|
||
|
// Define BIG to be a very large positive number
|
||
|
//
|
||
|
// DATA BIG /1.E30/, ZERO /0.0/, ONE /1.0/
|
||
|
//
|
||
|
BIG := 1.0e30;
|
||
|
ZERO := 0.0;
|
||
|
ONE := 1.0;
|
||
|
IFAULT := 3;
|
||
|
if ((K <= 1) or (K >= M)) then
|
||
|
begin
|
||
|
ShowMessage('The no. of clusters must be less than the no. of variables.');
|
||
|
exit;
|
||
|
end;
|
||
|
|
||
|
// For each point I, find its two closest centres, IC1(I) and
|
||
|
// IC2(I). Assign it to IC1(I).
|
||
|
//
|
||
|
for I := 1 to M do
|
||
|
begin
|
||
|
IC1[I] := 1;
|
||
|
IC2[I] := 2;
|
||
|
for IL := 1 to 2 do
|
||
|
begin
|
||
|
DT[IL] := ZERO;
|
||
|
for J := 1 to N do
|
||
|
begin
|
||
|
DA := A[I,J] - C[IL,J];
|
||
|
DT[IL] := DT[IL] + (DA * DA); //(squared difference for this comparison)
|
||
|
end; // 10 CONTINUE
|
||
|
end; // 10 CONTINUE
|
||
|
if (DT[1] > DT[2]) then // THEN swap
|
||
|
begin
|
||
|
IC1[I] := 2;
|
||
|
IC2[I] := 1;
|
||
|
TEMP := DT[1];
|
||
|
DT[1] := DT[2];
|
||
|
DT[2] := TEMP;
|
||
|
end; // END IF
|
||
|
for L := 3 to K do // (remaining clusters)
|
||
|
begin
|
||
|
DB := ZERO;
|
||
|
for J := 1 to N do // (variables)
|
||
|
begin
|
||
|
DC := A[I,J] - C[L,J];
|
||
|
DB := DB + DC * DC;
|
||
|
if (DB >= DT[2]) then goto cont50;
|
||
|
end;
|
||
|
if (DB < DT[1]) then goto cont40;
|
||
|
DT[2] := DB;
|
||
|
IC2[I] := L;
|
||
|
goto cont50;
|
||
|
cont40: DT[2] := DT[1];
|
||
|
IC2[I] := IC1[I];
|
||
|
DT[1] := DB;
|
||
|
IC1[I] := L;
|
||
|
cont50: end;
|
||
|
end; // 50 CONTINUE (next case)
|
||
|
|
||
|
// Update cluster centres to be the average of points contained
|
||
|
// within them.
|
||
|
//
|
||
|
for L := 1 to K do // (clusters)
|
||
|
begin
|
||
|
NC[L] := 0;
|
||
|
for J := 1 to N do C[L,J] := ZERO; //(initialize clusters)
|
||
|
end;
|
||
|
for I := 1 to M do // (subjects)
|
||
|
begin
|
||
|
L := IC1[I]; // which cluster the Ith case is in
|
||
|
NC[L] := NC[L] + 1; // no. in the cluster L
|
||
|
for J := 1 to N do C[L,J] := C[L,J] + A[I,J]; // sum of var. values in the cluster L
|
||
|
end;
|
||
|
|
||
|
// Check to see if there is any empty cluster at this stage
|
||
|
//
|
||
|
for L := 1 to K do
|
||
|
begin
|
||
|
if (NC[L] = 0) then
|
||
|
begin
|
||
|
IFAULT := 1;
|
||
|
exit;
|
||
|
end;
|
||
|
AA := NC[L];
|
||
|
for J := 1 to N do C[L,J] := C[L,J] / AA; // average the values in the cluster
|
||
|
|
||
|
// Initialize AN1, AN2, ITRAN & NCP
|
||
|
// AN1(L) := NC(L) / (NC(L) - 1)
|
||
|
// AN2(L) := NC(L) / (NC(L) + 1)
|
||
|
// ITRAN(L) := 1 if cluster L is updated in the quick-transfer stage,
|
||
|
// := 0 otherwise
|
||
|
// In the optimal-transfer stage, NCP(L) stores the step at which
|
||
|
// cluster L is last updated.
|
||
|
// In the quick-transfer stage, NCP(L) stores the step at which
|
||
|
// cluster L is last updated plus M.
|
||
|
//
|
||
|
AN2[L] := AA / (AA + ONE);
|
||
|
AN1[L] := BIG;
|
||
|
if (AA > ONE) then AN1[L] := AA / (AA - ONE);
|
||
|
ITRAN[L] := 1;
|
||
|
NCP[L] := -1;
|
||
|
end;
|
||
|
INDX := 0;
|
||
|
for IJ := 1 to ITER do
|
||
|
begin
|
||
|
//
|
||
|
// In this stage, there is only one pass through the data. Each
|
||
|
// point is re-allocated, if necessary, to the cluster that will
|
||
|
// induce the maximum reduction in within-cluster sum of squares.
|
||
|
//
|
||
|
OPTRA(A, M, N, C, K, IC1, IC2, NC, AN1, AN2, NCP, D, ITRAN, LIVE, INDX);
|
||
|
//
|
||
|
// Stop if no transfer took place in the last M optimal transfer
|
||
|
// steps.
|
||
|
//
|
||
|
if (INDX = M) then goto cont150;
|
||
|
//
|
||
|
// Each point is tested in turn to see if it should be re-allocated
|
||
|
// to the cluster to which it is most likely to be transferred,
|
||
|
// IC2(I), from its present cluster, IC1(I). Loop through the
|
||
|
// data until no further change is to take place.
|
||
|
//
|
||
|
QTRAN(A, M, N, C, K, IC1, IC2, NC, AN1, AN2, NCP, D, ITRAN, INDX);
|
||
|
//
|
||
|
// If there are only two clusters, there is no need to re-enter the
|
||
|
// optimal transfer stage.
|
||
|
//
|
||
|
if (K = 2) then goto cont150;
|
||
|
//
|
||
|
// NCP has to be set to 0 before entering OPTRA.
|
||
|
//
|
||
|
for L := 1 to K do NCP[L] := 0;
|
||
|
end;
|
||
|
//
|
||
|
// Since the specified number of iterations has been exceeded, set
|
||
|
// IFAULT := 2. This may indicate unforeseen looping.
|
||
|
//
|
||
|
IFAULT := 2;
|
||
|
//
|
||
|
// Compute within-cluster sum of squares for each cluster.
|
||
|
//
|
||
|
cont150:
|
||
|
for L := 1 to K do
|
||
|
begin
|
||
|
WSS[L] := ZERO;
|
||
|
for J := 1 to N do C[L,J] := ZERO;
|
||
|
end;
|
||
|
for I := 1 to M do
|
||
|
begin
|
||
|
II := IC1[I];
|
||
|
for J := 1 to N do C[II,J] := C[II,J] + A[I,J];
|
||
|
end;
|
||
|
for J := 1 to N do
|
||
|
begin
|
||
|
for L := 1 to K do C[L,J] := C[L,J] / (NC[L]);
|
||
|
for I := 1 to M do
|
||
|
begin
|
||
|
II := IC1[I];
|
||
|
DA := A[I,J] - C[II,J];
|
||
|
WSS[II] := WSS[II] + DA * DA;
|
||
|
end;
|
||
|
end; // 190 CONTINUE
|
||
|
end;
|
||
|
|
||
|
|
||
|
procedure TKMeansFrm.OPTRA(VAR A : DblDyneMat; M, N : integer;
|
||
|
VAR C : DblDyneMat; K : integer;
|
||
|
VAR IC1 : IntDyneVec; VAR IC2 : IntDyneVec;
|
||
|
VAR NC : IntDyneVec; VAR AN1 : DblDyneVec;
|
||
|
VAR AN2 : DblDyneVec; VAR NCP : IntDyneVec;
|
||
|
VAR D : DblDyneVec; VAR ITRAN : IntDyneVec;
|
||
|
VAR LIVE : IntDyneVec; INDX : integer);
|
||
|
VAR
|
||
|
ZERO, ONE, BIG,DE, DF, DD, DC, DB, DA, R2, RR, AL1, AL2, ALT, ALW : double;
|
||
|
I, J, L, L1, L2, LL : integer;
|
||
|
label cont30, cont60, cont70, cont90;
|
||
|
|
||
|
begin
|
||
|
// SUBROUTINE OPTRA(A, M, N, C, K, IC1, IC2, NC, AN1, AN2, NCP, D,
|
||
|
// * ITRAN, LIVE, INDX)
|
||
|
//
|
||
|
// ALGORITHM AS 136.1 APPL. STATIST. (1979) VOL.28, NO.1
|
||
|
//
|
||
|
// This is the optimal transfer stage.
|
||
|
//
|
||
|
// Each point is re-allocated, if necessary, to the cluster that
|
||
|
// will induce a maximum reduction in the within-cluster sum of
|
||
|
// squares.
|
||
|
//
|
||
|
// INTEGER IC1(M), IC2(M), NC(K), NCP(K), ITRAN(K), LIVE(K)
|
||
|
// REAL A(M,N), D(M), C(K,N), AN1(K), AN2(K), ZERO, ONE
|
||
|
//
|
||
|
// Define BIG to be a very large positive number.
|
||
|
//
|
||
|
// DATA BIG /1.0E30/, ZERO /0.0/, ONE/1.0/
|
||
|
//
|
||
|
// If cluster L is updated in the last quick-transfer stage, it
|
||
|
// belongs to the live set throughout this stage. Otherwise, at
|
||
|
// each step, it is not in the live set if it has not been updated
|
||
|
// in the last M optimal transfer steps.
|
||
|
//
|
||
|
|
||
|
ZERO := 0.0;
|
||
|
ONE := 1.0;
|
||
|
BIG := 1.0e30;
|
||
|
|
||
|
for L := 1 to K do
|
||
|
begin
|
||
|
if (ITRAN[L] = 1) then LIVE[L] := M + 1;
|
||
|
end; // 10 CONTINUE
|
||
|
|
||
|
for I := 1 to M do
|
||
|
begin
|
||
|
INDX := INDX + 1;
|
||
|
L1 := IC1[I];
|
||
|
L2 := IC2[I];
|
||
|
LL := L2;
|
||
|
//
|
||
|
// If point I is the only member of cluster L1, no transfer.
|
||
|
//
|
||
|
if (NC[L1] = 1) then goto cont90; // GO TO 90
|
||
|
//
|
||
|
// If L1 has not yet been updated in this stage, no need to
|
||
|
// re-compute D(I).
|
||
|
//
|
||
|
if (NCP[L1] = 0) then goto cont30; // GO TO 30
|
||
|
DE := ZERO;
|
||
|
for J := 1 to N do
|
||
|
begin
|
||
|
DF := A[I,J] - C[L1,J];
|
||
|
DE := DE + DF * DF;
|
||
|
end;
|
||
|
D[I] := DE * AN1[L1];
|
||
|
//
|
||
|
// Find the cluster with minimum R2.
|
||
|
//
|
||
|
cont30:
|
||
|
DA := ZERO;
|
||
|
for J := 1 to N do
|
||
|
begin
|
||
|
DB := A[I,J] - C[L2,J];
|
||
|
DA := DA + DB * DB;
|
||
|
end;
|
||
|
R2 := DA * AN2[L2];
|
||
|
for L := 1 to K do
|
||
|
begin
|
||
|
//
|
||
|
// If I >:= LIVE(L1), then L1 is not in the live set. If this is
|
||
|
// true, we only need to consider clusters that are in the live set
|
||
|
// for possible transfer of point I. Otherwise, we need to consider
|
||
|
// all possible clusters.
|
||
|
//
|
||
|
if ((I >= LIVE[L1]) and (I >= LIVE[L]) or (L = L1) or (L = LL)) then goto cont60;
|
||
|
RR := R2 / AN2[L];
|
||
|
DC := ZERO;
|
||
|
for J := 1 to N do
|
||
|
begin
|
||
|
DD := A[I,J] - C[L,J];
|
||
|
DC := DC + DD * DD;
|
||
|
if (DC >= RR) then goto cont60;
|
||
|
end;
|
||
|
R2 := DC * AN2[L];
|
||
|
L2 := L;
|
||
|
cont60:
|
||
|
end; // 60 CONTINUE
|
||
|
if (R2 < D[I]) then goto cont70;
|
||
|
//
|
||
|
// If no transfer is necessary, L2 is the new IC2(I).
|
||
|
//
|
||
|
IC2[I] := L2;
|
||
|
goto cont90; // GO TO 90
|
||
|
//
|
||
|
// Update cluster centres, LIVE, NCP, AN1 & AN2 for clusters L1 and
|
||
|
// L2, and update IC1(I) & IC2(I).
|
||
|
//
|
||
|
cont70:
|
||
|
INDX := 0;
|
||
|
LIVE[L1] := M + I;
|
||
|
LIVE[L2] := M + I;
|
||
|
NCP[L1] := I;
|
||
|
NCP[L2] := I;
|
||
|
AL1 := NC[L1];
|
||
|
ALW := AL1 - ONE;
|
||
|
AL2 := NC[L2];
|
||
|
ALT := AL2 + ONE;
|
||
|
for J := 1 to N do
|
||
|
begin
|
||
|
C[L1,J] := (C[L1,J] * AL1 - A[I,J]) / ALW;
|
||
|
C[L2,J] := (C[L2,J] * AL2 + A[I,J]) / ALT;
|
||
|
end;
|
||
|
NC[L1] := NC[L1] - 1;
|
||
|
NC[L2] := NC[L2] + 1;
|
||
|
AN2[L1] := ALW / AL1;
|
||
|
AN1[L1] := BIG;
|
||
|
if (ALW > ONE) then AN1[L1] := ALW / (ALW - ONE);
|
||
|
AN1[L2] := ALT / AL2;
|
||
|
AN2[L2] := ALT / (ALT + ONE);
|
||
|
IC1[I] := L2;
|
||
|
IC2[I] := L1;
|
||
|
cont90:
|
||
|
// 90 CONTINUE
|
||
|
if (INDX = M) then exit;
|
||
|
end; // 100 CONTINUE
|
||
|
for L := 1 to K do
|
||
|
begin
|
||
|
//
|
||
|
// ITRAN(L) := 0 before entering QTRAN. Also, LIVE(L) has to be
|
||
|
// decreased by M before re-entering OPTRA.
|
||
|
//
|
||
|
ITRAN[L] := 0;
|
||
|
LIVE[L] := LIVE[L] - M;
|
||
|
end; // 110 CONTINUE
|
||
|
end;
|
||
|
|
||
|
procedure TKMeansFrm.QTRAN(VAR A : DblDyneMat; M, N : integer;
|
||
|
VAR C : DblDyneMat; K : integer;
|
||
|
VAR IC1 : IntDyneVec; VAR IC2 : IntDyneVec;
|
||
|
VAR NC : IntDyneVec; VAR AN1 : DblDyneVec;
|
||
|
VAR AN2 : DblDyneVec; VAR NCP : IntDyneVec;
|
||
|
VAR D : DblDyneVec; VAR ITRAN : IntDyneVec;
|
||
|
INDX : integer);
|
||
|
VAR
|
||
|
BIG, ZERO, ONE, DA, DB, DE, DD, R2, AL1, ALW, AL2, ALT : double;
|
||
|
I, J, ICOUN, ISTEP, L1, L2 : integer;
|
||
|
label cont10, cont30, cont60;
|
||
|
|
||
|
begin
|
||
|
// SUBROUTINE QTRAN(A, M, N, C, K, IC1, IC2, NC, AN1, AN2, NCP, D,
|
||
|
// * ITRAN, INDX)
|
||
|
//
|
||
|
// ALGORITHM AS 136.2 APPL. STATIST. (1979) VOL.28, NO.1
|
||
|
//
|
||
|
// This is the quick transfer stage.
|
||
|
// IC1(I) is the cluster which point I belongs to.
|
||
|
// IC2(I) is the cluster which point I is most likely to be
|
||
|
// transferred to.
|
||
|
// For each point I, IC1(I) & IC2(I) are switched, if necessary, to
|
||
|
// reduce within-cluster sum of squares. The cluster centres are
|
||
|
// updated after each step.
|
||
|
//
|
||
|
// INTEGER IC1(M), IC2(M), NC(K), NCP(K), ITRAN(K)
|
||
|
// REAL A(M,N), D(M), C(K,N), AN1(K), AN2(K), ZERO, ONE
|
||
|
//
|
||
|
// Define BIG to be a very large positive number
|
||
|
//
|
||
|
// DATA BIG /1.0E30/, ZERO /0.0/, ONE /1.0/
|
||
|
//
|
||
|
// In the optimal transfer stage, NCP(L) indicates the step at which
|
||
|
// cluster L is last updated. In the quick transfer stage, NCP(L)
|
||
|
// is equal to the step at which cluster L is last updated plus M.
|
||
|
//
|
||
|
BIG := 1.0e30;
|
||
|
ZERO := 0.0;
|
||
|
ONE := 1.0;
|
||
|
ICOUN := 0;
|
||
|
ISTEP := 0;
|
||
|
cont10:
|
||
|
for I := 1 to M do
|
||
|
begin
|
||
|
ICOUN := ICOUN + 1;
|
||
|
ISTEP := ISTEP + 1;
|
||
|
L1 := IC1[I];
|
||
|
L2 := IC2[I];
|
||
|
//
|
||
|
// If point I is the only member of cluster L1, no transfer.
|
||
|
//
|
||
|
if (NC[L1] = 1) then goto cont60;
|
||
|
//
|
||
|
// If ISTEP > NCP(L1), no need to re-compute distance from point I to
|
||
|
// cluster L1. Note that if cluster L1 is last updated exactly M
|
||
|
// steps ago, we still need to compute the distance from point I to
|
||
|
// cluster L1.
|
||
|
//
|
||
|
if (ISTEP > NCP[L1]) then goto cont30;
|
||
|
DA := ZERO;
|
||
|
for J := 1 to N do
|
||
|
begin
|
||
|
DB := A[I,J] - C[L1,J];
|
||
|
DA := DA + DB * DB;
|
||
|
end;
|
||
|
D[I] := DA * AN1[L1];
|
||
|
//
|
||
|
// If ISTEP >:= both NCP(L1) & NCP(L2) there will be no transfer of
|
||
|
// point I at this step.
|
||
|
//
|
||
|
cont30:
|
||
|
if ((ISTEP >= NCP[L1]) and (ISTEP >= NCP[L2])) then goto cont60;
|
||
|
R2 := D[I] / AN2[L2];
|
||
|
DD := ZERO;
|
||
|
for J := 1 to N do
|
||
|
begin
|
||
|
DE := A[I,J] - C[L2,J];
|
||
|
DD := DD + DE * DE;
|
||
|
if (DD >= R2) then goto cont60;
|
||
|
end; // 40 CONTINUE
|
||
|
//
|
||
|
// Update cluster centres, NCP, NC, ITRAN, AN1 & AN2 for clusters
|
||
|
// L1 & L2. Also update IC1(I) & IC2(I). Note that if any
|
||
|
// updating occurs in this stage, INDX is set back to 0.
|
||
|
//
|
||
|
ICOUN := 0;
|
||
|
INDX := 0;
|
||
|
ITRAN[L1] := 1;
|
||
|
ITRAN[L2] := 1;
|
||
|
NCP[L1] := ISTEP + M;
|
||
|
NCP[L2] := ISTEP + M;
|
||
|
AL1 := NC[L1];
|
||
|
ALW := AL1 - ONE;
|
||
|
AL2 := NC[L2];
|
||
|
ALT := AL2 + ONE;
|
||
|
for J := 1 to N do
|
||
|
begin
|
||
|
C[L1,J] := (C[L1,J] * AL1 - A[I,J]) / ALW;
|
||
|
C[L2,J] := (C[L2,J] * AL2 + A[I,J]) / ALT;
|
||
|
end; // 50 CONTINUE
|
||
|
NC[L1] := NC[L1] - 1;
|
||
|
NC[L2] := NC[L2] + 1;
|
||
|
AN2[L1] := ALW / AL1;
|
||
|
AN1[L1] := BIG;
|
||
|
if (ALW > ONE) then AN1[L1] := ALW / (ALW - ONE);
|
||
|
AN1[L2] := ALT / AL2;
|
||
|
AN2[L2] := ALT / (ALT + ONE);
|
||
|
IC1[I] := L2;
|
||
|
IC2[I] := L1;
|
||
|
//
|
||
|
// If no re-allocation took place in the last M steps, return.
|
||
|
//
|
||
|
cont60:
|
||
|
if (ICOUN = M) then exit;
|
||
|
end; // 70 CONTINUE
|
||
|
goto cont10;
|
||
|
end;
|
||
|
|
||
|
initialization
|
||
|
{$I kmeansunit.lrs}
|
||
|
|
||
|
end.
|
||
|
|