ECG-Kit 1.0
(7,164 bytes)
%EMCLUST Expectation-Maximization clustering
%
% [LABELS,W_EM] = EMCLUST (A,W_CLUST,K,LABTYPE)
%
% INPUT
% A Dataset, possibly labeled
% W_CLUST Cluster model mapping, untrained (default: nmc)
% K Number of clusters (default: number of classes in A)
% LABTYPE Label type: 'crisp' or 'soft' (default: label type of A)
%
% OUTPUT
% LABELS Integer labels for the objects in A pointing to their cluster
% W_EM EM clustering mapping
%
% DESCRIPTION
% The untrained classifier mapping W_CLUST is used to update an initially
% labeled dataset A by iterating the following two steps:
% 1. Train W : W_EM = A*W_CLUST
% 2. Relabel A : A = prdataset(A,labeld(A*W_EM*classc))
% This is repeated until the labeling does not change anymore. The final
% classification matrix is returned in B. The final crisp labeling is returned
% in LABELS. W_EM may be used for assigning new objects.
%
% If K is given, a random initialisation for K clusters is made and labels
% of A are neglected. If K is omitted the given labeling is used as
% initailisation.
%
% LABTYPE determines the type of labeling: 'crisp' or 'soft'. Default: label
% type of A. It is assumed W_CLUST can handle the LABTYPE requested.
% Only in case LABTYPE is 'soft' the traditional EM algorithm is followed.
% In case LABTYPE is 'crisp' EMCLUST follows a generalised k-means
% algorithm.
%
% SEE ALSO (<a href="http://37steps.com/prtools">PRTools Guide</a>)
% MAPPINGS, DATASETS, PRKMEANS, PRPROGRESS
% Copyright: R.P.W. Duin, r.p.w.duin@37steps.com
% Faculty EWI, Delft University of Technology
% P.O. Box 5031, 2600 GA Delft, The Netherlands
% $Id: emclust.m,v 1.9 2009/02/03 21:07:26 duin Exp $
function [new_lab,w_em] = emclust (a,w_clust,n,type)
n_ini = 500; % Maximum size of subset to use for initialisation.
epsilon = 1e-6; % Stop when average labeling change drops below this.
% Check arguments.
if (nargin < 5), fid = []; end % obsolete
if (nargin < 4)
prwarning(3,'No label type specified, using label type of dataset A.');
type = [];
end
if (nargin < 3) | isempty(n)
prwarning(3,'No number of clusters specified, using number of classes in A.');
n = [];
end
if (nargin < 2) | isempty(w_clust)
prwarning(2,'No clustering mapping specified, assuming NMC.');
w_clust = nmc;
end
isuntrained(w_clust); % Assert that clustering mapping is untrained.
% Determine number of clusters N and initialisation method.
a = testdatasize(a);
islabtype(a,'crisp','soft');
[m,k,c] = getsize(a);
rand_init = 1;
if (isempty(n))
if (c == 1) % For one class, find two clusters.
n = 2;
else
n = c;
rand_init = 0; % Use given classification as initialisation.
end
end
if (n < 1), error('Number of clusters should be at least one.'); end
if (n == 1), prwarning(4,'Clustering with 1 cluster is trivial.'); end
% Set label type, if given.
if ~isempty(type), a = setlabtype(a,type); end
a = setprior(a,[]); % make sure that priors will be deleted
% Initialise by performing KCENTRES on...
prwaitbar(2,'EM Clustering, initialization');
prwaitbar(2,1);
if (rand_init)
if (m > n_ini) % ... a random subset of A.
prwarning(2,'Initializing by performing KCENTRES on a subset of %d samples.', n_ini);
a_ini = +gendat(+a,n_ini);
else
prwarning(2,'Initializing by performing KCENTRES on the training set.');
a_ini = +a; % ... the entire set A.
end
not_found = 1;
itern = 0;
while(not_found)
% try to find an initialisation with all class sizes > 1
itern = itern + 1;
if itern > 100
error('Not possible to find desired number of components')
end
% add some noise to data to avoid problems
% 50 trials
assign = kcentres(+distm(a_ini.*(ones(size(a_ini))+0.001*randn(size(a_ini)))),n,50);
% Train initial classifier on labels generated by KCENTRES and find
% initial hard labels. Use NMC instead of W_CLUST to make sure that we
% always have enough data to estimate the parameters.
a_ini = prdataset(a_ini,assign);
a_ini = setprior(a_ini,getprior(a_ini,0));
d = a*(a_ini*nmc*classc);
if (islabtype(a,'soft'))
new_lab = +d;
not_found = 0;
else
new_lab = d*labeld;
if all(classsizes(prdataset(d,new_lab)) > 1)
not_found = 0;
end
end
end
lablist_org = [];
else
lablist_org = getlablist(a);
a = setlablist(a,[1:c]');
new_lab = getlabels(a); % Use given labeling.
end
% Ready for the work.
iter = 0;
change = 1;
prwaitbar(2,2,'EM Clustering, EM loop')
prwaitbar(100,['using ' getname(w_clust)]);
if (islabtype(a,'soft'))
a = setlabels(a,new_lab);
a = setprior(a,getprior(a,0));
laba = getlabels(a);
lab = new_lab;
while (change > epsilon) % EM loop, run until labeling is stable.
prwaitbar(100,100-100*exp(-iter/10));
w_em = a*w_clust; % 1. Train classifier, density output.
b = a*(w_em*classc); % 2. Assign probability to training samples.
a = settargets(a,b); % 3. Insert probabilities as new labels.
change = mean(mean((+b-lab).^2)); lab = b;
iter = iter+1;
if iter > 500
prwarning(1,'emclust stopped after 500 iterations')
change = 0;
end
end
w_em = a*w_clust; % 1. Compute classifier, crisp output.
b = a*w_em; % 2. Classify training samples.
new_lab = labeld(b); % 3. Insert classification as new labels.
else % crisp labels
lab = ones(m,1);
while (any(lab ~= new_lab)) % EM loop, run until labeling is stable.
prwaitbar(100,100-100*exp(-iter/10));
a = setlabels(a,new_lab); % 0. Set labels and store old labels.
a = setprior(a,getprior(a,0));% Set priors to class frequencies
lab = new_lab; %
a = remclass(a,1); % demand class sizes > 2 objects
itern = 0;
while getsize(a,3) < n % increase number of classes if necessary
itern = itern + 1;
if itern > 100
error('Not possible to find desired number of components')
end
laba = getlablist(a);
labmax = max(laba);
N = classsizes(a);
[Nmax,cmax] = max(N); % find largest class
aa = seldat(a,cmax); % select just that one
new_lab_aa = prkmeans(aa,2); % split it by kmeans
N1 = sum(new_lab_aa == 1);
N2 = sum(new_lab_aa == 2);
if (N1 > 1 & N2 > 1) % use it if both classes have more than one sample
J = findlabels(a,laba(cmax,:));
a = setlabels(a,new_lab_aa + labmax,J);
end
end
w_em = a*w_clust; % 1. Compute classifier, crisp output.
b = a*w_em; % 2. Classify training samples.
new_lab = labeld(b); % 3. Insert classification as new labels.
iter = iter+1; %DXD Added also the iter for the crisp labels
if iter > 50
prwarning(1,'emclust stopped after 50 iterations')
break;
end
end
end
prwaitbar(0)
prwaitbar(0)
if ~isempty(lablist_org) % substitute original labels if desired
new_lab = lablist_org(new_lab);
wlab = getlabels(w_em);
wlab = lablist_org(wlab);
w_em = setlabels(w_em,wlab);
end
return;