ECG-Kit 1.0

File: <base>/common/prtools/emclust.m (7,164 bytes)
%EMCLUST Expectation-Maximization clustering
%
%  [LABELS,W_EM] = EMCLUST (A,W_CLUST,K,LABTYPE)
%
% INPUT
%   A         Dataset, possibly labeled
%   W_CLUST   Cluster model mapping, untrained (default: nmc)
%   K         Number of clusters (default: number of classes in A)
%   LABTYPE   Label type: 'crisp' or 'soft' (default: label type of A)
%   
% OUTPUT
%   LABELS    Integer labels for the objects in A pointing to their cluster
%   W_EM      EM clustering mapping
%
% DESCRIPTION
% The untrained classifier mapping W_CLUST is used to update an initially
% labeled dataset A by iterating the following two steps:
%   1. Train W   :  W_EM = A*W_CLUST
%   2. Relabel A :  A    = prdataset(A,labeld(A*W_EM*classc))
% This is repeated until the labeling does not change anymore. The final
% classification matrix is returned in B. The final crisp labeling is returned
% in LABELS. W_EM may be used for assigning new objects.
%
% If K is given, a random initialisation for K clusters is made and labels
% of A are neglected. If K is omitted the given labeling is used as
% initailisation.
%
% LABTYPE determines the type of labeling: 'crisp' or 'soft'. Default: label
% type of A. It is assumed W_CLUST can handle the LABTYPE requested.
% Only in case LABTYPE is 'soft' the traditional EM algorithm is followed.
% In case LABTYPE is 'crisp' EMCLUST follows a generalised k-means
% algorithm.
%
% SEE ALSO (<a href="http://37steps.com/prtools">PRTools Guide</a>)
% MAPPINGS, DATASETS, PRKMEANS, PRPROGRESS

% Copyright: R.P.W. Duin, r.p.w.duin@37steps.com
% Faculty EWI, Delft University of Technology
% P.O. Box 5031, 2600 GA Delft, The Netherlands

% $Id: emclust.m,v 1.9 2009/02/03 21:07:26 duin Exp $

function [new_lab,w_em] = emclust (a,w_clust,n,type)

		n_ini		= 500;			% Maximum size of subset to use for initialisation.
	epsilon = 1e-6;			% Stop when average labeling change drops below this.

	% Check arguments.
	if (nargin < 5), fid = []; end % obsolete
	if (nargin < 4)
		prwarning(3,'No label type specified, using label type of dataset A.');
		type = []; 
	end
	if (nargin < 3) | isempty(n)
		prwarning(3,'No number of clusters specified, using number of classes in A.');
		n = []; 
	end
	if (nargin < 2) | isempty(w_clust)
		prwarning(2,'No clustering mapping specified, assuming NMC.');
		w_clust = nmc;   
	end

  isuntrained(w_clust);   % Assert that clustering mapping is untrained.

  % Determine number of clusters N and initialisation method.

	a = testdatasize(a); 
	islabtype(a,'crisp','soft');
	[m,k,c] = getsize(a); 
	rand_init = 1;
	if (isempty(n))
		if (c == 1)						% For one class, find two clusters.
			n = 2;
		else
			n = c;							
			rand_init = 0; 			% Use given classification as initialisation.
		end
	end

	if (n < 1),  error('Number of clusters should be at least one.'); end
	if (n == 1), prwarning(4,'Clustering with 1 cluster is trivial.'); end

	% Set label type, if given.

	if ~isempty(type), a = setlabtype(a,type); end
	a = setprior(a,[]); % make sure that priors will be deleted
	
	% Initialise by performing KCENTRES on...

	prwaitbar(2,'EM Clustering, initialization');
	prwaitbar(2,1);
	if (rand_init)
		if (m > n_ini)       						% ... a random subset of A.
      prwarning(2,'Initializing by performing KCENTRES on a subset of %d samples.', n_ini);
			a_ini = +gendat(+a,n_ini);	
		else
      prwarning(2,'Initializing by performing KCENTRES on the training set.');
			a_ini = +a;								% ... the entire set A.
		end
		not_found = 1;
		itern = 0;
		while(not_found)
			% try to find an initialisation with all class sizes > 1
			itern = itern + 1;
			if itern > 100
				error('Not possible to find desired number of components')
			end
			% add some noise to data to avoid problems
			% 50 trials
			assign  = kcentres(+distm(a_ini.*(ones(size(a_ini))+0.001*randn(size(a_ini)))),n,50);
		% Train initial classifier on labels generated by KCENTRES and find
		% initial hard labels. Use NMC instead of W_CLUST to make sure that we 
    % always have enough data to estimate the parameters.
			a_ini = prdataset(a_ini,assign); 
			a_ini = setprior(a_ini,getprior(a_ini,0));
			d = a*(a_ini*nmc*classc);
  		if (islabtype(a,'soft'))
				new_lab = +d;
				not_found = 0;
			else
				new_lab = d*labeld;
				if all(classsizes(prdataset(d,new_lab)) > 1)
					not_found = 0;
				end
			end
		end
		lablist_org = [];
	else
		lablist_org = getlablist(a);
		a = setlablist(a,[1:c]');
		new_lab = getlabels(a);		% Use given labeling.
	end

	% Ready for the work.
	iter = 0;
	change = 1;
	prwaitbar(2,2,'EM Clustering, EM loop')
	prwaitbar(100,['using ' getname(w_clust)]);
  if (islabtype(a,'soft'))
		a = setlabels(a,new_lab);
		a = setprior(a,getprior(a,0));
		laba = getlabels(a);
		lab = new_lab;
  	while (change > epsilon)       	% EM loop, run until labeling is stable.
			prwaitbar(100,100-100*exp(-iter/10));
  		w_em = a*w_clust;             % 1. Train classifier, density output.
  		b = a*(w_em*classc);          % 2. Assign probability to training samples.
  		a = settargets(a,b);          % 3. Insert probabilities as new labels.
  		change = mean(mean((+b-lab).^2)); lab = b;  
			iter = iter+1;
			if iter > 500
				prwarning(1,'emclust stopped after 500 iterations')
				change = 0;
			end
		end
  	w_em = a*w_clust;             % 1. Compute classifier, crisp output.
  	b = a*w_em;               		% 2. Classify training samples.
  	new_lab = labeld(b);      		% 3. Insert classification as new labels.
	else  % crisp labels
		lab = ones(m,1);
		while (any(lab ~= new_lab)) % EM loop, run until labeling is stable.
			prwaitbar(100,100-100*exp(-iter/10));
			a = setlabels(a,new_lab); 		% 0. Set labels and store old labels.
			a = setprior(a,getprior(a,0));%    Set priors to class frequencies
			lab = new_lab;								% 
			a = remclass(a,1);            %    demand class sizes > 2 objects
			itern = 0;
			while getsize(a,3) < n        %    increase number of classes if necessary
				itern = itern + 1;
				if itern > 100
					error('Not possible to find desired number of components')
				end
				laba = getlablist(a);
				labmax = max(laba);
				N = classsizes(a);
				[Nmax,cmax] = max(N);        % find largest class
				aa = seldat(a,cmax);         % select just that one
				new_lab_aa = prkmeans(aa,2);   % split it by kmeans
				N1 = sum(new_lab_aa == 1);   
				N2 = sum(new_lab_aa == 2);
				if (N1 > 1 & N2 > 1) % use it if both classes have more than one sample
					J = findlabels(a,laba(cmax,:));
					a = setlabels(a,new_lab_aa + labmax,J);
				end
			end
				
  		w_em = a*w_clust;             % 1. Compute classifier, crisp output.
  		b = a*w_em;               		% 2. Classify training samples.
  		new_lab = labeld(b);      		% 3. Insert classification as new labels.
			iter = iter+1;             %DXD Added also the iter for the crisp labels
			if iter > 50
				prwarning(1,'emclust stopped after 50 iterations')
				break;
			end
  	end
  end
	prwaitbar(0)
	prwaitbar(0)
	
	if ~isempty(lablist_org) % substitute original labels if desired
		new_lab = lablist_org(new_lab);
		wlab = getlabels(w_em);
		wlab = lablist_org(wlab);
		w_em = setlabels(w_em,wlab);
	end
		
return;