ECG-Kit 1.0

File: <base>/common/prtools_addins/emclust.m (10,616 bytes)
%EMCLUST Expectation-Maximization clustering
%
%  [LABELS,W_EM] = EMCLUST (A,W_CLUST,K,LABTYPE,FID)
%
% INPUT
%   A         Dataset, possibly labeled
%   W_CLUST   Cluster model mapping, untrained (default: nmc)
%   K         Number of clusters (default: 2)
%   LABTYPE   Label type: 'crisp' or 'soft' (default: label type of A)
%   FID       File ID to write progress to (default [], see PRPROGRESS)
%
% OUTPUT
%   LABELS    Integer labels for the objects in A pointing to their cluster
%   W_EM      EM clustering mapping
%
% DESCRIPTION
% The untrained classifier mapping W_CLUST is used to update an initially
% labeled dataset A by iterating the following two steps:
%   1. Train W   :  W_EM = A*W_CLUST
%   2. Relabel A :  A    = dataset(A,labeld(A*W_EM*classc))
% This is repeated until the labeling does not change anymore. The final
% classification matrix is returned in B. The final crisp labeling is returned
% in LABELS. W_EM may be used for assigning new objects.
%
% If K is given, a random initialisation for K clusters is made and labels
% of A are neglected. 
%
% LABTYPE determines the type of labeling: 'crisp' or 'soft'. Default: label
% type of A. It is assumed W_CLUST can handle the LABTYPE requested.
% Only in case LABTYPE is 'soft' the traditional EM algorithm is followed.
% In case LABTYPE is 'crisp' EMCLUST follows a generalised k-means
% algorithm.
%
% SEE ALSO
% MAPPINGS, DATASETS, KMEANS, PRPROGRESS

% Copyright: R.P.W. Duin, r.p.w.duin@prtools.org
% Faculty EWI, Delft University of Technology
% P.O. Box 5031, 2600 GA Delft, The Netherlands

% $Id: emclust.m,v 1.9 2009/02/03 21:07:26 duin Exp $

function [new_lab,w_em] = emclust (a,w_clust,n,type,fid)

	prtrace(mfilename);

	n_ini		= 500;			% Maximum size of subset to use for initialisation.
	epsilon = 1e-6;			% Stop when average labeling change drops below this.

	% Check arguments.
	if (nargin < 5), fid = []; end
	if (nargin < 4)
		prwarning(3,'No label type specified, using label type of dataset A.');
		type = []; 
	end
	if (nargin < 3) | isempty(n)
		prwarning(3,'No number of clusters specified, using number of classes in A.');
		n = []; 
	end
	if (nargin < 2) | isempty(w_clust)
		prwarning(2,'No clustering mapping specified, assuming NMC.');
		w_clust = nmc;   
	end

  isuntrained(w_clust);   % Assert that clustering mapping is untrained.

  % Determine number of clusters N and initialisation method.

	a = testdatasize(a); 
	islabtype(a,'crisp','soft');
	[m,k,c] = getsize(a); 
	rand_init = 1;
	if (isempty(n))
		if (c == 1)						% For one class, find two clusters.
			n = 2;
		else
			n = c;							
			rand_init = 0; 			% Use given classification as initialisation.
		end
	end

	if (n < 1),  error('Number of clusters should be at least one.'); end
	if (n == 1), prwarning(4,'Clustering with 1 cluster is trivial.'); end

	% Set label type, if given.

	if ~isempty(type), a = setlabtype(a,type); end
	a = setprior(a,[]); % make sure that priors will be deleted
	
	% Initialise by performing KCENTRES on...

	prwaitbar(2,'EM Clustering, initialization');
	prwaitbar(2,1);
	if (rand_init)
		
        not_found = 1;
		itern = 0;
		while(not_found)
			% try to find an initialisation with all class sizes > 1
            
            if (m > n_ini)       						% ... a random subset of A.
                prwarning(2,'Initializing by performing KCENTRES on a subset of %d samples.', n_ini);
    % 			a_ini = +gendat(+a,n_ini);	
                %sampleo uniformemente el feature-space
                a_ini = +a;
                a_ini = sum(abs(a_ini),2);
                [~, sort_idx ] = sort(a_ini);
                a_ini = +a(sort_idx(randsample(m, n_ini)),:);
            else
                prwarning(2,'Initializing by performing KCENTRES on the training set.');
                a_ini = +a;								% ... the entire set A.
            end
            
			itern = itern + 1;
			if itern > 3
				error('Not possible to find desired number of components')
			end
			% add some noise to data to avoid problems
			% 50 trials
            assign = zeros(1,n-1);
            std_a_ini = std(a_ini);
            jj = 1;
            iter_max = 50;
            
            while( jj <= iter_max )
                try
                    assign  = kcentres(+distm(a_ini.*(ones(size(a_ini))+ bsxfun(@times, max( [repmat(0.001,size(std_a_ini)); 0.01*std_a_ini] ), randn(size(a_ini))))),n,50);
                    if( length(unique(assign)) == n )
                        a_ini = prdataset(a_ini,assign); 
                        a_ini = setprior(a_ini,getprior(a_ini,0));
                        if( isvaldfile(a_ini,1,2) )
                            break 
                        end
                    end

                catch ME
%                     if( strcmpi(ME.message, 'kcentres fails as some objects are identical: add some noise'))
%                         std_a_ini = std_a_ini * 2;
%                     else
%                         rethrow(ME)
%                     end
                    error('Not possible to find desired number of components')
                end
                
                jj = jj + 1;
                
            end
            
            if( jj > iter_max )
                error('Not possible to find desired number of components')
            end

            
            % Train initial classifier on labels generated by KCENTRES and find
            % initial hard labels. Use NMC instead of W_CLUST to make sure that we 
            % always have enough data to estimate the parameters.
            
            try
                d = a*(a_ini*nmc);
            catch ME
                fprintf(2, 'Valid %d\n\n', isvaldfile(a_ini,1,2) )  
                error('Not possible to find desired number of components')
            end
            if (islabtype(a,'soft'))
				new_lab = +d;
				not_found = 0;
			else
				new_lab = d*labeld;
                cs_new_lab = classsizes(prdataset(d,new_lab));
				if length(cs_new_lab) == n && all( cs_new_lab > 1)
					not_found = 0;
				end
			end
		end
		lablist_org = [];
	else
		lablist_org = getlablist(a);
		a = setlablist(a,[1:c]');
		new_lab = getlabels(a);		% Use given labeling.
	end

	% Ready for the work.
	iter = 0;
	change = 1;
	prwaitbar(2,2,'EM Clustering, EM loop')
	prwaitbar(100,['using ' getname(w_clust)]);
  if (islabtype(a,'soft'))
		prprogress(fid,'emclust optim: iter, change (mse):\n');
		prprogress(fid,' %i, %f \n',0,0);
		a = setlabels(a,new_lab);
		a = setprior(a,getprior(a,0));
		laba = getlabels(a);
		lab = new_lab;
  	while (change > epsilon)       	% EM loop, run until labeling is stable.
			prwaitbar(100,100-100*exp(-iter/10));
  		w_em = a*w_clust;             % 1. Train classifier, density output.
  		b = a*(w_em*classc);          % 2. Assign probability to training samples.
  		a = settargets(a,b);          % 3. Insert probabilities as new labels.
  		change = mean(mean((+b-lab).^2)); lab = b;          
			prprogress(fid,' %i, %f \n',iter,change);
			iter = iter+1;
			if iter > 500
				prwarning(1,'emclust stopped after 500 iterations')
				change = 0;
			end
		end
	else  % crisp labels
		
        prprogress(fid,'emclust optim: iter, change (#obj), #clust:\n');
		prprogress(fid,' %i, %i %i \n',0,0,0);
		lab = ones(m,1);
        
        while (change ~= 0 && any(lab ~= new_lab)) % EM loop, run until labeling is stable.
            
            prwaitbar(100,100-100*exp(-iter/10));
            a = setlabels(a,new_lab); 		% 0. Set labels and store old labels.
            a = setprior(a,getprior(a,0));%    Set priors to class frequencies
            lab = new_lab;								% 
            a = remclass(a,1);            %    demand class sizes > 2 objects
            itern = 0;
            
            while getsize(a,3) < n        %    increase number of classes if necessary
                
                itern = itern + 1;
                
                if itern > 5
                    error('Not possible to find desired number of components')
                end
                laba = getlablist(a);
                labmax = max(laba);
                N = classsizes(a);
                [Nmax,cmax] = max(N);        % find largest class
                aa = seldat(a,cmax);         % select just that one

                std_aa = std(+aa);
                bContinue = true;
                while( bContinue )
                    try
                        new_lab_aa = kmeans(aa .* (ones(size(+aa))+ bsxfun(@times, max( [repmat(0.001,size(+aa)); 0.01*std_aa] ), randn(size(+aa)))) ,2);   % split it by kmeans
                        bContinue = false;
                    catch ME
                        %                         if( strcmpi(ME.message, 'kcentres fails as some objects are identical: add some noise'))
                        %                             std_aa = std_aa * 2;
                        %                         else
                        %                             rethrow(ME)
                        %                         end
                        error('Not possible to find desired number of components')
                    end

                end



                N1 = sum(new_lab_aa == 1);   
                N2 = sum(new_lab_aa == 2);
                if (N1 > 1 & N2 > 1) % use it if both classes have more than one sample
                    J = findlabels(a,laba(cmax,:));
                    a = setlabels(a,new_lab_aa + labmax,J);
                end
            end

            std_a = std(+a);
            w_em = (a .* (ones(size(+a))+ bsxfun(@times, max( [repmat(0.001,1,size(+a,2)); 1e-6*std_a] ), randn(size(+a))))) * w_clust;             % 1. Compute classifier, crisp output.
            b = a*w_em;               		% 2. Classify training samples.
            new_lab = labeld(b);      		% 3. Insert classification as new labels.
            prprogress(fid,' %i, %i %i \n', ...
            iter,length(find(lab ~= new_lab)),length(unique(new_lab)));
            iter = iter+1;             %DXD Added also the iter for the crisp labels
            if iter > 50
                prwarning(1,'emclust stopped after 50 iterations')
                change = 0;
            end
            
            %para ver la velocidad de convergencia.
%             disp(sum(lab ~= new_lab))
            
        end
    end
	prwaitbar(0)
	prwaitbar(0)
	
% 	if ~isempty(lablist_org) % substitute original labels if desired
% 		new_lab = lablist_org(new_lab);
%         confmat_new
% 		wlab = getlabels(w_em);
% 		wlab = lablist_org(wlab);
% 		w_em = setlabels(w_em,wlab);
% 	end
		
return;