ECG-Kit 1.0
(10,842 bytes)
function clust_labels = cluster_data_em(data2clust, CantClusters, iter_times)
if nargin < 3 || isempty(iter_times)
iter_times = 1;
end
if nargin < 2 || isempty(CantClusters)
CantClusters = 5;
end
cant_ejemplos = getsize(data2clust,1);
std_train = std(+data2clust);
data2clust = setdata(data2clust, bsxfun(@rdivide, +data2clust, std_train ));
clustered_labels = repmat('0', cant_ejemplos, iter_times);
jj = 1;
% for CantClusters = 2:5
for jj = 1:iter_times
%EMclust
ii = 0;
bContinuar = true;
while(bContinuar && ii < CantClusters)
try
dbclear if caught error
clust_labels = em_loop(data2clust, CantClusters-ii );
dbstop if caught error
bContinuar = false;
catch ME
if(strcmpi(ME.message, 'Not possible to find desired number of components'))
ii = ii + 1;
else
rethrow(ME)
end
end
end
clustered_labels(:,jj) = char(97+clust_labels);
% jj = jj + 1;
end
bClustered = true;
while(bClustered)
%analizo las coincidencias en las distintas iteraciones respecto data2clust
%los etiquetados.
[~, sort_idx] = sort( cellstr(clustered_labels) );
[all_clusters, aux_location] = unique(clustered_labels(sort_idx,:), 'rows', 'first');
aux_location = [colvec(aux_location); cant_ejemplos+1];
cluster_sizes = diff(aux_location);
[~, clust_sort_idx] = sort(cluster_sizes, 'descend');
%agrupo todos los subclusters data2clust una dist maxima max_distance
max_distance = round(0.2*iter_times); %distancia maxima para considerarse un cluster
cant_clusters = size(all_clusters,1);
cant_iter = size(all_clusters,2);
aux_1 = repmat( all_clusters, cant_clusters, 1);
aux_idx = colvec(repmat(1:cant_clusters,cant_clusters,1));
aux_2 = all_clusters(aux_idx,:);
distances = reshape( sum(aux_1 ~= aux_2,2), cant_clusters, cant_clusters );
remaining_clusters = 1:cant_clusters;
%clusterizo igual todo los clusters iguales hasta max_distance
for ii = rowvec(clust_sort_idx)
bClustered = false;
cluster2fusion_idx = find(distances(:,ii) <= max_distance);
cluster2fusion_idx = cluster2fusion_idx(cluster2fusion_idx < ii | cluster2fusion_idx > ii );
for jj = 1:length(cluster2fusion_idx)
aux_idx = find(strcmpi(cellstr(clustered_labels), cellstr(all_clusters(cluster2fusion_idx(jj),:))));
if( ~isempty(aux_idx) )
bClustered = true;
remaining_clusters( remaining_clusters == cluster2fusion_idx(jj)) = [];
clustered_labels( aux_idx ,:) = repmat(all_clusters(ii,:), length(aux_idx), 1 );
all_clusters(cluster2fusion_idx(jj),:) = all_clusters(ii,:);
end
end
if(bClustered)
remaining_clusters( remaining_clusters == ii ) = [];
%fuerzo el recalclo de distancias, para que no se junte todo y solo
%los clusters grandes se coman data2clust los mas chicos.
break
end
end
end
%REanalizo las coincidencias en las distintas iteraciones respecto data2clust
%los etiquetados.
[~, sort_idx] = sort( cellstr(clustered_labels) );
[all_clusters, aux_location] = unique(clustered_labels(sort_idx,:), 'rows', 'first');
aux_location = [colvec(aux_location); cant_ejemplos+1];
cluster_sizes = diff(aux_location);
cant_clusters = size(all_clusters,1);
clust_labels = nan(cant_ejemplos,1);
for ii = 1:cant_clusters
clust_labels(sort_idx(aux_location(ii):(aux_location(ii+1)-1))) = ii;
end
if(any(isnan(clust_labels)))
error()
end
function new_lab = em_loop(data2clust, CantClusters )
n_ini = 500; % Maximum size of subset to use for initialisation.
cant_ejemplos = size(data2clust,1);
not_found = true;
iter = 0;
while(not_found)
% try to find an initialisation with all class sizes > 1
if (cant_ejemplos > n_ini) % ... data2clust random subset of data2clust.
%sampleo uniformemente el feature-space
a_ini = data2clust;
a_ini = sum(abs(a_ini),2);
[~, sort_idx ] = sort(a_ini);
a_ini = data2clust(sort_idx(randsample(cant_ejemplos, n_ini)),:);
else
a_ini = data2clust; % ... the entire set data2clust.
end
iter = iter + 1;
if iter > 3
error('Not possible to find desired number of components')
end
% 50 trials
assign = zeros(1,CantClusters-1);
std_a_ini = std(a_ini);
while( length(unique(assign)) ~= CantClusters )
% add some noise to data to avoid problems
a_ini = a_ini.*(ones(size(a_ini))+ bsxfun(@times, max( [repmat(0.001,size(std_a_ini)); 0.01*std_a_ini] ), randn(size(a_ini))));
%construct the distance matrix
% The order of operations below is good for the accuracy.
dist_mat = ones(n_ini,1)*sum(a_ini'.^2,1);
dist_mat = dist_mat + sum(a_ini.^2,2)*ones(1,n_ini);
dist_mat = dist_mat - 2 .* (a_ini)*(a_ini)';
% Check for a numerical inaccuracy.
dist_mat(dist_mat<0) = zeros(size(J)); % dist_mat should be nonnegative.
% take care of symmetric distance matrix
dist_mat(1:n_ini+1:n_ini*n_ini) = zeros(1,n_ini);
try
assign = kcentres(dist_mat,CantClusters,50);
catch ME
error('Not possible to find desired number of components')
end
end
% Train initial classifier on labels generated by KCENTRES and find
% initial hard labels. Use NMC instead of W_CLUST to make sure that we
% always have enough data to estimate the parameters.
a_ini = dataset(a_ini,assign);
a_ini = setprior(a_ini,getprior(a_ini,0));
dist_mat = data2clust*(a_ini*nmc);
new_lab = dist_mat*labeld;
cs_new_lab = classsizes(dataset(dist_mat,new_lab));
if length(cs_new_lab) == CantClusters && all( cs_new_lab > 1)
not_found = 0;
end
end
lablist_org = [];
% Ready for the work.
iter = 0;
change = 1;
lab = ones(cant_ejemplos,1);
while (change ~= 0 && any(lab ~= new_lab)) % EM loop, run until labeling is stable.
prwaitbar(100,100-100*exp(-iter/10));
data2clust = setlabels(data2clust,new_lab); % 0. Set labels and store old labels.
data2clust = setprior(data2clust,getprior(data2clust,0));% Set priors to class frequencies
lab = new_lab; %
data2clust = remclass(data2clust,1); % demand class sizes > 2 objects
iter = 0;
while getsize(data2clust,3) < CantClusters % increase number of classes if necessary
iter = iter + 1;
if iter > 5
error('Not possible to find desired number of components')
end
laba = getlablist(data2clust);
labmax = max(laba);
CantClusters = classsizes(data2clust);
[Nmax,cmax] = max(CantClusters); % find largest class
aa = seldat(data2clust,cmax); % select just that one
std_aa = std(+aa);
bContinue = true;
while( bContinue )
try
new_lab_aa = kmeans(aa .* (ones(size(+aa))+ bsxfun(@times, max( [repmat(0.001,size(+aa)); 0.01*std_aa] ), randn(size(+aa)))) ,2); % split it by kmeans
bContinue = false;
catch ME
% if( strcmpi(ME.message, 'kcentres fails as some objects are identical: add some noise'))
% std_aa = std_aa * 2;
% else
% rethrow(ME)
% end
error('Not possible to find desired number of components')
end
end
N1 = sum(new_lab_aa == 1);
N2 = sum(new_lab_aa == 2);
if (N1 > 1 & N2 > 1) % use it if both classes have more than one sample
J = findlabels(data2clust,laba(cmax,:));
data2clust = setlabels(data2clust,new_lab_aa + labmax,J);
end
end
std_a = std(+data2clust);
w_em = (data2clust .* (ones(size(+data2clust))+ bsxfun(@times, max( [repmat(0.001,size(+data2clust)); 0.01*std_a] ), randn(size(+data2clust))))) * w_clust; % 1. Compute classifier, crisp output.
b = data2clust*w_em; % 2. Classify training samples.
new_lab = labeld(b); % 3. Insert classification as new labels.
iter = iter+1; %DXD Added also the iter for the crisp labels
if iter > 50
change = 0;
end
%para ver la velocidad de convergencia.
% disp(sum(lab ~= new_lab))
end
if ~isempty(lablist_org) % substitute original labels if desired
new_lab = lablist_org(new_lab);
wlab = getlabels(w_em);
wlab = lablist_org(wlab);
w_em = setlabels(w_em,wlab);
end
function labels = kcentres( dist_mat, CantClusters, itern)
dmax = max(max(dist_mat));
dopt = inf;
for tri = 1:itern
M = randperm(cant_ejemplos); M = M(1:CantClusters); % Random initializations
J = zeros(1,CantClusters); % Center objects to be found.
% Iterate until J == M. See below.
while true
[dm,I] = min(dist_mat(M,:));
% Find CantClusters centers.
for ii = 1:CantClusters
JJ = find(I==ii);
if (isempty(JJ))
%JJ can be empty if two or more objects are in the same position of
% feature space in dataset
J(ii) = 0;
else
% Find objects closest to the object M(ii)
[dummy,j,dm] = kcentres(dist_mat(JJ,JJ),1,1);
J(ii) = JJ(j);
end
end
J(find(J==0)) = [];
CantClusters = length(J);
if CantClusters == 0
error('kcentres fails as some objects are identical: add some noise')
end
if (length(M) == CantClusters) & (all(M == J))
% CantClusters centers are found.
[dmin,labs] = min(dist_mat(J,:));
dmin = max(dmin);
break;
end
M = J;
end
% Store the optimal centers in JOPT.
if (dmin <= dopt)
dopt = dmin;
labels = labs';
Jopt = J;
end
end
prwaitbar(0)
% Determine the best centers over the N trials.
dm = zeros(1,CantClusters);
for ii=1:CantClusters
L = find(labels==ii);
dm(ii) = max(dist_mat(Jopt(ii),L));
end