ECG-Kit 1.0
(6,012 bytes)
function [idx,dm,mm,Ss] = kur_rce(x,mode)
%
% [idx,dm,mean,Cov] = kur_rce(x,mode)
%
% Outlier identification based on the study of univariate
% projections onto the directions maximizing and minimizing
% the kurtosis coefficient
%
% Inputs: observations, x, (a matrix having a row for each observation)
% mode, 0 use p maxizing and p minimizing directions
% iterate until no outliers are detected (default)
% >0 use p maxizing and p minimizing directions
% iterate "mode" times
% <0 use only maximizing directions
% iterate "mode" times
% Outputs: idx, zero/one vector with ones in the positions of the
% suspected outliers
% dm, robust Mahalanobis distances
% mean, robust mean estimator
% Ss, robust covariance matrix estimator
%
% Daniel Pena / Francisco J Prieto 23/5/00
[n,p] = size(x);
%Quite esta restricción por obsoleta
% if (n > 750)|(p > 50),
% disp('Data set too large for this code');
% return
% end
if nargin < 2,
mode = 0;
end
maxit = Inf;
if mode < 0,
maxit = abs(mode);
mode = -1;
elseif mode > 0,
maxit = mode;
mode = 1;
end
% Initialization of parameters
%% Cutoff points for projections
ctf1 = 3.05 + 0.324*p;
ctf2 = exp(1.522-0.268*log(p));
ctf3 = 3.5;
ctf4 = 3.0;
%% minimum numbers of observations
lmn1 = max(floor(0.5*(n+p+1)),2*p);
lmn2 = min(2*p,max(p+2,floor(0.25*n)));
% Removing suspect outliers
xx0 = [x (1:n)'];
xx = xx0;
nn = n;
it = 1;
while (it <= maxit),
xx1 = xx(:,1:p);
V = kur_nw(xx1,mode);
pr = xx1*V;
md = median(pr);
prn = abs(pr - ones(nn,1)*md);
% mad = median(prn);
%Problema con el estimador robusto de varianza (mad) al tener muchos valores en 0 o haber distribuciones con poca varianza.
%Solución: Si alguna dimension es muy pequeña, tomo la media de entre los percentiles 50 y 75 de prn en esa
%dimensión como estimacion del desvio std.
prn_percentiles = prctile(prn, [50 95]);
if( size(prn,2) == 1 )
prn_percentiles = prn_percentiles';
end
mad = prn_percentiles(1,:);
disp_ratio = prn_percentiles(2,:) ./ prn_percentiles(1,:);
if( any(disp_ratio > 5) )
% Para una distribucion normal, en 3 sigma encontramos el 99% de los
% puntos, por este motivos pusimos el umbral en 5 para ver cuando la
% estimacion es muy mala.
dim_with_low_var = find(mad == 0);
mad(dim_with_low_var) = mean( prctile( prn(:,dim_with_low_var), [50 75] ) );
if( any(mad(dim_with_low_var) == 0) )
% disp(['No se pudo estimar la varianza en las dimensiones ' num2str(dim_with_low_var) '. Posiblemente esos features estén mal calculados.' ]);
%No se indica ningun outlier en este caso.
idx = false(n,1);
return
end
end
tt = prn./(ones(nn,1)*mad);
if p > 2,
ctfo = ctf1 - (0:(p-2))*0.75*(ctf1-ctf2)/(p-2);
else
ctfo = ctf1;
end
if mode >= 0,
tctf = ones(nn,1)*[ ctfo ctf2 ctf3*ones(1,p-1) ctf4 ];
else
if( p > 1)
tctf = ones(nn,1)*[ ctfo ctf2 ];
else
tctf = ones(nn,1)*[ ctfo ];
end
end
taux = tt./tctf;
t = max(taux,[],2);
% in = find(t > 1);
% Meti una ampliación del nivel de rechazo para outliers ya que parecía muy
% sensible.
in = find(t > 5);
nn = length(in);
if nn == 0,
break
end
% inn = find(t <= 1);
% Meti una ampliación del nivel de rechazo para outliers ya que parecía muy
% sensible.
inn = find(t <= 5);
nu = length(inn);
if nu < lmn2,
[v,ix] = sort(t);
ixx = ix(1:lmn2);
xx = xx(ixx,:);
break
end
xx = xx(inn,:);
[nn,pp] = size(xx);
if nn <= lmn2,
break
end
it = it + 1;
end
%% Extracting the indices of the outliers
idx = ones(n,1);
[nn,pp] = size(xx);
idx(xx(:,pp)) = zeros(nn,1);
% recheck observations and relabel them
%% Cutoffs for Mahalanobis distances, based on the Hotelling-t2
%% Fixed at 0.99 . To modifay change the value of ctflvl
ctflvl = 0.99;
ctf = (p*(n-1)/(n-p))*finv(ctflvl,p,n-p);
%% Mahalanobis distances using center and scale estimators
%% based on good observations
sidx = sum(idx);
s1 = find(idx);
s2 = find(idx == 0);
if sidx > 0,
xx1 = xx0(s1,:);
xx1r = xx1(:,1:p);
end
xx2r = xx0(s2,1:p);
mm = mean(xx2r);
Ss = cov(xx2r);
if sidx > 0,
aux1 = xx1r - ones(sidx,1)*mm;
%Quito un poco del ruido que genera esta inversion de matriz
warning off
dd = Ss\aux1';
warning on
dms = sum((aux1.*dd')');
end
%% Ensure that at least lmn1 observations are considered good
ado = sidx + lmn1 - n;
if ado > 0,
[dmss,idms] = sort(dms);
idm1 = idms(1:ado);
ido = xx0(s1,pp);
s3 = ido(idm1);
idx(s3) = zeros(ado,1);
sidx = sum(idx);
s1 = find(idx);
s2 = find(idx == 0);
if sidx > 0,
xx1 = xx0(s1,:);
xx1r = xx1(:,1:p);
end
xx2r = xx0(s2,1:p);
mm = mean(xx2r);
Ss = cov(xx2r);
if sidx > 0,
aux1 = xx1r - ones(sidx,1)*mm;
dd = Ss\aux1';
dms = sum((aux1.*dd')');
end
end
%% Check remaining observations and relabel if appropriate
while sidx > 0,
s1 = find(dms <= ctf);
s2 = length(s1);
if s2 == 0,
break
end
s3 = xx1(s1,pp);
idx(s3) = zeros(s2,1);
sidx = sum(idx);
s1 = find(idx);
s2 = find(idx == 0);
if sidx > 0,
xx1 = xx0(s1,:);
xx1r = xx1(:,1:p);
end
xx2r = xx0(s2,1:p);
mm = mean(xx2r);
Ss = cov(xx2r);
if sidx > 0,
aux1 = xx1r - ones(sidx,1)*mm;
dd = Ss\aux1';
dms = sum((aux1.*dd')');
end
end
% Values to be returned
idx = idx == 1;
aux1 = x - ones(n,1)*mm;
%Quito un poco del ruido que genera esta inversion de matriz
warning off
dd = Ss\aux1';
warning on
dms = sum((aux1.*dd')');
dm = sqrt(dms);