ECG-Kit 1.0

File: <base>/common/LIBRA/rapca.m (14,137 bytes)
function result=rapca(data,varargin);

%RAPCA is a 'Reflection-based Algorithm for Principal Components Analysis'. 
% It is resistant to outliers in the data. The robust loadings are computed
% using projection-pursuit techniques and reflections. 
% Therefore RAPCA can be applied to both low and high-dimensional data sets.
% In low dimensions (at most 15), we recommend to use the MCD method instead
% (see mcdcov.m). 
%
% The RAPCA algorithm is described in 
%   Hubert, M., Rousseeuw, P.J., Verboven, S. (2002),
%   "A fast method for robust principal components with applications to chemometrics", 
%   Chemometrics and Intelligent Laboratory Systems, 60, 101-111.
%
% Required input arguments:
%         data : data matrix (observations in the rows, variables in the
%                columns)
%
% Optional input arguments:
%            k : number of principal components to compute
%    plots 0/1 : if equal to 1 a screeplot and an outlier map are drawn (default = 1)
%                else plots are suppressed 
%        labsd : the 'labsd' observations with largest score distance are
%                labeled on the outlier map (default = 3)
%        labod : the 'labod' observations with largest orthogonal distance are
%                labeled on the outlier map (default = 3)          
%	center 0/1 : if equal to 1 the data are centered around the L1-median (default = 1)
%                else the data are centered around the coordinatewise median
%                (not orthogonally equivariant, but faster)
%      classic : If equal to one, the classical PCA analysis will be performed
%                (see also cpca.m). (default = 0)
%
% If k is missing, or k = 0, a screeplot is drawn which allows you to select
% the number of principal components. If k = 0 and plots = 0, the algorithm itself 
% will determine the number of components. This is not recommended.
%
% I/O: result=rapca(x,'k',k,'plots',1,'labsd',3,'labod',3,'center',1,'classic',0);
%  The user should only give the input arguments that have to change their default value.
%  The name of the input arguments needs to be followed by their value.
%  The order of the input arguments is of no importance.
%  
% Examples: 
%    result=rapca(x,'k',3,'plots',0)
%    result=rapca(x,'labsd',5,'center',0)
%
% The output of RAPCA is a structure containing 
% 
%    result.P        : Robust loadings (eigenvectors)
%    result.L        : Robust eigenvalues       
%    result.M        : Robust center of the data
%    result.T        : Robust scores 
%    result.k        : Number of (chosen) principal components
%    result.sd       : Robust score distances within the robust PCA subspace
%    result.od       : Orthogonal distances to the robust PCA subspace 
%    result.cutoff   : Cutoff values for the robust score and orthogonal distances
%    result.flag     : The observations whose score distance is larger than result.cutoff.sd (==> result.flag.sd)
%                      or whose orthogonal distance is larger than result.cutoff.od (==> result.flag.od)
%                      can be considered as outliers and receive a flag equal to zero (result.flag.all).
%                      The regular observations receive a flag 1.
%    result.class    : 'RAPCA' 
%    result.classic  : If the input argument 'classic' is equal to one, this structure
%                      contains results of the classical PCA analysis (see also cpca.m). 
%
% Let n denote the number of observations, and p the number of original variables,
% then RAPCA finds a robust center (p x 1) of the data and a loading matrix P which 
% is (p x k) dimensional. Its columns are orthogonal and define a new coordinate
% system. The scores (n x k) are the coordinates of the centered observations with 
% respect to the loadings. The eigenvalues are the squared robust scales of the 
% observations projected on each of the loadings.
% Note that RAPCA also yields a robust covariance matrix (often singular) which
% can be computed as
%                         cov=result.P*result.L*result.P'
%
% The screeplot shows the eigenvalues and is helpful to select the number of 
% principal components.
% The outlier map visualizes the observations by plotting their orthogonal
% distance to the robust PCA subspace versus their robust distances 
% within the PCA subspace. This allows to classify the data points into 4 types:
% regular observations, good leverage points, bad leverage points and 
% orthogonal outliers. Remark that the RAPCA algorithm by construction passes
% through 'result.k' data points. The orthogonal distance of these data points is thus zero.
%
% The outlier map (or diagnostic plot) is described in
%    Hubert, M., Rousseeuw, P.J., Vanden Branden K. (2005),
%    "ROBPCA: a new approach to robust principal components analysis",  
%   Technometrics, 47, 64--79.
%  
%
% This function is part of LIBRA: the Matlab Library for Robust Analysis,
% available at: 
%              http://wis.kuleuven.be/stat/robust.html
%
% Written by Sabine Verboven and Mia Hubert 
%
% Last Update: 23/12/2003


[n,p]=size(data);
counter=1;
default=struct('k',0,'center',1,'plots',1,'labsd',3,'labod',3,'h',floor(0.75*n),'classic',0);
list=fieldnames(default);
options=default;
IN=length(list);
i=1;
%
if nargin>1
    %
    %placing inputfields in array of strings
    %
    for j=1:nargin-1
        if rem(j,2)~=0
            chklist{i}=varargin{j};
            i=i+1;
        end
    end 
    %
    %Checking which default parameters have to be changed
    % and keep them in the structure 'options'.
    %
    while counter<=IN 
        index=strmatch(list(counter,:),chklist,'exact');
        if ~isempty(index) %in case of similarity
            for j=1:nargin-2 %searching the index of the accompanying field
                if rem(j,2)~=0 %fieldnames are placed on odd index
                    if strcmp(chklist{index},varargin{j})
                        I=j;
                    end
                end
            end
            options=setfield(options,chklist{index},varargin{I+1});
            index=[];
        end
        counter=counter+1;
    end
end
k=options.k;
center=options.center;
plots=options.plots;

if k<0
    warning(['The number of principal components should be positive!']);
end

% First Step: classical SVD on the data 
% This step reduces the data space to the affine subspace 
% spanned by r=min(n-1,p) observations.
if n < p
   [loads,scores,lambda,r,centerX,clm]=kernelEVD(data);
else
   [loads,scores,lambda,r,centerX,clm]=classSVD(data);
end
X=scores;

% Second Step: Rstep on X 
% computes the robust eigenvectors and eigenvalues
[S,P,out.T,kmax,Rm]=rstep(X,k,center,r);
L=S'.^2;
out.P=loads*P;

if center==1
   % Retransforming the robust location to the original space
   out.M=clm+Rm*loads'; 
else
   out.M=median(data);
   datacentr=data-repmat(out.M,size(data,1),1);
   out.T=datacentr*out.P;
end

% Making screeplot to decide on the number of principal components
if plots==1 & k==0
    screeplot(L,'RAPCA');
    k=input(['How many principal components would you like to retain?\n']);
    k=max(min(k,kmax),1);
elseif plots==1    
    screeplot(L,'RAPCA');
    k=min(k,kmax);
elseif k~=0
    k=min(k,kmax)
else
    disp(['The number of principal components is defined by the algorithm.']); 
    disp(['It is set to ',num2str(kmax),'.']); 
    k=kmax;
end

% shrinking to k-dimensional subspace
out.P=out.P(:,1:k);
out.T=out.T(:,1:k);
out.L=L(1:k);
disp(['The outlier map is based on ',num2str(k),' principal component(s).'])
out.k=k; 
out.h=options.h;

% Computing distances 
% Robust score distances in robust PCA subspace
out.sd=sqrt(libra_mahalanobis(out.T,zeros(size(out.T,2),1),'cov',out.L))';
out.cutoff.sd=sqrt(chi2inv(0.975,out.k));
% Orthogonal distances to robust PCA subspace
XRc=data-repmat(out.M,n,1);
Xtilde=out.T*out.P';
Rdiff=XRc-Xtilde;
for i=1:n
    out.od(i,1)=norm(Rdiff(i,:));
end
% Robust cutoff-value for the orthogonal distance
if k~=r
    [m,s]=unimcd(out.od.^(2/3),out.h);
    out.cutoff.od = sqrt(norminv(0.975,m,s).^3); 
else
    out.cutoff.od=0;
end
% Classical analysis
if options.classic==1
    out.classic.P=loads(:,1:out.k);
    out.classic.L=lambda(1:out.k);
    out.classic.M=clm;
    out.classic.T=scores(:,1:out.k);
    out.classic.k=out.k;
    % Mahalanobis distance in classical PCA subspace
    Tclas=centerX*loads(:,1:out.k);
    out.classic.sd=sqrt(libra_mahalanobis(Tclas,zeros(size(Tclas,2),1),'cov',out.classic.L))';
    out.classic.cutoff.sd=sqrt(chi2inv(0.975,out.k));
    % Orthogonal distances to classical PCA subspace
    Xtilde=Tclas*loads(:,1:out.k)';
    Cdiff=centerX-Xtilde;
    for i=1:n
        out.classic.od(i,1)=norm(Cdiff(i,:));
    end
    % Classical cutoff-values
    if k~=r
        m=mean(out.classic.od.^(2/3));
        s=sqrt(var(out.classic.od.^(2/3)));
        out.classic.cutoff.od = sqrt(norminv(0.975,m,s)^3); 
    else
        out.classic.cutoff.od=0;
    end
    out.classic.cutoff.sd=sqrt(chi2inv(0.975,out.k));
    out.classic.flag.od=(out.classic.od<=out.classic.cutoff.od);
    out.classic.flag.sd=(out.classic.sd<=out.classic.cutoff.sd);
    out.classic.class='CPCA';
    out.classic.classic=1;
else
    out.classic=0;
end  

if k~=r
    out.flag.od=(out.od<=out.cutoff.od);
    out.flag.sd=(out.sd<=out.cutoff.sd);
    out.flag.all=(out.flag.od)&(out.flag.sd);
    if options.classic==1
        out.classic.flag.all=(out.classic.flag.od)&(out.classic.flag.sd);
    end
else
    out.flag.od=(out.od<=out.cutoff.od);
    out.flag.sd=(out.sd<=out.cutoff.sd);
    out.flag.all=out.flag.sd;
    if options.classic==1
        out.classic.flag.all=out.classic.flag.sd;
    end
end


% The output
result=struct('P',{out.P},'L',{out.L},'M',{out.M},'T',{out.T},'k',{out.k},...
    'sd', {out.sd},'od',{out.od},'cutoff',{out.cutoff},'flag',out.flag',...
    'class',{'RAPCA'},'classic',{out.classic});
% Making outlier map

try
    if plots & options.classic
        makeplot(result,'classic',1)
    elseif plots
        makeplot(result)
        %figure, scorediagplot(out.sd,out.od,out.k,out.cutoff.sd,out.cutoff.od,'RAPCA',options.labsd,options.labod)
    end
catch %output must be given even if plots are interrupted 
    %> delete(gcf) to get rid of the menu 
end

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
function [S,P,t,kmax,med]= rstep(X,k,center,r); 

%RSTEP: this is an auxiliary function for 'rapca.m'. 
%
% This function is part of LIBRA: the Matlab Library for Robust Analysis,
% available at: 
%              http://wis.kuleuven.be/stat/robust.html
%
% Created by Sabine Verboven and Mia Hubert 
% Part of the code is based on S-PLUS code from C. Croux.
%
% Last Update: 01/22/2002
% 

warning on;
if nargin<2
   k=0;
end
if nargin<3
   center=1;
end
if nargin<4
   r=rank(X);
end
[n,p]=size(X);
if k==0
   p1=min(floor(n/2),r);
else 
   p1=min([k,r,floor(n/2)]);
end

if k==0 | k > p1  
      disp(['The maximum number of principal components is ',num2str(p1),'.'])
      disp(['This is the minimum of (number of data points/2) and the rank of the data matrix.'])
end
S=zeros(p1,1);
V=zeros(p,p1);
switch center
case 0
   med=median(X);
   Xcentr=X-repmat(med,n,1); 
case 1
   med=l1median(X);
   Xcentr=X-repmat(med,n,1);
end
Xnewcentr=Xcentr;
kmax=0;
Transfo=eye(p);
for l=1:p1, 
   B=Xnewcentr;
   Bnorm=zeros(n,1);
   for i=1:n 
     Bnorm(i)=norm(B(i,:),2);
   end
   Bnormr=Bnorm(Bnorm > 1.e-12);
   B=B(Bnorm > 1.e-12,:);
   %Searching in directions A
   A=diag(1./Bnormr)*B;
   if size(Xnewcentr,2)==1 %case l=p1
      V(1:l-1,l)=0;
      V(l:p,l)=1;
      Vorigin(:,l)=Transfo*V(:,p1); 
      t=Xcentr*Vorigin(:,p1); %last step needs extraction of scale directly in p-dim space
      if n>40
         S(p1)=A_scale(t);
      else
         S(p1)=qnm(t);
      end
      kmax=kmax+1;
      break
   else
      Y=Xnewcentr*A'; %projected points in columns
   end
   if n>40
       s=A_scale(Y);
   else
       s=qnm(Y);
   end
   [c,vj]=sort(s); 
   j=vj(length(s)); 
   S(l)=s(j);
   if (S(1)/S(l) > 10^3) &(kmax<p1)
       l=p1+1;
       break
   else
       kmax=kmax+1;
   end
   if l==1 
      V(:,l)=A(j,:)';
   else
      V(1:l-1,l)=0;
      V(l:p,l)=A(j,:)';
   end
   % EigenVectors = columns of V
   % Constructing Transformation
   Base=eye(p-l+1);
   U=[];
   ndiff=norm(Base(:,1)-V(l:p,l),inf); %max norm of the normal vector
   if ndiff> 1.e-12
      if  V(l:p,l)'*Base(:,1) < 0
         V(l:p,l)=(-1)*V(l:p,l);
      end   
      u=(1./norm(Base(:,1)-V(l:p,l)))*(Base(:,1)-V(l:p,l));
      U=Base-2*repmat(u'*Base,p-l+1,1).*repmat(u,1,p-l+1);
   else
      U=Base; 
   end
   % Transforming eigenvectors to the original pxp dimensional space
   if l==1 
      Vorigin(:,l)=V(:,l);
      Transfo=U;
   else
      Edge=eye(p);
      Edge(l:p,l:p)=U;
      Vorigin(:,l)=Transfo*V(:,l);
      Transfo=Transfo*Edge;
   end
   Xnewcentr=Xnewcentr*U; 		%Reflection of data 
   Xnewcentr=removal(Xnewcentr,0,1);
end
[S,I]=greatsort(real(S(1:kmax)));
P=Vorigin(:,I);
t=Xcentr*P; 

%--------------------------------------------------------------------------
function [A_est]=A_scale(Z)

% A_SCALE calculates the A estimate of scale of the columns of Z
% 
% I/O: [A_est]=A_scale(Z); 
% 

Z=Z'; 
U=(Z - repmat(median(Z,2),1,size(Z,2)))./(repmat(madc(Z')',1,size(Z,2)));
[n,p]=size(U);
for i=1:n
   Ui=U(i,:);
   if any(isnan(Ui))
      scale(i)=0;
   else
      Zi=Z(i,:); 
      med=median(Zi); 
      m=madc(Zi-med);
      Zi=Zi(abs(Ui)<3.85);
      Ui=Ui(abs(Ui)<3.85);
      Ti=sqrt(sum((Ui.^2).*((3.85^2-Ui.^2).^4)))*sqrt(p)*0.9471*m;
      Ni=abs(sum((3.85^2-Ui.^2).*(3.85^2-5*(Ui.^2))));
      scale(i)=Ti/Ni;
   end
end
A_est=scale;