ECG-Kit 1.0
(20,257 bytes)
function result=rpcr(x,y,varargin)
%RPCR is a 'Robust Principal Components Regression' method based on ROBPCA.
% It can be applied to both low and high-dimensional predictor variables x,
% and to one or multiple response variables y. It is resistant to outliers
% in the data. First, a robust principal components method is applied to the
% predictor variables x (see robpca.m). Then a robust regression is performed.
% For univariate y, the LTS regression is used (see ltsregres.m). When there are
% several response variables, the MCD-regression method is applied (see mcdregres.m).
%
% The RPCR method is described in:
% Hubert, M., Verboven, S. (2003),
% "A robust PCR method for high-dimensional regressors",
% Journal of Chemometrics, 17, 438-452.
%
% To select the number of components in the regression model, a robust RMSECV (root mean squared
% error of cross validation) curve is drawn, based on a fast algorithm for
% cross-validation. This approach is described in:
%
% Engelen, S., Hubert, M. (2005),
% "Fast model selection for robust calibration methods",
% Analytica Chimica Acta, 544, 219-228.
%
% Required input arguments:
% x : Data matrix of the explanatory variables
% (n observations in rows, p variables in columns)
% y : Data matrix of the response variables
% (n observations in rows, q variables in columns)
%
% Optional input arguments:
% k : Number of principal components to be used in the PCA step
% (default = min(rank(x),kmax)). If k is not specified,
% it can be selected using the option 'rmsecv'.
% kmax : Maximal number of principal components to be used (default = 10).
% If k is provided, kmax does not need to be specified, unless k is larger
% than 10.
% alpha : (1-alpha) measures the fraction of outliers the algorithm should
% resist. Any value between 0.5 and 1 may be specified. (default = 0.75)
% h : (n-h+1) measures the number of observations the algorithm should
% resist. Any value between n/2 and n may be specified. (default = 0.75*n)
% Alpha and h may not both be specified.
% rmsecv : If equal to zero and k is not specified, a robust R-squared curve
% is plotted and an optimal k value can be chosen. (default)
% If equal to one and k is not specified, a robust component selection-curve is plotted and
% an optimal k value can be chosen. This curve computes a combination of
% the robust cross-validated mean squared error and the robust residual
% sum of squares for k=1 to kmax. Rmsecv and k may not both
% be specified.
% rmsep : If equal to one, the robust RMSEP-value (root mean squared error of
% prediction) for the model with k components.
% (default = 0). This value is automatically given if rmsecv = 1.
% intadjust : If equal to one, the intercept adjustment for the
% LTS-regression will be calculated (default = 0). See ltsregres.m for
% details.
% plots : If equal to one, a menu is shown which allows to draw several plots,
% such as a score outlier map and a regression outlier map. (default)
% If the input argument 'classic' is equal to one, the classical
% plots are drawn as well.
% If 'plots' is equal to zero, all plots are suppressed.
% See also makeplot.m
% labsd : The 'labsd' observations with largest score distance are
% labeled on the diagnostic plots. (default = 3)
% labod : The 'labod' observations with largest orthogonal distance are
% labeled on the score outlier map. (default = 3)
% labresd : The 'labresd' observations with largest residual distance are
% labeled on the regression outlier map. (default = 3)
% classic : If equal to one, the classical PCR analysis will be performed as well
% (see also cpcr.m). (default = 0)
%
%
% I/O: result=rpcr(x,y,'k',0,'kmax',10,'alpha',0.75,'h',h,'rmsecv',0,'rmsep',0,...
% 'plots',1,'labsd',3,'labod',3,'labresd',3,'classic',0);
% The user should only give the input arguments that have to change their default value.
% The name of the input arguments needs to be followed by their value.
% The order of the input arguments is of no importance.
%
% Example: result=rpcr(x,y,'alpha',0.65,'k',5,'plots',0,'classic',1);
% result=rpcr(x,y,'kmax',5,'labresd',7);
%
% The output of RPCR is a structure containing:
%
% result.slope : Robust slope
% result.int : Robust intercept
% result.fitted : Robust prediction vector
% result.res : Robust residuals
% result.cov : Estimated variance-covariance matrix of the residuals
% result.rsquared : Robust R-squared value for the optimal k.
% result.rcs : Robust Component Selection Criterion:
% This is a matrix with kmax columns. The first row contains the approximate
% R-squared values (for k=1,...,kmax) and the second row the square root of the weighted
% residuals sum of squares. This is equal to the RCS-value with
% gamma = 0 (see Engelen and Hubert (2005) for the definition).
% If the input argument rmsecv = 1, the third row contains the RCS-value sfor
% gamma = 0.5 and the fourth for gamma = 1. The last one is equal to the robust
% cross-validated RMSE values.
% Note that all the entries in this matrix depend on the choice of kmax.
% result.rmsep : Robust RMSEP value
% result.k : Number of principal components
% result.h : The quantile h used throughout the algorithm
% result.sd : Robust score distances within the robust PCA subspace
% result.od : Robust orthogonal distances to the robust PCA subspace
% result.resd : Residual distances (when there are several response variables).
% If univariate regression is performed, it contains the standardized residuals.
% result.cutoff : Cutoff values for the score, orthogonal and residual distances
% result.flag : The observations whose score distance is larger than
% 'result.cutoff.sd' receive a flag 'result.flag.sd' equal
% to zero (good leverage points). Otherwise 'result.flag.sd'
% is equal to one.
% The components 'result.flag.od' and 'result.flag.resd' are
% defined analogously, and determine the orthogonal outliers,
% resp. the bad leverage points/vertical outliers.
% The observations with 'result.flag.od' and 'result.flag.resd'
% equal to zero, can be considered as calibration outliers and receive
% 'result.flag.all' equal to zero. The regular observations and the good leverage
% points have 'result.flag.all' equal to one.
% result.class : 'RPCR'
% result.classic : If the input argument 'classic' is equal to one, this structure
% contains results of the classical PCR analysis (see also cpcr.m).
% result.robpca : Full output of the robust PCA analysis (see robpca.m)
% result.lts : If there is one response variable: full output of the LTS regression.
% (see ltsregres.m)
% result.mcdreg : If there are several response variables: full output of the MCD regression.
% (see mcdregres.m)
%
% This function is part of LIBRA: the Matlab Library for Robust Analysis,
% available at:
% http://wis.kuleuven.be/stat/robust.html
%
% Written by Sabine Verboven
% Created on: 31/10/2000
% Last Update: 09/04/2004, 03/07/2006
% Last Revision: 04/08/2006
%
% checking input
%
if rem(nargin,2)~=0
error('Number of inputarguments must be even!');
end
%
% initialization with defaults
%
counter=1;
[n,p]=size(x);
r=rank(x);
[n2,q]=size(y);
if n~=n2
error('The response variables and the predictor variables have a different number of observations.')
end
kmax=min([10,floor(n/2),r]);
alfa=0.75;
k=0;
h=floor(2*floor((n+kmax+2)/2)-n+2*(n-floor((n+kmax+2)/2))*alfa);
default=struct('intadjust', 0, 'alpha',alfa,'h',h,'k',k,'kmax',kmax,'plots',1,...
'rmsecv',0,'classic',0,'labsd',3,'labod',3,'labresd',3,'rmsep',0);
list=fieldnames(default);
options=default;
IN=length(list);
i=1;
%
if nargin>2
%
% placing inputfields in array of strings
%
for j=1:nargin-2
if rem(j,2)~=0
chklist{i}=varargin{j};
i=i+1;
end
end
dummy=sum(strcmp(chklist,'h')+2*strcmp(chklist,'alpha'));
switch dummy
case 0 % defaultvalues should be taken
options.alpha=alfa;
options.h=h;
case 3
error('Both inputarguments alpha and h are provided. Only one is required.')
end
%
% Checking which default parameters have to be changed
% and keep them in the structure 'options'.
%
while counter<=IN
index=strmatch(list(counter,:),chklist,'exact');
if ~isempty(index) % in case of similarity
for j=1:nargin-2 % searching the index of the accompanying field
if rem(j,2)~=0 % fieldnames are placed on odd index
if strcmp(chklist{index},varargin{j})
I=j;
end
end
end
options=setfield(options,chklist{index},varargin{I+1});
index=[];
end
counter=counter+1;
end
options.h=floor(options.h);
options.kmax=floor(options.kmax);
options.k=floor(options.k);
options.labsd=max(0,min(floor(options.labsd),n));
options.labod=max(0,min(floor(options.labod),n));
options.labresd=max(0,min(floor(options.labresd),n));
kmax=min([options.kmax,floor(n/2),r]);
dummyh = strcmp(chklist,'h');
dummykmax = strcmp(chklist,'kmax');
if all(dummyh == 0) && any(dummykmax)
h = floor(2*floor((n+kmax+1)/2)-n+2*(n-floor((n+kmax+1)/2))*alfa);
end
if options.k < 0
options.k=0;
elseif options.k > kmax
mess=sprintf(['Attention (rpcr.m): The required number of principal components, k = ',num2str(options.k)...
,'\n is larger than kmax= ',num2str(kmax),'; k is set to ',num2str(kmax)]);
disp(mess)
options.k=kmax;
end
if dummy==1% checking inputvariable h
if options.h-floor(options.h)~=0
mess=sprintf(['Attention (rpcr.m): h must be an integer. \n']);
disp(mess)
end
if options.k==0
if options.h<floor((n+kmax+1)/2 )
options.h = quanf(0.5,n,kmax);
mess=sprintf(['Attention (rpcr.m): h should be larger than (n+2)/2.\n',...
'It is set to its minimum value ',num2str(options.h)]);
disp(mess)
end
options.alpha=options.h/n;
else
if options.h<floor((n+options.k+1)/2) % checking inputvariable h
options.h = quanf(0.5,n,k);
mess=sprintf(['Attention (rpcr.m): h should be larger than (n+2)/2.\n',...
'It is set to its minimum value ',num2str(options.h)]);
disp(mess)
end
options.alpha=options.h/n;
end
if options.h>n
options.alpha=0.75;
if options.k==0
options.h=floor(2*floor((n+kmax+2)/2)-n+2*(n-floor((n+kmax+2)/2))*options.alpha);
else
options.h=floor(2*floor((n+options.k+1)/2)-n+2*(n-floor((n+options.k+1)/2))*options.alpha);%k+1 ipv k?
end
mess=sprintf(['Attention (rpcr.m): h should be smaller than n. \n',...
'It is set to its default value ',num2str(options.h)]);
disp(mess)
end
elseif dummy==2
if options.alpha < 0.5
options.alpha=0.5;
mess=sprintf(['Attention (rpcr.m): Alpha should be larger than 0.5. \n',...
'It is set to 0.5.']);
disp(mess)
end
if options.alpha > 1
options.alpha=0.75;
mess=sprintf(['Attention (rpcr.m): Alpha should be smaller than 1.\n',...
'It is set to 0.75.']);
disp(mess)
end
if options.k==0
options.h=floor(2*floor((n+kmax+2)/2)-n+2*(n-floor((n+kmax+2)/2))*options.alpha);
else
options.h=floor(2*floor((n+options.k+1)/2)-n+2*(n-floor((n+options.k+1)/2))*options.alpha);%k+1 ipv k?
end
end
end
%
%
% MAIN PART
%
% If y is univariate, perform LTS regression, else MCD regression
if q==1
options.method='lts';
else
options.method='mcd';
end
% Default value for the number of latent variables k
if n<kmax+2
error('You need more data points to compute this estimator.')
end
%
% initializing and checking k
if options.k==0 % no optimal number of latent variables given by user
kfixed = 0;
if options.rmsecv==0 % no cross validation asked -> calculate R2
[R2,final]=rsquared(x,y,kmax,'RPCR',options.h);
rss = final.rss;
options.k=final.k;
outrobpca=robpca(x,'h',options.h,'k',options.k,'kmax',kmax,'plots',0,'classic',options.classic);
options.k=outrobpca.k;
final=regression(x,y,options.k,outrobpca,options);
if options.rmsep==1 % the rmsep is asked, without prior knowledge
rmse=rrmse(x,y,options.h,kmax,'RPCR',0,options.k);
final.rmsep=rmse.rmsep;
end
final.robpca=outrobpca;
final.R2 = R2;
final.rss = rss;
else % RMSE with cross validation
crosscv=rrmse(x,y,options.h,kmax,'RPCR');
options.k=crosscv.k;
rmsecv=crosscv.rmsecv;
pred=rrmse(x,y,options.h,kmax,'RPCR',0,options.k,crosscv.weight,crosscv.res); %calculate rmsep with rmsecv weights and residuals
outrobpca=robpca(x,'h',options.h,'k',options.k,'kmax',kmax,'plots',0,'classic',options.classic);
options.k=outrobpca.k;
final=regression(x,y,options.k,outrobpca,options);
final.robpca=outrobpca;
final.rmsecv=rmsecv;
final.rmsep=pred.rmsep;
final.R2 = crosscv.R2;
final.rss = crosscv.rss;
end
else % optimal number of latent variables is given by user
if options.rmsecv
error(['Both RMSECV and k were given.', ...
'Please rerun your analysis with one of these inputs. (see help file)'])
end
kfixed = 1;
k=min(r,min(options.k,kmax));
options.k=k;
outrobpca=robpca(x,'h',options.h,'k',options.k,'kmax',kmax,'plots',0,'classic',options.classic);
options.k=outrobpca.k;
final=regression(x,y,options.k,outrobpca,options);
if options.rmsep==1 % the rmsep is asked, without prior knowledge
rmse=rrmse(x,y,options.h,kmax,'RPCR',0,options.k);
final.rmsep=rmse.rmsep;
end
final.robpca=outrobpca;
end
final.h=options.h;
final.k=options.k;
final.alpha=options.alpha;
% scores-distances
final.sd=final.robpca.sd;
quan=chi2inv(0.975,final.k);
final.cutoff.sd=quan^0.5;
% orthogonal distances
final.od=final.robpca.od;
final.cutoff.od=final.robpca.cutoff.od;
final.x=x;
final.y=y;
if options.classic==1
final.classic=cpcr(x,y,'k',final.k,'plots',0);
else
final.classic=0;
end
if strmatch(options.method,'mcd','exact')
final.resd=final.mcdreg.resd;
final.cutoff.resd=final.mcdreg.cutoff.resd;
else
final.resd=final.lts.res/final.lts.scale;
final.cutoff.resd=sqrt(chi2inv(0.975,1));
end
final.class='RPCR';
%Computing flags
final.flag.od=final.od<=final.cutoff.od;
final.flag.resd=abs(final.resd)<=final.cutoff.resd;
final.flag.all=(final.flag.od & final.flag.resd);
% Assigning output
if kfixed == 0
final.rcs = [final.R2;sqrt(final.rss)];
else
final.rcs = 0;
end
if options.rmsecv~=0
gammahalf = 0.5*sqrt(final.rss) + 0.5*final.rmsecv;
final.rcs = [final.rcs;gammahalf;final.rmsecv];
rmsecv_ind = 1;
else
rmsecv_ind = 0;
end
if options.rmsep~=1 && rmsecv_ind == 0
final.rmsep=0;
end
if isfield(final,'lts')
result=struct('slope',{final.slope}, 'int',{final.int}, 'fitted',{final.fitted},'res',{final.res},...
'cov',{final.cov},'rsquared',{final.rsquared},'rcs',{final.rcs},'rmsep',...
{final.rmsep},'k',{final.k},'alpha',{final.alpha},'h',{final.h},'sd', {final.sd},'od',{final.od},'resd',{final.resd},...
'cutoff',{final.cutoff},'flag',{final.flag},'class',{final.class},'classic',{final.classic},...
'robpca',{final.robpca},'lts',{final.lts});
else
result=struct('slope',{final.slope}, 'int',{final.int}, 'fitted',{final.fitted},'res',{final.res},...
'cov',{final.cov},'rsquared',{final.rsquared},'rcs',{final.rcs},'rmsep',{final.rmsep},...
'k',{final.k},'alpha',{final.alpha},'h',{final.h},'sd', {final.sd},'od',{final.od},'resd',{final.resd},...
'cutoff',{final.cutoff},'flag',{final.flag},'class',{final.class},'classic',{final.classic},...
'robpca',{final.robpca},'mcdreg',{final.mcdreg});
end
if result.rcs == 0
result = rmfield(result,'rcs');
end
if result.rmsep==0
result=rmfield(result,'rmsep');
end
% Plots
try
if options.plots && options.classic
makeplot(result,'classic',1,'labsd',options.labsd,'labod',options.labod,'labresd',options.labresd)
elseif options.plots
makeplot(result,'labsd',options.labsd,'labod',options.labod,'labresd',options.labresd)
end
catch %output must be given even if plots are interrupted
%> delete(gcf) to get rid of the menu
end
%---------------------------------
function out=regression(x,y,d,pre_out,options)
switch options.method
case 'mcd' % MCD regression for multivariate responses
out.mcdreg=mcdregres(pre_out.T(:,1:d),y,'k',options.k,'h',options.h,'plots',0);
if ~isstruct(out.mcdreg)
return
end
%%coefficients in the original space
out.slope=pre_out.P(:,1:d)*out.mcdreg.slope;
out.int=out.mcdreg.int-pre_out.M*pre_out.P(:,1:d)*out.mcdreg.slope;
out.fitted=[pre_out.T(:,1:d) ones(size(pre_out.T(:,1:d),1),1)]*[out.mcdreg.slope; out.mcdreg.int];
out.res=y-out.fitted;
out.cov=out.mcdreg.cov; % covariance matrix of residuals (capital sigma)
out.name='Multivariate robust MCD-regression';
out.rsquared=out.mcdreg.rsquared;
case 'lts'
[out.lts,raw]=ltsregres(pre_out.T(:,1:d),y,'plots',0,'h',options.h,'intadjust',options.intadjust);
%%coefficients in the original space;
out.lts.weights=raw.wt;
out.slope=pre_out.P(:,1:d)*out.lts.slope;
out.int=out.lts.int-pre_out.M*pre_out.P(:,1:d)*out.lts.slope;
out.fitted=[pre_out.T(:,1:d) ones(size(pre_out.T(:,1:d),1),1)]*[out.lts.slope; out.lts.int];
out.res=y-out.fitted;
out.cov=out.lts.scale.^2;
if out.cov==0
mess=sprintf(['Attention (rpcr.m): No standardized residuals could be calculated \n',...
'because their robust scale is zero.']);
disp(mess)
out.stdres=NaN;
else
out.stdres=out.res/out.lts.scale;
end
out.rsquared=out.lts.rsquared;
out.name='Robust LTS regression';
end
%-----------------------------------------------------------------------------------------
function quan=quanf(alfa,n,rk)
quan=floor(2*floor((n+rk+1)/2)-n+2*(n-floor((n+rk+1)/2))*alfa);