ECG-Kit 1.0
(18,764 bytes)
function result = cvRpcr(x,y,kmax,rmsecv,h,k)
%CVRPCR calculates the robust RMSECV (root mean squared error of cross-validation) curve
% for RPCR or the robust RMSEP (root mean squared error of prediction) value in a fast way.
% The R-RMSECV curve can be used to make a selection of the optimal number of
% components to include in the regression model. The function is used in rpcr.m.
%
% Input arguments:
% x : the explanatory variables
% y : the response variables
% kmax : the maximal number of components to be considered in the model.
% rmsecv : Optional. If equal to 1 (default), the rmsecv is computed.
% Else, rmsecv = 0 and then the rss and R2 are computed.
% h : the quantile used in RPCR
% k : optional, if equal to zero (default), the RMSECV is calculated.
% If different from 0, the RMSEP value is calculated.
% Output:
% If rmsecv = 1
% result.rmsecv : the r-rmsecv values (only if the RMSECV is computed)
% result.rmsep : the rmsep value (only if RMSEP is computed)
% result.residu : the residuals for every k = 1,...,kmax
% result.outWeights : the fixed weights used in the robust version of the RMSE.
% result.R2 : the coefficient of determination for every k.
% result.rss : the RSS values for every k.
%
% This function is part of LIBRA: the Matlab Library for Robust Analysis,
% available at:
% http://wis.kuleuven.be/stat/robust.html
%
% Written by Sanne Engelen
% Last Update: 08/10/2004, 03/07/2006
% Last Revision: 03/07/2006
% some initialisations
n = size(x,1);
p = size(x,2);
q = size(y,2);
teller_if_lus = 0;
cutoffWeights = sqrt(chi2inv(0.975,q));
if nargin < 4
alfa=0.75;
h=floor(2*floor((n+kmax+2)/2)-n+2*(n-floor((n+kmax+2)/2))*alfa);
k = 0;
rmsecv = 1;
elseif nargin == 4
alfa=0.75;
h=floor(2*floor((n+kmax+2)/2)-n+2*(n-floor((n+kmax+2)/2))*alfa);
k = 0;
elseif nargin == 5
k = 0;
end
if q == 1
FixedWeights = weightscvLtsregres(x,y,kmax,h,k);
else
FixedWeights = weightscvMcdregres(x,y,kmax,h,k,cutoffWeights);
end
kmax=FixedWeights.kmax;
if rmsecv
weights = FixedWeights.weightsk;
resrob = FixedWeights.resrob;
if k == 0
w_min = FixedWeights.w_min;
end
if size(resrob.Hsubsets.H0,2)==1
resrob.Hsubsets.H0=resrob.Hsubsets.H0';
end
Hsets = [FixedWeights.Hsets;resrob.Hsubsets.H0;resrob.Hsubsets.H1;resrob.Hsubsets.Hfreq];
for i = 1:n
disp(['observation ',num2str(i),' is left out'])
x_min_i = removal(x,i,0);
y_min_i = removal(y,i,0);
same.value = 0;
if isempty(find(resrob.Hsubsets.H0 == i))
if teller_if_lus >= 1
same.value = 1;
end
teller_if_lus = teller_if_lus + 1;
end
% constructing Hsets of right size: h - 1.
Hsets_min_i = RemoveObsHsets(Hsets,i);
if k == 0
res = removeObsRobpca(x,i,kmax,Hsets_min_i,same,1);
else
res = removeObsRobpca(x,i,k,Hsets_min_i,same);
end
if isempty(find(resrob.Hsubsets.H0 == i))
same.res = res;
end
Prob_min_i = res.Pk_min_i;
murob_min_i = res.muk_min_i;
Tk_min_i = [];
Lk_min_i = [];
coeffk_min_i = [];
if k == 0
j_ind = kmax;
else
j_ind = k;
end
Tkmax_min_i = (x_min_i - repmat(murob_min_i,n-1,1))*Prob_min_i(:,1:j_ind);
if q == 1
[resLts2,rawLts2] = ltsregres(Tkmax_min_i,y_min_i,'plots',0,'Hsets',Hsets_min_i,'h',h-1);
else
if k == 0
Mcdres = mcdcov([Tkmax_min_i,y_min_i],'h',h-1,'plots',0,'Hsets',Hsets_min_i);
% the full mcdregres is not needed, we only need the results of mcdcov.
resMcd2.Mu = Mcdres.center';
resMcd2.Sigma = Mcdres.cov;
end
end
if k == 0
j_start1 = 1;
j_end1 = kmax;
else
j_start1 = k;
j_end1 = k;
end
for j = j_start1:j_end1
if k == 0
if q == 1
[Bk,intk,weightslts] = extractlts(rawLts2,x_min_i, y_min_i, murob_min_i, Prob_min_i,kmax,j,h-1,n-1);
else
Tk_min_i = (x_min_i - repmat(murob_min_i,n-1,1))*Prob_min_i(:,1:j);
[Bk,intk,sigmayykmaxrew_k,sigmattkmaxrew_k] = extractmcdregres(resMcd2,Tk_min_i,y_min_i,kmax,n-1,q,j,h-1,cutoffWeights);
coeffk = [Bk;intk];
end
else
if q == 1
Bk = resLts2.slope;
intk = resLts2.int;
weightslts = resLts2.flag;
else
resMcdregres = mcdregres(Tkmax_min_i,y_min_i,'Hsets',Hsets_min_i,'plots',0,'h',h-1);
Bk = resMcdregres.slope;
intk = resMcdregres.int;
coeffk = [Bk;intk];
end
end
finalB = Prob_min_i(:,1:j)*Bk;
finalInt = intk - murob_min_i*finalB;
yhat_min_i = x(i,:)*finalB + finalInt;
residk_min_i(i,(j - 1)*q + 1:j*q) = y(i,:) - yhat_min_i;
% calculation of the resd:
if q > 1
if k == 0
rewE2=sigmayykmaxrew_k-coeffk(1:j,1:q)'*sigmattkmaxrew_k*coeffk(1:j,1:q);
cen=zeros(q,1);
resd(i,j)=sqrt(libra_mahalanobis(residk_min_i(i,(j - 1)*q + 1:j*q),cen','cov',rewE2))'; %robust distances of residuals
else
resd(i,j) = sqrt(libra_mahalanobis(residk_min_i(i,(j - 1)*q + 1:j*q),zeros(q,1),'cov',resMcdregres.cov));
end
else
if k == 0
scale=sqrt(sum(weightslts.*residk_min_i(i,(j - 1)*q + 1:j*q).^2)/(sum(weightslts)-1));
resd(i,j) = residk_min_i(i,(j-1)*q + 1:j*q)/scale;
else
resd(i,j) = residk_min_i(i,(j-1)*q + 1:j*q)/resLts2.scale;
end
end
end
end
if k == 0
for j = 1:kmax
resk = residk_min_i(:,(j-1)*q + 1:j*q);
if q == 1
rmsecv_min(j) = sqrt(1/sum(w_min)*w_min*(resk).^2);
else
rmsecv_min(j) = sqrt(1/sum(w_min)*w_min*(mean((resk').^2))');
end
end
result.rmsecv = rmsecv_min;
result.residu = residk_min_i;
else
weights = weights(:,k);
if q == 1
rmsep = sqrt(1/sum(weights)*weights'*(residk_min_i(:,k)).^2);
else
rmsep = sqrt(1/sum(weights)*weights'*(mean((residk_min_i(:,(k-1)*q + 1:k*q)').^2))');
end
result.rmsep = rmsep;
result.residu = residk_min_i(:,(k-1)*q + 1:k*q);
end
end
result.outWeights = FixedWeights;
result.rss = FixedWeights.rss;
result.R2 = FixedWeights.R2;
%-------------------------------------------------------------------------
function Hsets_min_i = RemoveObsHsets(Hsets,i)
% Removes the correct index from the $h$-subsets in Hsets to
% obtain (h - 1)-subsets.
% Every h-set is put as a row in Hsets.
% i is the index of the observation that is removed from the whole data.
for r = 1:size(Hsets,1)
if ~isempty(find(Hsets(r,:)== i))
Hsets_min_i(r,:) = removal(Hsets(r,:),0,find(Hsets(r,:) == i));
else
Hsets_min_i(r,:) = Hsets(r,1:(end-1));
end
for j = 1:length(Hsets_min_i(r,:))
if Hsets_min_i(r,j) > i
Hsets_min_i(r,j) = Hsets_min_i(r,j) - 1;
end
end
end
%------------------------------------------------------------------------
function out = weightscvLtsregres(x,y,kmax,h,k)
% Computes the weights needed in the R-RMSECV function.
%
% Input arguments:
% x : the independent variables
% y : the response variables
% kmax : the maximal number of components to be considered.
% h : the number of observations on which the calculations are based.
% k : if zero (default), the kmax approach is used (for RMSECV).
% Else robpca is calculated for a specific k (for RMSEP).
%
% Output arguments:
% if k = 0 (RMSECV):
% out.w_min : the weights obtained by taking the minimum over all k
% out.weightsk : the weights for every observation and every k (n x kmax).
% out.resrob : the results of robpca on x for kmax components.
% if k ~= 0 (RMSEP):
% out.weightsk : the weights for a specific k
% out.resrob : the results of robpca on x for k components.
% out.R2 : the weighted Rsquared for each value of k
% out.rss : the weighted rss for each value of k
n = size(x,1);
p = size(x,2);
q = size(y,2);
if nargin < 5
k = 0;
end
if k == 0
ResRobWhole = robpca(x,'plots',0,'kmax',kmax,'h',h,'k',kmax);
else
ResRobWhole = robpca(x,'plots',0,'kmax',kmax,'h',h,'k',k);
end
kmax = ResRobWhole.kmax;
Trob = ResRobWhole.T;
Mrob = ResRobWhole.M;
Prob = ResRobWhole.P;
HsetsH0 = ResRobWhole.Hsubsets.H0;
HsetsH1 = ResRobWhole.Hsubsets.H1;
HsetsHfreqrob = ResRobWhole.Hsubsets.Hfreq;
[ResLtsWhole,RawLtsWhole]=ltsregres(Trob,y,'plots',0,'h',h);
HsetsHfreq = ResLtsWhole.Hsubsets.Hfreq;
if k == 0
j_start = 1;
j_end = kmax;
else
j_start = k;
j_end = k;
end
for j = j_start:j_end
if k == 0
if j ~= kmax
[Bk,intk,weightslts,HsetsHopt(j,:)] = extractlts(RawLtsWhole,x,y,Mrob,Prob,kmax,j,h,n);
else
[Bk,intk,weightslts] = extractlts(RawLtsWhole,x,y,Mrob,Prob,kmax,j,h,n);
HsetsHopt(kmax,:) = ResLtsWhole.Hsubsets.Hopt';
end
else
Bk = ResLtsWhole.slope;
intk = ResLtsWhole.int;
weightslts = ResLtsWhole.flag;
HsetsHopt = ResLtsWhole.Hsubsets.Hopt';
end
coeffk=[Bk;intk];
finalB = Prob(:,1:j)*Bk;
finalInt = intk - Mrob*finalB;
fitted=x*finalB + repmat(finalInt,n,1);
residuals(:,j)=y-fitted;
scale=sqrt(sum(weightslts.*residuals(:,j).^2)/(sum(weightslts)-1));
s0=scale;
weightsk(:,j)=abs(residuals(:,j)/scale)<=sqrt(chi2inv(0.975,1));
end
% Determining fixed weights.
if k == 0
if kmax == 1
w_min = weightsk';
else
w_min = min(weightsk');
end
out.w_min = w_min;
yw = mean(y(w_min == 1,:));
y2=(y-repmat(yw,n,1)).^2;
R = residuals.^2;
D=sum(y2(w_min==1,:));
for j = 1:kmax
R1=R(w_min==1,j);
rss(j) = sum(sum(R1));
R2(j)=1-rss(j)/sum(D);
end
out.rss = 1/sum(w_min)*rss;
out.R2 = R2;
else
yw = mean(y(weightsk(:,k) == 1,:));
y2=(y-repmat(yw,n,1)).^2;
R = residuals.^2;
D=sum(y2(weightsk(:,k)==1,:));
R1=R(weightsk(:,k)==1,:);
rss(k) = sum(sum(R1));
R2(k)=1-rss(k)/sum(D);
out.rss = 1/sum(weightsk(:,k))*rss(k);
out.R2 = R2(k);
end
out.weightsk = weightsk;
out.resrob = ResRobWhole;
if size(HsetsH0,2)==1
HsetsH0=HsetsH0';
end
out.kmax=kmax;
out.Hsets = [HsetsHopt;HsetsHfreq;HsetsH0;HsetsH1;HsetsHfreqrob];
%-----------------------------------------------------------------------------
function [Bk,intk,weightslts,Hopt] = extractlts(rawltskmax, x_min_i, y_min_i, murob_min_i, Prob_min_i,kmax,k,h,n)
% This function extracts lts-regression coefficients for a certain k based on
% the slope and intercept of the regression with kmax coefficients.
%
% Input arguments:
% rawltskmax : the raw results of ltsregression with kmax components.
% x_min_i : the regressors without observation i.
% y_min_i : the dependent variable without observation i.
% murob_min_i : the center estimated using robpca on the data minus observation i.
% Prob_min_i : the loadings matrix estimated using robpca on the data minus observation i.
% k : the number of components
%
% Output arguments:
% Bk : the slope for k components.
% intk : the intercept for k components.
% weightslts : the weights needed for the reweighting.
% Hopt : the optimal h-subset. Not availabe for kmax.
j = k;
coeffkraw_min_i = rawltskmax.coefficients;
sloperaw = coeffkraw_min_i(1:j,:);
intraw = coeffkraw_min_i(end,:);
Tk_min_i = (x_min_i - repmat(murob_min_i,n,1))*Prob_min_i(:,1:j);
% perform csteps on the raw estimates of the parameters:
prevobj = 0;
if j ~= kmax
for noCsteps = 1:10
residu = y_min_i - Tk_min_i*sloperaw - repmat(intraw,n,1);
[sortresid,indsr] = sort(residu.^2);
obj = sum(sortresid(1:h));
[Q,R] = qr([Tk_min_i(indsr(1:h),:) ones(h,1)],0);
z = R\(Q'*y_min_i(indsr(1:h),1));
sloperaw = z(1:j,:);
intraw = z(j+1,:);
if noCsteps >= 20 | abs(obj - prevobj) < 10^(-4)
if noCsteps >= 20
disp('no convergence in Csteps')
end
break
end
prevobj = obj;
end
Hopt = indsr(1:h)';
end
% reweighting:
factor =rawconsfactorlts(h,n);
residu = y_min_i - Tk_min_i*sloperaw - repmat(intraw,n,1);
[sortresid,indsr] = sort(residu.^2);
sh0 = sqrt(1/(h)*sum(sortresid(1:h)));
s0 = sh0*factor;
m = 2*(n)/asvarscalekwad((h),(n));
quantile = tinv(0.9875,m);
weightslts = abs(residu/s0) <= quantile;
[Q,R] = qr([Tk_min_i(weightslts == 1,:) ones(sum(weightslts),1)],0);
z = R\(Q'*y_min_i(weightslts == 1));
Bk = z(1:j,:);
intk = z(j+1,:);
%----------------------------------------------------------------------------------------------------------------------
function out = weightscvMcdregres(x,y,kmax,h,k,cutoffWeights)
% computes the weights needed in the R-PRESS function.
%
% Input arguments:
% x : the independent variables
% y : the response variables
% kmax : the maximal number of components to be considered.
% h : the number of observations on which the calculations are based.
% k : if zero (default), the kmax approach is used then (for RMSECV).
% Else robpca is calculated for a specific k (for RMSEP).
%
% Output arguments:
% if k = 0 (RMSECV):
% out.w_min : the weights obtained by taking the minimum over all k
% out.weightsk : the weights for every observation and every k (n x kmax).
% out.resrob : the results of robpca on [X,Y] for kmax components.
% if k ~= 0 (RMSEP):
% out.weightsk : the weights for a specific k
% out.resrob : the results of robpca on [X,Y] for k components.
% out.R2 : the weighted Rsquared for each value of k
% out.rss : the weighted rss for each value of k
n = size(x,1);
p = size(x,2);
q = size(y,2);
if nargin < 5
k = 0;
end
if k == 0
ResRobWhole = robpca(x,'plots',0,'k',kmax,'kmax',kmax,'h',h);
else
ResRobWhole = robpca(x,'plots',0,'k',k,'kmax',kmax,'h',h);
end
kmax=ResRobWhole.kmax;
Trob = ResRobWhole.T;
Prob = ResRobWhole.P;
Mrob = ResRobWhole.M;
HsetsH0 = ResRobWhole.Hsubsets.H0;
HsetsH1 = ResRobWhole.Hsubsets.H1;
HsetsHfreqrob = ResRobWhole.Hsubsets.Hfreq;
ResMCDWhole = mcdregres(Trob,y,'h',h,'plots',0);
HsetsHfreq = ResMCDWhole.Hsubsets.Hfreq;
if k == 0
for j = 1:kmax
if j ~= kmax
if k == 0
Tk = (x - repmat(Mrob,n,1))*Prob(:,1:j);
[Bk,intk,sigmayykmaxrew_k,sigmattkmaxrew_k,HsetsHopt(j,:)] = extractmcdregres(ResMCDWhole,Tk,y,kmax,n,q,j,h,cutoffWeights);
else
Tk = (x - repmat(Mrob,n,1))*Prob(:,1:j);
[Bk,intk,sigmayykmaxrew_k,sigmattkmaxrew_k,HsetsHopt(j,:)] = extractmcdregres(ResMCDWhole,Tk,y,j,n,q,j,h,cutoffWeights);
end
else
[Bk,intk,sigmayykmaxrew_k,sigmattkmaxrew_k] = extractmcdregres(ResMCDWhole,Trob,y,kmax,n,q,j,h,cutoffWeights);
HsetsHopt(kmax,:) = ResMCDWhole.Hsubsets.Hopt;
end
coeffk = [Bk;intk];
finalB = Prob(:,1:j)*Bk;
finalInt = intk - Mrob*finalB;
yhat = x*finalB + repmat(finalInt,n,1);
residu(:,(j-1)*q+1:j*q) = y - yhat;
cen=zeros(q,1)';
cov=sigmayykmaxrew_k - coeffk(1:j,1:q)'*sigmattkmaxrew_k*coeffk(1:j,1:q);
[nn,pp]=size(residu(:,(j-1)*q+1:j*q));
resd = sqrt(libra_mahalanobis(residu(:,(j-1)*q+1:j*q),cen,'cov',cov))';
weightsk(:,j) = (abs(resd)<=cutoffWeights);
end
else
ResMCDWhole = mcdregres(Trob(:,1:k),y,'h',h,'plots',0);
if size(ResMCDWhole.flag,2)~=1
ResMCDWhole.flag = ResMCDWhole.flag';
end
weightsk(:,k) = ResMCDWhole.flag;
HsetsHfreq = ResMCDWhole.Hsubsets.Hfreq;
HsetsHopt = ResMCDWhole.Hsubsets.Hopt;
end
% Determining fixed weights.
if k == 0
if kmax == 1
w_min = weightsk';
else
w_min = min(weightsk');
end
out.w_min = w_min;
yw = mean(y(w_min == 1,:));
y2=(y-repmat(yw,n,1)).^2;
R = residu.^2;
D=sum(y2(w_min==1,:));
for j = 1:kmax
R1=R(w_min==1,(j-1)*q+1:j*q);
rss(j) = sum(sum(R1));
R2(j)=1-rss(j)/sum(D);
end
out.rss = 1/(q*sum(w_min))*rss;
out.R2 = R2;
else
yw = mean(y(weightsk(:,k) == 1,:));
y2=(y-repmat(yw,n,1)).^2;
R = ResMCDWhole.res.^2;
D=sum(y2(weightsk(:,k)==1,:));
R1=R(weightsk(:,k)==1,:);
rss = sum(sum(R1));
R2=1-rss/sum(D);
out.rss = 1/(q*sum(weightsk(:,k)))*rss;
out.R2 = R2;
end
out.weightsk = weightsk;
out.resrob = ResRobWhole;
if size(HsetsH0,2)==1
HsetsH0=HsetsH0';
end
out.kmax=kmax;
out.Hsets = [HsetsHopt;HsetsHfreq;HsetsH0;HsetsH1;HsetsHfreqrob];
%---------------------------------------------------------------
function rawconsfaclts=rawconsfactorlts(quan,n)
rawconsfaclts=(1/sqrt(1-((2*n)/(quan*(1/norminv((quan+n)/(2*n)))))*...
normpdf(1/(1/(norminv((quan+n)/(2*n)))))));
%--------------------------------------------------------------
function asvar=asvarscalekwad(quan,n)
alfa=quan/n;
alfa=1-alfa;
qalfa=chi2inv(1-alfa,1);
c2=gamcdf(qalfa/2,1/2+1);
c1=1/c2;
c3=3*gamcdf(qalfa/2,1/2+2);
asvar=qalfa*(1-alfa)-c2;
asvar=asvar^2;
asvar=(c3-2*qalfa*c2+(1-alfa)*(qalfa^2))-asvar;
asvar=c1^2*asvar;