ECG-Kit 1.0

File: <base>/common/LIBRA/cvRpcr.m (18,764 bytes)
function result = cvRpcr(x,y,kmax,rmsecv,h,k)

%CVRPCR calculates the robust RMSECV (root mean squared error of cross-validation) curve
% for RPCR or the robust RMSEP (root mean squared error of prediction) value in a fast way. 
% The R-RMSECV curve can be used to make a selection of the optimal number of 
% components to include in the regression model. The function is used in rpcr.m. 
%
% Input arguments:
%          x      : the explanatory variables
%          y      : the response variables
%          kmax   : the maximal number of components to be considered in the model.
%          rmsecv : Optional. If equal to 1 (default), the rmsecv is computed. 
%                   Else, rmsecv = 0 and then the rss and R2 are computed. 
%          h      : the quantile used in RPCR
%          k      : optional, if equal to zero (default), the RMSECV is calculated. 
%                   If different from 0, the RMSEP value is calculated.
% Output:
%  If rmsecv = 1
%     result.rmsecv  : the r-rmsecv values (only if the RMSECV is computed)
%     result.rmsep   : the rmsep value (only if RMSEP is computed)
%     result.residu  : the residuals for every k = 1,...,kmax
%  result.outWeights : the fixed weights used in the robust version of the RMSE. 
%  result.R2         : the coefficient of determination for every k.
%  result.rss        : the RSS values for every k.
%
% This function is part of LIBRA: the Matlab Library for Robust Analysis,
% available at: 
%              http://wis.kuleuven.be/stat/robust.html
%
% Written by Sanne Engelen 
% Last Update: 08/10/2004, 03/07/2006
% Last Revision: 03/07/2006

% some initialisations
n = size(x,1);
p = size(x,2);
q = size(y,2);
teller_if_lus = 0;
cutoffWeights = sqrt(chi2inv(0.975,q));

if nargin < 4
    alfa=0.75;
    h=floor(2*floor((n+kmax+2)/2)-n+2*(n-floor((n+kmax+2)/2))*alfa);
    k = 0;
    rmsecv = 1;
elseif nargin == 4
    alfa=0.75;
    h=floor(2*floor((n+kmax+2)/2)-n+2*(n-floor((n+kmax+2)/2))*alfa);
    k = 0;
elseif nargin == 5
    k = 0;
end

if q == 1   
    FixedWeights = weightscvLtsregres(x,y,kmax,h,k);
else
    FixedWeights = weightscvMcdregres(x,y,kmax,h,k,cutoffWeights);
end
kmax=FixedWeights.kmax;

if rmsecv 
    weights = FixedWeights.weightsk;
    resrob = FixedWeights.resrob;
    
    if k == 0
        w_min = FixedWeights.w_min;
    end
    if size(resrob.Hsubsets.H0,2)==1
        resrob.Hsubsets.H0=resrob.Hsubsets.H0';
    end
    Hsets = [FixedWeights.Hsets;resrob.Hsubsets.H0;resrob.Hsubsets.H1;resrob.Hsubsets.Hfreq];
    
    for i = 1:n
        disp(['observation ',num2str(i),' is left out'])
        x_min_i = removal(x,i,0);
        y_min_i = removal(y,i,0);
        
        same.value = 0;
        if isempty(find(resrob.Hsubsets.H0 == i))
            if teller_if_lus >= 1
                same.value = 1;
            end
            teller_if_lus = teller_if_lus + 1;
        end
        
        % constructing Hsets of right size: h - 1. 
        Hsets_min_i = RemoveObsHsets(Hsets,i);
        
        if k == 0
            res = removeObsRobpca(x,i,kmax,Hsets_min_i,same,1);
        else
            res = removeObsRobpca(x,i,k,Hsets_min_i,same);
        end
        
        if isempty(find(resrob.Hsubsets.H0 == i))
            same.res = res;
        end
        
        Prob_min_i = res.Pk_min_i;
        murob_min_i = res.muk_min_i;
        
        Tk_min_i = [];
        Lk_min_i = [];
        coeffk_min_i = [];
        
        if k == 0
            j_ind = kmax;
        else
            j_ind = k;
        end
        
        Tkmax_min_i = (x_min_i - repmat(murob_min_i,n-1,1))*Prob_min_i(:,1:j_ind);
        
        if q == 1
            [resLts2,rawLts2] = ltsregres(Tkmax_min_i,y_min_i,'plots',0,'Hsets',Hsets_min_i,'h',h-1);
        else
            if k == 0
                Mcdres = mcdcov([Tkmax_min_i,y_min_i],'h',h-1,'plots',0,'Hsets',Hsets_min_i);
                % the full mcdregres is not needed, we only need the results of mcdcov.
                resMcd2.Mu = Mcdres.center';
                resMcd2.Sigma = Mcdres.cov;
            end
        end
        
        if k == 0
            j_start1 = 1;
            j_end1 = kmax;
        else
            j_start1 = k;
            j_end1 = k;
        end
        
        for j = j_start1:j_end1
            if k == 0
                if q == 1
                    [Bk,intk,weightslts] = extractlts(rawLts2,x_min_i, y_min_i, murob_min_i, Prob_min_i,kmax,j,h-1,n-1);
                else
                    Tk_min_i = (x_min_i - repmat(murob_min_i,n-1,1))*Prob_min_i(:,1:j);
                    [Bk,intk,sigmayykmaxrew_k,sigmattkmaxrew_k] = extractmcdregres(resMcd2,Tk_min_i,y_min_i,kmax,n-1,q,j,h-1,cutoffWeights);
                    coeffk = [Bk;intk];
                end
            else
                if q == 1
                    Bk = resLts2.slope;
                    intk = resLts2.int;
                    weightslts = resLts2.flag;
                else
                    resMcdregres = mcdregres(Tkmax_min_i,y_min_i,'Hsets',Hsets_min_i,'plots',0,'h',h-1);
                    Bk = resMcdregres.slope;
                    intk = resMcdregres.int;
                    coeffk = [Bk;intk];
                end
            end
            finalB = Prob_min_i(:,1:j)*Bk;
            finalInt = intk - murob_min_i*finalB;
            yhat_min_i = x(i,:)*finalB + finalInt;
            residk_min_i(i,(j - 1)*q + 1:j*q) = y(i,:) - yhat_min_i;
            
            % calculation of the resd: 
            if q > 1
                if k == 0
                    rewE2=sigmayykmaxrew_k-coeffk(1:j,1:q)'*sigmattkmaxrew_k*coeffk(1:j,1:q);
                    cen=zeros(q,1);
                    resd(i,j)=sqrt(libra_mahalanobis(residk_min_i(i,(j - 1)*q + 1:j*q),cen','cov',rewE2))'; %robust distances of residuals
                else
                    resd(i,j) = sqrt(libra_mahalanobis(residk_min_i(i,(j - 1)*q + 1:j*q),zeros(q,1),'cov',resMcdregres.cov));
                end
            else
                if k == 0
                    scale=sqrt(sum(weightslts.*residk_min_i(i,(j - 1)*q + 1:j*q).^2)/(sum(weightslts)-1));
                    resd(i,j) = residk_min_i(i,(j-1)*q + 1:j*q)/scale;
                else
                    resd(i,j) = residk_min_i(i,(j-1)*q + 1:j*q)/resLts2.scale;
                end
            end
        end
    end
    
    if k == 0
        for j = 1:kmax
            resk = residk_min_i(:,(j-1)*q + 1:j*q);
            if q == 1
                rmsecv_min(j) = sqrt(1/sum(w_min)*w_min*(resk).^2);
            else
                rmsecv_min(j) = sqrt(1/sum(w_min)*w_min*(mean((resk').^2))');
            end
        end
        result.rmsecv = rmsecv_min;
        result.residu = residk_min_i;
    else
        weights = weights(:,k);
        if q == 1
            rmsep = sqrt(1/sum(weights)*weights'*(residk_min_i(:,k)).^2);
        else
            rmsep = sqrt(1/sum(weights)*weights'*(mean((residk_min_i(:,(k-1)*q + 1:k*q)').^2))');
        end
        result.rmsep = rmsep;
        result.residu = residk_min_i(:,(k-1)*q + 1:k*q);
    end
end

result.outWeights = FixedWeights;
result.rss =  FixedWeights.rss;
result.R2 = FixedWeights.R2;

%-------------------------------------------------------------------------
function Hsets_min_i = RemoveObsHsets(Hsets,i)

% Removes the correct index from the $h$-subsets in Hsets to 
% obtain (h - 1)-subsets.
% Every h-set is put as a row in Hsets.
% i is the index of the observation that is removed from the whole data.

for r = 1:size(Hsets,1)
    if ~isempty(find(Hsets(r,:)== i))
        Hsets_min_i(r,:) = removal(Hsets(r,:),0,find(Hsets(r,:) == i));
    else
        Hsets_min_i(r,:) = Hsets(r,1:(end-1));
    end
    
    for j = 1:length(Hsets_min_i(r,:))
        if Hsets_min_i(r,j) > i
            Hsets_min_i(r,j) = Hsets_min_i(r,j) - 1;
        end
    end
end
%------------------------------------------------------------------------
function out = weightscvLtsregres(x,y,kmax,h,k)

% Computes the weights needed in the R-RMSECV function.
%
% Input arguments: 
%   x    : the independent variables
%   y    : the response variables
%   kmax : the maximal number of components to be considered.
%   h    : the number of observations on which the calculations are based.
%   k    : if zero (default), the kmax approach is used (for RMSECV). 
%          Else robpca is calculated for a specific k (for RMSEP). 
%
% Output arguments:
%   if  k = 0 (RMSECV):
%       out.w_min    : the weights obtained by taking the minimum over all k 
%       out.weightsk : the weights for every observation and every k (n x kmax).
%       out.resrob   : the results of robpca on x for kmax components.
%   if k ~= 0 (RMSEP):
%       out.weightsk : the weights for a specific k
%       out.resrob   : the results of robpca on x for k components.
%   out.R2       : the weighted Rsquared for each value of k
%   out.rss      : the weighted rss for each value of k


n = size(x,1);
p = size(x,2);
q = size(y,2);

if nargin < 5
    k = 0;
end

if k == 0
    ResRobWhole = robpca(x,'plots',0,'kmax',kmax,'h',h,'k',kmax);
else
    ResRobWhole = robpca(x,'plots',0,'kmax',kmax,'h',h,'k',k);
end
kmax = ResRobWhole.kmax;
Trob = ResRobWhole.T;
Mrob = ResRobWhole.M;
Prob = ResRobWhole.P;
HsetsH0 = ResRobWhole.Hsubsets.H0;
HsetsH1 = ResRobWhole.Hsubsets.H1;
HsetsHfreqrob = ResRobWhole.Hsubsets.Hfreq;

[ResLtsWhole,RawLtsWhole]=ltsregres(Trob,y,'plots',0,'h',h);

HsetsHfreq = ResLtsWhole.Hsubsets.Hfreq;

if k == 0
    j_start = 1;
    j_end = kmax;
else
    j_start = k;
    j_end = k;
end

for j = j_start:j_end
    if k == 0
        if j ~= kmax
            [Bk,intk,weightslts,HsetsHopt(j,:)] = extractlts(RawLtsWhole,x,y,Mrob,Prob,kmax,j,h,n);
        else
            [Bk,intk,weightslts] = extractlts(RawLtsWhole,x,y,Mrob,Prob,kmax,j,h,n);
            HsetsHopt(kmax,:) =  ResLtsWhole.Hsubsets.Hopt';
        end
    else
        Bk = ResLtsWhole.slope;
        intk = ResLtsWhole.int;
        weightslts = ResLtsWhole.flag;
        HsetsHopt = ResLtsWhole.Hsubsets.Hopt';
    end
    coeffk=[Bk;intk];
    finalB = Prob(:,1:j)*Bk;
    finalInt = intk - Mrob*finalB;
    fitted=x*finalB + repmat(finalInt,n,1);
    residuals(:,j)=y-fitted;
    scale=sqrt(sum(weightslts.*residuals(:,j).^2)/(sum(weightslts)-1));
    s0=scale;
    weightsk(:,j)=abs(residuals(:,j)/scale)<=sqrt(chi2inv(0.975,1));
end
    
% Determining fixed weights.
if k == 0
    if kmax == 1
        w_min = weightsk';
    else
        w_min = min(weightsk');
    end
    out.w_min = w_min;

    yw = mean(y(w_min == 1,:));
    y2=(y-repmat(yw,n,1)).^2;
    R = residuals.^2;
    D=sum(y2(w_min==1,:));
    for j = 1:kmax
        R1=R(w_min==1,j); 
        rss(j) = sum(sum(R1));
        R2(j)=1-rss(j)/sum(D); 
    end
    out.rss = 1/sum(w_min)*rss;
    out.R2 = R2;
else
    yw = mean(y(weightsk(:,k) == 1,:));
    y2=(y-repmat(yw,n,1)).^2;
    R = residuals.^2;
    D=sum(y2(weightsk(:,k)==1,:));
    R1=R(weightsk(:,k)==1,:); 
    rss(k) = sum(sum(R1));
    R2(k)=1-rss(k)/sum(D); 
    out.rss = 1/sum(weightsk(:,k))*rss(k);
    out.R2 = R2(k);
end
out.weightsk = weightsk;
out.resrob = ResRobWhole;
if size(HsetsH0,2)==1
    HsetsH0=HsetsH0';
end
out.kmax=kmax;
out.Hsets = [HsetsHopt;HsetsHfreq;HsetsH0;HsetsH1;HsetsHfreqrob];
%-----------------------------------------------------------------------------
function [Bk,intk,weightslts,Hopt] = extractlts(rawltskmax, x_min_i, y_min_i, murob_min_i, Prob_min_i,kmax,k,h,n)

% This function extracts lts-regression coefficients for a certain k based on 
% the slope and intercept of the regression with kmax coefficients. 
%
% Input arguments: 
%  rawltskmax  : the raw results of ltsregression with kmax components. 
%  x_min_i     : the regressors without observation i.
%  y_min_i     : the dependent variable without observation i.
%  murob_min_i : the center estimated using robpca on the data minus observation i.
%  Prob_min_i  : the loadings matrix estimated using robpca on the data minus observation i. 
%  k           : the number of components
%
% Output arguments:
%  Bk         : the slope for k components. 
%  intk       : the intercept for k components.
%  weightslts : the weights needed for the reweighting. 
%  Hopt       : the optimal h-subset. Not availabe for kmax. 

j = k;
coeffkraw_min_i = rawltskmax.coefficients;
sloperaw = coeffkraw_min_i(1:j,:);
intraw = coeffkraw_min_i(end,:);
Tk_min_i = (x_min_i - repmat(murob_min_i,n,1))*Prob_min_i(:,1:j);

% perform csteps on the raw estimates of the parameters:
prevobj = 0;
if j ~= kmax
    for noCsteps = 1:10
        residu = y_min_i - Tk_min_i*sloperaw - repmat(intraw,n,1);
        [sortresid,indsr] = sort(residu.^2);
        obj = sum(sortresid(1:h));
        [Q,R] = qr([Tk_min_i(indsr(1:h),:) ones(h,1)],0);
        z = R\(Q'*y_min_i(indsr(1:h),1));
        sloperaw = z(1:j,:);
        intraw = z(j+1,:);
        if noCsteps >= 20 | abs(obj - prevobj) < 10^(-4)
           if noCsteps >= 20
               disp('no convergence in Csteps')
           end
           break
        end
        prevobj = obj;
    end
    Hopt = indsr(1:h)';
end

% reweighting:
factor =rawconsfactorlts(h,n);
residu = y_min_i - Tk_min_i*sloperaw - repmat(intraw,n,1);
[sortresid,indsr] = sort(residu.^2);
sh0 = sqrt(1/(h)*sum(sortresid(1:h)));
s0 = sh0*factor;
m = 2*(n)/asvarscalekwad((h),(n));
quantile = tinv(0.9875,m);
weightslts = abs(residu/s0) <= quantile;
[Q,R] = qr([Tk_min_i(weightslts == 1,:) ones(sum(weightslts),1)],0);
z = R\(Q'*y_min_i(weightslts == 1));
Bk = z(1:j,:);
intk = z(j+1,:);
%----------------------------------------------------------------------------------------------------------------------
function out = weightscvMcdregres(x,y,kmax,h,k,cutoffWeights)

% computes the weights needed in the R-PRESS function.
%
% Input arguments: 
%   x    : the independent variables
%   y    : the response variables
%   kmax : the maximal number of components to be considered.
%   h    : the number of observations on which the calculations are based.
%   k    : if zero (default), the kmax approach is used then (for RMSECV). 
%          Else robpca is calculated for a specific k (for RMSEP). 
%
% Output arguments:
%   if  k = 0 (RMSECV):
%       out.w_min    : the weights obtained by taking the minimum over all k 
%       out.weightsk : the weights for every observation and every k (n x kmax).
%       out.resrob   : the results of robpca on [X,Y] for kmax components.
%   if k ~= 0 (RMSEP):
%       out.weightsk : the weights for a specific k
%       out.resrob   : the results of robpca on [X,Y] for k components.
%   out.R2       : the weighted Rsquared for each value of k
%   out.rss      : the weighted rss for each value of k

n = size(x,1);
p = size(x,2);
q = size(y,2);

if nargin < 5
    k = 0;
end

if k == 0
    ResRobWhole = robpca(x,'plots',0,'k',kmax,'kmax',kmax,'h',h);
else
    ResRobWhole = robpca(x,'plots',0,'k',k,'kmax',kmax,'h',h);
end
kmax=ResRobWhole.kmax;
Trob = ResRobWhole.T;
Prob = ResRobWhole.P;
Mrob = ResRobWhole.M;
HsetsH0 = ResRobWhole.Hsubsets.H0;
HsetsH1 = ResRobWhole.Hsubsets.H1;
HsetsHfreqrob = ResRobWhole.Hsubsets.Hfreq;

ResMCDWhole = mcdregres(Trob,y,'h',h,'plots',0);
HsetsHfreq = ResMCDWhole.Hsubsets.Hfreq;
    
if k == 0
    for j = 1:kmax
        if j ~= kmax
            if k == 0
                Tk = (x - repmat(Mrob,n,1))*Prob(:,1:j);
                [Bk,intk,sigmayykmaxrew_k,sigmattkmaxrew_k,HsetsHopt(j,:)] = extractmcdregres(ResMCDWhole,Tk,y,kmax,n,q,j,h,cutoffWeights);
            else
                Tk = (x - repmat(Mrob,n,1))*Prob(:,1:j);
                [Bk,intk,sigmayykmaxrew_k,sigmattkmaxrew_k,HsetsHopt(j,:)] = extractmcdregres(ResMCDWhole,Tk,y,j,n,q,j,h,cutoffWeights);
            end
        else
            [Bk,intk,sigmayykmaxrew_k,sigmattkmaxrew_k] = extractmcdregres(ResMCDWhole,Trob,y,kmax,n,q,j,h,cutoffWeights);
            HsetsHopt(kmax,:) = ResMCDWhole.Hsubsets.Hopt;
        end
        coeffk = [Bk;intk];
        finalB = Prob(:,1:j)*Bk;
        finalInt = intk - Mrob*finalB;
        yhat = x*finalB + repmat(finalInt,n,1);
        residu(:,(j-1)*q+1:j*q) = y - yhat;
        
        cen=zeros(q,1)';
        cov=sigmayykmaxrew_k - coeffk(1:j,1:q)'*sigmattkmaxrew_k*coeffk(1:j,1:q);
        [nn,pp]=size(residu(:,(j-1)*q+1:j*q));
        resd = sqrt(libra_mahalanobis(residu(:,(j-1)*q+1:j*q),cen,'cov',cov))';
        weightsk(:,j) = (abs(resd)<=cutoffWeights);
    end
else
    ResMCDWhole = mcdregres(Trob(:,1:k),y,'h',h,'plots',0);
    if size(ResMCDWhole.flag,2)~=1
        ResMCDWhole.flag = ResMCDWhole.flag';
    end
    
    weightsk(:,k) = ResMCDWhole.flag;
    
    HsetsHfreq = ResMCDWhole.Hsubsets.Hfreq;
    HsetsHopt = ResMCDWhole.Hsubsets.Hopt;
end


% Determining fixed weights.

if k == 0
    if kmax == 1
        w_min = weightsk';
    else
        w_min = min(weightsk');
    end
    out.w_min = w_min;
    
    yw = mean(y(w_min == 1,:));
    y2=(y-repmat(yw,n,1)).^2;
    R = residu.^2;
    D=sum(y2(w_min==1,:));
    for j = 1:kmax
        R1=R(w_min==1,(j-1)*q+1:j*q); 
        rss(j) = sum(sum(R1));
        R2(j)=1-rss(j)/sum(D); 
    end
    out.rss = 1/(q*sum(w_min))*rss;
    out.R2 = R2;
else
    yw = mean(y(weightsk(:,k) == 1,:));
    y2=(y-repmat(yw,n,1)).^2;
    R = ResMCDWhole.res.^2;
    D=sum(y2(weightsk(:,k)==1,:));
    R1=R(weightsk(:,k)==1,:); 
    rss = sum(sum(R1));
    R2=1-rss/sum(D); 
    out.rss = 1/(q*sum(weightsk(:,k)))*rss;
    out.R2 = R2;
end
out.weightsk = weightsk;
out.resrob = ResRobWhole;
if size(HsetsH0,2)==1
    HsetsH0=HsetsH0';
end
out.kmax=kmax;
out.Hsets = [HsetsHopt;HsetsHfreq;HsetsH0;HsetsH1;HsetsHfreqrob];

%---------------------------------------------------------------
function rawconsfaclts=rawconsfactorlts(quan,n)

rawconsfaclts=(1/sqrt(1-((2*n)/(quan*(1/norminv((quan+n)/(2*n)))))*...
		normpdf(1/(1/(norminv((quan+n)/(2*n)))))));
%--------------------------------------------------------------

function asvar=asvarscalekwad(quan,n)

alfa=quan/n;
alfa=1-alfa;
qalfa=chi2inv(1-alfa,1);
c2=gamcdf(qalfa/2,1/2+1);
c1=1/c2;
c3=3*gamcdf(qalfa/2,1/2+2);
asvar=qalfa*(1-alfa)-c2;
asvar=asvar^2;
asvar=(c3-2*qalfa*c2+(1-alfa)*(qalfa^2))-asvar;
asvar=c1^2*asvar;