ECG-Kit 1.0
(17,173 bytes)
%TREEC Trainable decision tree classifier
%
% W = TREEC(A,CRIT,PRUNE,T)
% W = A*TREEC([],CRIT,PRUNE,T)
% W = A*TREEC(CRIT,PRUNE,T)
%
% Computation of a decision tree classifier out of a dataset A using
% a binary splitting criterion CRIT:
% INFCRIT - information gain
% MAXCRIT - purity (default)
% FISHCRIT - Fisher criterion
%
% Pruning is defined by prune:
% PRUNE = -1 pessimistic pruning as defined by Quinlan.
% PRUNE = -2 testset pruning using the dataset T, or, if not
% supplied, an artificially generated testset of 5 x size of
% the training set based on parzen density estimates.
% see PARZENML and GENDATP.
% PRUNE = 0 no pruning (default).
% PRUNE > 0 early pruning, e.g. prune = 3
% PRUNE = 10 causes heavy pruning.
%
% If CRIT or PRUNE are set to NaN they are optimised by REGOPTC.
%
% SEE ALSO (<a href="http://37steps.com/prtools">PRTools Guide</a>)
% DATASETS, MAPPINGS, TREE_MAP, REGOPTC
% Copyright: R.P.W. Duin, r.p.w.duin@37steps.com
% Faculty EWI, Delft University of Technology
% P.O. Box 5031, 2600 GA Delft, The Netherlands
% $Id: treec.m,v 1.9 2009/07/26 18:52:08 duin Exp $
function w = treec(varargin)
mapname = 'DecTree';
argin = shiftargin(varargin,'char');
argin = setdefaults(argin,[],'maxcrit',[],[]);
if mapping_task(argin,'definition')
w = define_mapping(argin,'untrained',mapname);
elseif mapping_task(argin,'training') % Train a mapping.
[a,crit,prune,t] = deal(argin{:});
parmin_max = [1,3;-1,10];
optcrit = inf;
if isnan(crit) & isnan(prune) % optimize criterion and pruning, grid search
global REGOPT_OPTCRIT REGOPT_PARS
for n = 1:3
defs = {n,0};
v = regoptc(a,mfilename,{crit,prune},defs,[2],parmin_max,testc([],'soft'),[0,0]);
if REGOPT_OPTCRIT < optcrit
w = v; optcrit = REGOPT_OPTCRIT; regoptpars = REGOPT_PARS;
end
end
REGOPT_PARS = regoptpars;
elseif isnan(crit) % optimize criterion
defs = {1,0};
w = regoptc(a,mfilename,{crit,prune},defs,[1],parmin_max,testc([],'soft'),[0,0]);
elseif isnan(prune) % optimize pruning
defs = {1,0};
w = regoptc(a,mfilename,{crit,prune},defs,[2],parmin_max,testc([],'soft'),[0,0]);
else % training for given parameters
islabtype(a,'crisp');
isvaldfile(a,1,2); % at least 1 object per class, 2 classes
%a = testdatasize(a);
a = prdataset(a);
% First get some useful parameters:
[m,k,c] = getsize(a);
nlab = getnlab(a);
% Define the splitting criterion:
if isempty(crit), crit = 2; end
if ~ischar(crit)
if crit == 0 || crit == 1, crit = 'infcrit';
elseif crit == 2, crit = 'maxcrit';
elseif crit == 3, crit = 'fishcrit';
else error('Unknown criterion value');
end
end
% Now the training can really start:
if isempty(prune)
tree = maketree(+a,nlab,c,crit);
elseif nargin > 2
% We have to apply a pruning strategy:
if prune == -1, prune = 'prunep'; end
if prune == -2, prune = 'prunet'; end
% The strategy can be prunep/prunet:
if isstr(prune)
tree = maketree(+a,nlab,c,crit);
if prune == 'prunep'
tree = prunep(tree,a,nlab);
elseif prune == 'prunet'
if nargin < 4
t = gendatp(a,5*sum(nlab==1));
end
tree = prunet(tree,t);
else
error('unknown pruning option defined');
end
else
% otherwise the tree is just cut after level 'prune'
tree = maketree(+a,nlab,c,crit,prune);
end
else
error('Wrong number of parameters')
end
% Store the results:
w = prmapping('tree_map','trained',{tree,1},getlablist(a),k,c);
w = setname(w,mapname);
w = setcost(w,a);
end
end
return
%MAKETREE General tree building algorithm
%
% tree = maketree(A,nlab,c,crit,stop)
%
% Constructs a binary decision tree using the criterion function
% specified in the string crit ('maxcrit', 'fishcrit' or 'infcrit'
% (default)) for a set of objects A. stop is an optional argument
% defining early stopping according to the Chi-squared test as
% defined by Quinlan [1]. stop = 0 (default) gives a perfect tree
% (no pruning) stop = 3 gives a pruned version stop = 10 a heavily
% pruned version.
%
% Definition of the resulting tree:
%
% tree(n,1) - feature number to be used in node n
% tree(n,2) - threshold t to be used
% tree(n,3) - node to be processed if value <= t
% tree(n,4) - node to be processed if value > t
% tree(n,5:4+c) - aposteriori probabilities for all classes in
% node n
%
% If tree(n,3) == 0, stop, class in tree(n,1)
%
% This is a low-level routine called by treec.
%
% See also infstop, infcrit, maxcrit, fishcrit and mapt.
% Authors: Guido te Brake, TWI/SSOR, Delft University of Technology
% R.P.W. Duin, TN/PH, Delft University of Technology
% Copyright: R.P.W. Duin, duin@ph.tn.tudelft.nl
% Faculty of Applied Physics, Delft University of Technology
% P.O. Box 5046, 2600 GA Delft, The Netherlands
function tree = maketree(a,nlab,c,crit,stop)
[m,k] = size(a);
if nargin < 5, stop = 0; end;
if nargin < 4, crit = []; end;
if isempty(crit), crit = 'infcrit'; end;
% Construct the tree:
% When all objects have the same label, create an end-node:
if all([nlab == nlab(1)])
% Avoid giving 0-1 probabilities, but 'regularize' them a bit using
% a 'uniform' Bayesian prior:
p = ones(1,c)/(m+c); p(nlab(1)) = (m+1)/(m+c);
tree = [nlab(1),0,0,0,p];
else
% now the tree is recursively constructed further:
[f,j,t] = feval(crit,+a,nlab); % use desired split criterion
if isempty(t)
crt = 0;
else
crt = infstop(+a,nlab,j,t); % use desired early stopping criterion
end
p = sum(expandd(nlab),1);
if length(p) < c, p = [p,zeros(1,c-length(p))]; end
% When the stop criterion is not reached yet, we recursively split
% further:
if crt > stop
% Make the left branch:
J = find(a(:,j) <= t);
tl = maketree(+a(J,:),nlab(J),c,crit,stop);
% Make the right branch:
K = find(a(:,j) > t);
tr = maketree(+a(K,:),nlab(K),c,crit,stop);
% Fix the node labelings before the branches can be 'glued'
% together to a big tree:
[t1,t2] = size(tl);
tl = tl + [zeros(t1,2) tl(:,[3 4])>0 zeros(t1,c)];
[t3,t4] = size(tr);
tr = tr + (t1+1)*[zeros(t3,2) tr(:,[3 4])>0 zeros(t3,c)];
% Make the complete tree: the split-node and the branches:
tree= [[j,t,2,t1+2,(p+1)/(m+c)]; tl; tr];
else
% We reached the stop criterion, so make an end-node:
[mt,cmax] = max(p);
tree = [cmax,0,0,0,(p+1)/(m+c)];
end
end
return
%MAXCRIT Maximum entropy criterion for best feature split.
%
% [f,j,t] = maxcrit(A,nlabels)
%
% Computes the value of the maximum purity f for all features over
% the data set A given its numeric labels. j is the optimum feature,
% t its threshold. This is a low level routine called for constructing
% decision trees.
%
% [1] L. Breiman, J.H. Friedman, R.A. Olshen, and C.J. Stone,
% Classification and regression trees, Wadsworth, California, 1984.
% Copyright: R.P.W. Duin, duin@ph.tn.tudelft.nl
% Faculty of Applied Physics, Delft University of Technology
% P.O. Box 5046, 2600 GA Delft, The Netherlands
function [f,j,t] = maxcrit(a,nlab)
[m,k] = size(a);
c = max(nlab);
% -variable T is an (2c)x k matrix containing:
% minimum feature values class 1
% maximum feature values class 1
% minimum feature values class 2
% maximum feature values class 2
% etc.
% -variable R (same size) contains:
% fraction of objects which is < min. class 1.
% fraction of objects which is > max. class 1.
% fraction of objects which is < min. class 2.
% fraction of objects which is > max. class 2.
% etc.
% These values are collected and computed in the next loop:
T = zeros(2*c,k); R = zeros(2*c,k);
for j = 1:c
L = (nlab == j);
if sum(L) == 0
T([2*j-1:2*j],:) = zeros(2,k);
R([2*j-1:2*j],:) = zeros(2,k);
else
T(2*j-1,:) = min(a(L,:),[],1);
R(2*j-1,:) = sum(a < ones(m,1)*T(2*j-1,:),1);
T(2*j,:) = max(a(L,:),[],1);
R(2*j,:) = sum(a > ones(m,1)*T(2*j,:),1);
end
end
% From R the purity index for all features is computed:
G = R .* (m-R);
% and the best feature is found:
[gmax,tmax] = max(G,[],1);
[f,j] = max(gmax);
Tmax = tmax(j);
if Tmax ~= 2*floor(Tmax/2)
t = (T(Tmax,j) + max(a(find(a(:,j) < T(Tmax,j)),j)))/2;
else
t = (T(Tmax,j) + min(a(find(a(:,j) > T(Tmax,j)),j)))/2;
end
if isempty(t)
[f,j,t] = infcrit(a,nlab);
prwarning(3,'Maxcrit not feasible for decision tree, infcrit is used')
end
return
%INFCRIT The information gain and its the best feature split.
%
% [f,j,t] = infcrit(A,nlabels)
%
% Computes over all features the information gain f for its best
% threshold from the dataset A and its numeric labels. For f=1:
% perfect discrimination, f=0: complete mixture. j is the optimum
% feature, t its threshold. This is a lowlevel routine called for
% constructing decision trees.
% Copyright: R.P.W. Duin, duin@ph.tn.tudelft.nl
% Faculty of Applied Physics, Delft University of Technology
% P.O. Box 5046, 2600 GA Delft, The Netherlands
function [g,j,t] = infcrit(a,nlab)
[m,k] = size(a);
c = max(nlab);
mininfo = ones(k,2);
% determine feature domains of interest
[sn,ln] = min(a,[],1);
[sx,lx] = max(a,[],1);
JN = (nlab(:,ones(1,k)) == ones(m,1)*nlab(ln)') * realmax;
JX = -(nlab(:,ones(1,k)) == ones(m,1)*nlab(lx)') * realmax;
S = sort([sn; min(a+JN,[],1); max(a+JX,[],1); sx]);
% S(2,:) to S(3,:) are interesting feature domains
P = sort(a);
Q = (P >= ones(m,1)*S(2,:)) & (P <= ones(m,1)*S(3,:));
% these are the feature values in those domains
for f=1:k, % repeat for all features
af = a(:,f);
JQ = find(Q(:,f));
SET = P(JQ,f)';
if JQ(1) ~= 1
SET = [P(JQ(1)-1,f), SET];
end
n = length(JQ);
if JQ(n) ~= m
SET = [SET, P(JQ(n)+1,f)];
end
n = length(SET) -1;
T = (SET(1:n) + SET(2:n+1))/2; % all possible thresholds
L = zeros(c,n); R = L; % left and right node object counts per class
for j = 1:c
J = find(nlab==j); mj = length(J);
if mj == 0
L(j,:) = realmin*ones(1,n); R(j,:) = L(j,:);
else
L(j,:) = sum(repmat(af(J),1,n) <= repmat(T,mj,1), 1) + realmin;
R(j,:) = sum(repmat(af(J),1,n) > repmat(T,mj,1), 1) + realmin;
end
end
infomeas = - (sum(L .* log10(L./(ones(c,1)*sum(L, 1))), 1) ...
+ sum(R .* log10(R./(ones(c,1)*sum(R, 1))), 1)) ...
./ (log10(2)*(sum(L, 1)+sum(R, 1))); % criterion value for all thresholds
[mininfo(f,1),j] = min(infomeas); % finds the best
mininfo(f,2) = T(j); % and its threshold
end
g = 1-mininfo(:,1)';
[finfo,j] = min(mininfo(:,1)); % best over all features
t = mininfo(j,2); % and its threshold
return
%FISHCRIT Fisher's Criterion and its best feature split
%
% [f,j,t] = fishcrit(A,nlabels)
%
% Computes the value of the Fisher's criterion f for all features
% over the dataset A with given numeric labels. Two classes only. j
% is the optimum feature, t its threshold. This is a lowlevel
% routine called for constructing decision trees.
% Copyright R.P.W. Duin, duin@ph.tn.tudelft.nl
% Faculty of Applied Physics, Delft University of Technology
% P.O. Box 5046, 2600 GA Delft, The Netherlands
function [f,j,t] = fishcrit(a,nlab)
[m,k] = size(a);
c = max(nlab);
if c > 2
error('Not more than 2 classes allowed for Fisher Criterion')
end
% Get the mean and variances of both the classes:
J1 = find(nlab==1);
J2 = find(nlab==2);
u = (mean(a(J1,:),1) - mean(a(J2,:),1)).^2;
s = std(a(J1,:),0,1).^2 + std(a(J2,:),0,1).^2 + realmin;
% The Fisher ratio becomes:
f = u ./ s;
% Find then the best feature:
[ff,j] = max(f);
% Given the feature, compute the threshold:
m1 = mean(a(J1,j),1);
m2 = mean(a(J2,j),1);
w1 = m1 - m2; w2 = (m1*m1-m2*m2)/2;
if abs(w1) < eps % the means are equal, so the Fisher
% criterion (should) become 0. Let us set the thresold
% halfway the domain
t = (max(a(J1,j),[],1) + min(a(J2,j),[],1)) / 2;
else
t = w2/w1;
end
return
%INFSTOP Quinlan's Chi-square test for early stopping
%
% crt = infstop(A,nlabels,j,t)
%
% Computes the Chi-square test described by Quinlan [1] to be used
% in maketree for forward pruning (early stopping) using dataset A
% and its numeric labels. j is the feature used for splitting and t
% the threshold.
%
% [1] J.R. Quinlan, Simplifying Decision Trees,
% Int. J. Man - Machine Studies, vol. 27, 1987, pp. 221-234.
%
% See maketree, treec, classt, prune
% Guido te Brake, TWI/SSOR, TU Delft.
% Copyright: R.P.W. Duin, duin@ph.tn.tudelft.nl
% Faculty of Applied Physics, Delft University of Technology
% P.O. Box 5046, 2600 GA Delft, The Netherlands
function crt = infstop(a,nlab,j,t)
[m,k] = size(a);
c = max(nlab);
aj = a(:,j);
ELAB = expandd(nlab);
L = sum(ELAB(aj <= t,:),1) + 0.001;
R = sum(ELAB(aj > t,:),1) + 0.001;
LL = (L+R) * sum(L) / m;
RR = (L+R) * sum(R) / m;
crt = sum(((L-LL).^2)./LL + ((R-RR).^2)./RR);
return
%PRUNEP Pessimistic pruning of a decision tree
%
% tree = prunep(tree,a,nlab,num)
%
% Must be called by giving a tree and the training set a. num is the
% starting node, if omitted pruning starts at the root. Pessimistic
% pruning is defined by Quinlan.
%
% See also maketree, treec, mapt
% Guido te Brake, TWI/SSOR, TU Delft.
% Copyright: R.P.W. Duin, duin@ph.tn.tudelft.nl
% Faculty of Applied Physics, Delft University of Technology
% P.O. Box 5046, 2600 GA Delft, The Netherlands
function tree = prunep(tree,a,nlab,num)
if nargin < 4, num = 1; end;
[N,k] = size(a);
c = size(tree,2)-4;
if tree(num,3) == 0, return, end;
w = prmapping('treec','trained',{tree,num},[1:c]',k,c);
ttt=tree_map(prdataset(a,nlab),w);
J = testc(ttt)*N;
EA = J + nleaves(tree,num)./2; % expected number of errors in tree
P = sum(expandd(nlab,c),1); % distribution of classes
%disp([length(P) c])
[pm,cm] = max(P); % most frequent class
E = N - pm; % errors if substituted by leave
SD = sqrt((EA * (N - EA))/N);
if (E + 0.5) < (EA + SD) % clean tree while removing nodes
[mt,kt] = size(tree);
nodes = zeros(mt,1); nodes(num) = 1; n = 0;
while sum(nodes) > n; % find all nodes to be removed
n = sum(nodes);
J = find(tree(:,3)>0 & nodes==1);
nodes(tree(J,3)) = ones(length(J),1);
nodes(tree(J,4)) = ones(length(J),1);
end
tree(num,:) = [cm 0 0 0 P/N];
nodes(num) = 0; nc = cumsum(nodes);
J = find(tree(:,3)>0);% update internal references
tree(J,[3 4]) = tree(J,[3 4]) - reshape(nc(tree(J,[3 4])),length(J),2);
tree = tree(~nodes,:);% remove obsolete nodes
else
K1 = find(a(:,tree(num,1)) <= tree(num,2));
K2 = find(a(:,tree(num,1)) > tree(num,2));
tree = prunep(tree,a(K1,:),nlab(K1),tree(num,3));
tree = prunep(tree,a(K2,:),nlab(K2),tree(num,4));
end
return
%PRUNET Prune tree by testset
%
% tree = prunet(tree,a)
%
% The test set a is used to prune a decision tree.
% Copyright: R.P.W. Duin, duin@ph.tn.tudelft.nl
% Faculty of Applied Physics, Delft University of Technology
% P.O. Box 5046, 2600 GA Delft, The Netherlands
function tree = prunet(tree,a)
[m,k] = size(a);
[n,s] = size(tree);
c = s-4;
erre = zeros(1,n);
deln = zeros(1,n);
w = prmapping('treec','trained',{tree,1},[1:c]',k,c);
[f,lab,nn] = tree_map(a,w); % bug, this works only if a is dataset, labels ???
[fmax,cmax] = max(tree(:,[5:4+c]),[],2);
nngood = nn([1:n]'+(cmax-1)*n);
errn = sum(nn,2) - nngood;% errors in each node
sd = 1;
while sd > 0
erre = zeros(n,1);
deln = zeros(1,n);
endn = find(tree(:,3) == 0)'; % endnodes
pendl = max(tree(:,3*ones(1,length(endn)))' == endn(ones(n,1),:)');
pendr = max(tree(:,4*ones(1,length(endn)))' == endn(ones(n,1),:)');
pend = find(pendl & pendr); % parents of two endnodes
erre(pend) = errn(tree(pend,3)) + errn(tree(pend,4));
deln = pend(find(erre(pend) >= errn(pend))); % nodes to be leaved
sd = length(deln);
if sd > 0
tree(tree(deln,3),:) = -1*ones(sd,s);
tree(tree(deln,4),:) = -1*ones(sd,s);
tree(deln,[1,2,3,4]) = [cmax(deln),zeros(sd,3)];
end
end
return
%NLEAVES Computes the number of leaves in a decision tree
%
% number = nleaves(tree,num)
%
% This procedure counts the number of leaves in a (sub)tree of the
% tree by using num. If num is omitted, the root is taken (num = 1).
%
% This is a utility used by maketree.
% Guido te Brake, TWI/SSOR, TU Delft
% Copyright: R.P.W. Duin, duin@ph.tn.tudelft.nl
% Faculty of Applied Physics, Delft University of Technology
% P.O. Box 5046, 2600 GA Delft, The Netherlands
function number = nleaves(tree,num)
if nargin < 2, num = 1; end
if tree(num,3) == 0
number = 1 ;
else
number = nleaves(tree,tree(num,3)) + nleaves(tree,tree(num,4));
end
return