ECG-Kit 1.0
(32,297 bytes)
%DTC Verzakov Tree - Trainable Decision Tree Classifier
%
% W = DTC(A, CRIT, CHISQSTOPVAL, PRUNE, T, NFEAT)
%
% INPUT
% A Training dataset.
% Object weights and class priors:
% It is possible to assign individual weights to dataset
% objects by using 'weights' identifier:
%
% A = SETIDENT(A, WEIGHTS, 'weights')
%
% If weights are not defined they assumed to be equal to 1 for
% all objects. The actual weights used by the training routine
% are computed by using supplied weights along with the class
% priors (if priors are not set then the apparent priors are used):
%
% ACTUAL_WEIGHTS(I) = M*(WEIGHTS(I)/CW(C))*(PRIOR(C)/SUM(PRIOR))
%
% where M is the total amount of (labelled) objects in A,
% CW is the vector weights sums for each class, and C is
% the class of the object I. The sum of all actual weights is M.
%
% Feature types:
% Features are treated diffrently based on their domain information.
% If the feature domain is empty or is the interval/collection of intervals
% then this feature is considered to be the continuous one and
% branches are created by splitting at the threshold value.
% Otherwise (feature domain specifies a set of values or set of
% names), the feature is considered to be the nominal one and
% branches corresponding to all present feature values are
% created.
%
% Unknown and 'non-applicable' values:
% If the feature value is NaN then it is assumed that its value
% is unknown. In such a situation the object with unknown value
% is split into fractions and sent down all branches.
% The concept of the 'non-applicable' feature value is different
% from the concept of the unknown (missing) value.
% ...
% The 'non-applicable' values for the continues features are
% encoded as INF. If feature domain is the (set of) interval(s),
% then INF value has to be explicitly added to the domain
% defintion. The 'non-applicable' value of nominal features
% does not have predefined encoding. If it is necessary user
% have to include such value into domain definition.
%
% CRIT Splitting citerion name.
% 'igr' Information Gain Ratio (default):
% As defined by Quinlan. The penalty on the number of the distinct
% values of the continues feature is used. If the gain is zero or
% negative due to such penalization, the split is not performed.
% This leads to smaller trees and may give non-zero training error.
% This criterion does not use costs. (Costs are used only at the classification step).
%
% 'gini' Gini impurity index. More precisely, the change in this
% index. GINI index can be interpreted as a misclassification rate
% for the stochastic prior based classifier, so costs are
% naturally embedded. If the change in the (absolute) error less
% or equal to 0.1 (change in the cost less or equal to 0.1 of minimal
% absolute value of non-zero costs) the split is not performed.
% This leads to smaller trees and may give non-zero training error.
%
% 'miscls' Classification error criterion.
% To be used only for educational puposes because
% it gives rather inferior results. Costs are naturally embedded.
%
% CHISQSTOPVAL Early stopping crtitical value for the chi-squared test
% on the difference between the original node class distribution and
% branches class distributions. CHISQSTOPVAL is 0 by default.
% Which means that branches will be discarded only if they bring no
% change in class distribution.
%
% PRUNE Pruning type name
% 'prunep' - pessimistic (top-down) pruning as defined by Quinlan.
% Pessimistic pruning be perfromed if cost matrix is defined.
% 'prunet' - test set (bottom-up) pruning using the (required)
% dataset T.
% These implementations of both pruning algorithms may be not
% exactly correct if there are unknown values in datastes.
%
% T Test test for the test set pruning
%
% OUTPUT
% W Classifier mapping
%
% DESCRIPTION
% If (full grown) branches of (sub)tree do not improve classification error
% (misclassification cost) they are immediatley discarded.
% This may happen because we use regularized posteriors. As a result
% the algorithm is more stable, trees are smaller, but split on
% the training set may be not perfect.
%
% REFERENCES
% [1] J.R. Quinlan, Simplifying Decision Trees, International Journal of
% Man-Machine Studies, 27(3), pp. 221-234, 1987.
% [2] J.R. Quinlan, Improved use of continuous attributes in C4.5. Journal
% of AI Research, 4(96), pp. 77-90, 1996.
%
% see also DATASETS, MAPPINGS, TREEC
% Copyright: S. Verzakov, s.verzakov@gmail.com
% Based on prtools' treec, tree_map, and their subroutines
% by Guido te Brake and R.P.W Duin
function w = dtc(a, crit, chisqstopval, prune, t, nfeat)
% When no input data is given, an empty tree is defined:
if (nargin == 0) || isempty(a)
if nargin < 2 || isempty(crit)
crit = 'igr';
end
if nargin < 3 || isempty(chisqstopval)
chisqstopval = 0;
end
if nargin < 4 || isempty(prune)
prune = '';
end
if nargin < 5
t = [];
end
if nargin < 6
nfeat = [];
end
w = prmapping('dtc', {crit, chisqstopval, prune, t, nfeat});
w = setname(w, ['DecTree' upper(crit)]);
elseif (nargin == 2) && ismapping(crit)
% Execution
w = crit;
tree = +w;
% B contains posteriors.
% We do not need to convert posteriors to costs.
% PRTools will do it automatically
b = mapdt(tree, +a);
b = setdat(a,b,w);
% b = setfeatlab(b, getlabels(w));
w = b;
else
% Training
if nargin < 2 || isempty(crit)
crit = 'igr';
end
if nargin < 3 || isempty(chisqstopval)
chisqstopval = 0;
end
if nargin < 4 || isempty(prune)
prune = '';
end
if nargin < 5
t = [];
end
if nargin < 6
nfeat = [];
end
if ~any(strcmpi(crit, {'igr', 'gini', 'miscls'}))
error('Unknown splitting criterion');
end
islabtype(a,'crisp');
isvaldfile(a, 1, 2); % at least 1 object per class, 2 classes
a = seldat(prdataset(a)); % get rid of all unlabelled objects
m = size(a,1);
% Get weights (if defined)
weights = getident(a, 'weights');
if isempty(weights)
weights = ones([m 1]);
else
idx = (weights > 0);
if nnz(idx) < m
a = a(idx, :);
weights = weights(idx);
end
isvaldset(a, 1, 2);
end
% First get some useful parameters:
% Sizes
[m, k, c] = getsize(a);
cs = classsizes(a);
% Determine if features are categorical
featdom = getfeatdom(a);
featdom = featdom(:)';
if isempty(featdom)
feattype = zeros(1, k);
else
feattype = cellfun(@(x) ischar(x) || (size(x,1)==1), featdom);
end
% Features to use
usefeat = true([1, k]);
if ~isempty(nfeat)
nfeat = min(nfeat, k);
end
% Labelling
nlab = getnlab(a);
% Get priors
prior = getprior(a);
prior = prior/sum(prior);
% Define weights:
% the total sum of weights is equal to m
% Compute class weights
cw = zeros(size(cs));
for i=1:c
cw(i) = sum(weights(nlab == i));
end
% Rescale objects weights
% by class weight factors derived from priors
cwf = m*prior./cw;
weights = weights.*(cwf(nlab)).';
% Miscalssification cost
cost = a.cost;
% Now the training can really start:
tree = makedt(+a, feattype, usefeat, nfeat, c, nlab, weights, cost, crit, chisqstopval);
if ~isempty(prune)
if strcmpi(prune, 'prunep')
if ~isempty(cost)
error('Pessimistic prunning based on misclassification costs is not implemented');
end
tree = prunep(tree);
elseif strcmpi(prune, 'prunet')
if isempty(t)
error('Test set is not specified for the test set prunning');
end
tree = prunet(tree, t, cost);
else
error('Unkown prunning method');
end
tree = cleandt(tree);
end
% Store the results:
w = prmapping('dtc', 'trained', {tree}, getlablist(a), k, c);
w = setname(w, ['DecTree' upper(crit)]);
w = setcost(w, cost);
end
return
%MAKEDT General tree building algorithm
%
% TREE = MAKEDT(A, FEATTYPE, USEFEAT, NFEAT, C, NLAB, WEIGHTS, COST, CRIT, CHISQSTOPVAL)
%
% INPUT
% A Data matrix
%
% FEATTYPE Row defining the feature types (0 - continues, 1 - nominal)
%
% USEFEAT Row defining the features to be used for splitting
%
% NFEAT The maximum number of features for which splitting has to be
% attempted. If the number of the available features is geater
% than NFEAT the random subset of NFEAT features will be used for
% splitting.
%
% C Total number of classes in the initial dataset
%
% NLAB Numeric class labels of objects in A
%
% WEIGHTS Weights of objects in A
%
% COST Misclassification costs
%
% CRIT Name of splitting criterion to be used
%
% CHISQSTOPVAL Crtitical value for the chi-squared test
%
% OUTPUT
% TREE Structure containing arrays defining decision tree.
% .NSMP The number of samples. NSMP(J) is the sum of weights of objects
% which reached the node J.
%
% .POST Posterior probabilities. POST(J, :) is the class
% distribution at the node J. Object weights are taken into
% account. 0 and 1 probablities are avoided by perfroming Bayes
% uniform priors regularization.
%
% .CIDX Class index. CIDX(J) is the class corresponding to the
% node J if it considered as a leaf.
%
% .ERRL Leaf error. ERRL(J) is the misclassification error (cost)
% on training samples which reached the node J if this node is
% considered to be a leaf.
%
% .ERRT Leaf error. ERRT(J) is the misclassification error (cost)
% on training samples which reached the node J if this node is
% considered to be a root of the (sub)tree.
%
% .SIZE Tree size. SIZE(J) is the (sub)tree size with root at the
% node J.
%
% .FIDX Feature index. FIDX(J) is the index of the feature on
% which node J is split. FIDX(J) == 0 means that J is a leaf.
%
% .FVAL Feature value(s). For nominal features FVAL{J} is the set
% of feature values observed at the node J (LENGTH(FVAL{J} == 0
% for the leaf, otherwise it is > 1).
% For continues features, if LENGTH(FVAL{J}) == 1 then it contains
% threshold THR for splitting into the left (<= THR) and the
% right (> THR) branches. If FVAL{J} == [] (and J is not a leaf)
% then it means that split is perfromed between applicable and
% non-applicable values.
%
% .NIDX Branch node indices. For nominal features NIDX{J,K} is
% the index of branch node of node J with value FVAL{J,K}
% of feature FIDX(J). For continues features (if LENGTH(FVAL{J}) == 1)
% NIDX{J,1} is the index of the left branch node, NIDX{J,2} is the
% index of the right branch node. If FVAL{J} == []
% (and J is not a leaf) then NIDX{J,1} is the index of branch node
% containing objects with applicable values of feature FIDX(J) and
% NIDX{J,2} is the index of branch node containing objects with
% non-applicable values of the same feature.
%
% This is a low-level routine called by DTC.
%
% See also IGR, GINI, MISCLS
function tree = makedt(a, feattype, usefeat, nfeat, c, nlab, weights, cost, crit, chisqstopval)
% Construct the tree:
% Find (absolute) class frequencies
C = zeros(1, c);
for j=1:c
C(j) = sum(weights(nlab == j));
end
NC = nnz(C);
tree.nsmp = sum(C);
% regularization by 'uniform' Bayesian priors;
C0 = C;
C = C + 1;
p = C/sum(C);
if isempty(cost)
[maxpost, cidx] = max(p);
errc = tree.nsmp*(1-maxpost);
%errc = tree.nsmp - C0(cidx);
else
costp = p*cost;
[mincost, cidx] = min(costp);
errc = mincost;
end
tree.post = p;
tree.cidx = cidx;
tree.errl = errc;
tree.errt = errc;
tree.size = 1;
if NC ~= 1 % not a pure class dataset
% now the tree is recursively constructed further:
% use desired split criterion
[fidx, fval, nb, bidx, chisqval] = findsplit(+a, feattype, usefeat, nfeat, c, nlab, weights, cost, crit);
% When the stop criterion is not reached yet, we recursively split
% further:
if ~isempty(fidx) && (chisqval > chisqstopval)
tree.fidx = fidx;
tree.fval = {fval};
if feattype(fidx) > 0
usefeat(fidx) = 0;
end
uidx = bidx == 0;
nu = nnz(uidx);
if nu > 0
knsmp = tree.nsmp - sum(weights(uidx));
end
tree.nidx = {zeros(1, nb)};
tree.errt(1) = 0;
for j=1:nb
tree.nidx{1}(j) = tree.size(1) + 1;
J = bidx == j;
if nu == 0
bweights = weights(J);
else
J = J | uidx;
bweights = weights(J);
buidx = uidx(J);
bknsmp = sum(bweights(~buidx));
bweights(buidx) = (bknsmp/knsmp)*bweights(buidx);
end
branch = makedt(+a(J, :), feattype, usefeat, nfeat, c, nlab(J), bweights, cost, crit, chisqstopval);
branch.nidx = cellfun(@(x) x + tree.size(1)*(x>0), branch.nidx, 'UniformOutput', false);
tree.errt(1) = tree.errt(1) + branch.errt(1);
tree.size(1) = tree.size(1) + size(branch.nidx, 1);
tree.nsmp = [tree.nsmp; branch.nsmp];
tree.post = [tree.post; branch.post];
tree.cidx = [tree.cidx; branch.cidx];
tree.errl = [tree.errl; branch.errl];
tree.errt = [tree.errt; branch.errt];
tree.size = [tree.size; branch.size];
tree.fidx = [tree.fidx; branch.fidx];
tree.fval = [tree.fval; branch.fval];
tree.nidx = [tree.nidx; branch.nidx];
end
end
end
% no improvement in error (cost), rollback
if (tree.size(1) > 1) && (tree.errt(1) >= tree.errl(1))
tree.nsmp = tree.nsmp(1);
tree.post = tree.post(1,:);
tree.cidx = tree.cidx(1);
tree.errl = tree.errl(1);
tree.errt = tree.errl(1); % sic!
tree.size = 1;
end
if tree.size(1) == 1
% We reached the stop criterion or no further split is possible
% so we make a leaf node:
tree.fidx = 0;
tree.fval = {[]};
tree.nidx = {[]};
end
return
%MAPDT Tree mapping and node statistic calculation
%
% [P, S] = MAPDT(TREE, A, NLAB, WEIGHTS)
%
function [p, s] = mapdt(tree, a, nlab, weights)
persistent dt st
if isstruct(tree)
dt = tree;
if nargin < 4
weights = [];
end
if nargin < 3
nlab = [];
end
m = size(a, 1);
[n, c] = size(dt.post);
p = zeros([m, c]);
if (nargout < 2) || isempty(nlab)
st = [];
for i=1:m
p(i, :) = mapdt(1, +a(i, :));
end
else
st = zeros([n, c]);
nlab (nlab > c) = 0;
if isempty(weights)
weights = ones(m, 1);
end
for i=1:m
p(i, :) = mapdt(1, +a(i, :), nlab(i), weights(i));
end
end
s = st;
clear dt st
else
j = tree;
while j > 0
if ~isempty(st) && (nlab > 0)
st(j, nlab) = st(j, nlab) + weights;
end
k = 0;
fidx = dt.fidx(j);
if fidx ~= 0
nidx = dt.nidx{j};
aval = a(fidx);
if ~isnan(aval)
fval = dt.fval{j};
if isempty(fval)
k = nidx(2 - (aval ~= inf));
elseif length(fval) == 1
if aval ~= inf
k = nidx(2 - (aval <= fval));
elseif length(nidx) == 3
k = nidx(3);
end;
else
k = nidx(aval == fval);
if isempty(k)
k = 0;
end
end
else
p = zeros([1, size(dt.post, 2)]);
if isempty(st) || (nlab <= 0)
for b=1:length(nidx)
k = nidx(b);
f = (dt.nsmp(k)/dt.nsmp(j));
p = p + f*mapdt(nidx(b), a);
end
else
for b=1:length(nidx)
k = nidx(b);
f = (dt.nsmp(k)/dt.nsmp(j));
p = p + f*mapdt(nidx(b), a, nlab, f*weights);
end
end
k = -1;
end
end
if k == 0
p = dt.post(j, :);
end
j = k;
end
end
return
%FINDSPLIT General routine for finding the best split
%
% [FIDX, FVAL, NB, BIDX, CHISQVAL, CRITVAL] = FINDSPLI(A, FEATTYPE, USEFEAT, NFEAT, C, NLAB, WEIGHTS, COST, CRIT)
%
function [fidx, fval, nb, bidx, chisqval, critval] = findsplit(a, feattype, usefeat, nfeat, c, nlab, weights, cost, crit)
selfeatidx = find(usefeat);
nf = length(selfeatidx);
if ~isempty(nfeat) && (nfeat < nf)
permidx = randperm(length(selfeatidx));
selfeatidx = selfeatidx(permidx(1:nfeat));
nf = nfeat;
end
fval = cell([1, nf]);
chisqval = zeros([1, nf]);
critval = nan([1, nf]);
% repeat for all selected features
for f=1:nf
fidx = selfeatidx(f);
af = a(:, fidx);
kidx = ~isnan(af); % known values index
if nnz(kidx) == 0
continue
end
MU = sum(weights(~kidx)) + realmin;
af = af(kidx);
wk = weights(kidx);
nlabk = nlab(kidx);
switch feattype(fidx)
case 0 % continous/ordered feature
naidx = af == inf;
NA = repmat(realmin, [1 c]);
MNA = c*realmin;
nlabna = nlabk(naidx);
nna = length(nlabna);
if nna > 0
for j = 1:c
NA(1,j) = sum(wk(nlabna == j)) + realmin;
end
MNA = sum(NA);
end
apidx = find(~naidx);
af = af(apidx);
wap = wk(apidx);
nlabap = nlabk(apidx);
[af, sortidx] = sort(af);
wap = wap(sortidx);
nlabap = nlabap(sortidx);
if length(af) == 1
uv = af;
ns = 0;
sli = [];
else
labchngcount = cumsum([1; double(diff(nlabap) ~= 0)]);
uniquevallowidx = find([true; ((diff(af)./(0.5*abs(af(1:end-1) + af(2:end)) + realmin)) > 1e-8)]);
uniquevalhighidx = [(uniquevallowidx(2:end) - 1); length(af)];
% unique values
uv = af(uniquevalhighidx);
% split low indices in unique values
sli = find(labchngcount(uniquevalhighidx(2:end)) - labchngcount(uniquevallowidx(1:end-1)) > 0);
% split low indices in af
splitlowidx = uniquevalhighidx(sli);
ns = length(splitlowidx);
end
if (ns == 0) && (nna == 0)
continue
end
% applicable, left, and right branch class counts
AP = zeros(1, c);
L = zeros(ns, c);
R = zeros(ns, c);
for j = 1:c
J = find(nlabap == j);
mj = length(J);
AP(j) = sum(wap(J));
if (ns > 0) && (mj > 0)
L(:, j) = (repmat(splitlowidx, [1, mj]) >= repmat(J.', [ns, 1]))*wap(J) + realmin;
R(:, j) = AP(j) - L(:, j) + realmin;
end
end
AP = AP + 2*realmin;
% total count of applicable
MAP = sum(AP);
% known
K = AP + NA;
MK = MNA + MAP;
% object counts for branches
ML = sum(L, 2);
MR = sum(R, 2);
[cv, i] = feval(crit, 0, MU, MK, K, MNA, NA, MAP, AP, ML, L, MR, R, cost, weights, uv, sli);
critval(f) = cv;
if ~isnan(cv) && (cv > -inf)
if (ns > 0) && ~isempty(i)
t = splitlowidx(i);
fval{f} = 0.5*(af(t) + af(t+1));
if nna == 0
APL = AP*(ML(i)/MAP);
APR = AP*(MR(i)/MAP);
chisqval(f) = sum(((L(i, :) - APL).^2)./APL + ((R(i, :) - APR).^2)./APR);
else
KNA = K*(MNA/MK);
KL = K*(ML(i)/MK);
KR = K*(MR(i)/MK);
chisqval(f) = sum(((NA - KNA).^2)./KNA + ((L(i, :) - KL).^2)./KL + ((R(i, :) - KR).^2)./KR);
end
else
fval{f} = [];
KNA = K*(MNA/MK);
KAP = K*(MAP/MK);
chisqval(f) = sum(((NA - KNA).^2)./KNA + ((AP(i, :) - KAP).^2)./KAP);
end
end
case 1 % nominal feature
vf = unique(af);
v = length(vf);
B = zeros(v, c);
for j=1:c
J = find(j == nlab);
mj = length(J);
if mj > 0
B(:, j) = (repmat(vf, [1, mj]) == repmat(af(J).', [v 1]))*weights(J);
end
end
B = B + realmin;
% class counts for known
K = sum(B, 1);
% total known count
MK = sum(K);
% object counts for branches
MB = sum(B, 2);
cv = feval(crit, 1, MU, MK, K, MB, B, cost, weights);
critval(f) = cv;
if ~isnan(cv) && (cv > -inf)
KB = MB.*K/MK;
chisqval(f) = sum(sum(((B - KB).^2)./KB));
fval{f} = bf;
end
end
end
% best criterion over all features
testfeatidx = find(~isnan(critval) & (critval > -inf));
if isempty(testfeatidx)
fidx = [];
fval = [];
nb = [];
bidx = [];
chisqval = [];
critval = [];
else
[critval, fidx] = max(critval(testfeatidx));
fidx = testfeatidx(fidx);
fval = fval{fidx};
chisqval = chisqval(fidx);
fidx = selfeatidx(fidx);
af = a(:, fidx);
m = size(af, 1);
bidx = zeros(m, 1);
switch feattype(fidx)
case 0
apidx = af < inf;
if ~isempty(fval)
lidx = af <= fval;
ridx = apidx & ~lidx;
bidx(lidx) = 1;
bidx(ridx) = 2;
nb = 2;
else
bidx(apidx) = 1;
nb = 1;
end
naidx = ~isnan(af) & ~apidx;
if nnz(naidx) > 0
nb = nb + 1;
bidx(naidx) = nb;
end
case 1
nb = length(fval);
for i=1:nb
bidx(af == fval(i)) = i;
end
end
end
return
%IGR The information gain ratio
%
% [CRITVAL, IDX] = IGR(FEATTYPE, MU, MK, K, MNA, NA, PRMAP, AP, ML, L, MR, R, COST, WEIGHTS, UV, SLI)
% [CRITVAL, IDX] = IGR(FEATTYPE,MU, MK, K, MB, B, COST, WEIGHTS
function [critval, idx] = igr(feattype, varargin)
switch feattype
case 0
[MU, MK, K, MNA, NA, PRMAP, AP, ML, L, MR, R, cost, weights, uv, sli] = deal(varargin{:});
M = MK + MU;
infoold = - K * log2(K.'/MK);
infonew = - NA * log2(NA.'/MNA);
infosplit = - MU * log2(MU/M) - MNA * log2(MNA/M);
if ~isempty(R)
infolr = - ( ...
sum(L .* log2(L./(repmat(ML, [1 size(L, 2)]))), 2) + ...
sum(R .* log2(R./(repmat(MR, [1 size(R, 2)]))), 2) ...
);
% best criterion value over all thresholds
[infolr, idx] = min(infolr);
infonew = infonew + infolr;
infosplit = infosplit - [ML(idx), MR(idx)] * log2([ML(idx); MR(idx)]/M);
else
idx = [];
infonew = infonew - AP * log2(AP.'/MAP);
infosplit = infosplit - MAP * log2(MAP/M);
end
infothresh = log2(max(length(uv) - 1, 1));
infogain = (infoold - infonew - infothresh);
case 1
idx = [];
[MU, MK, K, MB, B, cost, weights] = deal(varargin{:});
M = MK + MU;
infoold = - K * log2(K.'/MK);
infosplit = - MU.' * log2(MU/M);
infonew = - sum(sum(B .* log2(B./(repmat(MB, [1 size(B, 2)]))), 2), 1);
infosplit = infosplit - MB.' * log2(MB/M);
infogain = (infoold - infonew);
end
% infogain = infogain / M;
% infosplit = infospit / M;
if infogain > 0
critval = infogain / infosplit;
else
critval = -inf;
end
return
%GINI
%
% [CRITVAL, IDX] = GINI(FEATTYPE, MU, MK, K, MNA, NA, PRMAP, AP, ML, L, MR, R, COST, WEIGHTS, UV, SLI)
% [CRITVAL, IDX] = GINI(FEATTYPE,MU, MK, K, MB, B, COST, WEIGHTS%
function [critval, idx] = gini(feattype, varargin)
switch feattype
case 0
[MU, MK, K, MNA, NA, PRMAP, AP, ML, L, MR, R, cost, weights, uv, sli] = deal(varargin{:});
if isempty(cost)
purityold = K * (K.'/MK);
%impurityold = MK - purityold;
if ~isempty(R)
puritylr = ( ...
sum(L .* (L./(repmat(ML, [1 size(L, 2)]))), 2) + ...
sum(R .* (R./(repmat(MR, [1 size(R, 2)]))), 2) ...
);
% best criterion value over all thresholds
[puritylr, idx] = max(puritylr);
puritynew = puritylr + NA * (NA.'/MNA);
else
idx = [];
puritynew = AP * (AP.'/ MAP) + NA * (NA.'/MNA);
end
%impuritynew = MK - puritynew;
deltaimpurity = puritynew - purityold;
else
impurityold = K * cost * (K.'/MK);
if ~isempty(R)
impuritylr = ...
sum((L*cost) .* (L./(repmat(ML, [1 size(L, 2)]))), 2) + ...
sum((R*cost) .* (R./(repmat(MR, [1 size(R, 2)]))), 2);
[impuritylr, idx] = min(impuritylr);
impuritynew = impuritylr + NA * cost * (NA.'/MNA);
else
idx = [];
impuritynew = AP * cost* (AP.'/ MAP) + NA * cost *(NA.'/MNA);
end
deltaimpurity = impurityold - impuritynew;
end
case 1
idx = [];
[MU, MK, K, MB, B, cost, weights] = deal(varargin{:});
if isempty(cost)
purityold = K * (K.'/MK);
%impurityold = MK - purityold;
puritynew = sum(sum(B .* (B./(repmat(MB, [1 size(B, 2)]))), 2));
%impuritynew = MK - puritynew;
deltaimpurity = puritynew - purityold;
else
impurityold = K * cost * (K.'/MK);
impuritynew = ( ...
sum(sum((B*cost) .* (B./(repmat(MB, [1 size(B, 2)]))), 2), 1) ...
);
deltaimpurity = impurityold - impuritynew;
end
end
if isempty(cost)
deltamin = 0.1;
else
deltamin = 0.1*min(abs(cost(abs(cost) > 0)));
end
if deltaimpurity > deltamin
critval = deltaimpurity/(MU+MK);
else
critval = -inf;
end
return
%MISCLS Miscalssification error
%
% [CRITVAL, IDX] = MISCLS(FEATTYPE, MU, MK, K, MNA, NA, PRMAP, AP, ML, L, MR, R, COST, WEIGHTS, UV, SLI)
% [CRITVAL, IDX] = MISCLS(FEATTYPE,MU, MK, K, MB, B, COST, WEIGHTS
%
function [critval, idx] = miscls(feattype, varargin)
switch feattype
case 0
[MU, MK, K, MNA, NA, PRMAP, AP, ML, L, MR, R, cost, weights, uv, sli] = deal(varargin{:});
if isempty(cost)
ccold = max(K, [], 2);
if ~isempty(R)
ccnew = max(L, [], 2) + max(R, [], 2);
[ccnew, idx] = max(ccnew);
ccnew = ccnew + max(NA, [], 2);
else
idx = [];
ccnew = max(AP, [], 2) + max(NA, [], 2);
end
deltamc = ccnew - ccold;
else
mcold = min(K*cost, [], 2);
if ~isempty(R)
mcnew = min(L*cost, [], 2) + min(R*cost, [], 2);
[mcnew, idx] = min(mcnew);
mcnew = mcnew + min(NA*cost, [], 2);
else
idx = [];
mcnew = min(AP*cost, [], 2) + min(NA*cost, [], 2);
end
deltamc = mcold - mcnew;
end
case 1
idx = [];
[MU, MK, K, MB, B, cost, weights] = deal(varargin{:});
if isempty(cost)
ccold = max(K, [], 2);
ccnew = sum(max(B, [], 2), 1);
deltamc = ccnew - ccold;
else
mcold = min(K*cost, [], 2);
mcnew = sum(min(B*cost, [], 2), 1);
deltamc = mcold - mcnew;
end
end
%if isempty(cost)
% deltamin = min(weights(weights > 0));
%else
% deltamin = min(weights(weights > 0))*min(cost(cost > 0));
%end
%
%if ~isempty(deltamin)
% deltamin = 0.5*deltamin;
%else
% deltamin = 0;
%end
%
%if deltamc <= deltamin
% critval = -inf;
%else
% critval = deltamc / (MU+MK);
%end
critval = deltamc / (MU+MK);
return
%PRUNEP Pessimistic pruning of a decision tree
%
% TREE = PRUNEP(TREE,NODE)
%
% Pessimistic pruning defined by Quinlan.
function tree = prunep(tree, node)
persistent pt;
if (nargout ~= 0) || (nargin ~= 1) || ~isscalar(tree) || ~isnumeric(tree) || ~isint(tree)
if nargin < 2 || isempty(node)
node = 1;
end
pt = tree;
prunep(node);
tree = pt;
clear pt;
else
node = tree;
if pt.fidx(node) > 0
tnidx = (node+1):(node + pt.size(node) - 1);
nleaves = nnz(pt.fidx(tnidx) == 0);
errt = pt.errt(node) + 0.5*nleaves;
errl = pt.errl(node) + 0.5;
nsmp = pt.nsmp(node);
sd = sqrt(errt*(1-errt/nsmp));
if errl < (errt + sd)
pt.fidx(tnidx) = -1;
pt.errt(node) = pt.errl(node);
pt.size(node) = 1;
pt.fidx(node) = 0;
pt.fval(node) = {[]};
pt.nidx(node) = {[]};
else
errt = 0;
for i=1:length(pt.nidx{node})
idx = pt.nidx{node}(i);
prunep(idx);
errt = errt + pt.errt(idx);
end
pt.errt(idx) = errt;
end
end
end
return
%PRUNET Prune tree by testset
%
% TREE = PRUNET(TREE,T,COST)
%
% The test set a is used to prune a decision tree.
function tree = prunet(tree, t, cost)
persistent pt;
if (nargout ~= 0) || (nargin ~= 1) || ~isscalar(tree) || ~isnumeric(tree) || ~isint(tree)
if nargin < 3
cost = [];
end
[m, k, c] = getsize(t);
cs = classsizes(t);
prior = getprior(t);
prior = prior/sum(prior);
weights = m*prior./cs;
pt = tree;
mt = size(pt.post,1);
[p, s] = mapdt(pt, +t, getnlab(t), weights);
% error (cost) in each node as if there were leafs
if isempty(cost)
idx = sub2ind([mt, c], (1:mt).', pt.cidx);
pt.test_errl = sum(s,2) - s(idx);
else
pt.test_errl = sum(s .* cost(:, pt.cidx).', 2);
end
pt.test_errt = pt.test_errl;
prunet(1);
tree = pt;
clear pt;
else
node = tree;
if pt.fidx(node) > 0
test_errt = 0;
errt = 0;
for i=1:length(pt.nidx{node})
idx = pt.nidx{node}(i);
prunet(idx);
test_errt = test_errt + pt.test_errt(idx);
errt = errt + pt.errt(idx);
end
if pt.test_errl(node) <= test_errt
pt.fidx(pt.nidx{node}) = -1;
pt.errt(node) = pt.errl(node);
pt.size(node) = 1;
pt.fidx(node) = 0;
pt.fval(node) = {[]};
pt.nidx(node) = {[]};
else
pt.test_errt(node) = test_errt;
pt.errt(node) = errt;
end
end
end;
return
function tree = cleandt(tree)
rnidx = tree.fidx == -1;
if nnz(rnidx) == 0
return
end
rncount = cumsum(rnidx);
fn = fieldnames(tree);
for i=1:length(fn)
tree.(fn{i})(rnidx, :) = [];
end
tree.nidx = cellfun(@(x) x - rncount(x).', tree.nidx, 'UniformOutput', false);
idx = (1:length(rnidx)).';
idx(rnidx) = [];
tree.size = tree.size - (rncount(idx) - rncount(idx + tree.size - 1));
return