ECG-Kit 1.0
(3,359 bytes)
%GENDATK K-Nearest neighbor data generation
%
% B = GENDATK(A,N,K,S)
% B = A*GENDATK([],N,K,S)
% B = A*GENDATK(N,K,S)
%
% INPUT
% A Dataset
% N Number of points (optional; default: 50)
% K Number of nearest neighbors (optional; default: 1)
% S Standard deviation (optional; default: 1)
%
% OUTPUT
% B Generated dataset
%
% DESCRIPTION
% Generation of N points using the K-nearest neighbors of objects in the
% dataset A. First, N points of A are chosen in a random order. Next, to each
% of these points and for each direction (feature), a Gaussian-distributed
% offset is added with the zero mean and the standard deviation: S * the mean
% signed difference between the point of A under consideration and its K
% nearest neighbors in A.
%
% The result of this procedure is that the generated points follow the local
% density properties of the point from which they originate.
%
% If A is a multi-class dataset the above procedure is followed class by
% class, neglecting objects of other classes and possibly unlabeled objects.
%
% If N is a vector of sizes, exactly N(I) objects are generated
% for class I. Default N is 100 objects per class.
%
% SEE ALSO (<a href="http://37steps.com/prtools">PRTools Guide</a>)
% DATASETS, MAPPINGS, GENDAT, GENDATP
% Copyright: R.P.W. Duin, r.p.w.duin@37steps.com
function B = gendatk(varargin)
argin = shiftargin(varargin,'vector');
argin = setdefaults(argin,[],[],1,1);
if mapping_task(argin,'definition')
B = define_mapping(argin,'generator','KNN generation');
return
end
% execution
[A,N,k,stdev] = deal(argin{:});
if isdataset(A)
Adouble = false;
A = prdataset(A);
A = setlablist(A); % remove empty classes first
else
Adouble = true;
A = prdataset(A,1);
end
[m,n,c] = getsize(A);
prior = getprior(A);
if isempty(N),
N = repmat(50,1,c); % 50 samples are generated.
end
N = genclass(N,prior); % Generate class frequencies according to the priors.
lablist = getlablist(A);
B = [];
labels = [];
% Loop over classes.
for j=1:c
a = getdata(A,j); % The j-th class.
[D,I] = sort(distm(a));
I = I(2:k+1,:); % Indices of the K nearest neighbors.
alf = randn(k,N(j))*stdev; % Normally distributed 'noise'.
nu = ceil(N(j)/size(a,1)); % It is possible that NU > 1 if many objects have to be generated.
J = randperm(size(a,1));
J = repmat(J,nu,1)';
J = J(1:N(j)); % Combine the NU repetitions of J into one column vector.
b = zeros(N(j),n);
% Loop over features.
for f = 1:n
% Take all objects given by J, consider feature F.
% Their K nearest neighbors are given by I(:,J)
% We reshape them as a N(j) by K matrix (N(j) is the length of J)
% Compute all differences between them and the original objects
% Multiply these differences by the std dev stored in alf
% Transpose and sum over the K neighbors, normalize by K
% Transpose again and add to the original objects
b(:,f) = a(J,f) + sum(( ( a(J,f)*ones(1,k) - ...
reshape(+a(I(:,J),f),k,N(j))' ) .* alf' )' /k, 1)';
end
B = [B;b];
labels = [labels; repmat(lablist(j,:),N(j),1)];
end
if Adouble
B = +B;
else
B = prdataset(B,labels,'prior',A.prior);
B = set(B,'featlab',getfeatlab(A),'name',getname(A),'featsize',getfeatsize(A));
end
return;