ECG-Kit 1.0

File: <base>/common/prtools/gendat.m (4,852 bytes)
%GENDAT Random sampling of datasets for training and testing
% 
%  [A,B,IA,IB] = GENDAT(X,N,SEED)
%  [A,B,IA,IB] = X*GENDAT([],N,SEED)
%  [A,B,IA,IB] = X*GENDAT(N,SEED)
%  [A,B,IA,IB] = GENDAT(X,ALF,SEED)
%  [A,B,IA,IB] = X*GENDAT([],ALF,SEED)
%  [A,B,IA,IB] = X*GENDAT(ALF,SEED)
% 
% INPUT
%   X      Dataset.
%   N,ALF  Number/fraction of objects to be selected (def: bootstrapping).
%          Alternatively a vector of numbers of objects for each class.
%   SEED   A state of the random number generation according to RANDRESET
%
% OUTPUT
%   A,B    Datasets
%   IA,IB  Original indices from the dataset X
%
% DESCRIPTION
% Generation of N objects from dataset X. They are stored in dataset A,
% the remaining objects in dataset B. IA and IB are the indices of the
% objects selected from X for A and B. The random object generation follows
% the class prior probabilities. So is the prior probability of a class is
% PA, then in expectation PA*N objects are selected from that class. If N
% is large or if one of the classes has too few objects in A, the number of
% generated objects might be less than N.
% 
% If N is a vector of sizes, exactly N(i) objects are generated for class i.
% Classes are ordered as given by GETLABLIST(A).  
%
% If the function is called without specifying N, the data set X is
% bootstrapped and stored in A. Not selected samples are stored in B.
%
% ALF should be a scalar < 1. For each class a fraction ALF of the objects
% is selected for A and the not selected objects are stored in B.
%
% If X is a cell array of datasets the command is executed for each
% dataset separately. Results are stored in cell arrays. For each dataset
% the random seed is reset, resulting in aligned sets for the generated
% datasets if the sets in X were aligned.
% 
% EXAMPLES 
% See PREX_PLOTC.
%
% SEE ALSO (<a href="http://37steps.com/prtools">PRTools Guide</a>)
% DATASETS, MAPPINGS, GENSUBSETS, RANDRESET

% Copyright: R.P.W. Duin, r.p.w.duin@37steps.com

function [A,B,IA,IB] = gendat(varargin)

  argin = shiftargin(varargin,'vector');
	argin = setdefaults(argin,[],[],[]);
  if mapping_task(argin,'definition')
    A = define_mapping(argin,'generator','Data sampling');
    return
  end
  
  % execution
  [X,N,seed] = deal(argin{:});
  if isempty(seed)
    seed = randreset;
  end
  randreset(seed);
	% If the input is a cell array of datasets, apply this procedure
  % to the individual datasets.
	if (iscell(X))
    [A,B,IA,IB] = X*feval(mfilename,[],N,seed);
    return
  end

	% When required, get the right number of objects from the given
	% fraction ALF.
  if isdouble(X), Xdouble = true; else, Xdouble = false; end
	if ~isdatafile(X), X = prdataset(X); end
	X = setlablist(X); % remove empty classes first
	[m,k,c] = getsize(X);
	% we need at least one class below:
	unlabeled = 0;
	if c==0, 
	   X=cdats(X,1); 
	   c=1;
		 unlabeled = 1; % we need to correct for labeling at the end
	end

	R = classsizes(X);
	if ~isempty(N) && length(N) ~= 1 && length(N) ~= c
		error('Data size should be scalar or a vector matching the number of classes')
	end
	if ~islabtype(X,'crisp') 
		if numel(N) > 1
			prwarning(1,'Specification of numbers of objects per class not possible for given label type')
			N = sum(N);
		end
		if N < 1, N = ceil(N*m); end
	end
	if ~isempty(N) && all(N < 1) && islabtype(X,'crisp')
		%DXD it should also be possible to have a fraction for each of the
		%classes, I think...
		if length(N)==1
			N = ceil(N*R);
		else
			N = ceil(N(:).*R(:));
		end
	end

	% Depending if N (or ALF) is given, the objects are created using
	% subsampling or bootstrapping.
	IA = [];
	if (nargin < 2) || (isempty(N))			% Bootstrap
		for i=1:c
			J = findnlab(X,i);
			K = ceil(rand(R(i),1)*R(i));
			IA = [IA; J(K)];
		end
	else																% Subsampling
		if ~islabtype(X,'crisp')
			K = randperm(m);
			if (N > m)
				%DXD: I would like to have just a warning:
				%error('More objects requested than available.')
				prwarning(4,'More objects requested than available.')
				%N = m;
        K = repmat(K,1,ceil(N/m));
			end
			IA = K(1:N);
		else
			%p = X.prior;   % avoid warning
			if isempty(X,'prior')
				p = classsizes(X);
				p = p/sum(p);
			else
				p = getprior(X);
			end
			%p = getprior(X);
			N = genclass(N,p);
			for i=1:c
				J = findnlab(X,i);
				K = randperm(R(i));
        if (N(i) > R(i))
					%DXD: I would like to have just a warning:
					%error('More objects requested than available.')
					prwarning(1,'More objects requested than available in class %d.',i)
					%N(i) = R(i);
        end
        K = repmat(K,1,ceil(N(i)/R(i)));
				IA = [IA; J(K(1:N(i)))];
			end
		end
	end

	% Finally, extract the datasets:
	IB = [1:m]';
	IB(IA) = [];
	if unlabeled 
		X = setlabels(X,[]); % reset unlabeling
	end
	A = X(IA,:);
	B = X(IB,:);
  if Xdouble
    A = +A; B = +B;
  end

return;