Predicting Mortality of ICU Patients: The PhysioNet/Computing in Cardiology Challenge 2012 1.0.0

File: <base>/sources/reko.kemppainen_at_gmail.com/entry7/main_learn.m (16,796 bytes)
function main_learn

% MATLAB script for processing a Challenge 2012 data set with a Challenge entry
% Version 2.0  (22 March 2012)
%
% A script similar to this one will be used as part of the evaluation of
% Challenge entries written in m-code.  A companion script will be used to
% perform the same function for Challenge entries written in C and other
% languages.  We have provided these scripts so that Challenge participants
% can test their entries to verify that they run properly in the environment
% that will be used to test them.
%
% Each Challenge .txt file (record) contains data for one patient, in 3 columns
% (timestamp, parameter, and value).  The three Challenge data sets contain
% 4000 records each.
%
% This script supplies a complete set of 4000 records, one at a time, to an
% entry, and collects its output for each record (a binary prediction of
% survival, for event 1, and an estimate of mortality risk, for event 2)
% in a summary output file.
%
% The summary output file is then scored by comparing its contents with the
% patients' known outcomes.  This script includes MATLAB code that can
% calculate unofficial scores for the training data (set A), for which the
% known outcomes are provided to participants.  Entries will be ranked using
% official scores obtained using the same methods, but based on testing with
% sets B and C, for which the outcomes are not provided to participants.
%
% If your entry is written in m-code, it must be in the form of a function
% named physionet2012, with this signature:
%     [risk,survival]=physionet2012(tm,category,val);
% See the sample entry at http://physionet.org/challenge/2012/physionet2012.m
% for descriptions of the input and output variables.
%
% To use this script to obtain an unofficial score for your entry on set A:
%  1. Download these files from http://physionet.org/challenge/2012/ and save
%     them in your MATLAB working directory:
%	genresults.m		   (this file)
%	lemeshow.m		   (function needed to calculate Event 2 score)
%	set-a.zip or set-a.tar.gz  (zip archive or tarball of set A files)
%	Outcomes-a.txt		   (known outcomes for set A)
%  2. Unzip set-a.zip (or unpack set-a.tar.gz), creating a subdirectory within
%	your working directory called 'set-a'.  When you have completed this
%	step, the set-a directory should contain 4000 individual .txt files.
%  3. Save a copy of your entry (physionet2012.m) in your working directory.
%  4. The next few lines are MATLAB code that clears any previously set
%     variables, and sets the name of the directory containing the input
%     data, the name for the summary output file, and the name of the file
%     containing the known outcomes.  Change them if necessary.

        clear all;close all;clc
        set_name='set-a';
	fname_out='Outputs-a.txt';
	results='Outcomes-a.txt';

%  5. Start MATLAB and type
%       Main_Challenge
%     The fname_out file will be generated in the
%     current directory.
%
% You can also use this script to run your entry on test set B, but it will
% not be able to calculate scores in this case, since the outcomes are provided
% for set A only.  To do this, download and unzip/unpack set B into a 'set-b'
% subdirectory as you did for set A, and uncomment the next three lines:
%
%       set_name='set-b';
%       fname_out='Outputs-b.txt'
%       results=[];
%
% Note that if you run a test on either set A or B more than once, you should
% delete (or rename) your summary output file (named in fname_out), since this
% code appends new outputs to any existing output file rather than starting
% over.


cur_dir=pwd;
fdel='/';
if(ispc)
    fdel='\';
end
data_dir=[cur_dir fdel set_name fdel];

cd(data_dir)
records=dir('*.txt');
cd(cur_dir)

I=length(records);
DATA=zeros(I,3) + NaN;
display(['Processing records ...'])



[ALL_CATEGORIES,time_series_names,descriptors]=get_param_names();
num_params=length(ALL_CATEGORIES);
num_ts_params=length(time_series_names);
num_descriptors=length(descriptors);


MEAN_DATA_24=zeros(I,num_ts_params) + NaN;
MEAN_DATA_48=zeros(I,num_ts_params) + NaN;

DESCRIPTORS=zeros(I,num_descriptors) + NaN;

%num_data=zeros(num_params,1)+NaN; 

SAPS_SCORES=zeros(I,1);
SAPS_SCORES_48=zeros(I,1);

tm=[];
category=[];
val=[];

% Open fname_out and append to any previous contents
%fid_out=fopen(fname_out,'a');

% Each Challenge .txt file (record) contains data for one patient, in 3 columns
% (timestamp, parameter, and value).  During each iteration of the loop below,
% the contents of a single record are loaded into arrays named tm,
% category, and val.  Each data set (A, B, and C) contains 4000 records.
header={'tm','category','val'};
for i=1:I
    record_id=records(i).name(1:end-4);
    fname=[data_dir record_id '.txt'];
    
    fid_in=fopen(fname,'r');
    C=textscan(fid_in,'%q %q %f','delimiter', ',','HeaderLines',1);
    fclose(fid_in);
    for n=1:length(header)
        eval([header{n} '=C{:,n};'])
    end

    % saves the parsed data
    
    [times,values,names]=extract_param_series(tm,category,val);
        

    [ts_times,ts_values,ts_names]=get_param_subset(time_series_names,times,values,names);

    [des_times,des_values,des_names]=get_param_subset(descriptors,times,values,names);
    
    % plots all param time series
    %plot_params(times,values,names);
    
    DESCRIPTORS(i,:)=cell2mat(des_values(:))';
    
    means24=calculate_mean(ts_times,ts_values,ts_names,time_series_names,[0 24*60]);
    means48=calculate_mean(ts_times,ts_values,ts_names,time_series_names,[24*60 48*60]);
    MEAN_DATA_24(i,:)=means24;
    MEAN_DATA_48(i,:)=means48;
    
  %  savefile = 'parsed_data.mat';
   % save(savefile, 'means24', 'means48','times','values','names')
    
   % [risk,survival]=physionet2012(tm,category,val);

  % SAPS_SCORES(i)=saps_score(tm,category,val,1,[0 24]);
   
 %  SAPS_SCORES_48(i)=saps_score(tm,category,val,1,[24 48]);
    % The outputs of the analysis are now available in the risk (event 2)
    % and survival (event 1) variables output by the function.
    


end

%loadfile = 'parsed_data.mat';
%load(loadfile, 'means24', 'means48','times','values','names');



record_id_res=[];
SAPS=[];
SOFA=[];
LOS=[];
Survival=[];
IHD=[];

if(~isempty(results))

    % The file of known outcomes contains six columns.  The Challenge goal is
    % to predict the sixth column, IHD (in-hospital death).
    variables={'record_id_res','SAPS','SOFA','LOS','Survival','IHD'};
    fid_result=fopen(results,'r');
    C=textscan(fid_result,'%f %f %f %f %f %f','delimiter', ',','HeaderLines',1);
    fclose(fid_result);
    for n=1:length(variables)
        eval([variables{n} '=C{:,n};'])
    end

end

% X_Test=[SAPS_SCORES-SAPS_SCORES_48];
% X_Test=[X_Test SAPS_SCORES];
% [B1,th1,w1]=get_param_coeffs_by_name('GCS_diff');
% PHAT1 = mnrval(B1,X_Test);
% 
% X_Test=[MEAN_DATA_24(:,strcmp(time_series_names,'GCS'))];
% [B2,th2,w2]=get_param_coeffs_by_name('GCS_diff');
% PHAT2 = mnrval(B2,X_Test);
% 
% X_Test=SAPS_SCORES;
% [B3,th3,w3]=get_param_coeffs_by_name('SAPS');
% PHAT3 = mnrval(B3,X_Test);
% 
% % X_Test=[MEAN_DATA_24(:,strcmp(time_series_names,'ALP'))-MEAN_DATA_48(:,strcmp(time_series_names,'ALP'))];
% % [B4,th4,w4]=get_param_coeffs_by_name('ALP_diff');
% % PHAT4 = mnrval(B4,X_Test);
% 
% PHAT(:,2)=(w1*PHAT1(:,2)+w2*PHAT2(:,2))/(w1+w2);
% PHAT(:,2)=(PHAT1(:,2))
% X_Test=SAPS_SCORES;
% 
% % X_Test=MEAN_DATA_24(:,strcmp(time_series_names,'GCS'))
% % X_Test=MEAN_DATA_24(:,strcmp(time_series_names,'ALP'))
% 
% X_Test=[MEAN_DATA_24(:,strcmp(time_series_names,'GCS'))-MEAN_DATA_48(:,strcmp(time_series_names,'GCS'))];
% X_Test=[MEAN_DATA_24(:,strcmp(time_series_names,'GCS')) X_Test];
% B = mnrfit(X_Test,IHD+1);
% PHAT = mnrval(B,X_Test);
% 
% % 
% % X=[MEAN_DATA_24 MEAN_DATA_24-MEAN_DATA_48 ];
% % X=[DESCRIPTORS(:,strcmp(des_names,'Age')) X];
% % ICUTYPE=[DESCRIPTORS(:,strcmp(des_names,'ICUType'))];
% % 
% % X1=[MEAN_DATA_24];
% % X2=[MEAN_DATA_48 ];
% % plot_means_comparison(X,IHD,time_series_names);
% % plot_means_comparison2(X1,X2,IHD,time_series_names);
% % % X=[MEAN_DATA_24(:,strcmp(time_series_names,'Glucose'))-MEAN_DATA_48(:,strcmp(time_series_names,'Glucose')) ];
% % % X=[MEAN_DATA_24(:,strcmp(time_series_names,'Glucose')) X];

PCA_classification(MEAN_DATA_24,MEAN_DATA_48,DESCRIPTORS,IHD,time_series_names,des_names);

% scores1=PCA_classification(X(ICUTYPE==1,:),IHD(ICUTYPE==1));
% scores2=PCA_classification(X(ICUTYPE==2,:),IHD(ICUTYPE==2));
% scores3=PCA_classification(X(ICUTYPE==3,:),IHD(ICUTYPE==3));
% scores4=PCA_classification(X(ICUTYPE==4,:),IHD(ICUTYPE==4));

figure()
plot(30:2:50,scores1)
figure()
plot(30:2:50,scores2)
figure()
plot(30:2:50,scores3)
figure()
plot(30:2:50,scores4)


X=[DESCRIPTORS(:,strcmp(des_names,'Gender'))];
plot_means(DESCRIPTORS(:,strcmp(des_names,'Age')),DATA,IHD,{'ICUType'});

pause();

X=[MEAN_DATA_24(:,strcmp(time_series_names,'GCS'))-MEAN_DATA_48(:,strcmp(time_series_names,'GCS')) ];
X=[MEAN_DATA_24(:,strcmp(time_series_names,'GCS')) X];
Y=(IHD-1)+IHD; % classes y ={-1,1}

R=~isnan(X);
PHAT=teach_log_reg(X,Y,R);


B2 = mnrfit(X,IHD+1);
PHAT2 = mnrval(B2,X);




pause(.1)

X_Test=[MEAN_DATA_24(:,strcmp(time_series_names,'HCO3'))-MEAN_DATA_48(:,strcmp(time_series_names,'HCO3'))];
X_Test=[MEAN_DATA_24(:,strcmp(time_series_names,'HCO3')) X_Test];

X_Test=[MEAN_DATA_24(:,strcmp(time_series_names,'Glucose'))-MEAN_DATA_48(:,strcmp(time_series_names,'Glucose')) X_Test ];
X_Test=[MEAN_DATA_24(:,strcmp(time_series_names,'Glucose')) X_Test];

X_Test=[MEAN_DATA_24(:,strcmp(time_series_names,'Urine'))-MEAN_DATA_48(:,strcmp(time_series_names,'Urine')) X_Test ];
X_Test=[MEAN_DATA_24(:,strcmp(time_series_names,'Urine')) X_Test];

X_Test=[MEAN_DATA_24(:,strcmp(time_series_names,'GCS'))-MEAN_DATA_48(:,strcmp(time_series_names,'GCS')) X_Test ];
X_Test=[MEAN_DATA_24(:,strcmp(time_series_names,'GCS')) X_Test];

X_Test=[MEAN_DATA_24(:,strcmp(time_series_names,'Na'))-MEAN_DATA_48(:,strcmp(time_series_names,'Na')) X_Test ];
X_Test=[MEAN_DATA_24(:,strcmp(time_series_names,'Na')) X_Test];

X_Test=[MEAN_DATA_24(:,strcmp(time_series_names,'HR'))-MEAN_DATA_48(:,strcmp(time_series_names,'HR')) X_Test ];
X_Test=[MEAN_DATA_24(:,strcmp(time_series_names,'HR')) X_Test];

X_Test=[MEAN_DATA_24(:,strcmp(time_series_names,'BUN'))-MEAN_DATA_48(:,strcmp(time_series_names,'BUN')) X_Test ];
X_Test=[MEAN_DATA_24(:,strcmp(time_series_names,'BUN')) X_Test];

%   X_Test=[MEAN_DATA_24(:,strcmp(time_series_names,'WBC'))-MEAN_DATA_48(:,strcmp(time_series_names,'WBC')) X_Test ];
%   X_Test=[(MEAN_DATA_24(:,strcmp(time_series_names,'WBC'))) X_Test];


%X_Test=[MEAN_DATA_24(:,strcmp(time_series_names,'ALP'))-MEAN_DATA_48(:,strcmp(time_series_names,'ALP')) X_Test ];
X_Test=[DESCRIPTORS(:,strcmp(des_names,'Age')) X_Test];

X_Test=[DESCRIPTORS(:,strcmp(des_names,'ICUType')) X_Test];



%%X_Test=[MEAN_DATA_24(:,strcmp(time_series_names,'Weight'))];
B = mnrfit(X_Test,IHD+1);

%% for printing the decimals
%%fprintf('value of b is %1.10e\n',B(1))

%all params
X_Test=[MEAN_DATA_24 MEAN_DATA_24-MEAN_DATA_48];

analyze_2d(MEAN_DATA_24,[MEAN_DATA_24-MEAN_DATA_48],IHD,time_series_names);

PHAT = mnrval(B,X_Test);








% a=find_score_coef(MEAN_DATA_24(:,strcmp(time_series_names,'GCS')),IHD);
% 
% params_to_use=zeros(size(MEAN_DATA_24,2),1);
% params_to_use(6:20)=1;
% 
% Y=(IHD-1)+IHD; % classes y ={-1,1}
% % X=[MEAN_DATA_24 MEAN_DATA_24-MEAN_DATA_48];
% 
% %X=[MEAN_DATA_24(params_to_use)];
% X=[SAPS_SCORES];
% % X_mean=mean(X,1);
% % X=X./repmat(X_mean,4000,1);
% R=~isnan(X);
% 
%  B=teach_log_reg(X,Y,R);
% 
% B = mnrfit(SAPS_SCORES,IHD+1);
% PHAT = mnrval(B,SAPS_SCORES);
% 
% R=~isnan(SAPS_SCORES);
% %B=teach_log_reg(SAPS_SCORES,Y,R);
% 
% plot_2d_change(MEAN_DATA_24(:,strcmp(time_series_names,'ALP')),MEAN_DATA_24(:,strcmp(time_series_names,'GCS')),DATA,IHD,time_series_names);
% plot_means(MEAN_DATA_24(:,strcmp(time_series_names,'ALP')),DATA,IHD,{'ALP'});
% plot_means(MEAN_DATA_24(:,strcmp(time_series_names,'GCS')),DATA,IHD,{'GCS'});
% 
% B = mnrfit(MEAN_DATA_24(:,strcmp(time_series_names,'GCS')),IHD+1);
% PHAT = mnrval(B,MEAN_DATA_24(:,strcmp(time_series_names,'GCS')));
% 
% plot_means(PHAT,DATA,IHD,{'GCS'});
% plot_2d_change(MEAN_DATA_24(:,strcmp(time_series_names,'GCS')),PHAT(:,1),DATA,IHD,{'CGS'});
% figure(2)
% scatter(MEAN_DATA_24(:,strcmp(time_series_names,'GCS')),(PHAT(:,1)),'xb')
% 
% 
% 
% 
% B = mnrfit(MEAN_DATA_24(:,strcmp(time_series_names,'ALP')),IHD+1);
% PHAT = mnrval(B,MEAN_DATA_24(:,strcmp(time_series_names,'ALP')));
% 
% plot_means(PHAT,DATA,IHD,{'ALP'});
% plot_2d_change(MEAN_DATA_24(:,strcmp(time_series_names,'ALP')),PHAT(:,1),DATA,IHD,{'ALP'});
% figure(2)
% scatter(MEAN_DATA_24(:,strcmp(time_series_names,'ALP')),(PHAT(:,1)),'xb')
% 
% 
% 
% 
% figure(2)
% scatter(SAPS_SCORES,(PHAT(:,1)),'xb')
% hold on
% scatter(SAPS_SCORES(IHD==1),PHAT(IHD==1),'r')
% scatter(SAPS_SCORES(IHD==0),PHAT(IHD==0),'g')
% 
% plot_2d_change(SAPS_SCORES,PHAT,DATA,IHD,ALL_CATEGORIES);
% 
% 
% % plots 2d 24h vs. changes
% %plot_2d_change(MEAN_DATA_24,MEAN_DATA_24-MEAN_DATA_48,DATA,IHD,ALL_CATEGORIES);
% 
% plot_2d(MEAN_DATA_24,MEAN_DATA_24-MEAN_DATA_48,DATA,IHD,time_series_names);
% 
% % plots parameter histograms in each group
% % plot_means(MEAN_DATA_24,DATA,IHD,ALL_CATEGORIES);
% 
% plot_means(MEAN_DATA_24-MEAN_DATA_48,DATA,IHD,time_series_names);
% 
% 
% plot_means(SAPS_SCORES,DATA,IHD,{'SAP'});
% 
% 
% plot_means(PHAT,DATA,IHD,{'phat'});
% 
    function th_search

        best_lam=0;
        max_score1_lam=0;
        max_score2_lam=0;
        best_lam_score=[];
        lam=0.01:0.002:.03;
        lam=[0.01 0.1 1 10 100];


        for lam_idx=1:length(lam)

        Y=(IHD-1)+IHD; % classes y ={-1,1}
        X=X_Test;
        R=~isnan(X);
        PHAT=teach_log_reg(X,Y,R,lam(lam_idx));

        %[IDX,C,sumd] = kmeans(double(R),5);

        %X_trunc_saps=[SAPS_SCORES(isnan(PHAT(:,2)))-SAPS_SCORES_48(isnan(PHAT(:,2))) SAPS_SCORES(isnan(PHAT(:,2))) ];
        %B_saps = mnrfit(X_trunc_saps,IHD(isnan(PHAT(:,2)))+1);
        %PHAT_saps = mnrval(B_saps,X_trunc_saps);

        max_score1=0;
        max_score2=0;
        best_th=0;

        for class_th=.1:.01:0.5



            DATA(:,1)=str2double('0000');


                DATA(:,2)=PHAT(:,2);
            %DATA(isnan(PHAT(:,2)),2)=PHAT_saps(:,2);
            DATA(:,3)=PHAT(:,2)> class_th;
           % DATA(isnan(PHAT(:,2)),3)=PHAT_saps(:,2)> 0.33;

        % % %     DATA(:,2)=PHAT(:,2);
        % % %     DATA(isnan(PHAT(:,2)),2)=PHAT_saps(:,2);
        % % %     DATA(~isnan(PHAT(:,2)),3)=PHAT(~isnan(PHAT(:,2)),2)> class_th;
        % % %     DATA(isnan(PHAT(:,2)),3)=PHAT_saps(:,2)> 0.33;

            DATA(DATA(:,2)<0.01,2)=0.01;
            DATA(DATA(:,2)>0.99,2)=0.99;
            %DATA(:,3)=PHAT1(:,2)>0.17 & PHAT2(:,2)>class_th ;

        %     if max_score1==0
        %           savefile = 'DATA_learn.mat';
        %         save(savefile, 'DATA')
        %    
        %     end


         if(~isempty(results))


            % Calculate sensitivity (Se) and positive predictivity (PPV)
            TP=sum(DATA(IHD==1,3));
            FN=sum(~DATA(IHD==1,3));
            FP=sum(DATA(IHD==0,3));  
            Se=TP/(TP+FN);
            PPV=TP/(TP+FP);

            show=0; % if show is 1, the decile graph will be displayed by lemeshow()
            H=lemeshow([IHD DATA(:,2)],show);

            % Use the title of figure to display the results
            title(['H= ' num2str(H) ' Se= ' num2str(Se) ' PPV= ' num2str(PPV) '. ' num2str(class_th) ])

            % The event 1 score is the smaller of Se and PPV.
            score1 = min(Se, PPV);
            if score1>max_score1
                max_score1=score1;
                best_th=class_th;
                max_score2=H;
              %  display(['Unofficial Event 1 score: ' num2str(score1)]);
            end

            % The event 2 score is the Hosmer-Lemeshow statistic (H).
            %display(['Unofficial Event 2 score: ' num2str(H)]);

            figure(10)
            hold on
            scatter(class_th,score1,'x')
         end


        %B'

        end
        best_lam_score(lam_idx)=max_score1;
        if max_score1 > max_score1_lam
        best_lam=0;
        max_score1_lam=max_score1
        max_score2_lam=max_score2
        best_th
        end

        end
    end
end