Information-Based Similarity 1.0.0

File: (9,047 bytes)
/* file: ibs.c		Albert Yang, MD		25 August 2004		0.9
.			Last revised:		27 October 2004 (GBM)   1.1
_______________________________________________________________________________
ibs: calculates information-based similarity index (IBS) between two
data sets

This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; either version 2 of the License, or (at your option) any later
version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
PARTICULAR PURPOSE.  See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc., 59 Temple
Place - Suite 330, Boston, MA 02111-1307, USA.

You may contact the author by e-mail (ccyang@physionet.org/).  For updates to
this software, please visit PhysioNet (http://www.physionet.org/).
_______________________________________________________________________________

Revision history:
  0.9 (25-Aug-2004, Albert Yang)	Original version
  1.0 (26-Oct-2004, George Moody)	Removed limits on input series length,
					fixed bugs that caused crashes for
					large values of M, memory leaks, loss
					of precision in index calculations
  1.1 (27-Oct-2004, George Moody)	Check for and avoid overflow in
					histograms

TODO: collect word frequency histograms while reading the input series to
      reduce memory requirements

Compile this program by

    gcc -o ibs -O ibs.c -lm

This program reads two text files of numbers, which are interpreted as values
of two time series.  Within each series, pairs of consecutive values are
compared to derive a binary series, which has values that are either 1 (if the
second value of the pair was greater than the first) or 0 (otherwise).  A
user-specified parameter, M, determines the length of "words" (M-tuples) to
be analyzed by this progam.

Within each binary series (bs1 and bs2), all M-tuples of consecutive values are
treated as "words"; the function counts the occurrences of each of the 2**M
possible "words" and then derives the word rank order frequency (WROF) list for
the series.  Finally, it calculates the information-based similarity between
the two WROF lists, and outputs this number.  Depending on the input series and
on the choice of M, the value of the index can vary between 0 (completely
dissimilar) and 1 (identical).

*/

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#define MAXINT	(1 << (sizeof(int)*8 - 1) - 1) /* largest positive int value */
#define MAXCOUNT  (MAXINT - 1)	/* largest possible count in a histogram */

/* Sort a word frequency histogram in order to derive a WROF list */
void quicksort(int *array[2], int left, int right, int ic)
{
    if (left < right) {
	int i = left-1, j = right+1, pivot = array[ic][left], temp;

	while (i < j) {
	    while (array[ic][--j] > pivot)
		;
	    while (array[ic][++i] < pivot)
		;
	    if (i < j) {	/* swap ith and jth elements of array */
		temp = array[0][i];
		array[0][i] = array[0][j];
		array[0][j] = temp;
		temp = array[1][i];
		array[1][i] = array[1][j];
		array[1][j] = temp;
	    }
	}
	quicksort(array, left, j, ic);
	quicksort(array, j+1, right, ic);
    }
}

char *pname;	/* the name of this program (main's argv[0]) */

void help()
{
    fprintf(stderr, "usage: %s M SERIES1 SERIES2\n", pname);
    fprintf(stderr,
 "  where M is the word length (an integer greater than 1), and\n"
 "  SERIES1 and SERIES2 are one-column text files containing the\n"
 "  data of the two series that are to be compared.  The output\n"
 "  is the information-based similarity index of the input series\n"
 "  evaluated for M-tuples (words of length M).\n\n"
 "  For additional information, see http://physionet.org/physiotools/ibs/.\n");
}

char *bs1, *bs2;		/* binary series */
FILE *RF1, *RF2;   		/* input file pointers */
int *temp, *w1[2], *w2[2]; 	/* workspace for histograms and WROF lists */

/* This function releases all dynamically allocated memory, closes the input
   files, and exits this program (it does not return to the caller). */
void cleanup(int exitcode)
{
    if (RF1) fclose(RF1);
    if (RF2) fclose(RF2);
    if (bs1) free(bs1);
    if (bs2) free(bs2);
    if (temp) free(temp);
    if (w1[0]) free(w1[0]);
    if (w1[1]) free(w1[1]);
    if (w2[0]) free(w2[0]);
    if (w2[1]) free(w2[1]);
    exit(exitcode);
}

int main(int argc, char **argv)
{
    double dr, f1, f2, p, pi, pj, x1, x2;
    int i, j, m, mtuple, nw;
    long maxdat = 0L, n, nlin1 = 0L, nlin2 = 0L, ns1, ns2;

    /* Read the argument list. */
    pname = argv[0];
    if ((argc < 4) || (m = atoi(argv[1])) < 2) {
	help();
	exit(1);
    }
    if (m >= sizeof(int)*8 - 1) {
	fprintf(stderr, "%s: insufficient memory (try a smaller M)\n", pname);
	exit(1);
    }

    /* Allocate and initialize workspace for word histograms and WROF lists. */
    nw = 1 << m;
    if ((temp  = calloc(nw, sizeof(int))) == NULL ||
	(w1[0] = calloc(nw, sizeof(int))) == NULL ||
	(w1[1] = calloc(nw, sizeof(int))) == NULL ||
	(w2[0] = calloc(nw, sizeof(int))) == NULL ||
	(w2[1] = calloc(nw, sizeof(int))) == NULL) {
	fprintf(stderr, "%s: insufficient memory (try a smaller M)\n", pname);
	cleanup(1);
    }
    for (i = 0; i < nw; i++)
	w1[0][i] = w2[0][i] = i;
  
    /* Open the input files. */
    if ((RF1 = fopen(argv[2], "r")) == NULL) {
	fprintf(stderr, "%s: can't open %s\n", pname, argv[2]);
	cleanup(1);
    }
    if ((RF2 = fopen(argv[3], "r")) == NULL) {
	fprintf(stderr, "%s: can't open %s\n", pname, argv[3]);
	cleanup(1);
    }

    /* Read input series 1. */
    fscanf(RF1, "%f", &x1);
    while (fscanf(RF1, "%f", &x2) == 1) {
	if (nlin1 >= maxdat) {
	    char *pbs;

	    maxdat += 50000;	/* allow bs1 to grow (the increment is
				   arbitrary) */
	    if ((pbs = realloc(bs1, maxdat * sizeof(char))) == NULL) {
		fprintf(stderr,
			"%s: insufficient memory for %s\n", pname, argv[2]);
		cleanup(1);
	    }
	    bs1 = pbs;
	}
	bs1[nlin1++] = (x2 > x1) ? 1 : 0;
	x1 = x2;
    }
    if (nlin1 < m) {
	fprintf(stderr, "%s: input series %s is too short (< %d values)\n",
		pname, argv[2], m);
	cleanup(1);
    }

    /* Read input series 2. */ 
    maxdat = 0L;
    fscanf(RF2, "%f", &x1);
    while (fscanf(RF2, "%f", &x2) == 1) {
	if (nlin2 >= maxdat) {
	    char *pbs;

	    maxdat += 50000;	/* allow bs2 to grow (the increment is
				   arbitrary) */
	    if ((pbs = realloc(bs2, maxdat * sizeof(char))) == NULL) {
		fprintf(stderr,
			"%s: insufficient memory for %s\n", pname, argv[3]);
		cleanup(1);
	    }
	    bs2 = pbs;
	}
	bs2[nlin2++] = (x2 > x1) ? 1 : 0;
	x1 = x2;
    }
    if (nlin2 < m) {
	fprintf(stderr, "%s: input series %s is too short (< %d values)\n",
		pname, argv[3], m);
	cleanup(1);
    }
    
    /* Accumulate word frequency histograms. */
    for (n = 0, f1 = ns1 = nlin1 - m + 1; n < ns1; n++) {
	mtuple = 0;
	for (j = 1 ; j <= m ; j++){
	    mtuple = mtuple + (1 << (m-j))*bs1[n+j-1];
	}
       	if (++w1[1][mtuple] > MAXCOUNT) w1[1][mtuple] = MAXCOUNT;
    }

    for (n = 0, f2 = ns2 = nlin2 - m + 1; n < ns2; n++) {
	mtuple = 0;
	for (j = 1 ; j <= m ; j++){
	    mtuple = mtuple + (1 << (m-j))*bs2[n+j-1];
	}
       	if (++w2[1][mtuple] > MAXCOUNT) w2[1][mtuple] = MAXCOUNT;
    }
  
    /* Sort histograms to obtain WROF lists. At this point, w1[0] contains
       word values, and w1[1] contains word counts (e.g., w1[0][5] = 5, and
       w1[1][5] contains the number of times that 5 occurs in bs1). */
    for (i = 0; i < nw; i++)
	temp[i] = w1[1][i];		/* save w1[1] (word counts) */
    quicksort(w1, 0, nw - 1, 1);	/* sort bins by word counts */
    for (i = 0; i < nw; i++)
	w1[1][i] = nw - i - 1;		/* store word rank in w1[1] */
    quicksort(w1, 0, nw - 1, 0);	/* restore order by word value */
    for (i = 0; i < nw; i++)
	w1[0][i] = temp[i];		/* replace word values with ranks */
  
    /* Repeat for w2. */
    for (i = 0; i < nw; i++)
	temp[i] = w2[1][i];
    quicksort(w2, 0, nw - 1, 1);
    for (i = 0; i < nw; i++)
	w2[1][i] = nw - i - 1;
    quicksort(w2, 0, nw - 1, 0);
    for (i = 0 ; i < nw; i++)
	w2[0][i] = temp[i];

    /* Calculate similarity index. */
    for (dr = p = i = 0; i < nw; i++) {
	pi = w1[0][i]/f1;	/* pi = probability of finding i by choosing
				   a random word from bs1 */
	pj = w2[0][i]/f2;	/* pj = probability of finding i by choosing
				   a random word from bs2 */
	if (pi > 0)
	    pi = -pi * log(pi);	/* Shannon entropy of word i in bs1 */
	else
	    pi = 0;
	if (pj > 0)
	    pj = -pj * log(pj);
	else
	    pj = 0;
	dr += abs(w1[1][i] - w2[1][i]) * (pi + pj);
	p +=  pi + pj;
    }

    if (p > 0)
	dr /= (p * (nw-1));
    else
	dr = -1;	/* This should never happen!  (It might occur if if we
			   had not already checked to see that at least m
			   values were available in each binary series.) */
     printf("%lf\n", dr);
     cleanup(0);
}