Information-Based Similarity 1.0.0
(9,047 bytes)
/* file: ibs.c Albert Yang, MD 25 August 2004 0.9
. Last revised: 27 October 2004 (GBM) 1.1
_______________________________________________________________________________
ibs: calculates information-based similarity index (IBS) between two
data sets
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; either version 2 of the License, or (at your option) any later
version.
This program is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc., 59 Temple
Place - Suite 330, Boston, MA 02111-1307, USA.
You may contact the author by e-mail (ccyang@physionet.org/). For updates to
this software, please visit PhysioNet (http://www.physionet.org/).
_______________________________________________________________________________
Revision history:
0.9 (25-Aug-2004, Albert Yang) Original version
1.0 (26-Oct-2004, George Moody) Removed limits on input series length,
fixed bugs that caused crashes for
large values of M, memory leaks, loss
of precision in index calculations
1.1 (27-Oct-2004, George Moody) Check for and avoid overflow in
histograms
TODO: collect word frequency histograms while reading the input series to
reduce memory requirements
Compile this program by
gcc -o ibs -O ibs.c -lm
This program reads two text files of numbers, which are interpreted as values
of two time series. Within each series, pairs of consecutive values are
compared to derive a binary series, which has values that are either 1 (if the
second value of the pair was greater than the first) or 0 (otherwise). A
user-specified parameter, M, determines the length of "words" (M-tuples) to
be analyzed by this progam.
Within each binary series (bs1 and bs2), all M-tuples of consecutive values are
treated as "words"; the function counts the occurrences of each of the 2**M
possible "words" and then derives the word rank order frequency (WROF) list for
the series. Finally, it calculates the information-based similarity between
the two WROF lists, and outputs this number. Depending on the input series and
on the choice of M, the value of the index can vary between 0 (completely
dissimilar) and 1 (identical).
*/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define MAXINT (1 << (sizeof(int)*8 - 1) - 1) /* largest positive int value */
#define MAXCOUNT (MAXINT - 1) /* largest possible count in a histogram */
/* Sort a word frequency histogram in order to derive a WROF list */
void quicksort(int *array[2], int left, int right, int ic)
{
if (left < right) {
int i = left-1, j = right+1, pivot = array[ic][left], temp;
while (i < j) {
while (array[ic][--j] > pivot)
;
while (array[ic][++i] < pivot)
;
if (i < j) { /* swap ith and jth elements of array */
temp = array[0][i];
array[0][i] = array[0][j];
array[0][j] = temp;
temp = array[1][i];
array[1][i] = array[1][j];
array[1][j] = temp;
}
}
quicksort(array, left, j, ic);
quicksort(array, j+1, right, ic);
}
}
char *pname; /* the name of this program (main's argv[0]) */
void help()
{
fprintf(stderr, "usage: %s M SERIES1 SERIES2\n", pname);
fprintf(stderr,
" where M is the word length (an integer greater than 1), and\n"
" SERIES1 and SERIES2 are one-column text files containing the\n"
" data of the two series that are to be compared. The output\n"
" is the information-based similarity index of the input series\n"
" evaluated for M-tuples (words of length M).\n\n"
" For additional information, see http://physionet.org/physiotools/ibs/.\n");
}
char *bs1, *bs2; /* binary series */
FILE *RF1, *RF2; /* input file pointers */
int *temp, *w1[2], *w2[2]; /* workspace for histograms and WROF lists */
/* This function releases all dynamically allocated memory, closes the input
files, and exits this program (it does not return to the caller). */
void cleanup(int exitcode)
{
if (RF1) fclose(RF1);
if (RF2) fclose(RF2);
if (bs1) free(bs1);
if (bs2) free(bs2);
if (temp) free(temp);
if (w1[0]) free(w1[0]);
if (w1[1]) free(w1[1]);
if (w2[0]) free(w2[0]);
if (w2[1]) free(w2[1]);
exit(exitcode);
}
int main(int argc, char **argv)
{
double dr, f1, f2, p, pi, pj, x1, x2;
int i, j, m, mtuple, nw;
long maxdat = 0L, n, nlin1 = 0L, nlin2 = 0L, ns1, ns2;
/* Read the argument list. */
pname = argv[0];
if ((argc < 4) || (m = atoi(argv[1])) < 2) {
help();
exit(1);
}
if (m >= sizeof(int)*8 - 1) {
fprintf(stderr, "%s: insufficient memory (try a smaller M)\n", pname);
exit(1);
}
/* Allocate and initialize workspace for word histograms and WROF lists. */
nw = 1 << m;
if ((temp = calloc(nw, sizeof(int))) == NULL ||
(w1[0] = calloc(nw, sizeof(int))) == NULL ||
(w1[1] = calloc(nw, sizeof(int))) == NULL ||
(w2[0] = calloc(nw, sizeof(int))) == NULL ||
(w2[1] = calloc(nw, sizeof(int))) == NULL) {
fprintf(stderr, "%s: insufficient memory (try a smaller M)\n", pname);
cleanup(1);
}
for (i = 0; i < nw; i++)
w1[0][i] = w2[0][i] = i;
/* Open the input files. */
if ((RF1 = fopen(argv[2], "r")) == NULL) {
fprintf(stderr, "%s: can't open %s\n", pname, argv[2]);
cleanup(1);
}
if ((RF2 = fopen(argv[3], "r")) == NULL) {
fprintf(stderr, "%s: can't open %s\n", pname, argv[3]);
cleanup(1);
}
/* Read input series 1. */
fscanf(RF1, "%f", &x1);
while (fscanf(RF1, "%f", &x2) == 1) {
if (nlin1 >= maxdat) {
char *pbs;
maxdat += 50000; /* allow bs1 to grow (the increment is
arbitrary) */
if ((pbs = realloc(bs1, maxdat * sizeof(char))) == NULL) {
fprintf(stderr,
"%s: insufficient memory for %s\n", pname, argv[2]);
cleanup(1);
}
bs1 = pbs;
}
bs1[nlin1++] = (x2 > x1) ? 1 : 0;
x1 = x2;
}
if (nlin1 < m) {
fprintf(stderr, "%s: input series %s is too short (< %d values)\n",
pname, argv[2], m);
cleanup(1);
}
/* Read input series 2. */
maxdat = 0L;
fscanf(RF2, "%f", &x1);
while (fscanf(RF2, "%f", &x2) == 1) {
if (nlin2 >= maxdat) {
char *pbs;
maxdat += 50000; /* allow bs2 to grow (the increment is
arbitrary) */
if ((pbs = realloc(bs2, maxdat * sizeof(char))) == NULL) {
fprintf(stderr,
"%s: insufficient memory for %s\n", pname, argv[3]);
cleanup(1);
}
bs2 = pbs;
}
bs2[nlin2++] = (x2 > x1) ? 1 : 0;
x1 = x2;
}
if (nlin2 < m) {
fprintf(stderr, "%s: input series %s is too short (< %d values)\n",
pname, argv[3], m);
cleanup(1);
}
/* Accumulate word frequency histograms. */
for (n = 0, f1 = ns1 = nlin1 - m + 1; n < ns1; n++) {
mtuple = 0;
for (j = 1 ; j <= m ; j++){
mtuple = mtuple + (1 << (m-j))*bs1[n+j-1];
}
if (++w1[1][mtuple] > MAXCOUNT) w1[1][mtuple] = MAXCOUNT;
}
for (n = 0, f2 = ns2 = nlin2 - m + 1; n < ns2; n++) {
mtuple = 0;
for (j = 1 ; j <= m ; j++){
mtuple = mtuple + (1 << (m-j))*bs2[n+j-1];
}
if (++w2[1][mtuple] > MAXCOUNT) w2[1][mtuple] = MAXCOUNT;
}
/* Sort histograms to obtain WROF lists. At this point, w1[0] contains
word values, and w1[1] contains word counts (e.g., w1[0][5] = 5, and
w1[1][5] contains the number of times that 5 occurs in bs1). */
for (i = 0; i < nw; i++)
temp[i] = w1[1][i]; /* save w1[1] (word counts) */
quicksort(w1, 0, nw - 1, 1); /* sort bins by word counts */
for (i = 0; i < nw; i++)
w1[1][i] = nw - i - 1; /* store word rank in w1[1] */
quicksort(w1, 0, nw - 1, 0); /* restore order by word value */
for (i = 0; i < nw; i++)
w1[0][i] = temp[i]; /* replace word values with ranks */
/* Repeat for w2. */
for (i = 0; i < nw; i++)
temp[i] = w2[1][i];
quicksort(w2, 0, nw - 1, 1);
for (i = 0; i < nw; i++)
w2[1][i] = nw - i - 1;
quicksort(w2, 0, nw - 1, 0);
for (i = 0 ; i < nw; i++)
w2[0][i] = temp[i];
/* Calculate similarity index. */
for (dr = p = i = 0; i < nw; i++) {
pi = w1[0][i]/f1; /* pi = probability of finding i by choosing
a random word from bs1 */
pj = w2[0][i]/f2; /* pj = probability of finding i by choosing
a random word from bs2 */
if (pi > 0)
pi = -pi * log(pi); /* Shannon entropy of word i in bs1 */
else
pi = 0;
if (pj > 0)
pj = -pj * log(pj);
else
pj = 0;
dr += abs(w1[1][i] - w2[1][i]) * (pi + pj);
p += pi + pj;
}
if (p > 0)
dr /= (p * (nw-1));
else
dr = -1; /* This should never happen! (It might occur if if we
had not already checked to see that at least m
values were available in each binary series.) */
printf("%lf\n", dr);
cleanup(0);
}