De-Identification Software Package 1.1

File: (5,350 bytes)
# File: deid.config
# Example configuration file used by the de-identification 
# software, deid.pl, to de-identify the gold standard corpus
# and output performance statistics.
# Authors: Margaret Douglass, Ishna Neamatullah, William J. Long, Li-wei Lehman 
# Last modified by Li-wei Lehman (lilehman@mit.edu) Nov. 2007

#Description: This configuration file allows you to
# (1) set certain global variables (2) turn certain filters on/off,
# and (3) turn certain dictionaries on/off. The general format is
# <configuration string> = <value> 
#
# The <value> is 'y'/'n' or '0'/'1' or some other values depending
# on the configuration string 
# For example, for the configuration string "Gold standard comparison",
# set the value to either 0 or 1. See rest of this file for more examples.
# IMPORTANT: do not change the configuration string, as this might
# cause the software to not recognize the configuration setting.

########################################################################
#################Configure Comparison or Output Mode####################
# "Gold standard comparison = 0" for output mode.
# "Gold standard comparison = 1" for performance comparison mode; a
# gold standard corpus and PHI list must be provided.
Gold standard comparison = 1
#Gold standard comparison = 0


########################################################################
########Configure Date Related Variables for De-identification##########
# Date offset should be an integer than represents the number of days 
# to date shift in re-identifying dates in the medical notes. 
# This date offset will be applied to all patients.  To use a different 
# date shift for different patient, set "PID to date offset mapping" 
# to 'y', and provide the mapping in a file called "shift.txt" in the 
# same directory.

# Date offset is 0 in GS comparison mode, since we are not outputing
# any de-ided text with date shift.  For output mode, set Date offset
# to a number of days that will be used for to date shift for all 
# patients. Note that this offset is ignored if a PID to date offset file 
# is available.
Date offset = 0 

# PID to date shift mappings: if set to 'y', the code will
# load patient-specific date-shift from file "shift.txt";
PID to date offset mapping = n

#Format for the default date should be MM/DD/YYYY
Date Default = 01/01/2020

# The "Two Digit Year Threshold" is used to determine whether
# to interpret the year as a year in the 1900's or 2000's.
# Must be a 1- or 2-digit number.
# Two digit years > Threshold are  interepreted as in the 1900's
# Two digit years <=  Threshold are interpreted as in the 2000's
# The following threshold is set according to the re-identified dates
# that appear in our gold standard corpus. 
Two Digit Year Threshold = 30


########################################################################
##################Configure De-identification Filters:##################
# De-identification filters used:
# PHI categories filtered:
# 1. Social Security Numbers (SSN)
# 2. Uniform Resource Locators (URL)
# 3. Email addresses
# 4. Telephone/fax numbers
# 5. Provider/unit/medical record numbers
# 6. Ages over 90
# 7. Locations and hospital names
# 8. Dates
# 9. Names
# 10.U.S. States
#Note: 
# GS (gold standard) filters patterns (e.g. ward names) specific to 
# gold std corpus (which are nursing notes).  The filter for DS should 
# always be set to "n" for this distribution, as it applies only to 
# patterns we see in our discharge summaries, which are not included 
# in this distribution.

# Use 'y' to set the filter on or 'n' to turn the filter off
SSN filter = y
URL filter = y
Email filter = y
Telephone filter = y
Unit number filter = y
Age filter = y
Location filter = y
Date filter = y
Name filter = y
State filter = n
GS filter = y
DS filter = n

#########################################################################
#########Configure Dictionary Loading for De-identification #############
# Note: there are more dictionaries than listed here. The ones listed
# here are the ones we allow you to enable/disable the loading of the
# dictionaries for. Generic first/lastname dictionaries are always loaded.
# Lists used:
# 1. PID to patient name mappings: "lists/pid_patientname.txt";
# 2. Country names: "lists/countries_unambig.txt";
# 3. Company names: 
#    a) "lists/company_names_unambig.txt", 
#    b) "lists/company_names_ambig.txt".
# 4. Ethnicities: "lists/ethnicities_unambig.txt";
# 5. Hospitals: "lists/stripped_hospitals.txt";
# 6. Locations:;
#   a) "lists/locations_unambig.txt",
#   b) "lists/locations_ambig.txt",
# 7. LocalPlaces:
#   a) "lists/local_places_unambig.txt",
#   b) "lists/local_places_ambig.txt".
# 8. Doctor names:
#   a) "lists/doctor_first_names.txt"
#   b) "lists/doctor_last_names.txt"
# 9. US States: 
#   a) lists/us_states.txt
#   b) lists/us_states_abbre.tx
#   c) lists/more_us_state_abbreviations.txt"


#Configure lists/dictionaries:
# Use 'y' to load the dictionary or 'n' to not load the dictionary
# Note that we load the State dictionary for de-identification
# of patterns of zipcode and university/college names with 
# State names in it.

PID to patient name mapping = y
Country names = n
Company names = y
Ethnicities = n
Hospital names = y
Location names = y
Doctor names = y
LocalPlaces names = y
State names = y