De-Identification Software Package 1.1
(5,350 bytes)
# File: deid.config
# Example configuration file used by the de-identification
# software, deid.pl, to de-identify the gold standard corpus
# and output performance statistics.
# Authors: Margaret Douglass, Ishna Neamatullah, William J. Long, Li-wei Lehman
# Last modified by Li-wei Lehman (lilehman@mit.edu) Nov. 2007
#Description: This configuration file allows you to
# (1) set certain global variables (2) turn certain filters on/off,
# and (3) turn certain dictionaries on/off. The general format is
# <configuration string> = <value>
#
# The <value> is 'y'/'n' or '0'/'1' or some other values depending
# on the configuration string
# For example, for the configuration string "Gold standard comparison",
# set the value to either 0 or 1. See rest of this file for more examples.
# IMPORTANT: do not change the configuration string, as this might
# cause the software to not recognize the configuration setting.
########################################################################
#################Configure Comparison or Output Mode####################
# "Gold standard comparison = 0" for output mode.
# "Gold standard comparison = 1" for performance comparison mode; a
# gold standard corpus and PHI list must be provided.
Gold standard comparison = 1
#Gold standard comparison = 0
########################################################################
########Configure Date Related Variables for De-identification##########
# Date offset should be an integer than represents the number of days
# to date shift in re-identifying dates in the medical notes.
# This date offset will be applied to all patients. To use a different
# date shift for different patient, set "PID to date offset mapping"
# to 'y', and provide the mapping in a file called "shift.txt" in the
# same directory.
# Date offset is 0 in GS comparison mode, since we are not outputing
# any de-ided text with date shift. For output mode, set Date offset
# to a number of days that will be used for to date shift for all
# patients. Note that this offset is ignored if a PID to date offset file
# is available.
Date offset = 0
# PID to date shift mappings: if set to 'y', the code will
# load patient-specific date-shift from file "shift.txt";
PID to date offset mapping = n
#Format for the default date should be MM/DD/YYYY
Date Default = 01/01/2020
# The "Two Digit Year Threshold" is used to determine whether
# to interpret the year as a year in the 1900's or 2000's.
# Must be a 1- or 2-digit number.
# Two digit years > Threshold are interepreted as in the 1900's
# Two digit years <= Threshold are interpreted as in the 2000's
# The following threshold is set according to the re-identified dates
# that appear in our gold standard corpus.
Two Digit Year Threshold = 30
########################################################################
##################Configure De-identification Filters:##################
# De-identification filters used:
# PHI categories filtered:
# 1. Social Security Numbers (SSN)
# 2. Uniform Resource Locators (URL)
# 3. Email addresses
# 4. Telephone/fax numbers
# 5. Provider/unit/medical record numbers
# 6. Ages over 90
# 7. Locations and hospital names
# 8. Dates
# 9. Names
# 10.U.S. States
#Note:
# GS (gold standard) filters patterns (e.g. ward names) specific to
# gold std corpus (which are nursing notes). The filter for DS should
# always be set to "n" for this distribution, as it applies only to
# patterns we see in our discharge summaries, which are not included
# in this distribution.
# Use 'y' to set the filter on or 'n' to turn the filter off
SSN filter = y
URL filter = y
Email filter = y
Telephone filter = y
Unit number filter = y
Age filter = y
Location filter = y
Date filter = y
Name filter = y
State filter = n
GS filter = y
DS filter = n
#########################################################################
#########Configure Dictionary Loading for De-identification #############
# Note: there are more dictionaries than listed here. The ones listed
# here are the ones we allow you to enable/disable the loading of the
# dictionaries for. Generic first/lastname dictionaries are always loaded.
# Lists used:
# 1. PID to patient name mappings: "lists/pid_patientname.txt";
# 2. Country names: "lists/countries_unambig.txt";
# 3. Company names:
# a) "lists/company_names_unambig.txt",
# b) "lists/company_names_ambig.txt".
# 4. Ethnicities: "lists/ethnicities_unambig.txt";
# 5. Hospitals: "lists/stripped_hospitals.txt";
# 6. Locations:;
# a) "lists/locations_unambig.txt",
# b) "lists/locations_ambig.txt",
# 7. LocalPlaces:
# a) "lists/local_places_unambig.txt",
# b) "lists/local_places_ambig.txt".
# 8. Doctor names:
# a) "lists/doctor_first_names.txt"
# b) "lists/doctor_last_names.txt"
# 9. US States:
# a) lists/us_states.txt
# b) lists/us_states_abbre.tx
# c) lists/more_us_state_abbreviations.txt"
#Configure lists/dictionaries:
# Use 'y' to load the dictionary or 'n' to not load the dictionary
# Note that we load the State dictionary for de-identification
# of patterns of zipcode and university/college names with
# State names in it.
PID to patient name mapping = y
Country names = n
Company names = y
Ethnicities = n
Hospital names = y
Location names = y
Doctor names = y
LocalPlaces names = y
State names = y