De-Identification Software Package 1.1
(191,762 bytes)
#!/usr/bin/perl
#**************************************************************************************************************
#file: deid.pl Original author: M. Douglass 2004
#Last revised: Apr 2009 DeID 1.1
#
#
#_______________________________________________________________________________
#
#deid.pl: De-identification algorithm -- scrubs PHI from free-text medical records
#(e.g. Discharge Summaries and Nursing Notes)
#
#Copyright (C) 2004-2007 Margaret Douglas and Ishna Neamatullah
#
#This code is free software; you can redistribute it and/or modify it under
#the terms of the GNU Library General Public License as published by the Free
#Software Foundation; either version 2 of the License, or (at your option) any
#later version.
#
#This library is distributed in the hope that it will be useful, but WITHOUT ANY
#WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
#PARTICULAR PURPOSE. See the GNU Library General Public License for more
#details.
#
#You should have received a copy of the GNU Library General Public License along
#with this code; if not, write to the Free Software Foundation, Inc., 59
#Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
#You may contact the author by e-mail or postal mail
#(MIT Room E25-505, Cambridge, MA 02139 USA). For updates to this software,
#please visit PhysioNet (http://www.physionet.org/).
#_______________________________________________________________________________
#
# De-identification Algorithm: Scrubs PHI from free-text medical records
#(e.g. Discharge Summaries and Nursing Notes)
# Original version written by:
# Margaret Douglass (douglass AT alum DOT mit DOT edu)
# William Long (wjl AT mit DOT edu)
# Modified by:
# Ishna Neamatullah (ishna AT alum DOT mit DOT edu) in Sept 5, 2006
# Last modified by:
# Li-wei Lehman (lilehman AT alum DOT mit DOT edu) in April, 2009
#
# Command to run the software:
# perl deid.pl <filename> <config_filename>
#
# Input arguments:
# filename (without extension, where extension must be .text): file to be de-identified
# config_filename: configuration file
#
# Required library file: stat.pm
#
# Output files:
# filename.res: de-identified file
# filename.phi: file containing all PHI locations, used in calculating performance statistics
# filename.info: file containing information useful for debugging
# code performance statistics printed on screen if Gold Standard available for nursing notes (details in README)
#**************************************************************************************************************
#use Stat;
# Declaring some variables for algorithm run configuration
# Variables to switch on/off filter functions
$allfilters = "";
$ssnfilter = "";
$urlfilter = "";
$emailfilter = "";
$telfilter = "";
$unitfilter = "";
$agefilter = "";
$locfilter = "";
$datefilter = "";
$namefilter = "";
$us_state_filter = "";
$ds_specific_filter = ""; #filter for discharge summmary specific patterns
$gs_specific_filter = ""; #filter for gold std specific patterns
my $offset; # positive date shift in number of days
my $comparison = ""; # 1=comparison with gold standard, 0=no comparison with gold standard
# Variables to switch on/off dictionaries/lists
$alllists = "";
$pid_patientname_list = "";
$pid_dateshift_list = "";
$country_list = "";
$company_list = "";
$ethnicity_list = "";
$hospital_list = "";
$doctor_list = "";
$location_list = "";
$local_list = "";
$state_list = "";
use Time::Local;
my ($mday,$mon,$year) = (localtime(time))[3,4,5];
my $shortyear = substr($year,1,4);
my $longyear = '20'.$shortyear;
# Nursing note date as retrieved from the header
my $nn_year;
# Sets whether a de-identified version should be output: 0 = no new version of text output, 1 = fully de-identified version of text output
# Note: Generally keep output_deid_text = 1
$output_deid_text = 1;
#Default date used to de-identify the Gold Std if no default date is specified in the
#config file. You can change the default date by setting the "Default date" variable to
#some other dates (in MM/DD/YYYY format).
$DEFAULT_DATE = "01/01/2000";
#The "Two Digit Year Threshold" is used to determine whether
#to interpret the year as a year in the 1900's or 2000's.
#Must be a 1- or 2-digit number.
#Two digit years > Threshold are interepreted as in the 1900's
#Two digit years <= Threshold are interpreted as in the 2000's
$TWO_DIGIT_YEAR_THRESHOLD = 30;#change this default by setting "Two Digit Year Threshold" in config file.
# "Valid Year Low" and "Valid Year High" (must be 4-digit numbers) are
# used in certain date pattern checking routines to determine if a
# four digit number that appear in a potential
# date pattern is a year or not -- it is a valid year if it is
# in the range of [Valid Year Low, Valid Year High].
$VALID_YEAR_LOW = 1900;
$VALID_YEAR_HIGH = 2030;
my @known_phi;
my @known_first_name;
my @known_last_name;
# Hash that stores PHI information from de-identification
# KEY = (patient number), VALUE = array with each element = (%HASH with KEY = start-end, VALUE = array of types of PHI for the note number/index of the array)
%all_phi;
my %ID;
# Declares some global variables
%lhash = ();
@typename = ();
$ntype = 1;
@extend = ('phrases');
##########################################################################################
# Sets paths to lists and dictionaries in working directory that will be used in this algorithm
$date_shift_file = "shift.txt"; # contains mapping of PID to date offset
$countries_file = "lists/countries_unambig.txt";
$ethnicities_unambig_file = "lists/ethnicities_unambig.txt";
$companies_file = "lists/company_names_unambig.txt";
$companies_ambig_file = "lists/company_names_ambig.txt";
$common_words_file = "dict/common_words.txt";
$medical_words_file = "dict/sno_edited.txt";
$very_common_words_file = "dict/commonest_words.txt";
$female_unambig_file = "lists/female_names_unambig.txt";
$female_ambig_file = "lists/female_names_ambig.txt";
$female_popular_file = "lists/female_names_popular.txt";
$male_unambig_file = "lists/male_names_unambig.txt";
$male_ambig_file = "lists/male_names_ambig.txt";
$male_popular_file = "lists/male_names_popular.txt";
$last_unambig_file = "lists/last_names_unambig.txt";
$last_ambig_file = "lists/last_names_ambig.txt";
$last_popular_file = "lists/last_names_popular.txt";
#$last_name_prefixes_file = "lists/last_name_prefixes.txt";
$doctor_first_unambig_file = "lists/doctor_first_names.txt";
$doctor_last_unambig_file = "lists/doctor_last_names.txt";
$prefixes_unambig_file = "lists/prefixes_unambig.txt";
$locations_unambig_file = "lists/locations_unambig.txt";
$locations_ambig_file = "lists/locations_ambig.txt";
$local_places_ambig_file = "lists/local_places_ambig.txt";
$local_places_unambig_file = "lists/local_places_unambig.txt";
$hospital_file = "lists/stripped_hospitals.txt";
$last_name_prefix_file = "lists/last_name_prefixes.txt";
$patient_file = "lists/pid_patientname.txt"; # contains mapping of PID to patient name
$us_states_file = "lists/us_states.txt";
$us_states_abbre_file = "lists/us_states_abbre.txt";
$more_us_states_abbre_file = "lists/more_us_state_abbreviations.txt";
$us_area_code_file = "lists/us_area_code.txt";
$medical_phrases_file = "dict/medical_phrases.txt";
$unambig_common_words_file = "dict/notes_common.txt";
############################################################################################################
# Declares some arrays of context words that can be used to identify PHI
# Days of the month
@days = ("monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday");
# Titles that precede last names (ignore .'s)
@titles = ("MISTER", "DOCTOR", "DOCTORS", "MISS", "PROF", "PROFESSOR", "REV", "RABBI", "NURSE", "MD", "PRINCESS", "PRINCE", "DEACON", "DEACONESS", "CAREGIVER", "PRACTITIONER", "MR", "MS");
@strict_titles = ("Dr", "DRS", "Mrs"); #treat words after these strict_titles as PHI
%titles = ();
foreach $title (@titles){
$titles{$title} = 1;
}
# Name indicators that precede or follow names
@name_indicators = ("problem","problem:", "proxy", "daughter","daughters", "dtr", "son", "brother","sister", "mother", "mom", "father", "dad", "wife", "husband", "neice", "nephew", "spouse", "partner", "cousin", "aunt", "uncle", "granddaughter", "grandson", "grandmother", "grandmom", "grandfather", "granddad", "relative", "friend", "neighbor", "visitor", "family member", "lawyer", "priest", "rabbi", "coworker", "co-worker", "boyfriend", "girlfriend", "name is", "named", "rrt", "significant other", "jr", "caregiver", "proxys", "friends", "sons", "brothers", "sisters", "sister-in-law", "brother-in-law", "mother-in-law", "father-in-law", "son-in-law", "daughter-in-law", "dtr-in-law", "surname will be", "name will be", "name at discharge will be", "name at discharge is");
# Phrases that precede locations
@location_indicators = ("lives in", "resident of", "lived in", "lives at", "comes from", "called from", "visited from", "arrived from", "returned to");
@employment_indicators_pre = ("employee of", "employed by", "employed at", "CEO of", "manager at", "manager for", "manager of", "works at", "business");
# Hospital indicators that follow hospital names
@hospital_indicators = ("Hospital", "General Hospital", "Gen Hospital", "gen hosp", "hosp", "Medical Center", "Med Center", "Med Ctr", "Rehab", "Clinic", "Rehabilitation", "Campus", "health center", "cancer center", "development center", "community health center", "health and rehabilitation", "Medical", "transferred to", "transferred from", "transfered to", "transfered from");
# Location indicators that follow locations
@loc_indicators_suff = ("city", "town", "beach", "valley","county", "harbor", "ville", "creek", "springs", "mountain", "island", "lake", "lakes", "shore", "garden", "haven", "village", "grove", "hills", "hill", "shire", "cove", "coast", "alley", "street", "terrace", "boulevard", "parkway", "highway", "university", "college", "tower");
# Location indicators that are most likely preceded by locations
@loc_ind_suff_c = ("town", "ville", "harbor", "tower");
# Location indicators that precede locations
#@loc_indicators_pre = ("cape", "fort", "lake", "mount", "santa", "los", "great","east","west","north","south");
@loc_indicators_pre = ("cape", "fort", "lake", "mount", "santa", "los", "east","west","north","south");
@apt_indicators = ("apt", "suite"); #only check these after the street address is found
@street_add_suff = ("park", "drive", "street", "road", "lane", "boulevard", "blvd", "avenue", "highway", "circle","ave", "place", "rd", "st");
#Strict street address suffix: case-sensitive match on the following,
#and will be marked as PHI regardless of ambiguity (common words)
@strict_street_add_suff = ("Park", "Drive", "Street", "Road", "Lane", "Boulevard", "Blvd", "Avenue", "Highway","Ave",,"Rd", "PARK", "DRIVE", "STREET", "ROAD", "LANE", "BOULEVARD", "BLVD", "AVENUE", "HIGHWAY","AVE", "RD");
# Age indicators that follow ages
@age_indicators_suff = ("year old", "y\. o\.", "y\.o\.", "yo", "y", "years old", "year-old", "-year-old", "years-old", "-years-old", "years of age", "yrs of age");
# Age indicators that precede ages
@age_indicators_pre = ("age", "he is", "she is", "patient is");
# Digits, used in identifying ages
@digits = ("one","two","three","four","five","six","seven","eight","nine", "");
# Different variations of the 12 months
@months = ("January", "Jan", "February", "Feb", "March", "Mar", "April", "Apr", "May", "June", "Jun", "July", "Jul", "August", "Aug", "September", "Sept", "Sep", "October", "Oct", "November", "Nov", "December", "Dec");
######################################################################################
# If the correct number of input argument is provided, sets the input and output filenames.
if ($#ARGV == 1) {
$data_file = "$ARGV[0].text"; # data_file: input file
$output_file = "$ARGV[0].phi"; # output_file: file containing PHI locations
$debug_file = "$ARGV[0].info"; # debug_file: file used for debugging, contains PHI and non-PHI locations
$deid_text_file = "$ARGV[0].res"; # deid_text_file: de-identified text file
$gs_file = "$ARGV[0].deid"; # gs_file: Gold Standard of the input file
print "\n*******************************************************************************************************************\n";
print "De-Identification Algorithm: Identifies Protected Health Information (PHI) in Discharge Summaries and Nursing Notes";
print "\n*******************************************************************************************************************\n";
$config_file = $ARGV[1];
open CF, $config_file or die "Cannot open $config_file";
while ($cfline = <CF>) {
chomp $cfline;
if ($cfline =~ /\A[\#]+/){
next;
}
if ($cfline =~ /\bGold\s+standard\s+comparison\s*\=\s*([0-9])/ig) {
$comparison = ($1);
}
#Date default expects MM/DD/YYYY
if ($cfline =~ /\bDate\s+default\s*\=\s*(\d\d)\/(\d\d)\/(\d\d\d\d)/ig) {
my $mm = $1; $dd = $2; $yyyy = $3;
$DEFAULT_DATE = "$mm/$dd/$yyyy";
#print "Default date is $DEFAULT_DATE\n";
}
#The "Two Digit Year Threshold" is used to determine whether
#to interpret the year as a year in the 1900's or 2000's
if ($cfline =~ /\bTwo\s+Digit\s+Year\s+Threshold\s*=\s*(\d{1,2})/ig) {
$TWO_DIGIT_YEAR_THRESHOLD = "$1";
#print "Two Digit Year Threshold is $TWO_DIGIT_YEAR_THRESHOLD\n";
}
if ($cfline =~ /\bDate\s+offset\s*\=\s*([0-9]+)/ig) {
$offset = ($1);
#print "Date offset is $1\n";
}
if ($cfline =~ /\bSSN\s+filter\s*\=\s*([a-z])/ig) {
$ssnfilter = ($1);
}
if ($cfline =~ /\bURL\s+filter\s*\=\s*([a-z])/ig) {
$urlfilter = ($1);
}
if ($cfline =~ /\bEmail\s+filter\s*\=\s*([a-z])/ig) {
$emailfilter = ($1);
}
if ($cfline =~ /\bTelephone\s+filter\s*\=\s*([a-z])/ig) {
$telfilter = ($1);
}
if ($cfline =~ /\bUnit\s+number\s+filter\s*\=\s*([a-z])/ig) {
$unitfilter = ($1);
}
if ($cfline =~ /\bAge\s+filter\s*\=\s*([a-z])/ig) {
$agefilter = ($1);
}
if ($cfline =~ /\bLocation\s+filter\s*\=\s*([a-z])/ig) {
$locfilter = ($1);
}
if ($cfline =~ /\bDate\s+filter\s*\=\s*([a-z])/ig) {
$datefilter = ($1);
}
if ($cfline =~ /\bName\s+filter\s*\=\s*([a-z])/ig) {
$namefilter = ($1);
}
if ($cfline =~ /\bState\s+filter\s*\=\s*([a-z])/ig) {
$us_state_filter = ($1);
}
if ($cfline =~ /\bDS\s+filter\s*\=\s*([a-z])/ig) {
$ds_specific_filter = ($1);
}
if ($cfline =~ /\bGS\s+filter\s*\=\s*([a-z])/ig) {
$gs_specific_filter = ($1);
}
######################################################
#get the config info for dictionaries loading
if ($cfline =~ /\bPID\s+to\s+patient\s+name\s+mapping\s*\=\s*([a-z])/ig) {
$pid_patientname_list = ($1);
}
if ($cfline =~ /\bPID\s+to\s+date\s+offset\s+mapping\s*\=\s*([a-z])/ig) {
$pid_dateshift_list = ($1);
}
if ($cfline =~ /\bCountry\s+names\s*\=\s*([a-z])/ig) {
$country_list = ($1);
}
if ($cfline =~ /\bCompany\s+names\s*\=\s*([a-z])/ig) {
$company_list = ($1);
}
if ($cfline =~ /\bEthnicities\s*\=\s*([a-z])/ig) {
$ethnicity_list = ($1);
}
if ($cfline =~ /\bHospital\s+names\s*\=\s*([a-z])/ig) {
$hospital_list = ($1);
}
if ($cfline =~ /\bLocation\s+names\s*\=\s*([a-z])/ig) {
$location_list = ($1);
}
if ($cfline =~ /\bDoctor\s+names\s*\=\s*([a-z])/ig) {
$doctor_list = ($1);
}
if ($cfline =~ /\bLocalPlaces\s+names\s*\=\s*([a-z])/ig) {
$local_list = ($1);
}
if ($cfline =~ /\bState\s+names\s*\=\s*([a-z])/ig) {
$state_list = ($1);
}
}
}
# Prints an error message on the screen if number of arguments is incorrect
else {
print "\n===========================================================================================";
print "\nError: Wrong number of arguments entered";
print "\nThe algorithm takes 2 arguments:";
print "\n 1. filename (the filename of medical notes, without extension, where extension must be .text)";
print "\n 2. config_filename (the configuration filename)";
print "\nExample (for Gold Standard Comparison): perl deid.pl id deid.config";
print "\nExample (for output mode using Gold Standard): perl deid.pl id deid-output.config";
print "\nFor further documentation, please consult the README.txt file";
print "\n===========================================================================================\n";
exit;
}
# After setting file names and configuring the run, indicates that de-identification has commenced
print "\n\nStarting de-identification (version 1.1) ...\n\n";
#check if we can open the .phi file
open F, ">$output_file" or die "Cannot open $output_file";
close F;
# Calls setup to create some lookup lists in memory
setup();
if ($comparison==1) {
print "Running deid in performance comparison mode.\n";
print "Using PHI locations in $gs_file as comparison. Output files will be:\n";
print "$output_file: the PHI locations found by the code.\n";
print "$debug_file: debug info about the PHI locations.\n";
#check if the gold std file exists
open GS, $gs_file or die "Cannot open $gs_file. Make sure that the gold standard file exists!\n"; # GS = Gold Standard file
close GS;
}
else {
#check if we can open the .res file
open F, ">$deid_text_file" or die "Cannot open $deid_text_file";
close F;
print "Running deid in output mode. Output files will be: \n";
print "$output_file: the PHI locations found by the code.\n";
print "$deid_text_file: the scrubbed text.\n";
print "$debug_file: debug info about the PHI locations.\n";
}
deid();
# Calls function stat() to calculate code performance statistics, if comparison mode = 1
if ($comparison==1) {
require "stat.pm";
&stat($gs_file, $output_file);
}
# End of top level of code
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
sub numerically {
#print "a is $a b is $b\n";
$a <=> $b;
}
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Reads in a file and pushes each line onto an array. Returns the array.
sub preload {
my ($file) = @_;
my @res = ();
open FILE, $file or die "Cannot open file $file";
while ($line = <FILE>) {
chomp $line;
push(@res, uc($line));
}
close FILE;
return @res;
}
# End of preload()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Reads in a file and pushes each line onto an array. Returns the array.
sub preload_uc {
my ($file) = @_;
my @res = ();
open FILE, $file or die "Cannot open file $file";
while ($line = <FILE>) {
chomp $line;
push(@res, uc($line));
}
close FILE;
return @res;
}
# End of preload_uc()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Reads in a file and creates a dictionary that records each line in that file under the given association, by mapping the line to a '1' in the dictionary of the association
sub preload_assoc {
my ($file,$assoc) = @_;
open FILE, $file or die "Cannot open file $file";
while ($line = <FILE>) {
chomp $line;
$$assoc{uc($line)}=1;
}
close FILE;
}
# End of preload_assoc()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Reads in a file and calls setup_hash on each line
sub setup_hash {
my ($file, $tname) = @_;
my @entry;
$typename[$ntype]= $tname;
open FILE, $file or die "Cannot open file $file";
while ($line = <FILE>) {
chomp $line;
&setup_item($ntype,$line);
}
$ntype++;
close FILE;
}
# End of setup_hash()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Reads in a file and calls setup_hash on each line
sub setup_lst_hash {
my ($tname, @hlst) = @_;
$typename[$ntype]= $tname;
foreach $line (@hlst) {
&setup_item($ntype,$line);
}
$ntype++;
}
# End of setup_lst_hash()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
sub setup_item {
my ($type,$line) = @_;
my ($head, @lst) = split (/([^a-zA-Z0-9_\']+)/,uc($line));
my $ix = $type;
if(@lst){
push @extend, [@lst];
$ix = "$type,$#extend";
}
my $entry = $lhash{$head};
if ($entry){
$lhash{$head} .= "|" . $ix;
}
else{$lhash{$head}=$ix;
}
}
# End of setup_item()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
sub typename {
my ($num) = @_;
return($typename[$num]);
}
# End of typename()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Takes in an array of words and compares them with hashes of known PHI. Recognizes and adds PHI using addType().
sub lookhash {
my @txtlst = @_;
my $pos = 0;
my $npos = 0;
my $tl,$ty;
my $txt = 1; # item is text not separator
my $item;
while (@txtlst) {
$item = shift(@txtlst);
$npos = $pos + length($item);
if ($item =~ /([a-zA-Z\']+)/ig) {
if($item){
$item = uc($item);
$tl = $lhash{$item};
#if the term ends with 's, remove it and see if
#there is a match in PHI hash
#if (!($tl) && $item =~/([a-zA-Z\'\-]+)\'s$/ig) {
if (!($tl) && $item =~/([a-zA-Z\']+)\'s$/ig) {
$tl = $lhash {$1};
}
if($tl){
# Compares with known PHI by calling bestmatch()
($xpos, $done, @types)=&bestmatch($tl,@txtlst);
splice(@txtlst,0,$#txtlst-$done);
$npos += $xpos;
foreach $typ (@types){
#print "item $item, adding type $type for position $pos-$npos\n";
#print "type is $typ , key is $pos - $npos \n";
addType("$pos-$npos",$typ);
#print "positions are $pos-$npos, typ is $typ\n";
}
}
} #end if $item
$txt = 0;
} #end if $txt
else {
$txt = 1;
}
$pos = $npos;
}
}
# End of lookhash()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
sub bestmatch {
my ($tl,@txtlst) = @_;
my $pos = 0;
my $bestpos = 0;
my $bestrest = $#txtlst;
my @besttyp = ();
my ($type, $rest, $xpos, @nlst);
foreach $ck (split '\|',$tl) {
($type, $rest) = split ',',$ck;
if($rest) {
($xpos , @nlst) = &matchrest($rest,@txtlst);
if ($xpos) {
if($xpos > $bestpos) {
$bestrest = $#nlst; $bestpos = $xpos;
@besttyp = ($typename[$type]);
}
elsif($xpos == $bestpos) {
push @besttyp,$typename[$type];
}
}
}
elsif ($bestpos == 0) {
push @besttyp,$typename[$type];
}
}
return ($bestpos, $bestrest, @besttyp);
}
# End of bestmatch()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
sub matchrest {
my ($rest, @txtlst) = @_;
my @mlst = @{$extend[$rest]};
my $item;
my $pos = 0;
my $pm = 0;
my $tmppos = 0;
if ($mlst[0] eq '|') {
$pm = 1; shift(@mlst);
}
foreach $i (@mlst) {
if ($i !~ /([a-zA-Z\'\-]+)$/ig){
next;
}
$item = shift(@txtlst);
$pos += length($item);
while ( $#txtlst >= 0 && ($item !~ /([a-zA-Z\'\-]+)/ig)){
$item = shift(@txtlst);
$pos += length ($item);
#print "item is $item, len is $#txtlst";
}
if ($pm ? (uc($item) !~ /$i/) : ($i ne uc($item))) {
#print "pm is $pm, returning zero\n";
return 0;
}
}
return ($pos, @txtlst);
}
# End of matchrest()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: setup()
# Arguments: None
# Returns: None
# Called by: Topmost level of code
# Description: Creates some lookup lists to have in memory
sub setup {
# This part is not necessary
open LP, $last_name_prefix_file or die "Cannot open $last_name_prefix_file";
while ($line = <LP>) {
chomp $line;
$prefixes{uc($line)} = 1;
}
close LP;
#Added to reduce false positives
&setup_hash($medical_phrases_file,"MedicalPhrase");
# Sets up hashes of some PHI lists for direct identification
if ($namefilter =~ /y/) {
&setup_hash($female_unambig_file,"Female First Name (un)");
&setup_hash($female_ambig_file,"Female First Name (ambig)");
&setup_hash($male_unambig_file,"Male First Name (un)");
&setup_hash($male_ambig_file,"Male First Name (ambig)");
&setup_hash($last_unambig_file,"Last Name (un)");
&setup_hash($last_popular_file,"Last Name (popular/ambig)");
&setup_hash($last_ambig_file,"Last Name (ambig)");
&setup_hash($female_popular_file, "Female First Name (popular/ambig)");
&setup_hash($male_popular_file, "Male First Name (popular/ambig)");
if ($doctor_list =~ /y/) {
&setup_hash($doctor_first_unambig_file, "Doctor First Name");
&setup_hash($doctor_last_unambig_file, "Doctor Last Name");
}
}
if ($locfilter =~ /y/) {
if ($location_list =~ /y/) {
&setup_hash($locations_ambig_file,"Location (ambig)");
&setup_hash($locations_unambig_file,"Location (un)");
} else {
@loc_unambig = ();
@more_loc_unambig = ();
@loc_ambig = ();
}
if ($hospital_list =~ /y/) {
&setup_hash($hospital_file,"Hospital");
}
if ($ethnicity_list =~ /y/) {
&setup_hash($ethnicities_unambig_file, "Ethnicity");
}
if ($company_list =~ /y/) {
&setup_hash($companies_file, "Company");
&setup_hash($companies_ambig_file, "Company (ambig)");
}
if ($country_list =~ /y/) {
&setup_hash($countries_file, "Country");
}
if ($local_list =~ /y/){
&setup_hash($local_places_unambig_file, "Location (un)");
&setup_hash($local_places_ambig_file, "Location (ambig)");
}
}
# Preloads PHI in some lists into corresponding arrays
@female_popular = &preload($female_popular_file);
@male_popular = &preload($male_popular_file);
#@last_name_prefixes = &preload_uc($last_name_prefixes_file);
@prefixes_unambig = &preload_uc($prefixes_unambig_file);
if ($hospital_list =~ /y/) {
@hospital = &preload($hospital_file);
} else {@hospital = ();}
if ($state_list =~ /y/){
@us_states = &preload($us_states_file);
@us_states_abbre = &preload($us_states_abbre_file);
@more_us_states_abbre = &preload($more_us_states_abbre_file);
}
# Generates associations between PHI in some lists and PHI categories
&preload_assoc($common_words_file,"common_words");
&preload_assoc($medical_words_file,"common_words");
&preload_assoc($very_common_words_file,"very_common_words");
&preload_assoc($unambig_common_words_file, "unambig_common_words");
&preload_assoc($male_unambig_file, "male_unambig");
&preload_assoc($female_unambig_file, "female_unambig");
&preload_assoc($female_ambig_file, "female_ambig");
&preload_assoc($male_ambig_file, "male_ambig");
&preload_assoc($last_ambig_file, "last_ambig");
&preload_assoc($male_popular_file, "male_popular");
&preload_assoc($female_popular_file, "female_popular");
&preload_assoc($us_area_code_file,"us_area_code");
# Opens debug file for debugging
open D, ">".$debug_file;
close D;
}
# End of setup()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: deid()
# Arguments: None
# Returns: None
# Called by: Topmost level of code
# Description: One of the 2 major branches of the code
# This function takes over de-identification if no performance statistic is required
# Function first loads the patient name list file.
# Then reads the medical text line by line.
# If the line indicates START_OF_RECORD, read the Patient ID (PID), Note ID (NID), (and Note Date if any) info
# Scan for PHI a paragraph at a time
# output PHI locations into .phi file
my $currentID;
my @known_phi;
my @known_first_name;
my @known_last_name;
my %pidPtNames; # key = pid, value = [0] first name [1] last name
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
sub deid {
$allText = "";
$allallText = "";
open DF, $data_file or die "Cannot open $data_file";
my $paraCount = 0;
my $stpos = 0;
my %deids; #key = start index, value = array of (end index, ID)
my %phiTerms; #key = lc(word), value = # of occurrences
my %phiT;
my %phiTT;
my $noteDate;
my $line;
$currentID = 0; #initialize current ID to a non-existent PID 0
# Code runs through text by paragraph so things that extend over lines aren't missed
$para = "";
#load the patient name file
if ($pid_patientname_list =~ /y/) {
open PF, $patient_file or die "Cannot open $patient_file";
while ($pfline = <PF>) {
chomp $pfline;
if ($pfline =~ /((.)+)\|\|\|\|((.)+)\|\|\|\|((.)+)/ig) {
my $pid = $1;
$known_first_names = $3;
$known_last_names = $5;
$pidPtNames{$pid}[0] = $known_first_names;
$pidPtNames{$pid}[1] = $known_last_names;
}# end if pfline = ~ /((.)+)\|\|\|\|((.)+)\|\|\|\|((.)+)/ig)
} # end while $pfline = <PF>
} #end if pid_patient_name_list ~= y
###End loading the patient names
while ($line = <DF>) {
#If this is a new record, set PID, Note ID
#we assume all notes for the same patient go together
if ( $line =~ /\ASTART_OF_RECORD=(([0-9]+)(\|)+([0-9]+)(\|)+(\d\d\/\d\d\/\d{4})?)/) {
# $currentNote = $2;
# $thePT = $4;
# $noteDate =$6; #noteDate is empty in gold std
#print "pt $thePT, currentNote $currentNote\n";
#if it's gold standard, the header contains PID then NID
$currentNote = $4;
$thePT = $2;
$noteDate =$6; #noteDate is empty in gold std
my $label = "Patient $thePT\tNote $currentNote";
open DEST, ">>".$debug_file or die "Cannot open $debug_file";
print DEST "$label\n";
close DEST;
#Gold Standard corpus does not specify note date, so assign a default
#If you would like deid to date shift the notes on a per note basis, make sure you
#specify the record date in the header!
if (length($noteDate) ==0){
#$noteDate="01/01/2020";
$noteDate = $DEFAULT_DATE;
}
#if this is a new patient, set the PID, and lookup patient name
if ($thePT != $currentID) { #This is a new patient!
$currentID = $thePT;
%phiTerms = (); #clear the phiTerms on new pt
# Find the patient name for the current PID
#lookup first and last name of this patient
if ($pid_patientname_list =~ /y/ && exists $pidPtNames{$currentID} ) {
$known_first_names = $pidPtNames{$currentID}[0];
$known_last_names = $pidPtNames{$currentID}[1];
#print "Found patient names, first = $known_first_names last = $known_last_names\n";
if ($known_first_names =~ /([a-z][a-z]+)[\s\-]([a-z][a-z]+)/ig) {
@known_first_name = ($1, $2);}
else {
if ($known_first_names =~ /\b([a-z])(\.?)\s([a-z]+)/ig) {
@known_first_name = ($3);
}
elsif ($known_first_names =~ /\b([a-z][a-z]+)\s([a-z])(\.?)/ig) {
@known_first_name = ($1);
}
else {
@known_first_name = ($known_first_names);
}
}
if ($known_last_names =~ /([a-z][a-z]+)[\s\-]([a-z][a-z]+)/ig) {
@known_last_name = ($1, $2);}
else {
if ($known_last_names =~ /\b([a-z])(\.?)\s([a-z]+)/ig) {
@known_last_name = ($3);
}
elsif ($known_last_names =~ /\b([a-z][a-z]+)\s([a-z])(\.?)/ig) {
@known_last_name = ($1);
}
else {
@known_last_name = ($known_last_names);
}
}
} else { #if no pid/patient name file, just set the first name and last name to null
@known_first_name = ();
@known_last_name = ();
}
} # end if this is a new patient
#output the header to output file
$allText = ""; #reset,
$allallText = ""; #reset
$stpos = 0;
$para = "";
#if output mode, output the header line (with patient and note ID) to .res file
if ($comparison == 0) {
open TF, ">>$deid_text_file" or die "Cannot open $deid_text_file"; #now open in append mode
print TF "\n$line";
close TF;
}
next; #skip to next line
} #end if this is start of a record
else { #else this is not the start of a record, just append the line to the end of the current text
chomp $line;
$allText .= $line."\n";
$allallText .= $line."\n";
#$myline = $line;
#chomp $myline;
#$allText .= $myline."\n";
#$allallText .= $myline."\n";
}
#Look for paragraph separator: if this is a line is entirely non-alphanumeric or
#if it starts with spaces, or if it is an empty line, or if this line marks the end of record,
#then call findPHI() for the current paragraph we have so far.
#If end of record is encoutnered, output the de-id text; else if
#it's not end of record yet, append the line to the paragraph.
# if ( (!($line =~ /[a-zA-Z\d]+/)) || $line =~ /^ +/ || $line eq "" || ($line =~ /\|\|\|\|END_OF_RECORD/) ) {
if ( (!($line =~ /[a-zA-Z\d]+/)) || ($line eq "") || ($line =~ /\|\|\|\|END_OF_RECORD/) ) {
if ($para =~ /\w/ ){ #if para contains alphanumeric
# Calls findPHI() with current paragraph; resulting PHI locations are stored in %phiT
#%phiT = findPHI("Para $paraCount", $date, $stpos, $para);
%phiT = findPHI("Para $paraCount", $noteDate, $stpos, $para);
$paraCount++;
# %phiT is copied over to %phiTT
foreach $x (sort numerically keys %phiT) {
@{$phiTT{$x}} = @{$phiT{$x}};
}
#Sorts keys in %phiT; outputs text accordingly
foreach $k (keys %phiT) {
my ($start, $end) = split '-', $k;
# $deids_end = ${@{$deids{$start}}}[0]; #does not work with perl v5.10
my @deidsval = ${@{$deids{$start}}};
$deids_end = $deidsval[0];
$found = $phiT{$k};
foreach $t (@{$phiT{$k}}) {
}
my $word = lc(substr $allallText, $start, ($end - $start));
#print "Key in PhiT = $k, word = $word\n"; #DEBUG
# if ($end > ${@{$deids{$start}}}[0]) {
if ($end > $deidsval[0]) {
$deids{$start}[0] = $end;
$deids{$start}[1] = $currentID;
$deids{$start}[2] = $noteDate;
}
#############################################################################
#Now remember the PHI terms that are important names for checking for repeated occurrences of PHIs
#PHI Name Tags
#(NI) Name indicators
#(LF) Lastname Firstnames
#(PTitle) plural titles
#(MD) followed by "MD" or "M.D"
#(PRE) checks up to 3 words following "PCP Name" ("PCP", "physician", "provider", "created by", "name");
#(NameIs) followed by pattern "name is"
#(Prefixes) for @prefixes_unambig)
#(STitle) @specific_titles = ("MR", "MISTER", "MS");
#(Titles) @titles
#(NamePattern) all other name patterns in sub name3
#remember all the PHI of type name (strict_titles) and name (indicators)
#print "checking for repeated occurences of PHIs: word is $word, phitype is (@{$phiT{$k}})\n";
if ( ($word !~ /\d/) && ((length $word) > 3) && !(isCommon($word)) &&
( isPHIType( "(NI)", @{$phiT{$k}}) ||
isPHIType( "(PTitle)", (@{$phiT{$k}})) || isPHIType( "(LF)", (@{$phiT{$k}})) ||
isPHIType( "(NamePattern)", (@{$phiT{$k}})) ||
isPHIType( "(MD)", (@{$phiT{$k}})) ||
isPHIType( "(NameIs)", (@{$phiT{$k}})) || isPHIType("(STitle)", (@{$phiT{$k}})) ||
isPHIType("(Titles)", (@{$phiT{$k}}) ))
) {
#$phiTerms{$word}++;
if (!(exists $phiTerms{$word})) {
$phiTerms{$word} = 1;}
else {
$phiTerms{$word} = $phiTerms{$word}+ 1;
}
} #end if
} # end foreach $k (keys %phiT)
} #end if ($para =~ /\W/)
if ($line =~ /\|\|\|\|END_OF_RECORD/ ) {
open DEST, ">>".$debug_file or die "Cannot open $debug_file";
####################################################
#check for repeated occurences of PHIs for this note
while ($allallText =~ /\b([A-Za-z]+)\b/g) {
my $token = $1;
my $start = (length ($`)); #$` is the string preceding what was matched by the last successful match
my $end = $start + length($token);
if (!(exists $deids{$start}) ) {
#if (!(exists ${@{$deids{$start}}}[0])) {
L:
foreach $word (keys %phiTerms) {
if ( (uc($token) eq uc($word)) ) {
$deids{$start}[0] = $end;
$deids{$start}[1] = $currentID;
$deids{$start}[2] = $noteDate;
$term = substr $text, $start, ($end - $start +1);
$outstr = "$start \t $end \t $term \t Name (Repeated Occurrence) \n";
print DEST $outstr;
next L;
} #end if
} # end foreach
} # end if
} # end while
#end checking for repeated occurences of PHIs
close DEST;
#####################################################
##output PHI locations to the .phi file
open OUTF, ">>$output_file" or die "Cannot open $output_file";
print OUTF "\nPatient $currentID\tNote $currentNote";
foreach $k (sort numerically keys %deids) {
my @deidvals = @{$deids{$k}};
$thisend = $deidvals[0];
if ($thisend ){
print OUTF "\n$k\t$k\t$thisend";
}
}
close OUTF;
###output de-ided text to .res file
if ($comparison==0) {
outputText(\%deids, \%phiTT);
}
#now that we have output text for this record, we reset the
#variables to get ready for the next record
$para = "";
$paraCount = 0;
$stpos = 0;
%deids=(); #clear the deid hash
%phiTT=(); #clear the phiTT hash
%phiT=();
$allText = ""; #reset,
$allallText = ""; #reset
} else { # this is not end of record yet ...
my $tmp = length($para);
$stpos += length ($para);
$para = $line.' ';
}
} #end if line starts with empty spaces || empty line || end of record
else { # else this line is still a part of the current paragraph
#$para .= ' '.$line; #just append to end of current paragraph
#$para .= $line.' '; #just append to end of current paragraph
if ($line eq "") {
$para .= "\n";
} else {
$para .= $line.' '; #just append to end of current paragraph
}
}
} #end while ($line=<DF>)
close DF;
}
# End of deid()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: findPHI()
# Arguments: string $label ("Para $paraCount"), string $date (e.g. "yyyy-mm-dd"), int $startIndex (normally 0), string $text (paragraph of text)
# Returns: hash %approved (key=start-end of each PHI, value=PHI types)
# Called by: deid()
# Description: Dispatched from deid() to perform de-identification
# Reads in a paragraph of text and runs the de-identification algorithm on it
sub findPHI {
my ($label, $curr_date, $startIndex, $text) = @_;
# Initializes the hash %phi which stores PHI locations and their PHI types
%phi = (); #key = start-end, value = (type1, type2, ...)
local %end = (); # key = start, value = end (dynamic scope so addType can do it)
if ($text !~ /\w/) {
return %phi;
} #[wjl] skip blank lines
# Splits the text into separate items at spaces
#my @txtlst = split (/([^a-zA-Z0-9_\'\-]+)/,$text); # used by lookhash and appx match
my @txtlst = split (/([^a-zA-Z0-9_\']+)/,$text);
# Performs exact matching with the hashes of PHI lists
&lookhash(@txtlst);
# Initializes hash %phik which stores only start and end indices of each PHI in %phi
%phik = ();
foreach $k (keys %phi) {
($st,$end) = split ('-',$k);
$phik{$st} = $end;
}
# Calls each filter module
if ($datefilter =~ /y/) {
&commonHoliday($text, 0);
&date ($text, $curr_date);
&dateWithContextCheck($text, $curr_date);
&yearWithContextCheck($text, $curr_date);
&seasonYear ($text, 0);
}
if ($telfilter =~ /y/) {
&telephone($text, 0);
&pager ($text, 0);
}
if ($locfilter =~ /y/) {
&wardname($text, 0);
&location1 ($text, 0);
&location2 ($text, 0);
}
if ($emailfilter =~ /y/) {
&email ($text, 0);
}
if ($urlfilter =~ /y/) {
&url ($text, 0);
}
if ($ssnfilter =~ /y/) {
&ssn ($text, 0);
}
if ($agefilter =~ /y/) {
&age ($text, 0);
}
if ($namefilter =~ /y/) {
&name1 ($text, 0);
&name2 ($text, 0);
&name3 ($text, 0);
&knownPatientName($text, 0);
&problem ($text, 0);
&signatureField ($text, 0);
}
if ($unitfilter =~ /y/) {
&mrn ($text, 0);
&unit ($text, 0);
&providerNumber ($text, 0);
}
if ($ds_specific_filter =~ /y/){
#discharge summary specific filters
#filter not enabled for this version
#&dischargeSummarySpecific ($text, 0);
}
# Call new function here >>>>>>>>>>
# Follow format shown here if necessary
# &functionName ($text, 0);
open DEST, ">>".$debug_file or die "Cannot open $debug_file";
#print DEST "$label\n";
# Sub-function: finalPHICheck()
# After findPHI() has performed most PHI checks, goes through the identified PHI before adding them to the final PHI files
my %approved;
my ($startp, $endp) = (0, 0);
my $notAmbig = 0;
my $stg1 = "";
my $stg2 = "";
my $prevAmbig = 0;
my $oldk = "";
my $prevKey = "";
my ($oldstartp, $oldendp,) = (0, 0);
my $oldtext = "";
# Prunes keys and checks whether each PHI is ambiguous or is an indicator (e.g. hospital indicator)
foreach $k (&pruneKeys("phi",$text)) {
($startp, $endp) = split "-", $k;
my $the_word = (substr $text, $startp, ($endp - $startp));
$notAmbig = 0; #so by default, the term is ambiguous
foreach $tt (@{$phi{$k}}){
#if(($tt !~ /ambig/) && ($tt !~ /Indicator/) ) {
if (($tt !~ /ambig/) && ($tt !~ /Indicator/) && ($tt !~ /MedicalPhrase/)) {
#so IF the term matches ANY type that's non-ambiguous, THEN set it as non-ambiguous
$notAmbig = 1; last;}
} #end for each
my $notIndicator = 1; #default to be not an indicator
foreach $tt (@{$phi{$k}}){
if ($tt =~ /Indicator/) {
$notIndicator = 0; last;
}
}
$prevText = (substr $text, $oldstartp, ($oldendp - $oldstartp));
$newText = (substr $text, $startp, ($endp - $startp));
$a = (isType($prevKey, "Male First Name", 1) && ($prevAmbig==1) && (!isCommon($prevText)));
$b = (isType($k, "Last Name", 1) && ($notAmbig==0) && (!isCommon($newText)));
#if (this is ambig) and (previous is ambig) and ...
if ((($notAmbig==0) && ($prevAmbig==1) && (isType($prevKey, "Male First Name", 1) || (isType($prevKey, "Female First Name", 1))) && (!isCommon($prevText)) && (!isCommon($newText)) && ($prevText !~ /\./) && isType($k, "Last Name", 1) && (($startp-$oldendp)<3)) ||
#if (this is not-ambig) and (previous is ambig) and ...
(($notAmbig==1) && ($prevAmbig==1) && (isType($prevKey, "Male First Name", 1) || (isType($prevKey, "Female First Name", 1))) && (!isCommon($prevText)) && (!isCommon($newText)) && ($prevText !~ /\./) && isType($k, "Last Name", 1) && (($startp-$oldendp)<3)) ||
#if (this is not-ambig) and (previous is ambig) and ...
(($notAmbig==1) && ($prevAmbig==1) && isType($prevKey, "Last Name", 1) && (!isCommon($prevText)) && (!isCommon($newText)) && ($prevText !~ /\./) && isType($k, "First Name", 1) && (($startp-$oldendp)<3)) ||
#commented out on 1/31/07
(($notAmbig==0) && ($prevAmbig==0) && (isType($prevKey, "Male First Name", 1) || (isType($prevKey, "Female First Name", 1))) && (!isCommon($prevText)) && (!isCommon($newText)) && ($prevText !~ /\./) && isType($k, "Last Name", 1) && (($startp-$oldendp)<3)) ||
#if (this is ambig) and (previous is not ambig) and ...
(($notAmbig==0) && ($prevAmbig==0) && isType($prevKey, "Last Name", 1) && (!isCommon($prevText)) && (!isCommon($newText)) && ($prevText !~ /\./) && isType($k, "First Name", 1) && (($startp-$oldendp)<3))) {
print DEST ($startIndex + $oldstartp)."\t".($startIndex+$oldendp)."\t".(substr $text, $oldstartp, ($oldendp - $oldstartp +1));
###################
my $oldtext = $text;
my $newKey = ($startIndex + $oldstartp)."-".($startIndex + $oldendp);
###my $text = (substr $text, $oldstartp, ($oldendp - $oldstartp));
foreach $tt (@{$phi{$prevKey}}) {
print DEST "\t$tt";
push @{$approved{$newKey}}, $tt;}
print DEST "\n";
print DEST ($startIndex + $startp)."\t".($startIndex+$endp)."\t".(substr $oldtext, $startp, ($endp - $startp +1));
my $newKey = ($startIndex + $startp)."-".($startIndex + $endp);
###my $text = (substr $oldtext, $startp, ($endp - $startp));
foreach $tt (@{$phi{$k}}) {
print DEST "\t$tt";
push @{$approved{$newKey}}, $tt;
}
print DEST "\n";
}
# If the PHI is not ambiguous and not an indicator, recognizes it as PHI; add it to PHI file
elsif ($notAmbig && $notIndicator) {
###################
print DEST ($startIndex + $startp)."\t".($startIndex+$endp)."\t".(substr $text, $startp, ($endp - $startp +1));
my $newKey = ($startIndex + $startp)."-".($startIndex + $endp);
###my $text = (substr $text, $startp, ($endp - $startp));
foreach $tt (@{$phi{$k}}){
print DEST "\t$tt";
if (($tt !~ /ambig/) && ($tt !~ /Indicator/)) {
push @{$approved{$newKey}}, $tt;
}
}
print DEST "\n";
} # Else ck keys discarded
else {
###################
# Otherwise keeps the remaining PHI as non-PHI
print DEST ($startIndex + $startp)."\t".($startIndex+$endp)."\t# ".(substr $text, $startp, ($endp - $startp +1));
foreach $tt (@{$phi{$k}}) {
print DEST "\t$tt";
}
print DEST "\n";}
# Sets ambiguous variables for current PHI to be recognized as previous PHI for the next round
if ($notAmbig==0) {
$prevAmbig = 1;
$prevKey = $k;
($oldstartp, $oldendp) = split "-", $prevKey;
$oldtext = $text;
}
else {
$prevAmbig = 0;
}
}
close DEST;
# End of sub-function finalPHICheck()
#***********************************************************************************************************
return %approved;
}
# End of findPHI()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: date ()
# Searches for date ranges following formats that appear most frequently in text
sub date {
#$text = $_[0];
my ($text, $date) = @_;
my $year = substr $date, 0, 4;
# Checks if dates should be filtered
if ($datefilter =~ /y/) {
# Searches for the pattern mm/dd-mm/dd where the items are valid dates
while ($text =~ /\b((\d\d?)\/(\d\d?)\-(\d\d?)\/(\d\d?))\b/ig) {
if (isValidDate($2,$3,-1) && isValidDate($4,$5,-1)) {
$date_range = $1;
$start = length($`);
$end = $start + length($date_range);
$key = "$start-$end";
addType($key, "Date range (1)");
}
}
# Searches for mm/dd/yy-mm/dd/yy or mm/dd/yyyy-mm/dd/yyyy where the items are valid dates
while ($text =~ /\b((\d\d?)\/(\d\d?)\/(\d\d|\d\d\d\d)\-(\d\d?)\/(\d\d?)\/(\d\d|\d\d\d\d))\b/ig) {
if (isValidDate($2,$3,$4) && isValidDate($5,$6,$7)) {
$date_range = $1;
$start = length($`);
$end = $start + length($date_range);
$key = "$start-$end";
addType($key, "Date range (2)");
}
}
# Searches for mm/dd-mm/dd/yy or mm/dd-mm/dd/yyyy where the items are valid dates
while ($text =~ /\b((\d\d?)\/(\d\d?)\-(\d\d?)\/(\d\d?)\/(\d\d|\d\d\d\d))\b/ig) {
if (isValidDate($6,$2,$3) && isValidDate($6,$4,$5)) {
$date_range = $1;
$start = length($`);
$end = $start + length($date_range);
$key = "$start-$end";
addType($key, "Date range (3)");
}
}
} #end if date filter is on
# End of sub-function date1()
if ($datefilter =~ /y/) {
# Checks for month/day/year
while ($text =~ /\b(\d\d?)[\-\/](\d\d?)[\-\/](\d\d|\d{4})\b/g) {
my $startI = (length($`));
my $endI = $startI + length($&);
my $key = $startI."-".$endI;
my $first_num = $1;
my $second_num = $2;
my $third_num = $3;
my $beginr = substr $text, ($startI - 2), 2;
my $endr = substr $text, $endI, 2;
if (($beginr !~ /(\|\|)/) && ($endr !~ /(\|\|)/)) {
if (($beginr !~ /\d[\/\.\-]/) && ($endr !~ /\A[\%\/]/) && ($endr !~ /\S\d/)) {
#print "checking M/D/Y, $first_num, $second_num, $third_num\n";
if (isValidDate ($first_num, $second_num, $third_num)) {
addType ($key, "Month/Day/Year");
}
}
}
} #end while
# Checks for month/day/year
while ($text =~ /\b(\d\d?)\.(\d\d?)\.(\d\d|\d{4})\b/g) {
my $startI = (length($`));
my $endI = $startI + length($&);
my $key = $startI."-".$endI;
my $first_num = $1;
my $second_num = $2;
my $third_num = $3;
my $beginr = substr $text, ($startI - 2), 2;
my $endr = substr $text, $endI, 2;
#print "2. checking M/D/Y, $first_num, $second_num, $third_num\n";
if (($beginr !~ /(\|\|)/) && ($endr !~ /(\|\|)/)) {
if (($beginr !~ /\d[\/\.\-]/) && ($endr !~ /\A[\%\/]/) && ($endr !~ /\S\d/)) {
if (isValidDate ($first_num, $second_num, $third_num)) {
addType ($key, "Month/Day/Year");
}
}
}
}
# Checks for day/month/year
while ($text =~ /\b(\d\d?)[\-\/](\d\d?)[\-\/](\d\d|\d{4})\b/g){
my $startI = (length($`));
my $endI = $startI + length($&);
my $key = $startI."-".$endI;
my $first_num = $1;
my $second_num = $2;
my $third_num = $3;
my $beginr = substr $text, ($startI - 2), 2;
my $endr = substr $text, $endI, 2;
if (($beginr !~ /\d[\/\.\-]/) && ($endr !~ /\A[\%\/]/) && ($endr !~ /\S\d/)) {
if (isValidDate ($second_num, $first_num, $third_num)) {
}
}
}
# Checks for year/month/day
while ($text =~ /\b(\d\d|\d{4})[\-\/](\d\d?)[\-\/](\d\d?)\b/g){
my $startI = (length($`));
my $endI = $startI + length($&);
my $key = $startI."-".$endI;
my $yr = $1;
$nn_year = $yr;
my $mo = $2;
my $da = $3;
my $beginr = substr $text, ($startI - 2), 2;
my $endr = substr $text, $endI, 2;
if (($beginr !~ /\d[\/\.\-]/) && ($endr !~ /\A[\%\/]/) && ($endr !~ /\S\d/)) {
if (isValidDate ($mo, $da, $yr) && (($yr>50) || ($yr<6))) {
$prevChars = (substr $text, ($startI-4), 4);
$nextChars = (substr $text, $endI, 11);
if (($prevChars =~ /(\d)(\s)?(\|)(\s)?/) && ($nextChars =~ /\s\d{2}\:\d{2}\:\d{2}(\s)?(\|)/)) {
addType ($key, "Header Date");
$longyear = $yr;
}
else {
addType ($key, "Year/Month/Day");
}
}
}
} #end while
# Checks for year/month/day
while ($text =~ /\b(\d\d|\d{4})\.(\d\d?)\.(\d\d?)\b/g){
my $startI = (length($`));
my $endI = $startI + length($&);
my $key = $startI."-".$endI;
my $yr = $1;
$nn_year = $yr;
my $mo = $2;
my $da = $3;
my $beginr = substr $text, ($startI - 2), 2;
my $endr = substr $text, $endI, 2;
if (($beginr !~ /\d[\/\.\-]/) && ($endr !~ /\A[\%\/]/) && ($endr !~ /\S\d/)) {
#if (isValidDate ($mo, $da, $yr) && (($yr>50) || ($yr<6))) {
if (isValidDate ($mo, $da, $yr)) {
$prevChars = (substr $text, ($startI-4), 4);
$nextChars = (substr $text, $endI, 11);
if (($prevChars =~ /(\d)(\s)?(\|)(\s)?/) && ($nextChars =~ /\s\d{2}\:\d{2}\:\d{2}(\s)?(\|)/)) {
addType ($key, "Header Date");
$longyear = $yr;
}
else {
addType ($key, "Year/Month/Day");
}
}
}
}
# Checks for year/day/month
while ($text =~ /\b(\d\d|\d{4})[\-\/](\d\d?)[\-\/](\d\d?)\b/g){
my $startI = (length($`));
my $endI = $startI + length($&);
my $key = $startI."-".$endI;
my $yr = $1;
$nn_year = $yr;
my $mo = $3;
my $da = $2;
my $beginr = substr $text, ($startI - 2), 2;
my $endr = substr $text, $endI, 2;
if (($beginr !~ /\d[\/\.\-]/) && ($endr !~ /\A[\%\/]/) && ($endr !~ /\S\d/)) {
#if (isValidDate ($mo, $da, $yr) && (($yr>50) || ($yr<6))) {
if (isValidDate ($mo, $da, $yr)) {
$prevChars = (substr $text, ($startI-4), 4);
$nextChars = (substr $text, $endI, 11);
if (($prevChars =~ /(\d)(\s)?(\|)(\s)?/) && ($nextChars =~ /\s\d{2}\:\d{2}\:\d{2}(\s)?(\|)/)) {
addType ($key, "Header Date");
$longyear = $yr;
}
else {
}
}
}
} #end while
# Checks for month/4-digit year
while ($text =~ /\b((\d\d?)[\-\/](\d{4}))/g) {
my $startI = (length($`));
my $endI = $startI + length($&);
my $beginr = substr $text, ($startI - 2), 2;
my $endr = substr $text, $endI, 2;
my $first_num = $2;
my $second_num = $3;
my $st = length($`);
my $endb = $st + length ($2) + length($3) + 1;
my $key = "$st-$endb";
if (($beginr !~ /\|\|/) && ($endr !~ /\|\|/)) {
if (($beginr !~ /\d[\/\.\-]/) && ($endr !~ /[\/\.\%]/)) {
#if (($first_num <= 12) && ($first_num > 0) && ($second_num>=1900)) {
if (($first_num <= 12) && ($first_num > 0) &&
( $second_num >= $VALID_YEAR_LOW && $second_num <= $VALID_YEAR_HIGH ) ) {
addType ($key, "Month/Year 1"); }}}
} #end while
# Checks for 4-digit year/month
while ($text =~ /\b((\d{4})[\-\/](\d\d?))\b/g) {
my $first_num = $2;
my $second_num = $3;
my $st = length($`);
my $endb = $st + length ($2) + length($3) + 1;
my $key = "$st-$endb";
if (($begin !~ /\d[\/\.\-]/) && ($end !~ /[\/\.\%]/)) {
#if (($second_num <= 12) && ($second_num > 0) && ($first_num>=1900) && ($first_num<2010)) {
if (($second_num <= 12) && ($second_num > 0) && ($first_num>=$VALID_YEAR_LOW) && ($first_num <= $VALID_YEAR_HIGH)) {
addType ($key, "Year/Month"); }}
} #end while
# Checks for spelled-out months
# Accounts for ambiguity around the dates, e.g. acronyms for measurements, spelled out months and such
foreach $m (@months) {
while ($text =~ /\b((\d{1,2})[ \-]?$m[ \-\,]? ?\'?\d{2,4})\b/ig) { # 2-May-04
my $day = $2;
my $completeDate = $1;
my $st = length($`);
my $key = "$st-".($st + length($1));
if (($day < 32) && ($day > 0)) {
addType ($key, "Day Month Year");
}
}
while ($text =~ /\b((\d{1,2}) ?(\-|to|through)+ ?(\d{1,2})[ \-]?$m[ \-\,]? ?\'?\d{2,4})\b/ig) { # 2-May-04
my $day1 = $2;
my $day2 = $4;
my $completeDate = $1;
my $st = length($`);
my $key = "$st-".($st + length($1));
if (($day1 < 32) && ($day1 > 0) && ($day2 < 32) && ($day2 >0)) {
addType ($key, "Date range (4)");
}
}
while ($text =~ /\b((\d{1,2}) ?\-\> ?(\d{1,2})[ \-]?$m[ \-\,]? ?\'?\d{2,4})\b/ig) { # 2-May-04
my $day1 = $2;
my $day2 = $3;
my $completeDate = $1;
my $st = length($`);
my $key = "$st-".($st + length($1));
if (($day1 < 32) && ($day1 > 0) && ($day2 < 32) && ($day2 > 0)) {
addType ($key, "Date range (5)");
}
}
while ($text =~ /\b($m\b\.? (\d{1,2})[\,\s]+ *\'?\d{2,4})\b/ig) { # Apr. 2 05
my $day = $2;
my $completeDate = $1;
my $st = length($`);
my $key = "$st-".($st + length($1));
if (($day < 32) && ($day > 0)) {
addType ($key, "Month Day Year");
}
}
while ($text =~ /\b($m\b\.? (\d{1,2}) ?(\-|to|through)+ ?(\d{1,2})[\,\s]+ *\'?\d{2,4})\b/ig) { # Apr. 2 05
my $day1 = $2;
my $day2 = $4;
my $completeDate = $1;
my $st = length($`);
my $key = "$st-".($st + length($1));
if (($day1 < 32) && ($day1 > 0) && ($day2 < 32) & ($day2 > 0)) {
addType ($key, "Date range (6)");
}
}
while ($text =~ /\b($m\b\.? (\d{1,2}) ?\-\> ?(\d{1,2})[\,\s]+ *\'?\d{2,4})\b/ig) { # Apr. 2 05
my $day1 = $2;
my $day2 = $3;
my $completeDate = $1;
my $st = length($`);
my $key = "$st-".($st + length($1));
if (($day1 < 32) && ($day1 > 0) && ($day2 < 32) & ($day2 > 0)) {
addType ($key, "Date range (7)");
}
}
#while ($text =~ /\b($m\b\.?,? ?(\d{1,2})(|st|nd|rd|th|)? ?[\,\s]+ *\'?\d{2,4})\b/ig) { # Apr. 12 2000
while ($text =~ /\b($m\b\.?,? ?(\d{1,2})(|st|nd|rd|th|) ?[\,\s]+ *\'?\d{2,4})\b/ig) { # Apr. 12th 2000
my $day = $2;
my $completeDate = $1;
my $st = length($`);
my $key = "$st-".($st + length($1));
if (($day < 32) && ($day > 0)) {
addType ($key, "Month Day Year (2)");
#addType ($key, "Month Day Year");
}
}
# while ($text =~ /\b($m\b\.?,? ?(\d{1,2})(|st|nd|rd|th|)?)\b/ig) { # Apr. 12
while ($text =~ /\b($m\b\.?,?\s*(\d{1,2})(|st|nd|rd|th|)?)\b/ig) { # Apr. 12
my $day = $2;
my $completeDate = $1;
my $st = length($`);
my $key = "$st-".($st + length($1));
if (($day < 32) && ($day > 0)) {
addType ($key, "Month Day");
}
}
while ($text =~ /\b($m\b\.?,? ?(\d{1,2})(|st|nd|rd|th|)? ?(\-|to|through)+ ?(\d{1,2})(|st|nd|rd|th|)?)\b/ig) { # Apr. 12
my $day1 = $2;
my $day2 = $4;
my $completeDate = $1;
my $st = length($`);
my $key = "$st-".($st + length($1));
if (($day1 < 32) && ($day1 > 0) && ($day2 < 32) && ($day2 > 0)) {
addType ($key, "Date range (8)");
}
}
while ($text =~ /\b($m\b\.?,? ?(\d{1,2})(|st|nd|rd|th|)? ?\-\> ?(\d{1,2})(|st|nd|rd|th|)?)\b/ig) { # Apr. 12th
my $day1 = $2;
my $day2 = $4;
my $completeDate = $1;
my $st = length($`);
my $key = "$st-".($st + length($1));
if (($day1 < 32) && ($day1 > 0) && ($day2 < 32) && ($day2 > 0)) {
addType ($key, "Date range (9)");
}
}
while ($text =~ /\b((\d{1,2})(|st|nd|rd|th|)?( of)?[ \-]\b$m)\b/ig) { # 12-Apr, or Second of April
my $day = $2;
my $completeDate = $1;
my $st = length($`);
my $key = "$st-".($st + length($1));
if (($day < 32) && ($day > 0)) {
addType ($key, "Day Month");
}
}
###
# while ($text =~ /\b((\d{1,2})(|st|nd|rd|th|)?\s+(of)?\s+[\-]\b$m\.?,?)\s*(\d{2,4})\b/ig) { # 12-Apr, or Second of April
while ($text =~ /\b(((\d{1,2})(|st|nd|rd|th|)?\s+(of\s)?[\-]?\b($m)\.?,?)\s+(\d{2,4}))\b/ig) { # 12-Apr, or Second of April
my $day = $3;
my $month = $6;
my $year = $7;
my $completeDate = $1;
my $st = length($`);
my $key = "$st-".($st + length($1));
if (($day < 32) && ($day > 0)) {
addType ($key, "Day Month Year 2");
}
}
###
while ($text =~ /\b((\d{1,2})(|st|nd|rd|th|)? ?(\-|to|through)+ ?(\d{1,2})(|st|nd|rd|th|)?( of)?[ \-]\b$m)\b/ig) { # 12-Apr
my $day1 = $2;
my $day2 = $5;
my $completeDate = $1;
my $st = length($`);
my $key = "$st-".($st + length($1));
if (($day1 < 32) && ($day1 > 0) && ($day2 < 32) && ($day2 > 0)) {
addType ($key, "Date range (10)");
}
}
while ($text =~ /\b((\d{1,2})(|st|nd|rd|th|)? ?\-\> ?(\d{1,2})(|st|nd|rd|th|)?( of)?[ \-]\b$m)\b/ig) { # 12-Apr
my $day1 = $2;
my $day2 = $5;
my $completeDate = $1;
my $st = length($`);
my $key = "$st-".($st + length($1));
if (($day1 < 32) && ($day1 > 0) && ($day2 < 32) && ($day2 > 0)) {
addType ($key, "Date range (11)");
}
}
while ($text =~ /\b($m\.?,? ?(of )?\d{2}\d{2}?)\b/ig) { # Apr. 2002
my $year = $2;
my $completeDate = $1;
my $st = length($`);
my $key = "$st-".($st + length($1));
addType ($key, "Month Year");
}
}
}
}
# End of function date()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: age()
# Checks for ages that are >=90 (assuming that no one is over 125 years old, just as a sanity check)
# When ages are spelled-out, assumes that the number will not be over hundred, i.e. highest spelled-out age="hundred"
sub age {
$text = $_[0];
if ($agefilter =~ /y/) {
foreach $i (@age_indicators_suff) {
if (($text =~ /\b(ninety)[\s\-]+ *$i\b/ig) || ($text =~ /\b(hundred)[\s\-]+ *$i\b/ig)) {
my $age = $1;
my $st = length($`);
my $key = "$st-".((length $age) + $st);
addType ($key, "Age over 90");
}
while ($text =~ /\b(([A-Za-z]+)([\s \-])([A-Za-z]+)) ? *$i\b/ig) {
foreach $j (@digits) {
$first = $2;
$second = $4;
my $age1 = $1;
my $st1 = length($`);
my $end1 = (length $age1) + $st1;
my $key1 = "$st1-$end1";
$st2 = $st1+length($2)+length($3);
$end2 = $st2+(length($second));
my $key2 = "$st2-$end2";
if ((($first=~/\bninety\b/ig) || ($first=~/\bhundred\b/ig)) && (($second=~/\b$digits\b/ig))) {
addType ($key1, "Age over 90");
}
else {
if (!(($first=~/\bninety\b/ig) || ($first=~/\bhundred\b/ig))) {
if (($second=~/\bninety\b/ig) || ($second=~/\bhundred\b/ig)) {
addType ($key2, "Age over 90");
}
}
}
}
}
while ($text =~ /\b(\d+) *$i/ig) {
my $age = $1;
my $st = length($`);
my $key = "$st-".((length $age) + $st);
if (($age >= 90) && ($age <= 125)) {
addType ($key, "Age over 90");
}
}
}
foreach $i (@age_indicators_pre) {
while ($text =~ /\b($i + *)(([A-Za-z]+)([\s \-])([A-Za-z]+))\b/ig) {
foreach $j (@digits) {
$first = $3;
$second = $5;
my $age1 = $2;
my $st1 = length($`)+length($1);
my $end1 = (length $age1) + $st1;
my $key1 = "$st1-$end1";
$st2 = $st1;
$end2 = $st2+length($first);
my $key2 = "$st2-$end2";
if ((($first=~/\bninety\b/ig) || ($first=~/\bhundred\b/ig)) && (($second=~/\b$digits\b/ig) || (length($second)))) {
addType ($key1, "Age over 90");
}
else {
if (!(($first=~/\bninety\b/ig) || ($first=~/\bhundred\b/ig))) {
if (($second=~/\bninety\b/ig) || ($second=~/\bhundred\b/ig)) {
addType ($key2, "Age over 90");
}
}
}
}
}
while ($text =~ /\b($i + *)(\d+)\b/ig) {
my $age = $2;
my $st = length($`)+length($1);
my $key = "$st-".((length $age) + $st);
if (($age >= 90) && ($age <= 125)) {
addType ($key, "Age over 90");
}
}
}
}
}
# End of function age()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: pager()
# Checks for pager numbers
sub pager {
$text = $_[0];
if ($telfilter =~ /y/) {
foreach $i ("Pager", "Pg", "Pgr", "Page", "Beeper", "Beeper number", "Pager number") { #removed "P"
while ($text =~ /\b$i[\s\#\:\-\=]*([a-zA-Z]\s+)*[a-zA-Z]*\s*(\d\d\d+)\b/gi) {
my $num = $2;
my $end = length($`)+length($&);
my $key = ($end - (length $num))."-$end";
addType ($key, "Pager number");
}
while ($text =~ /\b$i[\s\#\:\-\=]*/gi){
my $startp = length($`);
my $endp = length($`)+length($&);
#get the next 30 characters
my $the_chunck = (substr $text, $endp, 30);
#now look for a 5-digit number
while ($the_chunck =~ /(\D)(\d{5})(\D)/gi){
my $pager_startp = $endp + length($`) + length($1);
my $pager_endp = $pager_startp + length($2);
my $key = "$pager_startp-$pager_endp";
addType ($key, "Pager number");
} #end while
} #end while
}
}
}
# End of function pager()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: email()
# Looks for e-mail addresses
sub email {
$text = $_[0];
if ($emailfilter =~ /y/) {
while ($text =~ /\b([\w\.]+\w ?@ ?\w+[\.\w+]\.\w{2,3})\b/g) {
my $st = length($`);
my $key = "$st-".($st+length($&));
addType ($key, "E-mail address");
}
}
}
# End of function email()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: url()
# Checks for URLs of different types
sub url {
$text = $_[0];
if ($urlfilter =~ /y/) {
while ($text =~ /\bhttps?\:\/\/[\w\.]+\w{2,4}\/\S+\b/gi) {
my $st = length($`);
my $key = "$st-".($st+length($&));
addType ($key, "URL");
}
while ($text =~ /\bftp\:\/\/[\w\.]+\w{2,4}\/\S+\b/gi) {
my $st = length($`);
my $key = "$st-".($st+length($&));
addType ($key, "URL");
}
while ($text =~ /\bwww\.[\w\.]+\w{2,4}\/\S+\b/gi) {
my $st = length($`);
my $key = "$st-".($st+length($&));
addType ($key, "URL");
}
while ($text =~ /\bwww\.[\w\.]+\w{2,4}\b/gi) {
my $st = length($`);
my $key = "$st-".($st+length($&));
addType ($key, "URL");
}
while ($text =~ /\bweb\.[\w\.]+\w{2,4}\/\S+\b/gi) {
my $st = length($`);
my $key = "$st-".($st+length($&));
addType ($key, "URL");
}
while ($text =~ /\bweb\.[\w\.]+\w{2,4}\b/gi) {
my $st = length($`);
my $key = "$st-".($st+length($&));
addType ($key, "URL");
}
while ($text =~ /\bhttps?\:\/\/[\w\.]+\w{2,4}\b/g) {
my $st = length($`);
my $key = "$st-".($st+length($&));
addType ($key, "URL");
}
while ($text =~ /\bftp\:\/\/[\w\.]+\w{2,4}\b/g) {
my $st = length($`);
my $key = "$st-".($st+length($&));
addType ($key, "URL");
}
}
}
# End of function url()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: ssn()
# Checks for social security numbers (SSNs)
sub ssn {
$text = $_[0];
if ($ssnfilter =~ /y/) {
while ($text =~ /\b\d\d\d([- \/]?)\d\d\1\d\d\d\d\b/g) {
my $st = length($`);
my $key = "$st-".($st+length($&));
addType ($key, "Social Security Number");
}
}
}
# End of function ssn()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: name1()
# Uses name prefixes to make last names
sub name1 {
$text = $_[0];
if ($namefilter =~ /y/) {
my @keylst = sort numerically keys %phi;
my $key;
for($i=0;$i<$#keylst;$i++){
$key = $keylst[$i];
if (isType($key, "Last Prefix", 0)){
($f1,$t1) = split ('-',$key);
($f2,$t2) = split ('-',$keylst[$i+1]);
if ($f2 < $t1+3){
if (isType($keylst[$i+1], "Last Name", 1)){
print "Found Last Prefix match, Adding $keylst[$i+1] type for last name!!";
addType("$f1-$t2","Last Name");
}
}
}
}
#####################################################
# Uses common-sense heuristics to try to find more names
foreach $i (@name_indicators){
#while ($text =~ /\b($i)(s)?( *)(\-|\,|\.|\()?( *)([A-Za-z]+\b)(\s+)(and )?( *)([A-Za-z]+)\b/ig) {
while ($text =~ /\b($i)(s)?( *)(\-|\,|\.|\()?( *)([A-Za-z]+\b)\b/ig) {
$potential_name = $6;
$start = length($`)+length($1) + length ($2) + length($3) + length($4) + length($5);
$end = $start + length($potential_name);
$key = "$start-$end";
my $tmpstr = substr $text, $start, $end-$start;
my $tmp = isType($key, "Name",1);
if (isProbablyName($key, $potential_name)){
addType ($key, "Name (NI)");
} # end if the first word after the name indicator is a name
my $new_start = $end + length($6) + length($7);
my $new_end = $new_start + length($8);
#########now check the next word
my $rest = substr $text, $end+1, 20;
if (($rest =~ /\b(and )?( *)([A-Za-z]+)\b/ig)){
my $new_start = $end + 1 + length($`) + length($1)+length($2);
my $new_end = $new_start + length ($3);
my $keyAfter = "$new_start-$new_end";
my $wordAfter = substr $rest, (length ($`)+ length($1) + length($2)) , length ($3);
if ( !isNameIndicator($wordAfter) && ( ( !isCommon($wordAfter) ||
((isType($keyAfter, "Name", 1) && isType($keyAfter, "(un)")) ||
(isType($keyAfter, "Name", 1) && ($wordAfter =~ /\b(([A-Z])([a-z]+))\b/g)) ||
(!isCommonest($wordAfter) && isType($keyAfter, "Name", 1)) ||
(isType($keyAfter, "popular",1)) ) ) ) ) {
if ($rest =! /\b[\d]\b/ig) { #disregard if the rest contains numeric patterns
addType ($keyAfter, "Name2 (NI)");
}
}
elsif ($1 =~ /and/ig){
if (! (isCommon($wordAfter) || isNameIndicator($wordAfter))) {
addType ($keyAfter, "Name2 (NI)");
}
}
} #end if rest
} # end of while
} #end for each name indicator
#**********************************************************************************************************
# Searches for the name pattern LASTNAME, FIRSTNAME
# First checks if word2 is a firstname
# If it is, then if word1 is not a common or commonest word, identifies word1 as a lastname
while ($text =~ /\b([A-Za-z]+)( ?\, ?)([A-Za-z]+)\b/ig) {
$input1 = $1;
$input2 = $3;
my $st1 = length($`);
my $end1 = $st1 + length($input1);
my $key = $key1;
my $key1 = "$st1-$end1";
my $st2 = $end1+length($2);
my $end2 = $st2 + length($input2);
my $key = $key2;
my $key2 = "$st2-$end2";
if ((isType($key2, "Name", 1)) && (isType($key1, "Name (ambig)", 1)) && (!isNameIndicator($input1)) ) {
addType ($key1, "Last Name (LF)");
addType ($key2, "First Name1 (LF)");
}
if ((isType($key1, "Name", 1)) && (isType($key2, "Name (ambig)", 1)) && (!isNameIndicator($input1)) ) {
addType ($key2, "Last Name (LF)");
addType ($key1, "First Name2 (LF)");
}
if (isType($key2, "First Name", 1)) {
if ( (isType($key1, "Last Name", 1) && (!isCommonest($input1)) && (!isNameIndicator($input1))) ||
((!isCommon($input1)) && (!isCommonest($input1))) ) {
addType ($key1, "Last Name (LF)");
addType ($key2, "First Name3 (LF)");
}
}
}
}
}
# End of function name1()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: seasonYear()
# Checks for both season and year from patterns like "<season> of? <year>"
sub seasonYear {
$text = $_[0];
@seasons = ("winter", "spring", "summer", "autumn", "fall");
foreach $i (@seasons) {
while ($text =~ /\b(($i)(( +)of( +))? ?\,?( ?)\'?(\d{2}|\d{4}))\b/gi) {
$f2=$2;
$f3=$3;
$f6=$6;
$f7=$7;
if (length($f7)==4) {
if (($f7<=$longyear) && ($f7>1900)) {
my $st1 = length($`);
my $end1 = (length $f2) + $st1;
my $key1 = "$st1-$end1";
my $st2 = $end1+(length $f3)+(length $f6);
my $key2 = "$st2-".((length $f7) + $st2);
addType ($key2, "Year (4 digits)");
}
}
else {
my $st1 = length($`);
my $end1 = (length $f2) + $st1;
my $key1 = "$st1-$end1";
my $st2 = $end1+(length $f3)+(length $f6);
my $key2 = "$st2-".((length $f7) + $st2);
addType ($key2, "Year (4 digits)");
}
}
}
}
# End of function seasonYear()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: name2()
# Checks for more than 1 name following plural drs, drs., doctors, professors,
# Checks up to 3 names
sub name2 {
$text = $_[0];
if ($namefilter =~ /y/) {
@plural_titles = ("doctors", "drs", "drs\.", "professors");
foreach $p (@plural_titles) {
while ($text =~ /\b((($p +)([A-Za-z]+) *(and +)?\,? *)([A-Za-z]+) *(and +)?\,? *)([A-Za-z]+)?\b/gi) {
my %names = ();
$st3 = length($`);
$st4 = $st3+length($3);
$end4 = $st4+length($4);
$key4 = "$st4-$end4";
$st6 = $st3+length($2);
$end6 = $st6+length($6);
$key6 = "$st6-$end6";
$st8 = $st3+length($1);
$end8 = $st8+length($8);
$key8= "$st8-$end8";
$names{$4} = $key4;
$names{$6} = $key6;
$names{$8} = $key8;
foreach $i (keys %names) {
$val = $names{$i};
if (length($i)>0) {
if (!(isCommonest($i)) || (isType($val, "Name", 1))) {
#print "addtype, val is $val\n";
addType ($val, "Name5 (PTitle)");
}
}
}
}
}
#****************************************************************************************
# Checks for names followed by "MD" or "M.D"
# Checks up to 3 previous words
while ($text =~ /\b((([A-Za-z\']+) +)?(([A-Za-z])\. +)?([A-Za-z\-\']+)((\, *)|(\s+))(rrt|md|m\.d\.|crt|np|rn|nnp|msw|r\.n\.)(\.|\,)*\b)/ig) {
$name = $1;
$start = length($`);
$end = $start + length($name);
$key = "$start-$end";
$name1 = $3; #if present, would be first name
$name2 = $4; #if present, would be initial
$name3 = $6; #if present would be last name
$st1 = length($`);
$end1 = $st1 + (length $name1);
$key1 = "$st1-$end1";
$st2 = $st1 + length($2);
$end2 = $st2 + (length $name2);
$key2 = "$st2-$end2";
$st3 = length($`) + (length $2) + (length $4);
$end3 = $st3 + (length $name3);
$key3 = "$st3-$end3";
if ((!($text =~ /(m\.?d\.?\')/)) && (!($text =~ /(m\.?d\.?s)/))) {
if (length($name1)>0) {
if ((length($name1)==1) || ((length($name1)==2) && ($name1 =~ /\b([A-Za-z])\.+\b/ig))) {
addType($key1, "Name Initial (MD)");
}
else {
if (isProbablyName($key1, $name1)) {
addType($key1,"Name6 (MD)" );
}
}
}
if (length($name2)>0 && length($name3)>0) {
if ((length($name2)==1) || ((length($name2)==2) && ($name2 =~ /\b([A-Za-z])\.+\b/ig))) {
#addType($key2, "Name Initial (MD)");
}
else {
#if (!(isCommon($name2) && !(isType($key2, "Name", 1)))) {
if (isProbablyName($key2, $name2)) {
addType($key2,"Name7 (MD)" );
}
}
}
if (length($name3)>0) {
if ((length($name3)==1) || ((length($name3)==2) && ($name3 =~ /\b([A-Za-z])\.\b/ig))) {
#addType($key3, "Name Initial (MD)");
}
else {
if (isProbablyName($key3, $name3)) {
addType($key3,"Name8 (MD)" );
}
} # end else
} #endif
} #end if text does not have M.D.' or M.D.s
}
#****************************************************************************************
# Removes PCP name field, leaving "PCP name:" intact, from discharge summaries
# Does not check for name patterns, since these should be caught by the other methods
# Required mainly for unknown names, checks up to 3 words following "PCP Name"
# Follows the pattern seen in discharge summaries, may not work well in nursing notes
#@name_pre = ("PCP", "physician", "provider", "created by", "name");
@name_pre = ("PCP", "physician", "provider", "created by");
foreach $l (@name_pre) {
while ($text =~/\b(($l( +name)?( +is)?\s\s*)([A-Za-z\-]+)((\s*\,*\s*)? *)([A-Za-z\-]+)(((\s*\,*\s*)? *)([A-Za-z\-]+))?)\b/ig) {
my $key1 = $5;
my $st1 = length($`)+(length $2);
my $end1 = $st1+(length $5);
my $keyloc1 = "$st1-$end1";
my $key2 = $8;
my $st2 = $end1+(length $6);
my $end2 = $st2+(length $8);
my $keyloc2 = "$st2-$end2";
my $key3 = $12;
my $st3 = $end2+(length $10);
my $end3 = $st3+(length $12);
my $keyloc3 = "$st3-$end3";
my %pcp = ();
$pcp{$keyloc1} = $key1;
$pcp{$keyloc2} = $key2;
$pcp{$keyloc3} = $key3;
foreach my $keyloc (keys %pcp ) {
my $val = $pcp{$keyloc};
if (length($val)>0) {
if ((length($val)==1) || ($val =~ /\b([A-Za-z])\.\b/ig)) {
addType($keyloc, "Name Initial (PRE)");
}
else {
#if (!(isCommonest($val) && !(isType($keyloc, "Name", 1)))) {
if (isProbablyName($keyloc, $val)){
addType($keyloc,"Name9 (PRE)" );
}
}
}
}
}
#followed by pattern "name is"
while ($text =~ /\b(($l( +name)?( +is)? ?([\#\:\-\=\.\,])+ *)([A-Za-z\-]+)((\s*\,*\s*)? *)([A-Za-z\-]+)((\s*\,*\s*)? *)([A-Za-z\-]+)?)\b/ig) {
my $key1 = $6;
my $st1 = length($`)+(length $2);
my $end1 = $st1+(length $6);
my $keyloc1 = "$st1-$end1";
my $key2 = $9;
my $st2 = $end1+(length $7);
my $end2 = $st2+(length $9);
my $keyloc2 = "$st2-$end2";
my $key3 = $12;
my $st3 = $end2+(length $10);
my $end3 = $st3+(length $12);
my $keyloc3 = "$st3-$end3";
my %pcp = ();
my $firstfound = 0;
my $secondfound = 0;
$pcp{$keyloc1} = $key1;
$pcp{$keyloc2} = $key2;
$pcp{$keyloc3} = $key3;
$blah = isCommonest($key3);
$blah2 = isType($keyloc3, "Name", 1);
if (length($key1)>0) {
if ((length($key1)==1) || ($key1 =~ /\b([A-Za-z])\.\b/ig)) {
addType($keyloc1, "Name Initial (NameIs)");
$firstfound = 1;
}
else {
if (isProbablyName($keyloc1, key1)){
addType($keyloc1,"Name10 (NameIs)" );
$firstfound = 1;
}
}
}
if ($firstfound == 1) {
if (length($key2)>0) {
if ((length($key2)==1) || ($key2 =~ /\b([A-Za-z])\.\b/ig)) {
addType($keyloc2, "Name Initial (NameIs)");
$secondfound = 1;
}
else {
if (isProbablyName($keyloc2, $key2)){
addType($keyloc2,"Name11 (NameIs)" );
$secondfound = 1;
}
}
}
}
if ($secondfound == 1) {
if (length($key3)>0) {
if ((length($key3)==1) || ($key3 =~ /\b([A-Za-z])\.\b/ig)) {
addType($keyloc3, "Name Initial (NameIs)");
}
else {
if (isProbablyName ($keyloc3, $key3)){
addType($keyloc3,"Name12 (NameIs)" );
}
}
}
}
}
}
}
}
# End of function name2()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: providerNumber()
# Removes "provider number", whole field, from discharge summaries
# Does not remove the field if there is no number following it
sub providerNumber {
$text = $_[0];
if ($unitfilter =~ /y/) {
while ($text =~ /\b(provider(( +)number)?( ?)[\#\:\-\=\s\.]?( ?)(\d+)([\/\-\:](\d+))?)\b/gi) {
my $unit = $1;
my $st = length($`);
my $key = "$st-".((length $unit) + $st);
addType ($key, "Provider Number");
}
}
}
# End of function providerNumber()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: signatureField()
# Removes signature fields from discharge summaries
# signature field taken as 3 or more underscores
# does not remove doctor names, since these are handled by other name handlers
sub signatureField {
$text = $_[0];
while ($text =~ /\b(\_\_+)\b/gi) {
my $sigfield = $1;
my $st = length($`);
my $key = "$st-".((length $sigfield) + $st);
addType ($key, "Signature");
}
}
# End of function signatureField()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: location1()
# Removes complete street addresses only when including one of the terms in @street_add_suff
sub location1 {
$text = $_[0];
if ($locfilter =~ /y/) {
#check if ambiguous locations are preceded by any of the location indicators,
#if so, add in PHI list
foreach $i (@location_indicators){
while ($text =~ /\b($i)(\s+)([A-Za-z]+)\b/ig) {
#print "Location Match 1 is $1, 2 is $2, 3 is $3\n";
my $st = length ($`) + length ($1) + length ($2);
my $end = $st + length ($3);
my $key = "$st-$end";
my $word = substr $text, $st, ($end-$st);
#print "word is $word\n";
if (isType($key, "Location",1) || (length ($word) > 1 && !(isCommon($word))) ) {
addType ($key, "Location");
}
}
} #end for each i in location indicators
#if the company dictionary is loaded, check if any of the ambiguous company names
#are preceded by the employment indicators
if ($company_list =~/y/){
#print "company list is on\n";
foreach $i (@employment_indicators_pre){
while ($text =~ /\b($i)(\s+)([A-Za-z]+)\b/ig) {
$st = length ($`) + length ($1) + length ($2);
$end = $st + length ($3);
$key = "$st-$end";
my $word = substr $text, $st, ($end-$st);
my $tmp = isCommon($word);
if (isType($key, "Company",1)|| (length ($word) > 1 && !(isCommon($word))) ) {
#print "adding $3 key $key as company unambig";
addType ($key, "Company");
}
}
} #end for each i in location indicators
}
#strict address suffix, case-sensitive match, PHI regardless of ambiguity
foreach $i (@strict_street_add_suff) {
#make it a case-sensitive match for street address suffix
while ($text =~ /\b(([0-9]+ +)?(([A-Za-z\.\']+) +)?([A-Za-z\.\']+) +\b$i\.?\b)\b/g) {
$st = length($`);
$end = $st + length($1);
#check next segment for apartment, suite, floor #s
my $nextSeg = substr $text, $end, 30;
#print "check nextSeg for apt and sutie #, seg is $nextSeg\n";
foreach $k (@apt_indicators){
if ($nextSeg =~ /\b($k\.?\#? +[\w]+)\b/gi) {
$end += length ($`) + length($1);
}
}
$key = "$st-$end";
#addType ($key, "Street Address");
if (length($3) == 0) {
if (!isUnambigCommon($5)) {
addType ($key, "Street Address");
}
}
elsif (!((isUnambigCommon($4)) && (isUnambigCommon($5)))) {
addType($key, "Street Address");
}
} # end while
} #end foreach
}#end if
#Non-strict address suffix, case-insensitive match, PHI if no ambiguity
if ($locfilter =~ /y/) {
foreach $i (@street_add_suff) {
while ($text =~ /\b(([0-9]+) +(([A-Za-z]+) +)?([A-Za-z]+) +$i)\b/gi) {
$st = length($`);
$end = $st + length($1);
$key = "$st-$end";
if (length($3) == 0) {
if (!isUnambigCommon($word)){
addType ($key, "Street Address");
}
}
elsif ( ! (isUnambigCommon($4) || isUnambigCommon($5))){
addType($key, "Street Address");
}
}
}
}
#****************************************************************************************
# Removes 2-word location PHI ending with @loc_indicators_suff or preceded by @loc_indicators_pre
# Words potentially PHI
if ($locfilter =~ /y/) {
foreach $i (@loc_indicators_suff) {
while ($text =~ /\b(([A-Za-z\-]+)? +)?(([A-Za-z\-]+) + *$i +)\b/ig) {
if (!isCommon($4)) {
$st2 = length($`)+length($1);
$end2 = $st2 + length($3);
$key2 = "$st2-$end2";
addType ($key2, "Location");
if (length $2>0) {
if (!isCommon($2)) {
$st1 = length($`);
$end1 = $st1 + length($2);
$key1 = "$st1-$end1";
addType ($key1, "Location");}
}
}
}
}
}
if ($locfilter =~ /y/) {
# Words most likely PHI
foreach $i (@loc_ind_suff_c) {
while ($text =~ /\b(([A-Za-z]+ +)?)(([A-Za-z]+)$i+)\b/ig) {
if (!isCommon($3)) {
$st2 = length($`)+length($1);
$end2 = $st2 + length($3);
$key2 = "$st2-$end2";
addType ($key2, "Location");
}
}
}
}
if ($locfilter =~ /y/) {
# Words potentially PHI
foreach $i (@loc_indicators_pre) {
while ($text =~ /\b((($i + *([A-Za-z\-]+)) *)([A-Za-z\-]+)?)\b/ig) {
if (!isCommon($4)) {
$st2 = length($`);
$end2 = $st2 + length($3);
$key2 = "$st2-$end2";
addType ($key2, "Location");
if (length $5>0) {
if (!isCommon($5)) {
$st1 = length($`)+length($2);
$end1 = $st1 + length($5);
$key1 = "$st1-$end1";
addType ($key1, "Location");
}
}
}
}
}
} # end if locfilter =~ /y/
@universities_pre = ("University", "U", "Univ", "Univ.");
#catches "University of", "U of", "Univ of", "Univ. of"
if ($locfilter =~ /y/) {
# Words potentially PHI
foreach $i (@universities_pre) {
while ($text =~ /\b((($i +of *([A-Za-z\-]+)) *)([A-Za-z\-]+)?)\b/ig) {
my $tmp = isUSStateAbbre($4);
if (isUSStateAbbre($4) || isUSState($4) || !isCommon($4) ) {
$st2 = length($`);
$end2 = $st2 + length($3);
$key2 = "$st2-$end2";
addType ($key2, "Location");
if (length $5>0) {
if (!isCommon($5)) {
$st1 = length($`)+length($2);
$end1 = $st1 + length($5);
$key1 = "$st1-$end1";
addType ($key1, "Location (Universities)");
}
}
}
}
}
} # end if locfilter =~ /y/
}
# End of function location1()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: location2()
# Searches for multiple hospital and location terms
sub location2 {
$text = $_[0];
if ($locfilter =~ /y/) {
foreach $hos (@hospital) {
my @hospital_terms = split " ", $hos;
$len = 0;
foreach $h (@hospital_terms) {
if (length($h) != 0) {
$len = $len+1;
$hos[$len] = $h;
}
}
if ($len == 1) {
while ($text =~ /\b($hos[1])\b/ig) {
$hospital = $1;
$st = length($`);
$end = $st + length($hospital);
$key = "$st-$end";
addType($key, "Hospital1");
}
}
if ($len == 2) {
while ($text =~ /\b($hos[1])( )($hos[2])\b/ig) {
$hos1 = $1;
$hos2 = $3;
$space = $2;
$st1 = length($`);
$end1 = $st1 + length($hos1);
$key1 = "$st1-$end1";
addType($key1, "Hospital2");
$st2 = $end1 + length($space);
$end2 = $st2 + length($hos2);
$key2 = "$st2-$end2";
addType($key2, "Hospital3");
}
}
if ($len == 3) {
while ($text =~ /\b($hos[1])( )($hos[2])( )($hos[3])\b/ig) {
$hos1 = $1;
$hos2 = $3;
$hos3 = $5;
$st1 = length($`);
$end1 = $st1 + length($hos1);
$key1 = "$st1-$end1";
addType($key1, "Hospital4");
$st2 = $end1 + length($2);
$end2 = $st2 + length($hos2);
$key2 = "$st2-$end2";
addType($key2, "Hospital5");
$st3 = $end2 + length($4);
$end3 = $st3 + length($hos3);
$key3 = "$st3-$end3";
addType($key3, "Hospital6");
}
}
if ($len == 4) {
while ($text =~ /\b($hos[1])( )($hos[2])( )($hos[3])( )($hos[4])\b/ig) {
$hos1 = $1;
$hos2 = $3;
$hos3 = $5;
$st1 = length($`);
$end1 = $st1 + length($hos1);
$key1 = "$st1-$end1";
addType($key1, "Hospital");
$st2 = $end1 + length($2);
$end2 = $st2 + length($hos2);
$key2 = "$st2-$end2";
addType($key2, "Hospital");
$st3 = $end2 + length($4);
$end3 = $st3 + length($hos3);
$key3 = "$st3-$end3";
addType($key3, "Hospital");
$st4 = $end3 + length($6);
$end4 = $st4 + length($hos4);
$key4 = "$st4-$end4";
addType($key4, "Hospital");
}
}
}
foreach $loc (@loc_unambig) {
my @loc_terms = split " ", $loc;
$len = 0;
foreach $h (@loc_terms) {
if (length($h) != 0) {
$len = $len+1;
$loc[$len] = $h;
}
}
if ($len == 1) {
while ($text =~ /\b($loc[1])\b/ig) {
$location = $1;
$st = length($`);
$end = $st + length($location);
$key = "$st-$end";
addType($key, "Location");
}
}
if ($len == 2) {
while ($text =~ /\b($loc[1])( )($loc[2])\b/ig) {
$loc1 = $1;
$loc2 = $3;
$st1 = length($`);
$end1 = $st1 + length($loc1);
$key1 = "$st1-$end1";
addType($key1, "Location");
$st2 = $end1 + length($2);
$end2 = $st2 + length($loc2);
$key2 = "$st2-$end2";
addType($key2, "Location");
}
}
if ($len == 3) {
while ($text =~ /\b($loc[1])( )($loc[2])( )($loc[3])\b/ig) {
$loc1 = $1;
$loc2 = $3;
$loc3 = $5;
$st1 = length($`);
$end1 = $st1 + length($loc1);
$key1 = "$st1-$end1";
addType($key1, "Location");
$st2 = $end1 + length($2);
$end2 = $st2 + length($loc2);
$key2 = "$st2-$end2";
addType($key2, "Location");
$st3 = $end2 + length($4);
$end3 = $st3 + length($loc3);
$key3 = "$st3-$end3";
addType($key3, "Location");
}
}
if ($len == 4) {
while ($text =~ /\b($loc[1])( )($loc[2])( )($loc[3])( )($loc[4])\b/ig) {
$loc1 = $1;
$loc2 = $3;
$loc3 = $5;
$st1 = length($`);
$end1 = $st1 + length($loc1);
$key1 = "$st1-$end1";
addType($key1, "Location");
$st2 = $end1 + length($2);
$end2 = $st2 + length($loc2);
$key2 = "$st2-$end2";
addType($key2, "Location");
$st3 = $end2 + length($4);
$end3 = $st3 + length($loc3);
$key3 = "$st3-$end3";
addType($key3, "Location");
$st4 = $end3 + length($6);
$end4 = $st4 + length($loc4);
$key4 = "$st4-$end4";
addType($key4, "Location");
}
}
}
#######
#PO Box number
while ($text =~ /\b(P[\.]?O[\.]? *Box *[\#]? *[0-9]+)\b/gi) {
$location = $1;
$st = length($`);
$end = $st + length($location);
$key = "$st-$end";
addType($key, "PO Box");
}
######
#Zipcodes
foreach $loc (@us_states_abbre) {
while ($text =~ /\b($loc *[\.\,]*\s*\d{5}[\-]?[0-9]*)\b/gi) {
$location = $1;
$st = length($`);
$end = $st + length($location);
$key = "$st-$end";
addType($key, "State/Zipcode");
}
}
#Zipcodes with more US states abbreviations
foreach $loc (@more_us_states_abbre) {
while ($text =~ /\b($loc *[\.\,]*\s*\d{5}[\-]?[0-9]*)\b/gi) {
$location = $1;
$st = length($`);
$end = $st + length($location);
$key = "$st-$end";
addType($key, "State/Zipcode");
}
}
#Zipcodes with full US state names
foreach $loc (@us_states) {
while ($text =~ /\b($loc *[\.\,]*\s*\d{5}[\-]?[0-9]*)\b/gi) {
$location = $1;
$st = length($`);
$end = $st + length($location);
$key = "$st-$end";
addType($key, "State/Zipcode");
}
}
##########
#remove US states if filter flag for State is on
if ($us_state_filter =~ /y/) {
foreach $loc (@us_states) {
my @loc_terms = split " ", $loc;
$len = 0;
foreach $h (@loc_terms) {
if (length($h) != 0) {
$len = $len+1;
$loc[$len] = $h;
}
}
if ($len == 1) {
while ($text =~ /\b($loc[1])\b/ig) {
$location = $1;
$st = length($`);
$end = $st + length($location);
$key = "$st-$end";
addType($key, "State");
}
}
if ($len == 2) {
while ($text =~ /\b(($loc[1])( )($loc[2]))\b/ig) {
$location = $1;
$st = length($`);
$end = $st + length($location);
$key = "$st-$end";
addType($key, "State");
}
}
if ($len == 3) {
while ($text =~ /\b(($loc[1])( )($loc[2])( )($loc[3]))\b/ig) {
$location = $1;
$st = length($`);
$end = $st + length($location);
$key = "$st-$end";
addType($key, "State");
}
}
if ($len == 4) {
while ($text =~ /\b(($loc[1])( )($loc[2])( )($loc[3])( )($loc[4]))\b/ig) {
$location = $1;
$st = length($`);
$end = $st + length($location);
$key = "$st-$end";
addType($key, "State");
}
}
}
} #end if us_state_filter is on
#######
# Sub-function: hospitalIndicators()
# Searches for hospital indicators and checks if previous and following words are hospitals
foreach $h (@hospital_indicators) {
while ($text =~ /((([A-Za-z\-\']+)( + *))?(([A-Za-z\-\']+)( + *))?($h\b)(( + *)([A-Za-z\-\']+))?(( + *)([A-Za-z\-\']+))?\b)/ig) {
my $typeadded = 0;
$st1 = length($`);
$end1 = $st1 + length($3);
$key1 = "$st1-$end1";
$st2 = $st1 + length($2);
$end2 = $st2 + length($6);
$key2 = "$st2-$end2";
$st3 = $st1 + length($2) + length ($5) + length($8) + length($10);
$end3 = $st3 + length($11);
$key3 = "$st3-$end3";
$st4 = $end3 + length($13);
$end4 = $st4 + length($14);
$key4 = "$st4-$end4";
$st5 = $end2 + length($7);
$end5 = $st5 + length($8);
$key5 = "$st5-$end5";
if (length($5)==0) {
if ((length($3) > 1) && (!isUnambigCommon($3)) && (!(isCommon ($3)) || (isType ($key1, "Hospital", 1)))) {
addType ($key1, "Hospital");
#addType ($key5, "Hospital-Ind");
$typeadded = 1;
}
}
elsif ((length($6) > 1) && (!isUnambigCommon($6)) && (!(isCommon ($6)) || isUSState($6) || isUSStateAbbre($6) || (isType ($key2, "Hospital", 1)))) {
addType ($key2, "Hospital");
#addType ($key5, "Hospital-Ind");
$typeadded = 1;
if ((length($3) > 1) && (!isUnambigCommon($3)) && (!(isCommon ($3)) || isUSState($3) || isUSStateAbbre($3) || (isType ($key1, "Hospital", 1)))) {
addType ($key1, "Hospital");
#addType ($key5, "Hospital-Ind");
$typeadded = 1;
}
}
# #Generating too many false positives.
# #Need a better common word dictionary to enable this.
#if ($typeadded == 0) {
# if ((length($11) > 1) && (!(isCommonest ($11)) || (isType ($key3, "Hospital", 1)))) {
# #addType ($key3, "Hospital");
# #addType ($key5, "Hospital-Ind");
# #if ((length($14) > 1) && (!(isCommonest ($14)) || (isType ($key4, "Hospital", 1)))) {
# # # addType ($key4, "Hospital");
# # #addType ($key5, "Hospital-Ind");
# # }
# }
#} #end if (typeadded == 0)
}
}
}
}
# End of function location2()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: problem()
# Checks for names preceded by "problem", pattern found in discharge summaries
sub problem {
$text = $_[0];
$k = "problem";
$l = ":";
while ($text =~ /\b(([A-Za-z\-]+) + *($k))\b/ig) {
if ((!isCommon($2)) || (isNameAmbig($2))) {
$st = length($`);
$end = $st + length($2);
$key = "$st-$end";
addType ($key, "Last Name");
}
}
}
# End of function problem()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: mrn()
# Checks for medical record numbers, i.e. numbers preceded by "mrn" or "medical record number"
sub mrn {
$text = $_[0];
if ($unitfilter =~ /y/) {
while ($text =~ /\b(mrn( *)[\#\:\-\=\s\.]?( *)(\t*)( *)(\d+)([\/\-\:](\d+))?)\b/gi) {
my $unit = $1;
my $st = length($`);
my $key = "$st-".((length $unit) + $st);
addType ($key, "Medical Record Number");
}
@numbers = ("number", "no", "num", "");
foreach $i (@numbers) {
while ($text =~ /\b(medical record( *)$i?( *)[\#\:\-\=\s\.]?( *)(\t*)( *)(\d+)([\/\-\:](\d+))?)\b/gi) {
my $unit = $1;
my $st = length($`);
my $key = "$st-".((length $unit) + $st);
addType ($key, "Medical Record Number");
}
}
}
}
# End of function mrn()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: unit()
# Checks for unit numbers in discharge summaries
# Removes the entire field of "unit <number>" or unit <number>/<number>
# If "unit" is not followed by a number, does not remove it
sub unit {
$text = $_[0];
if ($unitfilter =~ /y/) {
@numbers = ("number", "no", "num", "");
foreach $i (@numbers) {
while ($text =~ /\b(unit( ?)$i?( *)[\#\:\-\=\s\.]?( *)(\t*)( *)(\d+)([\/\-\:](\d+))?)\b/gi) {
my $unit = $1;
my $st = length($`);
my $key = "$st-".((length $unit) + $st);
addType ($key, "Unit Number");
}
}
}
}
# End of function unit()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: name3()
# Checks every lastnameprefix
# If the following word is either a name or not commonest, identifies it as lastname
sub name3 {
$text = $_[0];
if ($namefilter =~ /y/) {
foreach $line (@prefixes_unambig) {
while ($text =~ /\b(($line)([\s\'\-])+ *)([A-Za-z]+)\b/ig) {
my $pre = $2;
my $prestart = length($`);
my $preend = $prestart+(length $pre);
my $prekey = "$prestart-$preend";
my $lname = $4;
my $lstart = $prestart+length($1);
my $lend = $lstart+length($4);
my $lnamekey = "$lstart-$lend";
if ((!(isCommonest ($lname))) || (isType ($lnamekey, "Name", 1))) {
addType ($prekey, "Name Prefix (Prefixes)");
addType ($lnamekey, "Last Name (Prefixes)");
}
}
}
#****************************************************************************************
@specific_titles = ("MR", "MISTER", "MS");
foreach $i (@specific_titles) {
while ($text =~ /\b($i\.( *))([A-Za-z\'\-]+)\b/ig) {
$potential_name = $3;
$start = length($`)+length($1);
$end = $start + length($potential_name);
$key = "$start-$end";
if (isType($key, "Name", 1)) {
addType ($key, "Name13 (STitle)");
}
elsif (!(isCommon($potential_name))) {
addType ($key, "Name14 (STitle)");
}
}
}
#****************************************************************************************
# Goes through word by word looking for unspotted names
# All words have already been marked as potential names (where appropriate) by the previous routines
# Looks for last names following titles (Dr. Smith)
# Also should pick up "Dr. S"
# Looks for the last name prefixes
foreach $i (@strict_titles) {
L:
while ($text =~ /\b($i\b\.? *)([A-Za-z\'\-]+)\b/ig) { # added ' -
my $tt = $1;
my $word = $2;
my $st = length($`) + length($1);
my $fi = $st + length($2);
my $key = "$st-$fi";
if (exists $prefixes{uc($word)}) {
addType ($key, "Last Name (STitle)");
my $start = $fi;
my $nextWord = substr $text, $start;
if ($nextWord =~ /\A( ?)(\')?( ?)([A-Za-z]+)\b/g) {
my $token = $4;
my $lstart = $start+length($1)+length($2)+length($3);
my $lend = $lstart+length($4);
my $fi += length($1)+length($4);
if (exists $prefixes{uc($token)}){
addType ("$start-$fi", "Last Name (STitle)");
my $start = $fi;
my $nextWord = $nextWord;
my $token = $token;
if ($nextWord =~ /\A( ?$token( ?))([A-Za-z]+)\b/g) {
$word = $3;
$key = "$fi-".($fi + length($2) + length($3));
} else {
next L;
}
}
else {
# Has already identified one prefix, should now check to see if next word is name or is not commonest
$word = $token;
$key = "$lstart-$lend";
if (isProbablyName($key, $word)){
addType ($key, "Last Name (STitle)");
}
}
} else {
next L;
}
} else { #else $word is not a prefix
if ($word =~ /\'([A-Za-z]+)/) {
$word = $1;
$st--;
$key = $st."-".$fi;
}
if ($word =~ /([A-Za-z]+)\'/) {
$word = $1;
$key = $st."-".($fi-1);
}
}
###########################################################
if (exists $phi{$key}) {
addType($key, "Last Name (STitle)");
if ((isType($key, "First Name", 1))) {
addType($key, "First Name (STitle)");
}
} else {
if (isProbablyName($key, $word)){
addType ($key, "Last Name (STitle)");
}
else {
addType ($key, "Last Name (STitle)");
}
} #end else (!exists $phi{$key})
################################
#added to catch Dr. <firstname> <lastname>
#check the word after $word
my ($tmpStart,$tmpEnd) = split '-', $key;
my $following = substr $text, $tmpEnd;
if($following =~/\A(\s+)([A-Za-z\-\']{2,})\b/g){
my $fword = $2;
my $newst = $tmpEnd + (length ($1));
my $nextKey = "$newst-".($newst + length($2));
if (isProbablyName($nextKey, $fword)){
addType($nextKey, "Name (STitle)");
}
}
##########################################
} #end while text matches the pattern
} #end for each $i strict_titles
#****************************************************************************************
# Goes through word by word looking for unspotted names
# All words have already been marked as potential names (where appropriate) by the previous routines
# Looks for last names following titles (Dr. Smith)
# Also should pick up "Dr. S"
# Looks for the last name prefixes
#mark as ambiguous if common words
foreach $i (@titles) {
L:
while ($text =~ /\b($i\b\.? ?)([A-Za-z]+) *([A-Za-z]+)?\b/ig) {
my $tt = $1;
my $word = $2;
my $st = length($`) + length($1);
my $fi = $st + length($2);
my $key = "$st-$fi";
my $wordAfter = $3; ##added to catch last names
my $stAfter = $fi + 1;
my $fiAfter = $stAfter + length ($3);
my $keyAfter = "$stAfter-$fiAfter";
if (exists $prefixes{uc($word)}) {
addType ($key, "Last Name (Titles)");
my $start = $fi;
my $nextWord = substr $text, $start;
if ($nextWord =~ /\A( ?)(\')?( ?)([A-Za-z]+)\b/g) {
my $token = $4;
my $lstart = $start+length($1)+length($2)+length($3);
my $lend = $lstart+length($4);
my $fi += length($1)+length($4);
if (exists $prefixes{uc($token)}){
addType ("$start-$fi", "Last Name (Titles)");
my $start = $fi;
my $nextWord = $nextWord;
my $token = $token;
if ($nextWord =~ /\A( ?$token( ?))([A-Za-z]+)\b/g) {
$word = $3;
$key = "$fi-".($fi + length($2) + length($3));
} else {
next L;
}
}
else {
# Has already identified one prefix, should now check to see if next word is name or is not commonest
$word = $token;
$key = "$lstart-$lend";
if (isProbablyName($key, $word) && length($word) > 1 ){
addType ($key, "Last Name (Titles)");
}
}
} else {
next L;
}
} else {
if ($word =~ /\'([A-Za-z]+)/) {
$word = $1;
$st--;
$key = $st."-".$fi;
}
if ($word =~ /([A-Za-z]+)\'/) {
$word = $1;
$key = $st."-".($fi-1);
}
}
if (length ($wordAfter) > 1) {
my $tmp = isCommon($wordAfter);
if (!isCommonest($wordAfter) || (isType($keyAfter, "Name", 1) && isType($keyAfter, "(un)")) ||
(isType($keyAfter, "Name", 1) && ($wordAfter =~ /\b(([A-Z])([a-z]+))\b/g)) ) {
addType($keyAfter, "Last Name (Titles)");
addType($key, "First Name (Titles)");
}
}
elsif (exists $phi{$key}) {
if ((isType($key, "Name", 1))) {
addType($key, "Last Name (Titles)");
}
} else {
if ( length($word) > 1 && !(isCommon($word)) ) {
addType ($key, "Last Name (Titles)");
}
else {
if (($word =~ /\b[A-Z][a-z]+\b/) || ($tt =~ /$i\. /)) {
addType ($key, "Last Name (Titles ambig)");
} else {
addType ($key, "Last Name (Titles ambig)");
}
}
}
}
}
#****************************************************************************************
# Implements simple rules for finding names that aren't in the list or are ambiguous...
# first name + last name -> first name + last name (ambig), first name +
#not-on-any-safe-word-list, else save the second word and see whether it appears
#in the patient text not associated with the first name or any other name indicator
# Also first + initial + last name
# Finds each prefix, labels the next not uncommonest word
# Finds all first names (unambig), look at following word -> make last name unambigs
foreach $k (keys %phi) {
if (((isType($k, "Male First Name", 1)) || (isType($k, "Female First Name", 1))) && ((isType($k, "(un)", 1)) || (isType($k, "pop", 1)))) {
my ($start, $end) = split '-', $k;
my $following = substr $text, $end;
# No middle initial
if ($following =~ /\A( +)([A-Za-z\']{2,})\b/g) { #added to catch firstname s.a. O'Connell
my $fword = $2;
my $st = $end + (length $1);
my $nextKey = "$st-".($st + length($2));
if (exists $phi{$nextKey}) {
if ((isType($nextKey, "Name", 1) == 1) && isProbablyName($nextKey, $fword)) {
addType($nextKey, "Last Name (NamePattern1)");
addType($k,"First Name4 (NamePattern1)"); # make it unambig
}
}
else {
if (isProbablyName($nextKey, $fword)){
addType ($nextKey, "Last Name (NamePattern1)");
addType($k,"First Name5 (NamePattern1)"); }}}# make it unambig
# Looks for that middle initial
if ($following =~ /\A( +)([A-Za-z])(\.? )([A-Za-z\-][A-Za-z\-]+)\b/g) {
my $initial = $2;
my $lastN = $4;
my $st = $end + (length $1);
my $iniKey = "$st-".($st+1);
my $stn = $st + (length $2) + (length $3);
my $nextKey = "$stn-".($stn + (length $4));
if (exists $phi{$nextKey}) {
if ((isType($nextKey, "Last Name", 0) == 0)) {
addType($nextKey, "Last Name (NamePattern1)");
addType($iniKey, "Initial (NamePattern1)");
addType($k,"First Name11 (Name Pattern1)");
}
}
else {
if ($following =~ /\A( +)([A-Za-z])(\.? )([A-Za-z][A-Za-z]+)\b\s*\Z/g){
addType ($nextKey, "Last Name (NamePattern1)");
addType($iniKey, "Initial (NamePattern1)");
addType($k,"First Name6 (NamePattern1)");
}
elsif (!(isCommonest($lastN))) {
addType ($nextKey, "Last Name (NamePattern1)");
addType($iniKey, "Initial (NamePattern1)");
addType($k,"First Name7 (NamePattern1)");
}
}
}
}
}
# Finds all last names (unambig), looks at proceeding word -> make first names unambigs
foreach $k (keys %phi) {
if (isType($k, "Last Name", 1) && (isType($k, "(un)", 1))) {
my ($start, $end) = split '-', $k;
my $preceding = substr $text, 0, $start;
if ($preceding =~ /\b([A-Za-z]+)( *)\Z/g) {
my $pword = $1;
my $st = length($`);
my $prevKey = "$st-".($st + (length $1));
if (exists $phi{$prevKey}) {
#my $result = isNameIndicator($pword);
#print "pword is $pword, isNameIndicator returns $result";
#if ((isType($prevKey, "First Name", 1)) && (!isType($prevKey, "Name Indicator", 0))) {
if ((isType($prevKey, "First Name", 1)) && (!isNameIndicator($pword)) ) {
addType($prevKey, "First Name8 (NamePattern2)");
} # Else it's been positively identified as something that is not a name so leave it
}
else {
# Sees whether it appears in the common words...
if (!(isCommon($pword))) {
addType ($prevKey, "First Name9 (NamePattern2)");
}
}
}
}
}
#****************************************************************************************
# Looks for compound last names -> last name + last name (ambig), last name + not-on-any-safe-word-list, last name "-" another word
# Last name with an ambiguous name preceding it has already labeled the preceding thing a first name; no huge loss if it's just a weird first part of a compound last name
foreach $k (keys %phi) {
if (isType($k, "Last Name", 0)) {
my ($start, $end) = split '-', $k;
my $following = substr $text, $end;
if ($following =~ /\A-([A-Za-z]+)\b/g) { #hypen-ated last name
my $newend = $end+length($1)+1;
my $nextKey = "$end-$newend";
addType ($nextKey, "Last Name (NamePattern3)");
}
if ($following =~ /\A( *)([A-Za-z]+)\b/g) {
my $fword = $2;
my $st = $end + (length $1);
my $nextKey = "$st-".($st + length($2));
if (exists $phi{$nextKey}) {
if (!(isType($nextKey, "ambig", 1))) {
if (isType($nextKey, "Last Name", 0) == 0) {
addType($nextKey, "Last Name (NamePattern3)");
}
} # Else it's been positively identified as something that is not a name so leaves it
}
else {
# Sees whether it appears in the common words
if (!(isCommon($fword))) {
addType ($nextKey, "Last Name (NamePattern3)");
}
}
}
}
}
#****************************************************************************************
# Looks for initials
# Many last names get classified as first names and other PHI -> looks for initial before all unambig names and locations
INI:
foreach $k (keys %phi) {
if ( ((!(isType($k, "ambig", 1))) || isType($k, "(un)",1)) && (isType($k, "Name", 1))) {
#if (isType($k, "Name", 1)) {
my ($start, $end) = split '-', $k;
my $preceding = substr $text, 0, $start;
# Checks for two initials
if ($preceding =~ /\b([A-Za-z][\. ] ?[A-Za-z]\.?) ?\Z/g) {
my $key = (length ($`))."-".(length($`) + (length $1));
addType ($key, "Initials (NamePattern4)");
if (!(isType($k, "Last Name", 0))) {
}
}
# Checks if preceding word is an initial
elsif ($preceding =~ /\b([A-Za-z]\.?) ?\Z/g) { #1 initial
my $tmp = substr $text, $start, $end - $start +1;
my $init = $1;
my $key = (length ($`))."-".(length($`) + (length $1));
if (lc($init) eq "s") {
if ((substr $preceding, (length($`) - 1), 1) eq "'") { #for 's
# next INI;
}
}
if ((lc($init) eq "a") || (lc($init) eq "i")) {
if (isCommon(substr $text, $start, ($end - $start))) {
# next INI;
}
}
if (length($init)==2 || length($init)==1) {
addType ($key, "Initials (NamePattern4)");
}
if (!(isType($k, "Last Name", 0))) {
addType ($k, "Last Name (NamePattern4)");
}
}
}
}
#****************************************************************************************
# Looks for initials; similar to previous code block
foreach $k (keys %phi) {
if (isType($k, "Last Name", 1) && (!isType ($k, "ambig", 1))) {
my ($start, $end) = split '-', $k;
my $preceding = substr $text, 0, $start;
#two initials (why would they write that? Why not?) {
if ($preceding =~ /\b([A-Za-z][\. ] ?[A-Za-z]\.?) ?\Z/g) {
my $key = (length ($`))."-".(length($`) + (length $1));
addType ($key, "Initials (NamePattern5)");
if (!(isType($k, "Last Name", 0))) {
addType ($k, "Last Name (NamePattern5)");
}
}
elsif ($preceding =~ /\b([A-Za-z]\.?) ?\Z/g) { #1 initial
my $init = $1;
my $key = (length ($`))."-".(length($`) + (length $1));
if (lc($init) eq "s") {
if ((substr $preceding, (length($`) - 1), 1) eq "'") { #for 's
#next INI;
}
}
if ((lc($init) eq "a") || (lc($init) eq "i")) {
if (isCommon(substr $text, $start, ($end - $start))) {
# next INI;
}
}
}
}
}
#****************************************************************************************
# Searches for patterns "name and/or," comma list names
foreach $k (keys %phi) {
if ((isType($k, "Last Name", 0)) || (isType($k, "Male First Name", 0)) || (isType($k, "Female First Name", 0))) {
my ($start, $end) = split '-', $k;
my $following = substr $text, $end;
if ((length $following) == 0) { next; }
# First just looks for "and"/"or"
if ($following =~ /\A and ([A-Za-z]+)\p{IsPunct}/ig) {
my $word = $text1;
my $key = ($end + 5)."-".($end + 5 + length($1));
if ((isType($key, "Name", 1)) || (!(isCommon($word)))) {
addType ($key, "Last Name (NamePattern6)");
}
}
elsif ($following =~ /\A and ([A-Za-z]+)\b/ig) {
my $word = $1;
my $key = ($end + 5)."-".($end + 5 + length($1));
if (!(isCommon($word))) {
addType ($key, "Last Name (NamePattern6)");
}
}
elsif ($following =~ /\A or ([A-Za-z]+)\b/ig) {
my $word = $1;
my $key = ($end + 4)."-".($end + 4 + length($1));
if (!(isCommon($word))) {
addType ($key, "Last Name (NamePattern6)");
}
}
elsif ($following =~ /\A( ?[\&\+] ?)([A-Za-z]+)\b/ig) {
my $word = $2;
my $st = $end + (length $1);
my $key = "$st-".($st + length($2));
if (!(isCommon($word))) {
addType ($key, "Last Name (NamePattern6)");
}
}
elsif ($following =~ /\A, ([A-Za-z]+)(,? and )([A-Za-z]+)\b/ig) {
# Searches up to 3 names in a list
my $name1 = $1;
my $name2 = $3;
my $st2 = $end + 2 + (length $name1) + length($2);
my $key1 = ($end + 2)."-".($end + 2 + (length $name1));
my $key2 = "$st2-".($st2 + length($name2));
if (!(isCommon($name1))) {
addType ($key1, "Last Name (NamePattern6)");
}
if (!(isCommon($name2))) {
addType ($key2, "Last Name (NamePattern6)");
}
}
}
}
}
}
# End of function name3()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: commonHoliday()
# Searches for some common holiday names that can identify the date
# Extension: Add new holiday names to this regex
sub commonHoliday() {
$text = $_[0];
if ($datefilter =~ /y/) {
while ($text =~ /\b(christmas|thanksgiving|easter|hannukah|rosh hashanah|ramadan)\b/ig) {
$holidayname = $1;
$start = length($`);
$end = $start + length($holidayname);
$key = $start."-".$end;
addType ($key, "Holiday");
}
}
}
# End of function commonHoliday()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: knownPatientName()
# Searches for PID-specific patient names known a priori, i.e. the patient first and last names for this particular PID
# Indiscriminately removes these PHI from anywhere in the text
# Extension: To include new PID-patient name mappings, extend the file $patient_file
sub knownPatientName {
$text = $_[0];
if ($namefilter =~ /y/) {
foreach $i (@known_first_name) {
while ($text =~ /\b($i)\b/ig) {
my $start = length($`);
my $end = $start + length($1);
my $key = "$start-$end";
addType ($key, "Known patient firstname");
}
}
foreach $j (@known_last_name) {
while ($text =~ /\b($j)\b/ig) {
my $start = length($`);
my $end = $start + length($1);
my $key = "$start-$end";
addType ($key, "Known patient lastname");
}
}
}
}
# End of function knownPatientName()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
#Returns true if the number passed in matches a defined us area code
sub isCommonAreaCode {
$areacode = $_[0];
return ($us_area_code{$areacode});
}
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: telephone()
# Searches for telephone numbers, with or without area codes, with or without extensions
# Extension: To add new formats, add a new rule
sub telephone() {
$text = $_[0];
if ($telfilter =~ /y/) {
while ($text =~ /\b\(?\d{3}\s*[ \-\.\/\=\,]*\s*\d{4}\)?\b/g) { #added back \b to avoid high fp
#while ($text =~ /\b\d{3}\s*[ \-\.\/\=\,]*\s*\d{4}\b/g) { #added back \b to avoid high fp
my $start = length($`);
my $end = $start + length($&);
my $nextSeg = substr $text, $end, 20;
#catch extensions
if ($nextSeg =~ /\A(\s*x\.?\s*[\(]?[\d]+[\)]?)\b/) {
$end += length($1);
} elsif ($nextSeg =~ /\A(\s*ext[\.]?\s*[\(]?[\d]+[\)]?)\b/) {
$end += length($1);
} elsif ($nextSeg =~ /\A(\s*extension\s*\(?[\d]+\)?)\b/) {
$end += length($1);
}
my $key = "$start-$end";
#now checks the context
$context_len = 20;
my $start_pos = $start - $context_len;
if ($start_pos < 0) {
$start_pos = 0;
}
my $len = $context_len;
if (length ($text) < ($end + $context_len)){
$len = length($text) - $end;
}
my $textBefore = substr $text, $start_pos, ($start - $start_pos);
my $textAfter = substr $text, $end, $len;
if (isProbablyPhone($textBefore)){
addType ($key, "Telephone/Fax (1)");
}
}
#pattern such as ###-###-####
#let's not worry about patterns such as ###-HART and ###-LUNG for now
#while ($text =~ /\(?\d{3}?\s?[\)\.\/\-\=\, ]*\s?\d{3}\s?[ \-\.\/\=]*\s?\d{4}\b/g) {
while ($text =~ /\d{3}\s*[\)\.\/\-\, ]*\s*\d{3}\s*[ \-\.\/]*\s*\d{4}/g) {
#if (isCommonAreaCode($1)){
my $st = length($`);
my $end = $st + length($&);
my $nextSeg = substr $text, $end, 20;
if ($nextSeg =~ /\A(\s*x\.?\s*[\(]?[\d]+[\)]?)\b/) {
$end += length($1);
} elsif ($nextSeg =~ /\A(\s*ex\.?\s*[\(]?[\d]+[\)]?)\b/) {
$end += length($1);
} elsif ($nextSeg =~ /\A([\,]?\s?ext[\.]?\s*[\(]?[\d]+[\)]?)\b/) {
$end += length($1);
} elsif ($nextSeg =~ /\b(\s?extension\s*\(?[\d]+\)?)\b/) {
$end += length($1);
}
my $key = "$st-$end";
addType ($key, "Telephone/Fax (2)");
}
#allow arbitrary line break (almost) anywhere in the phone numbers (except first 3 digit to reduce fp)
#only scrubbs the pattern, if it's a known area code
while ($text =~ /(\d\d\d)\s*[\)\.\/\-\, ]*\s*\d\s*\d\s*\d\s*[ \-\.\/]*\s*\d\s*\d\s*\d\s*\d/g) {
if (isCommonAreaCode($1)){
my $st = length($`);
my $end = $st + length($&);
my $nextSeg = substr $text, $end, 20;
if ($nextSeg =~ /\A(\s*x\.?\s*[\(]?[\d]+[\)]?)\b/) {
$end += length($1);
} elsif ($nextSeg =~ /\A(\s*ex\.?\s*[\(]?[\d]+[\)]?)\b/) {
$end += length($1);
} elsif ($nextSeg =~ /\A([\,]?\s?ext[\.]?\s*[\(]?[\d]+[\)]?)\b/) {
$end += length($1);
} elsif ($nextSeg =~ /\b(\s?extension\s*\(?[\d]+\)?)\b/) {
$end += length($1);
}
my $key = "$st-$end";
addType ($key, "Telephone/Fax (2)");
}
}
#check phone pattern that has 1 extra or 1 less digit at end
#in case pattern such as xxx-xxx-xxx?, check if the first 3 digits match with
#common area code
while (($text =~ /\(?(\d{3})\s*[\)\.\/\-\=\, ]*\s*\d{3}\s*[ \-\.\/\=]*\s*\d{3}\b/g)){
#match it with common local area code
if (isCommonAreaCode($1)){
my $st = length($`);
my $end = $st + length($&);
my $nextSeg = substr $text, $end, 20;
if ($nextSeg =~ /\A(\s*x\.?\s*[\(]?[\d]+[\)]?)\b/) {
$end += length($1);
} elsif ($nextSeg =~ /\A(\s*ex\.?\s*[\(]?[\d]+[\)]?)\b/) {
$end += length($1);
} elsif ($nextSeg =~ /\A(\s?ext[\.]?\s*[\(]?[\d]+[\)]?)\b/) {
$end += length($1);
} elsif ($nextSeg =~ /\A(\s?extension\s*\(?[\d]+\)?)\b/) {
$end += length($1);
}
my $key = "$st-$end";
addType ($key, "Telephone/Fax (3)");
} #end if the first 3 digits are area codes
} #end while
#check phone pattern that has 1 extra digit at end
#in case pattern such as xxx-xxx-xxxxx, check if the first 3 digits match with
#common area code
while (
($text =~ /\(?(\d{3})\s*[\)\.\/\-\=\, ]*\s*\d{3}\s*[ \-\.\/\=]*\s*\d{5}\b/g)) {
#match it with common local area code
if (isCommonAreaCode($1)){
my $st = length($`);
my $end = $st + length($&);
my $nextSeg = substr $text, $end, 20;
if ($nextSeg =~ /\A(\s*x\.?\s*[\(]?[\d]+[\)]?)\b/) {
$end += length($1);
} elsif ($nextSeg =~ /\A(\s*ex\.?\s*[\(]?[\d]+[\)]?)\b/) {
$end += length($1);
} elsif ($nextSeg =~ /\A(\s?ext[\.]?\s*[\(]?[\d]+[\)]?)\b/) {
$end += length($1);
} elsif ($nextSeg =~ /\A(\s?extension\s*\(?[\d]+\)?)\b/) {
$end += length($1);
}
my $key = "$st-$end";
addType ($key, "Telephone/Fax (4)");
} #end if the first 3 digits are area codes
} #end while
#in case typed in pattern such as ###-####-###
while ($text =~ /\(?\d{3}?\s?[\)\.\/\-\=\, ]*\s?\d{4}\s?[ \-\.\/\=]*\s?\d{3}\b/g) {
my $st = length($`);
my $end = $st + length($&);
my $nextSeg = substr $text, $end, 20;
if ($nextSeg =~ /\A(\s*x\.?\s*[\(]?[\d]+[\)]?)\b/) {
$end += length($1);
} elsif ($nextSeg =~ /\A(\s*ex\.?\s*[\(]?[\d]+[\)]?)\b/) {
$end += length($1);
}
elsif ($nextSeg =~ /\A(\s*ext[\.]?\s*[\(]?[\d]+[\)]?)\b/) {
$end += length($1);
} elsif ($nextSeg =~ /\A(\s*extension\s*\(?[\d]+\)?)\b/) {
$end += length($1);
}
my $key = "$st-$end";
addType ($key, "Telephone/Fax (5)");
}
}
}
# End of function telephone()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Add new function here >>>>>>>>>>
# Follow format shown here if necessary
# sub functionName {
# $text = $_[0];
# while ($text =~ /(<search pattern>)/) {
# $startIndex = length($');
# $endIndex = $startIndex + length($1);
# $phiKey = $startIndex."-".$endIndex;
# addType ($phiKey, "Name of PHI Category");
# }
# }
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: pruneKeys()
# Arguments: hash %keys, string $text
# Returns: array @keylst
# Called by: findPHI()
# Description: Extracts PHI locations from hash %keys, compares each loc with previous loc to prevent overlaps
# Returns an array of pruned PHI keys
sub pruneKeys {
my ($keyh,$text) = @_;
my $lk = "";
my ($ls,$le)=(0,0);
my ($cs,$ce)=(0,0);
my @keylst = ();
foreach $k (sort numerically keys %{$keyh}) {
#print "prunkey, key = $k values = \n";
$ls = $cs;
$le = $ce;
($cs,$ce)= split ('-',$k);
if ($cs > $le){push (@keylst,$lk);} # proper relation
elsif($cs > $ls){
if($ce > $le) {
my $stgl = substr($text,$ls,$le-$ls);
my $stgc = substr($text,$cs,$ce-$cs);
$cs = $ls; $k = "$ls-$ce"; $$keyh{$k} = $$keyh{$lk}} # include both transfer types
else{$cs = $ls; $ce = $le; $k = $lk;}} # use previous (current in previous)
elsif($le > $ce){$cs = $ls; $ce = $le; $k = $lk;} # use previous (current in previous)
$lk = $k;}
#print "pushing $lk to keylst\n";
push (@keylst,$lk); # last one
return (@keylst)
}
# End of pruneKeys()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: addType()
# Arguments: string $key ("start-end" of PHI loc), string $type (PHI type)
# Returns: None
# Called by: findPHI()
# Description: Pushes PHI key and type into the hash %phi
# Keeps track of all possible PHI types for each PHI key
sub addType {
my ($key,$type) = @_;
($st,$end) = split '-',$key;
if ($end > $end{$st}) {
$end{$st} = $end;
}
#print "in addType, key is $key\n";
push @{$phi{$key}}, $type;
$t = (@{$phiT{$key}});
$start = $st - 1 - 64;
$ending = $end - 1 - 64;
return;
}
# End of addType()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: isType()
# Arguments: string $key ("start-end" of PHI loc), string $type (PHI type), int $pattern (1 if PHI type can be matched with '=~'; 0 otherwise)
# Returns: 1 if PHI $key is of PHI type $type; 0 otherwise
# Called by: findPHI()
# Description: Given a PHI loc, checks its PHI type in the existing PHI hash. If the type in the hash is equal to the given type, then returns 1.
sub isType {
my ($key, $type, $pattern) = @_;
foreach $tt (@{$phi{$key}}){
# print "isType, tt is $tt key is $key\n";
if ($pattern) {
if ($tt =~ /$type/) {
return 1;
}
}
elsif ($tt eq $type) {
return 1;
}
}
return 0;
}
# End of isType()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: isPHIType()
# Arguments: string $mytype (PHI type), array of string @phiTypes
# Returns: 1 if PHI $mytype appears in @phiTypes, 0 otherwise.
# Called by: deid()
# Description: Given a PHI type, checks if it appears in @phiTypes, if so, returns 1. Returns 0 otherwise.
sub isPHIType {
my ( $mytype, @phiTypes) = @_;
#foreach $tt (@{$phi{$key}}){
foreach $tt (@phiTypes){
if ($tt =~ /$mytype/) {
return 1;
} #end if
} #end foreach
return 0;
}
# End of isPHIType()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: isCommon()
# Arguments: string $word
# Returns: 1 if given word is a common word; 0 otherwise
# Called by: findPHI()
# Description: Compares the given word to the common_words association, compiled from dictionary files for common English words and from SNOMED.
# Returns 1 if given word is in one of those lists, i.e. is a common word.
sub isCommon {
my $word = $_[0];
chomp $word;
$word = uc($word);
return ($common_words{$word} || $unambig_common_words{$word});
}
# End of isCommon()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: isUnambigCommon()
# Arguments: string $word
# Returns: 1 if given word is a really common word or unambig med terms; 0 otherwise
sub isUnambigCommon {
my $word = $_[0];
$word = uc($word);
return $unambig_common_words{$word};
}
# End of isUnambigCommon()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: isNameAmbig()
# Arguments: string $word
# Returns: 1 if given word is an ambiguous name
# Called by: findPHI()
# Description: Searches for given word in lists of ambiguous male, female and last names
# Returns 1 if word is in any of those lists
sub isNameAmbig {
my $word = $_[0];
$word = uc($word);
return (($male_ambig{$word}) || ($female_ambig{$word}) || ($last_ambig{$word}));
}
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
#Function: isProbablyName()
#Arguments: string $key, $potential_name
#Returns true if
#name is Not a common word OR
#name is Unambiguous name OR
#name maybe ambiguous BUT starts with Capital_letter followed by small_case letter OR
#name is popular
sub isProbablyName{
my ($key, $potential_name) = @_;
if ( (!isCommon($potential_name)) ||
((isType($key, "Name", 1) && isType($key, "(un)")) ||
(isType($key, "Name", 1) && ($potential_name =~ /\b(([A-Z])([a-z]+))\b/g)) ||
(isType($key, "popular",1)) )) {
return 1;
} else {
return 0;
}
}
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
#Function: isUSStateAbbre
#Data structure used for us states
#Returns true if the word is a US state abbreviation
sub isUSStateAbbre {
my $word = $_[0];
$word = uc($word);
foreach $loc (@us_states_abbre){
if ($word =~/\b$loc\b/gi){
return 1;
}
}
foreach $loc (@more_us_states_abbre){
if ($word =~/\b$loc\b/gi){
return 1;
}
}
return 0;
}
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
#Function: isNameIndicator
#Returns true if the word is a name indicator
sub isNameIndicator {
my $word = $_[0];
$word = uc($word);
foreach $nam (@name_indicators){
#print "nam in name indicators is $nam";
if ($word =~/\b$nam\b/gi){
return 1;
}
}
return 0;
}
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
#Function: isUSState
#Returns true if the word is a US State
sub isUSState {
my $word = $_[0];
$word = uc($word);
#return (($us_states{$word}));
foreach $loc (@us_states){
if ($word =~/\b$loc\b/gi){
return 1;
}
}
return 0;
}
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: isFirstName()
# Arguments: string $word
# Returns: 1 if given word is an ambiguous or unambiguous firstname
# Called by: findPHI()
sub isFirstName {
my $word = $_[0];
#$word = ($word);
$word = uc($word);
return (($male_ambig{$word}) || ($female_ambig{$word}) || ($male_unambig{$word}) || ($female_unambig{$word}) || ($male_popular{$word}) || ($female_popular{$word}));
}
# End of isFirstName()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: isCommonest()
# Arguments: string $word
# Returns: 1 if given word is a commonest word; 0 otherwise
# Called by: findPHI()
# Description: Compares the given word to the commonest_words association, compiled from dictionary file for commonest English words.
# Returns 1 if given word is in that list, i.e. is a commonest word.
sub isCommonest {
my $word = $_[0];
$word = uc($word);
return ($very_common_words{$word} || isUnambigCommon($word));
}
# End of isCommonest()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: getNote()
# Arguments: int $patient (patient number), int $note (note number)
# Returns: string $noteText
# Called by: deidStats() (obsolete)
# Description: Given patient and note numbers, looks up header in $header_file
# If it finds the header, extracts the body of the note, and returns text until the end pattern
sub getNote{
my ($patient, $note) = @_;
open DF, $data_file or die "Cannot open $data_file";
my $noteFound = 0;
my $noteText = "";
D:
# Parses the data file line-by-line to match the header found in the header file
while ($line = <DF>) {
chomp $line;
# If header is found in the text, then matches the end pattern, and sets the body of the note (excluding the header) as the note text
if ($line =~ /\b$patient\|\|\|\|$note\|\|\|\|/) {
$noteFound = 1;
$noteText = "";
}
else {
if ($noteFound) {
if ($line eq "||||END_OF_RECORD"){
#$noteText = $noteText."\n".$1;
#$noteText = $noteText.$1."\n";
$end = $2;
last D;
}
else {
$noteText = $noteText.$line."\n";
}
}
}
}
close DF;
# If the note text has zero length, prints an error message
if ((length $noteText) == 0) {
print("Warning. No text found for Patient $patient, Note $note ");
}
# Returns the body of the note (everything excluding the header) as the note text
return $noteText;
}
# End of getNote()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: isValidDate()
# Arguments: int $month, int $day, int $year (If the date being validated doesn't specify year, the "year" argument should be "-1")
# Returns: 1 if given date is valid based on the calendar
# Called by: findPHI(), isValidDay()
# Description: Verifies if the given date is valid or not.
sub isValidDate{
my ($month, $day, $year) = @_;
if (($year!=-1) && (length $year) == 2) {
#if ($year < 30) {
if ($year <= $TWO_DIGIT_YEAR_THRESHOLD) {
$year = "20".$year;
}
else {
$year = "19".$year;
}
}
#if (($year != -1) && ($year < 1900 || $year > 2030)){
#if (($year != -1) && ($year <= $VALID_YEAR_LOW || $year >= $VALID_YEAR_HIGH)){
if (($year != -1) && ($year < $VALID_YEAR_LOW || $year > $VALID_YEAR_HIGH)){
return 0;
}
# Invalid months and days
if (($month< 1) || ($month > 12) || ($day < 1) || ($day > 31)) {
return 0;
}
# Checks validity of February days
if ($month == 2) {
if (($year != -1) && (($year % 4) == 0) && ($year != 2000)) {
return ($day <= 29);
}
return ($day <= 28);
# Checks validity of months consisting of 30 days
}
elsif (($month == 4) || ($month == 6) || ($month== 9) || ($month == 11)) {
return ($day <= 30);
}
# Checks validity of months consisting of 31 days
return ($day <= 31);
}
# End of isValidDate()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: isValidDay()
# Arguments: int $day, string $month, int $year
# Returns: 1 if given date is valid; 0 otherwise
# Called by: findPHI()
# Description: Verifies validity of date when month is printed, by calling isValidDate()
sub isValidDay {
my ($day, $month, $year) = @_;
my $mnum = 0;
# Converts printed out months to numerical months
if ($month =~ /Jan|January|Januar/i) { $mnum = 1; }
elsif ($month =~ /Feb|February|Februar/i) { $mnum = 2; }
elsif ($month =~ /Mar|March|Maerz/i) { $mnum = 3; }
elsif ($month =~ /Apr|April/i) { $mnum = 4; }
elsif ($month =~ /May|Mai/i) { $mnum = 5; }
elsif ($month =~ /June|Jun|Juni/i) { $mnum = 6; }
elsif ($month =~ /July|Jul|Juli/i) { $mnum = 7; }
elsif ($month =~ /August|Aug/i) { $mnum = 8; }
elsif ($month =~ /September|Sept|Sep/i) { $mnum = 9; }
elsif ($month =~ /October|Oct/i) { $mnum = 10; }
elsif ($month =~ /November|Nov/i) { $mnum = 11; }
elsif ($month =~ /December|Dec/i) { $mnum = 12; }
if ($mnum == 0) { return 0; }
return (isValidDate($mnum, $day, $year));
}
# End of isValidDay()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: outputText()
# Arguments: hash %deids (key=PHI start index, value=PHI end index), hash %phiT (key=PHI start-end indices, value=PHI type)
# Returns: None
# Called by: deid()
# Description: Creates the de-identified version of the text
# Replaces dates with shifted dates, and other PHI with their PHI types
sub outputText {
my %deids = %{$_[0]};
my %phiT = %{$_[1]};
# These are the date and PID of the medical record. The year needs to be initialized.
# my $checkYear = "2005";
my $checkYear;
my $checkDate;
my $checkID;
# Forms associations between printed months and numerical ones
# This is because the date-shifting function only accepts numerical months, and months in the text can be non-numerical as well
my %months;
$months{"jan"} = 1;
$months{"feb"} = 2;
$months{"mar"} = 3;
$months{"apr"} = 4;
$months{"may"} = 5;
$months{"jun"} = 6;
$months{"jul"} = 7;
$months{"aug"} = 8;
$months{"sep"} = 9;
$months{"oct"} = 10;
$months{"nov"} = 11;
$months{"dec"} = 12;
#TF is the .res file (de-identified corpus), and OF is the (.phi file) list of PHIs
# open TF, ">$deid_text_file" or die "Cannot open $deid_text_file";
open TF, ">>$deid_text_file" or die "Cannot open $deid_text_file"; #now open in append mode
my $lastEnd = 0;
#open OF, ">>$output_file_old" or die "Cannot open $output_file_old";
my $phiType = "";
# Prints the PHI locations to the output file (.phi)
foreach $k (sort numerically keys %deids) {
my @deidsval = @{$deids{$k}};
local $added = 0;
my $phiID;
# Loops over each PHI recorded in %deids
# %deids maps each PHI start index to an array of 3 items
# These items are in order: the PHI end index, the PID of the PHI, the record date of the PHI
#if (exists ${@{$deids{$k}}}[0]) {
if (exists $deidsval[0]){
# Sets some key variables for the PHI
#the following no longer works with perl v5.10
#$deidsend = ${@{$deids{$k}}}[0]; # End index
#$checkID = ${@{$deids{$k}}}[1]; # PID
#$checkDate = ${@{$deids{$k}}}[2]; # Record date
$deidsend = $deidsval[0]; # End index
$checkID = $deidsval[1]; # PID
$checkDate = $deidsval[2]; # Record date
$checkYear = (substr $checkDate, 6, 4); # Record year
if (length($checkYear) ==0) {
$checkYear = extractYear($DEFAULT_DATE);
print "Warning, in outputText(), cannot extract year from noteDate, setting year to default year $checkYear.";
}
# Immediately prints the start and end indices of the PHI to the .phi file
#print OF "$k\t$deidsend\n";
# Sets the $key to the current PHI
my $key = $k."-".$deidsend;
my $lastlast = $lastEnd;
my $phiText;
# If this PHI is a date element, shifts the date and replaces it in the text
# Output format is YYYY/MM/DD always
# For month/date formats, assumes that year = record year; for month/year formats, assumes that day = 1
# Needs to go over %phiT for every PHI in %deids
# %phiT maps each PHI key to its type, e.g. "Mary" -> "First Name"
# This part is necessary because re-identification depends on the PHI's type
foreach $ky (sort numerically keys %phiT) {
my $ky = $ky;
($startp, $endp) = split "-", $ky;
$notAmbig = 0;
# Checks to see if PHI type matches any of the date patterns
# Each PHI may have more than one listed type, e.g. First Name AND Last Name
# For each PHI type listed for the specific PHI
foreach $t (@{$phiT{$ky}}) {
$datephi1 = "Year/Month/Day"; # e.g. 1999/2/23
$datephi2 = "Year/Day/Month"; # e.g. 1999/23/2, note: pattern currently not filtered in sub date()!
$datephi3 = "Month/Day/Year"; # e.g. 2/23/1999
$datephi5 = "Day/Month/Year"; # 23/2/1999, note: pattern currently not filtered in sub date()!
$datephi4 = "Month/Day"; # e.g. 2/23, using record year as year
$datephi6 = "Day/Month"; # e.g. 23/2, using record current year as year, note: pattern currently not filtered in sub date()!
$datephi7 = "Month/Year"; # e.g. 2/1999, using 1 as day
$datephi8 = "Year/Month"; # e.g. 1999/2, using 1 as day
$datephi9 = "Day Month Year"; # e.g. 23 february 1999
$datephi10 = "Month Day Year"; # e.g. feb 23 1999 or feb. 23rd 1999
$datephi11 = "Month Day"; # e.g. feb 23, using record year as year
$datephi12 = "Day Month"; # e.g. 23 february, using record year as year
$datephi13 = "Month Year"; # e.g. feb 1999, or february of 1999 or feb. 1999, using 1 as day
$datephi14 = "Header Date"; # not important
$datephi15 = "4 digits"; # 4-digit year, e.g. 1999
$datephi16 = "2 digits"; # 2-digit year, e.g. '99
$datephi17 = "Day Month Year 2"; # e.g. 23rd february 1999
#if ($ky =~/$key/) {$phiType = $t;}
if ($ky eq $key) {$phiType = $t;}
# Calls the date-shifting function alterdate() with a date argument appropriate for the date pattern
# This is because alterdate() accepts an argument of a fixed date pattern
# Prints the resulting shifted date in deid_text_file (.res)
# If the current PHI has not been output to the .res file yet, checks if PHI type is date
# Shifts the date and writes the shifted date to TF (.res file)
if ($added == 0) {
#if (($t =~ /$datephi1/) && ($ky=~/$key/)){
if (($t =~ /$datephi1/) && ($ky eq $key)){
$date = (substr $allText, $k, ($deidsend-$k));
$date =~ s/\-/\//g;
$date =~ s/\./\//g;
$altereddate = &alterdate($date, $checkID);
$date =~ /(\d+)(.)(\d+)(.)(\d+)/;
$longyear = $1;
print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";
$added = 1;
$lastEnd = $deidsend;
}
elsif (($t =~ /$datephi17/) && ($ky eq $key)){
$date = (substr $allText, $k, ($deidsend-$k));
$date =~ /\b(((\d{1,2})(|st|nd|rd|th|)?\s+(of\s)?[\-]?\b([A-Za-z]+)\.?,?)\s+(\d{2,4}))\b/ig; # 12-Apr, or Second of April
$mon = $6;
$day = $3;
$year = $7;
foreach $m (sort keys %months) {
if ($mon =~ /$m/ig) {
$month = $months{$m};
}
}
$date = "$year/$month/$day";
$altereddate = &alterdate($date, $checkID );
print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";
$added = 1;
$lastEnd = $deidsend;
}
#elsif (($t =~ /$datephi9/) && ($ky=~/$key/)){
elsif (($t =~ /$datephi9/) && ($ky eq $key)){
$date = (substr $allText, $k, ($deidsend-$k));
$date =~ /(\d+) ([A-Za-z]+)\,? (\d+)/;
$mon = $2;
$day = $1;
$year = $3;
foreach $m (sort keys %months) {
if ($mon =~ /$m/ig) {
$month = $months{$m};
}
}
$date = "$year/$month/$day";
$longyear = $5;
$altereddate = &alterdate($date, $checkID );
print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";
#local $added = 1;
$added = 1;
$lastEnd = $deidsend;
}
#elsif (($t =~ /$datephi10/) && ($ky=~/$key/)){
elsif (($t =~ /$datephi10/) && ($ky eq $key)){
$date = (substr $allText, $k, ($deidsend-$k));
$date =~ /\b(([A-Za-z]+)\b\.?,? ?(\d{1,2})(|st|nd|rd|th|)? ?[\,\s]+ *\'?(\d{2,4}))\b/ig;
$mon = $2;
$day = $3;
$year = $5;
#$date =~ /([A-Za-z]+) (\d+)\,? (\d+)/;
#$mon = $1;
#$day = $2;
#$year = $3;
#print "DatePHI10: Before date shift: month= $mon, day = $day , year = $year \n";
foreach $m (sort keys %months) {
if ($mon =~ /$m/ig) {
$month = $months{$m};
}
}
$date = "$year/$month/$day";
$altereddate = &alterdate($date, $checkID);
print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";
#local $added = 1;
$added = 1;
$lastEnd = $deidsend;
}
#elsif (($t =~ /$datephi2/) && ($ky=~/$key/)){
elsif (($t =~ /$datephi2/) && ($ky eq $key)){
$date = (substr $allText, $k, ($deidsend-$k));
$date =~ /(\d+)(.)(\d+)(.)(\d+)/;
$date = "$1/$5/$3";
$longyear = $1;
$altereddate = &alterdate($date, $checkID);
print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";
#local $added = 1;
$added = 1;
$lastEnd = $deidsend;
}
#elsif (($t =~ /$datephi3/) && ($ky=~/$key/)){
elsif (($t =~ /$datephi3/) && ($ky eq $key)){
$date = (substr $allText, $k, ($deidsend-$k));
$date =~ /(\d+)(.)(\d+)(.)(\d+)/;
$date = "$5/$1/$3";
$altereddate = &alterdate($date, $checkID);
$longyear = $5;
print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";
$added = 1;
$lastEnd = $deidsend;
}
#elsif (($t =~ /$datephi5/) && ($ky=~/$key/)){
elsif (($t =~ /$datephi5/) && ($ky eq $key)){
$date = (substr $allText, $k, ($deidsend-$k));
$date =~ /(\d+)(.)(\d+)(.)(\d+)/;
$date = "$5/$3/$1";
$altereddate = &alterdate($date, $checkID );
print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";
$longyear = $5;
$added = 1;
$lastEnd = $deidsend;
}
#elsif (($t =~ /$datephi4/) && ($ky=~/$key/)){
elsif (($t =~ /$datephi4/) && ($ky eq $key)){
$date = (substr $allText, $k, ($deidsend-$k));
$date =~ s/\-/\//g;
$date = "$checkYear/$date";
$altereddate = &alterdate($date, $checkID);
$altereddate = (substr $altereddate, 5, (length($altereddate)-5));
print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";
$added = 1;
$lastEnd = $deidsend;
}
#elsif (($t =~ /$datephi11/) && ($ky=~/$key/)){
elsif (($t =~ /$datephi11/) && ($ky eq $key)){
$date = (substr $allText, $k, ($deidsend-$k));
#$date =~ /([A-Za-z]+) (\d+)/;
#$mon = $1;
#$day = $2;
$date =~ /\b(([A-Za-z]+)\b\.?,?\s*(\d{1,2})(|st|nd|rd|th|)?)\b/ig; # Apr. 12
$mon = $2;
$day = $3;
foreach $m (sort keys %months) {
if ($mon =~ /$m/ig) {
$month = $months{$m};
}
}
$date = "$checkYear/$month/$day";
$altereddate = &alterdate($date, $checkID);
$altereddate = substr($altereddate, 5, (length($altereddate)-5));
print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";
$added = 1;
$lastEnd = $deidsend;
}
#elsif (($t =~ /$datephi6/) && ($ky=~/$key/)){
elsif (($t =~ /$datephi6/) && ($ky eq $key)){
$date = (substr $allText, $k, ($deidsend-$k));
$date =~ /(\d+)(.)(\d+)/;
#print "DatePHI6: date is $date, 3 is $3, 1 is $1\n";
$date = "$3/$1";
$date = "$checkYear/$date";
$altereddate = &alterdate($date, $checkID);
$altereddate = substr($altereddate, 5, (length($altereddate)-5));
print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";
$added = 1;
$lastEnd = $deidsend;
}
#elsif (($t =~ /$datephi12/) && ($ky=~/$key/)){
elsif (($t =~ /$datephi12/) && ($ky eq $key)){
$date = (substr $allText, $k, ($deidsend-$k));
#$date =~ /(\d+) ([A-Za-z]+)/;
#$mon = $2;
#$day = $1;
$date =~ /\b((\d{1,2})(|st|nd|rd|th|)?( of)?[ \-]\b([A-Za-z]+))\b/ig;
$mon = $5;
$day = $2;
#print "month is $month, day is $day\n";
foreach $m (sort keys %months) {
if ($mon =~ /$m/ig) {
$month = $months{$m};
}
}
$date = "$checkYear/$month/$day";
$altereddate = &alterdate($date, $checkID);
$altereddate = substr($altereddate, 5, (length($altereddate)-5));
print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";
$added = 1;
$lastEnd = $deidsend;
}
#elsif (($t =~ /$datephi13/) && ($ky=~/$key/)){
elsif (($t =~ /$datephi13/) && ($ky eq $key)){
$date = (substr $allText, $k, ($deidsend-$k));
$date =~ /([A-Za-z]+)(\.)?(\s+)(of\s+)?(\d+)/ig;
$mon = $1;
$year = $5;
foreach $m (sort keys %months) {
if ($mon =~ /$m/ig) {
$month = $months{$m};
}
}
$date = "$year/$month/1";
$longyear = $3;
$altereddate = &alterdate($date, $checkID );
print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";
$added = 1;
$lastEnd = $deidsend;
}
#elsif (($t =~ /$datephi7/) && ($ky=~/$key/)){
elsif (($t =~ /$datephi7/) && ($ky eq $key)){
$date = (substr $allText, $k, ($deidsend-$k));
$date =~ /(\d+)(.)(\d+)/;
$date = $3.'/'.$1.'/1';
$altereddate = &alterdate($date, $checkID);
print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";
$longyear = $3;
$added = 1;
$lastEnd = $deidsend;
}
#elsif (($t =~ /$datephi8/) && ($ky=~/$key/)){
elsif (($t =~ /$datephi8/) && ($ky eq $key)){
$date = (substr $allText, $k, ($deidsend-$k));
$date =~ /(\d+)(.)(\d+)/;
$date = $1.'/'.$3.'/1';
$altereddate = &alterdate($date, $checkID);
$longyear = $1;
print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$altereddate**]";
$added = 1;
$lastEnd = $deidsend;
}
#elsif (($t =~ /$datephi14/) && ($ky=~/$key/)) {
elsif (($t =~ /$datephi14/) && ($ky eq $key)) {
$date = (substr $allText, $k, ($deidsend-$k));
$date =~ /(\d+)(\-)(\d+)(\-)(\d+)/;
$date = $1.'/'.$3.'/'.$5;
$altereddate = &alterdate($date, $checkID);
$longyear = $1;
print TF (substr $allText, $lastEnd, ($k - $lastEnd))."$date";
$added = 1;
$lastEnd = $deidsend;
}
#elsif (($t =~ /$datephi15/) && ($ky=~/$key/)) {
elsif (($t =~ /$datephi15/) && ($ky eq $key)) {
$year = (substr $allText, $k, ($deidsend-$k));
$date = $year.'/1/1';
$altereddate = &alterdate($date, $checkID);
$alteredyear = (substr $altereddate, 0, 4);
print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$alteredyear**]";
$added = 1;
$lastEnd = $deidsend;
}
#elsif (($t =~ /$datephi16/) && ($ky=~/$key/)) {
elsif (($t =~ /$datephi16/) && ($ky eq $key)) {
$year = (substr $allText, $k, ($deidsend-$k));
#if ($year > 20) {
# $year = '19'.$year;
#} else {
# #$year = '20.'.$year;
# $year = '20'.$year;
# }
$date = $year.'/1/1';
$altereddate = &alterdate($date, $checkID);
$alteredyear = (substr $altereddate, 2, 2);
print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$alteredyear**]";
$added = 1;
$lastEnd = $deidsend;
}
# If the current PHI is not a date, then indicates that it has not been output yet
else { my $added = 0;}
}
}
}
# If the PHI is not a date, replaces it in deid_text_file (.res) by its PHI type tag
if ($added==0) {
if ($k > $lastEnd || ($k==0)) {
$phiText = (substr $allText, $k, ($deidsend-$k));
# Parentheses are eliminated so that they do not trip up the run
$phiText =~ s/\(//g;
$phiText =~ s/\)//g;
$phiText =~ s/\+//g;
$found = 0;
# Assigns a unique ID to each PHI, e.g. all instances of "Mary" may be assigned "1", but "John" may be assigned "2"
# %ID maps each PHI (e.g. "Mary") to its ID (e.g. "1")
foreach $phik (keys %ID) {
if ($phik =~/$phiText/ig) {
$found = 1;
}
}
# If the current PHI to be added to .res file is already recorded in %ID, then retrieves its unique ID
if ($found==1) {
$phiID = $ID{$phiText};
}
# If the current PHI is not recorded in %ID, records it in %ID and assigns the PHI a unique ID
else {
$ID{$phiText} = keys(%ID) + 1;
$phiID = $ID{$phiText};
}
# Prints the PHI type and PHI ID in place of the original PHI in the .res file
print TF (substr $allText, $lastEnd, ($k - $lastEnd))."[**$phiType $phiID**]";
$lastEnd = $deidsend;
}
else {
if ($lastEnd < $deidsend) {
$lastEnd = $deidsend;
}
}
if ($lastEnd == 0) {
$lastEnd = $lastlast;
}
}
}
}
#close OF;
#print "Finished outputing to the .phi file.";
# Prints the remaining non-PHI text to the .res file
print TF (substr $allText, $lastEnd);
#print "finished outputing to the .res file";
close TF;
}
# End of outputText()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: alterdate()
# Arguments: string $date (yyyy/mm/dd), int $pid (patient ID)
# Returns: string $year-$month-$day
# Called by: outputText()
# Description: Converts given date to shifted date, depending on date-shift-mode, by calling doalterdate().
# Returns shifted date
sub alterdate {
# Separates date fields by splitting along "/" or "-"
# Then calls doalterdate on the resulting array of date elements
$d = $_[0];
if (substr($d,0,6) =~ /(\/)/) {
@d = split '/', $d;
}
elsif (substr($d,0,6) =~ /(\-)/) {
@d = split '-', $d;
}
($entryyear, $entrymonth)=@d[0..1];
return join "-", &doalterdate(@d, $_[1]);
}
# End of alterdate()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: doalterdate()
# Arguments: int $year, int $month, int $day, int $pid (patient ID)
# Returns: array @($year, $month, $day)
# Called by: alterdate()
# Description: Converts given date, depending on date-shift-mode, to shifted date.
# Returns shifted date in an array format to calling function alterdate(). alterdate() converts the shifted date to a string.
sub doalterdate {
my $year=$_[0];
if (length $year == 2) {
# Limits 2-digit years between 1900 and 2020
# Converts them to 4-digit years
#if ($year<=10) {$year = 2000+$year;}
#if ($year<=20) {$year = 2000+$year;}
if ($year<=$TWO_DIGIT_YEAR_THRESHOLD) {$year = 2000+$year;}
else {$year = 1900+$year;}
}
my $month=$_[1];
my $day=$_[2];
my $pid=$_[3];
if ($pid_dateshift_list =~ /y/) {
open SF, $date_shift_file or die "Cannot open $date_shift_file";
while ($line = <SF>) {
chomp $line;
if ($line =~ /\A($pid)\|\|\|\|([0-9\-]+)/) {
$offset = $2;
}
}
close SF;
}
my $ml=&monthlength($month, $year);
# $offset = days of offset (positive or negative shift)
# Sets the shifted year
$offset_local = $offset;
if ($offset_local>0) {
$year += 4*int($offset_local/1461);
$offset_local -=1461*int($offset_local/1461);
}
if ($offset_local<0){
$year -= 4*int(-$offset_local/1461);
$offset_local +=1461*int(-$offset_local/1461);
}
# Shifts number of days
$day +=$offset_local;
$ml=&monthlength($month, $year);
# Changes $day, $month, $year based on the remaining offset after shifting $year
while ($day>$ml) {
$ml=&monthlength($month, $year);
$day=$day-$ml;
$month++;
if ($month>12) {
$month -=12;
$year++;
}
$ml = &monthlength($month, $year);
}
while ($day<1) {
$ml=&monthlength($month-1, $year);
$day=$day + $ml;
$month--;
if ($month<1) {
$month +=12;
$year--;
}
}
# Formats the output of single-digit month and day: "2" becomes "02"
if (length($month)<2) {
$month="0".$month;
}
if (length($day)<2) {
$day="0".$day;
}
# Returns the shifted date as an array
return ($year, $month, $day);
}
# End of doalterdate()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: altermonthdate()
# Arguments: int $month, int $day, int $pid (patient ID)
# Returns: call doalterdate(), returns array ($year, $month, $day)
# Called by: None
# Description: Calls doalterdate() which performs dateshift
# This function is not used
sub altermonthdate {
my ($month, $day, $pid)=@_;
my $year;
if (($month-$entrymonth)%12<6) {
if ($month<$entrymonth) {
$year=$entryyear+1;
}
else {
$year=$entryyear;
}
}
else {
if ($month>$entrymonth) {
$year=$entryyear-1;
}
else {
$year=$entryear;
}
}
return (&doalterdate($year, $month, $day, $pid))[1,2];
}
# End of altermonthdate()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: getcentury()
# Arguments: int 2-digit $year
# Returns: int
# Called by: None
# Description: Converts a 2-digit year into a 4-digit year based on entryyear
# This function is not used
sub getcentury {
my $year=$_[0];
if (($year-$entryyear)%100<10) {
$year=$entryyear+(($year-$entryyear)%100);
}
else {
$year=$entryyear-(($entryyear-$year)%100);
}
}
# End of getcentury()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: monthlength()
# Arguments: int $m, int $y
# Returns: int
# Called by: doalterdate()
# Description: Returns the number of days in given month-year
sub monthlength {
my($m, $y)=@_;
while ($m<=0) {
$m += 12;
$y --;
}
while ($m>=13) {
$m -= 12;
$y ++;
}
# Checks for February
if ($m==2) {
if ($y % 4 ==0) {
if($y % 100 ==0) {
if ($y % 400 ==0){
return 29;
}
else {
return 28;
}
}
else {
return 29;
}
}
else {
return 28;
}
}
# Checks for months consisting of 30 days
elsif (($m==4) || ($m==6) || ($m==9) || ($m==11)) {
return 30;
}
# Checks for months consisting of 31 days
else {
return 31;
}
}
# End of monthlength()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Application-specific code follows. Code may contain patterns specific to our medical notes.
# Customize by replacing with your application-specific filters.
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
#Function: isProbablyPhone
#Argument: context string
#Returns: Always returns 1 (true) unless the context is one of the words defined in
#phone_pre_disqualifier. For future extensions, can add qualifier words
#such as "phone", "pager", etc.
sub isProbablyPhone {
@phone_pre_disqualifier = ("HR","Heart", "BP", "SVR", "STV", "VT", "Tidal Volumes", "Tidal Volume", "TV", "CKS");
$context = $_[0];
foreach $i (@phone_pre_disqualifier) {
if ($context =~ /\b$i\b/i){
return 0;
}
}
return 1;
}
#end of isProbablyPhone()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# Function: wardname()
# Argument: string
# Searches for ward names specific to our hospital
sub wardname() {
$text = $_[0];
#Added to catch gs specific wardnames
if ($gs_specific_filter =~ /y/){
@ward_indicators = ("Quartermain");
foreach $ward_ind (@ward_indicators){
while ($text =~ /\b(($ward_ind) ?(\d))\b/ig){
$wardname = $1;
$start = length($`);
$end = $start + length($wardname);
$key = $start."-".$end;
addType ($key, "Wardname");
}
}
}
}
# End of function wardname()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
#Function: isProbablyMeasurement
#Argument: context string
#Returns: Returns true if the text passed in contains (more specifically, ends with) any of the measurement indicators.
sub isProbablyMeasurement {
@measurement_indicators_pre = ("increased to","decreased from","rose to","fell from", "down to",
"increased from", "dropped to", "dec to", "changed to","remains on", "change to");
$context = $_[0];
foreach $i (@measurement_indicators_pre) {
if ($context =~ /\b$i\b/i){ #only match if it ends with the phrase
return 1;
}
}
return 0;
}
#end isProbablyMeasurement()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
#Function: isProbablyDate
#Argument: textBefore string and textAfter string
#Returns: returns 1 if the context check determines that textBefore or textAfter
#are most likely strings preceding or following a date PHI; returns 0 otherwise.
#Description: Subroutine performs a context check on the textBefore and textAfter. If it contains one of the
#keywords (see below) that indicates it's probably not a date, then return false; otherwise return true.
#Called by dateWithContextCheck on partial dates MM-DD
sub isProbablyDate{
my ($textBefore, $textAfter) = @_;
if ((!isProbablyMeasurement($textBefore)) && ($textBefore !~ /\b(drop|up|cc|dose|doses|range|ranged|pad|rate|bipap|pap|unload|ventilation|scale|cultures|blood|at|up|with|in|of|RR|ICP|CVP|strength|PSV|SVP|PCWP|PCW|BILAT|SRR|VENT|PEEP\/PS|flowby|drinks|stage) ?\Z/i) && ($textAfter !~ /\A ?(packs|litres|puffs|mls|liters|L|pts|patients|range|psv|scale|beers|per|esophagus|tabs|tablets|systolic|sem|strength|hours|pts|times|drop|up|cc|mg|\/hr|\/hour|mcg|ug|mm|PEEP|hr|hrs|hour|hours|bottles|bpm|ICP|CPAP|years|days|weeks|min|mins|minutes|seconds|months|mons|cm|mm|m|sessions|visits|episodes|drops|breaths|wbcs|beat|beats|ns|units|amp|qd|chest pain|intensity)\b/i)) {
return 1;
}
return 0;
}
#end isProbablyDate()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
#Function: isProbablyDate2
#Argument: textBefore string and textAfter string
#Returns: returns 1 if the context check determines that textBefore or textAfter
#are most likely strings preceding or following a date PHI; returns 0 otherwise.
#Subroutine performs a context check on the textBefore and textAfter. If it contains one of the
#keywords (see below) that indicates it's probably not a date, then return false; otherwise return true.
#Called by dateWithContextCheck on partial dates MM/DD.
sub isProbablyDate2{
if ( (!isProbablyMeasurement($textBefore)) && ($textBefore !~ /\b(drop|up|cc|range|ranged|pad|rate|rating|bipap|pap|unload|ventilation|scale|blood|at|up|with|RR|ICP|CVP|strength|PSV|SVP|PCWP|BILAT|SRR|VENT|PEEP\/PS|flowby) ?\Z/i) && ($textAfter !~ /\A ?(packs|litres|puffs|mls|liters|L|pts|patients|range|psv|scale|drinks|beers|per|esophagus|tabs|tab|tablet|tablets|systolic|sem|strength|hours|pts|times|drop|up|cc|mg|\/hr|\/hour|mcg|ug|mm|hr|hrs|hour|hours|bottles|bpm|ICP|CPAP|years|days|weeks|min|mins|minutes|seconds|months|mons|cm|mm|m|sessions|visits|drops|breaths|wbcs|beat|beats|ns|units|amp)\b/i)) {
return 1;
}
return 0;
}
#end isProbablyDate2()
#***********************************************************************************************************
#***********************************************************************************************************
#Function: extractYear
#Argument: a date string in the format MM/DD/YYYY
#Returns: the 4-digit year if the date is in the correct format
#returns 0000 otherwise.
sub extractYear{
$date = $_[0];
if ($date =~ /\b(\d\d)\/(\d\d)\/(\d\d\d\d)\b/){
$year = $3;
} else{
$year = 0000;
}
return $year;
}
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
#Function: dateWithContextCheck
#Argument: text string
#Returns: none
#Description: Find date patterns. Performs context check on text before and after the date patterns
#to determine if its an actual date.
sub dateWithContextCheck{
#$text = $_[0];
my ($text, $date) = @_;
my $year = extractYear($date);
#my $year = substr $date, 0, 4;
#**********************************************************************************************
# Searches for numerical date formats
# Checks if dates should be filtered
if ($datefilter =~ /y/) {
# Searches for mm/dd or mm/yy
while ($text =~ /\b([A-Za-z0-9%\/]+ +)?(\d\d?)([\/\-])(\d\d?)\/?\/?( +[A-Za-z]+)?\b/g) {
$pre = $1;
$post = $5;
my $first_num = $2;
my $divider = $3;
my $second_num = $4;
my $postdate = $5;
my $startI = length($`) + length($pre);
my $endI = $startI + length($first_num)+length($divider)+length($second_num);
my $key = $startI."-".$endI;
my $beginr = substr $text, ($startI - 2), 2;
my $endr = substr $text, $endI, 2;
#Excludes nn/nn formats when preceded by any in array @prev
#@prev = ("cvp", "noc", "peep/ps","%");
@prev = ("cvp", "noc", "%", "RR", "PCW");
my $dateorno = 1;
foreach $j (@prev) {
if ((!($pre =~ /\b$j/ig)) && (!($post =~ /\bpersantine\b/ig))) {
}
else {
$dateorno = 0;
}
}
my $context_len = 12; #number of characters we extract before the date
if ($dateorno == 1) {
if (($beginr !~ /\d[\/\.\-]/) && ($endr !~ /\A[\%]/) && ($endr !~ /\S\d/)) {
# Checks if date identified is valid as mm/dd; then adds the date as PHI
if (isValidDate ($first_num, $second_num, -1)) {
if ($second_num == 5) {
my $start_pos = $startI - $context_len;
if ($start_pos < 0) {
$start_pos = 0;
}
my $len = $context_len;
if (length($text) < ($endI + $context_len)){
$len = length($text) - $endI;
}
my $textBefore = substr $text, $start_pos, ($startI - $start_pos);
my $textAfter = substr $text, $endI, $len;
if ( (!isProbablyMeasurement($textBefore)) && ($textBefore !~ /\bPSV? \Z/i) && ($textBefore !~ /\b(CPAP|PS|range|bipap|pap|pad|rate|unload|ventilation|scale|strength|drop|up|cc|rr|cvp|at|up|in|with|ICP|PSV|of) \Z/i) && ($textAfter !~ /\A ?(packs|psv|puffs|pts|patients|range|scale|mls|liters|litres|drinks|beers|per|esophagus|tabs|pts|tablets|systolic|sem|strength|times|bottles|drop|drops|up|cc|mg|\/hr|\/hour|mcg|ug|mm|PEEP|L|hr|hrs|hour|hours|dose|doses|cultures|blood|bpm|ICP|CPAP|years|days|weeks|min|mins|minutes|seconds|months|mons|cm|mm|m|sessions|visits|episodes|drops|breaths|wbcs|beat|beats|ns)\b/i)) {
addType ($key, "Month/Day (1)");
}
} elsif ($second_num == 2) {
my $start_pos = $startI - $context_len;
if ($start_pos < 0) {
$start_pos = 0;
}
my $len = $context_len;
if (length ($text) < ($endI + $context_len)){
$len = length($text) - $endI;
}
my $textBefore = substr $text, $start_pos, ($startI - $start_pos);
my $textAfter = substr $text, $endI, $len;
if ( (!isProbablyMeasurement($textBefore)) && ($textAfter !~ /\A ?hour\b/i) && ($textBefore !~ /\b(with|drop|bipap|pap|range|pad|rate|unload|ventilation|scale|strength|up|cc|rr|cvp|at|up|with|in|ICP|PSV|of) \Z/i) && ($textAfter !~ /\A ?hr\b/i) && ($textAfter !~ /\A ?(packs|L|psv|puffs|pts|patients|range|scale|dose|doses|cultures|blood|mls|liters|litres|pts|drinks|beers|per|esophagus|tabs|tablets|systolic|sem|strength|bottles|times|drop|cc|up|mg|\/hr|\/hour|mcg|ug|mm|PEEP|hr|hrs|hour|hours|bpm|ICP|CPAP|years|days|weeks|min|mins|minutes|seconds|months|mons|cm|mm|m|sessions|visits|episodes|drops|breaths|wbcs|beat|beats|ns)\b/i)) {
addType ($key, "Month/Day (2)");
}
#} elsif (($divider eq "-") && ($startI > 4)) {
} elsif (($divider eq "-")) {
my $start_pos = $startI - $context_len;
if ($start_pos < 0) {
$start_pos = 0;
}
my $len = $context_len;
if (length ($text) < ($endI + $context_len)){
$len = length($text) - $endI;
}
my $textBefore = substr $text, $start_pos, ($startI - $start_pos);
my $textAfter = substr $text, $endI, $len;
if (isProbablyDate($textBefore, $textAfter)){
addType ($key, "Month/Day (3)");
}
}
else {
my $start_pos = $startI - $context_len;
if ($start_pos < 0) {
$start_pos = 0;
}
my $len = $context_len;
if (length ($text) < ($endI + $context_len)){
$len = length($text) - $endI;
}
my $textAfter = substr $text, $endI, $len;
my $textBefore = substr $text, $start_pos, ($startI - $start_pos);
if (isProbablyDate($textBefore, $textAfter)){
addType ($key, "Month/Day (4)");
}
}
}
# Checks if date identified is valid as mm/yy; then adds the date as PHI
# Checks for years of length 2, restricted to 1950-2030
if (($first_num <= 12) && ($first_num > 0) && ((length $second_num) == 2)
&& (($second_num>=50) || ($second_num<=30))) {
#my $textAfter = substr $text, $endI, 9;
my $start_pos = $startI - $context_len;
if ($start_pos < 0) {
$start_pos = 0;
}
my $len = $context_len;
if (length ($text) < ($endI + $context_len)){
$len = length($text) - $endI;
}
my $textAfter = substr $text, $endI, $len;
my $textBefore = substr $text, $start_pos, ($startI - $start_pos);
# print "checking mm/yy text before is $textBefore, text after is $textAfter\n";
if (isProbablyDate($textBefore, $textAfter)){
addType ($key, "Month/Year (2)");
}
} #end if the first num and second num are month/year
}
} #end if dateno ==1
} #end while the pattern match
} # end if datefilter is on
}
#end dateWithContextCheck()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
#Function: yearWithContextCheck
#Argument: text string
#Returns: none
#Description: Find date patterns. Performs context check on text before and after the year patterns
#to determine if it is a year.
sub yearWithContextCheck {
#$text = $_[0];
my ($text, $date) = @_;
#my $year = substr $date, 0, 4;
my $year = extractYear($date);
# Checks for 2-digit year written as '01, &c, only when preceded by the following medical terms
while ($text =~ /\b((embolus|mi|mvr|REDO|pacer|ablation|cabg|avr|x2|x3|CHOLECYSTECTOMY|cva|ca|PTC|PTCA|stent|since|surgery|year) + *(\')?)(\d\d)\b/ig) {
my $num = $1;
#print "2-digit date, 1 is $1, 2 is $2, 3 is $3, 4 is $4, 5 is $5\n";
#my $key = (1 + (length($`))+length($1))."-".(pos $text);
my $startd = (length($`))+length($1);
my $endd = $startd + length($4);
my $key = $startd."-".$endd;
#check if the match is part of number followed by decimal point and a number
my $textAfter = substr $text, $endd, 2;
if ($textAfter !~ /\.\d/){
addType($key, "Year (2 digits)");
}
}
while ($text =~ /\b((embolus|mi|mvr|REDO|pacer|ablation|cabg|avr|x2|x3|CHOLECYSTECTOMY|cva|ca|PTCA|since|surgery|year)(\'+))(\d\d)\b/ig) {
my $num = $1;
my $startd = (length($`))+length($1);
my $endd = $startd + length($4);
my $key = $startd."-".$endd;
#check if the match is part of number followed by decimal point and a number
#or if it is a time (HH:MM)
my $textAfter = substr $text, $endd, 2;
if ($textAfter !~ /(\.|\:)\d/){
addType($key, "Year (2 digits)");
}
}
# Checks for 4-digit year written as 2001, &c, only when preceded by the following medical terms
while ($text =~ /\b((embolus|mi|mvr|REDO|pacer|ablation|cabg|x2|x3|CHOLECYSTECTOMY|cva|ca|in|PTCA|since|from|year) + *)(\d{4})((\,? )\d{4})?\b/ig) {
my $num1 = $1;
$s1 = length($`) + length($1);
$e1 = $s1+length($3);
$s2 = $e1+length($5);
$e2 = $e1+length($4);
$k1 = "$s1-$e1";
$k2 = "$s2-$e2";
#for 4-digit year, check if the matched number is in the range of [$VALID_YEAR_LOW,$VALID_YEAR_HIGH]
#if ($3 <= 2030 && $3 >= 1950){
if ($3 <= $VALID_YEAR_HIGH && $3 >= $VALID_YEAR_LOW){
addType($k1, "Year (4 digits)");
addType($k2, "Year (4 digits)");
}
}
# Looks for year only (esp Patient Medical History): looks for year numbers within the 30 years before
# and 2 years after the date passed in as an argument.
# for $n (($year - 30) .. $year) {
for $n (($year - 30) .. ($year+2)) {
my $short = substr $n, 2, 2;
if ($n =~ /\d\d\d\d/) {
while ($text =~ /\b((embolus|mi|mvr|REDO|pacer|ablation|cabg|x2|x3|CHOLECYSTECTOMY|cva|ca|in|PTCA|since|from|year) + *)$n\b/ig) {
my $key = (length($`)+length($1))."-".(pos $text);
addType ($key, "Year (4 digits)");
}}
if ($short =~ /\d\d/) {
#while ($text =~ /\b((embolus|mi|mvr|REDO|pacer|ablation|cabg|x2|x3|CHOLECYSTECTOMY|cva|ca|PTCA|since|from|year) + *(\'?))$short\b/ig) {
while ($text =~ /\b((embolus|mi|mvr|REDO|pacer|ablation|cabg|x2|x3|CHOLECYSTECTOMY|cva|ca|PTCA|since|year) + *(\'?))$short\b/ig) {
my $key = (length($`)+length($1))."-".(pos $text);
addType ($key, "Year (2 digits)");
}}
if ($short =~ /\d\d/) {
# while ($text =~ /\b((embolus|mi|mvr|REDO|pacer|ablation|cabg|x2|x3|CHOLECYSTECTOMY|cva|ca|PTCA|since|from|year)(\'+))$short\b/ig) {
while ($text =~ /\b((embolus|mi|mvr|REDO|pacer|ablation|cabg|x2|x3|CHOLECYSTECTOMY|cva|ca|PTCA|since|year)(\'+))$short\b/ig) {
my $key = (length($`)+length($1))."-".(pos $text);
addType ($key, "Year (2 digits)");
}}
}
}
#end yearWithContextCheck()
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************
# End of De-Identification Algorithm
#***********************************************************************************************************
#***********************************************************************************************************
#***********************************************************************************************************