Fellowship Match Rates over Time

September 21, 2017
healthcare howto R research tutorial

library(readxl)
library(tidyverse)
library(plotly)
library(ggTimeSeries)

NRMP_data <- read_excel("/Users/Sami/Documents/SamiSite/static/research/NRMP Data.xlsx", sheet = "Sheet1")

colnames(NRMP_data)
##  [1] "Year"                 "Specialty"            "US Applicants"       
##  [4] "All Applicants"       "Positions Offered"    "No. of Pgms"         
##  [7] "US Matches"           "All Matches"          "% US Matches"        
## [10] "% All Matches"        "US Ranked Positions"  "All Ranked Positions"
## [13] "Unfilled Spots"       "Salary"               "Overall"             
## [16] "Satsfied Income"      "Satsfied Medicine"    "Satisfied Specialty"
glimpse(NRMP_data)
## Observations: 115
## Variables: 18
## $ Year                   <dbl> 2016, 2016, 2016, 2016, 2016, 2016, 201...
## $ Specialty              <chr> "Allergy and Immunology", "Cardiovascul...
## $ `US Applicants`        <dbl> 108, 479, 116, 372, 92, 78, 325, 147, 1...
## $ `All Applicants`       <dbl> 151, 1108, 325, 718, 213, 108, 693, 259...
## $ `Positions Offered`    <dbl> 137, 844, 270, 466, 385, 14, 521, 280, ...
## $ `No. of Pgms`          <dbl> 86, 193, 136, 179, 137, 3, 131, 122, 14...
## $ `US Matches`           <dbl> 92, 446, 107, 309, 88, 8, 284, 126, 126...
## $ `All Matches`          <dbl> 125, 836, 247, 462, 192, 14, 513, 226, ...
## $ `% US Matches`         <dbl> 67.2, 52.8, 39.6, 66.3, 22.9, 57.1, 54....
## $ `% All Matches`        <dbl> 91.2, 99.1, 91.5, 99.1, 49.9, 100.0, 98...
## $ `US Ranked Positions`  <dbl> 882, 4176, 787, 3320, 266, 85, 2196, 61...
## $ `All Ranked Positions` <dbl> 1210, 7538, 2089, 4913, 908, 118, 4403,...
## $ `Unfilled Spots`       <dbl> 11, 7, 16, 4, 100, 0, 5, 36, 82, 0, 93,...
## $ Salary                 <dbl> 222000, 410000, 206000, 380000, NA, NA,...
## $ Overall                <dbl> 49, 54, 49, 57, NA, NA, NA, NA, 56, NA,...
## $ `Satsfied Income`      <dbl> 43, 48, 43, 48, NA, NA, NA, NA, 52, NA,...
## $ `Satsfied Medicine`    <dbl> 57, 58, 60, 61, NA, NA, NA, NA, 69, NA,...
## $ `Satisfied Specialty`  <dbl> 48, 57, 45, 60, NA, NA, NA, NA, 49, NA,...
# Let's rename the columns to make the data easier to work with

NRMP_data <- NRMP_data %>% 
  rename("year" = "Year",
         "specialty" = "Specialty",
         "us_applicants" = "US Applicants",
         "total_applicants" = "All Applicants",
         "total_positions" = "Positions Offered",
         "total_programs" = "No. of Pgms",
         "us_matches" = "US Matches",
         "total_matches" = "All Matches",
         "us_matches_percent" = "% US Matches",
         "all_matches_percent" = "% All Matches",
         "us_ranked_positions" = "US Ranked Positions",
         "all_ranked_positions" = "All Ranked Positions",
         "unfilled_spots" = "Unfilled Spots",
         "salary" = "Salary",
         "satisfaction_overall" = "Overall",
         "satisfaction_income" = "Satsfied Income",
         "satisfaction_medicine" = "Satsfied Medicine",
         "satisfied_specialty" = "Satisfied Specialty")

## Notice that you misspelled Satisfied for income and medicine. Nice going.

# Crap, I noticed that Pulm has 5 different names

unique(NRMP_data$specialty)
##  [1] "Allergy and Immunology"                      
##  [2] "Cardiovascular Disease"                      
##  [3] "Endocrinology, Diabetes, and Metabolism"     
##  [4] "Gastroenterology"                            
##  [5] "Geriatric Medicine*"                         
##  [6] "Hematology"                                  
##  [7] "Hematology and Oncology"                     
##  [8] "Hospice and Palliative Medicine"             
##  [9] "Infectious Disease"                          
## [10] "Interventional Pulmonology**"                
## [11] "Nephrology"                                  
## [12] "Oncology"                                    
## [13] "Pulmonary Disease"                           
## [14] "Pulmonary Disease and Critical Care"         
## [15] "Rheumatology"                                
## [16] "Pulmonary Disease and Critical Care Medicine"
## [17] "Clinical Oncology"                           
## [18] "Pulmonary Disease and Critical"
# Reviewing the data and NRMPs collection, I should have 3

NRMP_data$specialty[NRMP_data$specialty == "Pulmonary Disease and Critical Care"] <- "Pulmonary Disease and Critical Care Medicine"
NRMP_data$specialty[NRMP_data$specialty == "Pulmonary Disease and Critical"] <- "Pulmonary Disease and Critical Care Medicine"

unique(NRMP_data$specialty)
##  [1] "Allergy and Immunology"                      
##  [2] "Cardiovascular Disease"                      
##  [3] "Endocrinology, Diabetes, and Metabolism"     
##  [4] "Gastroenterology"                            
##  [5] "Geriatric Medicine*"                         
##  [6] "Hematology"                                  
##  [7] "Hematology and Oncology"                     
##  [8] "Hospice and Palliative Medicine"             
##  [9] "Infectious Disease"                          
## [10] "Interventional Pulmonology**"                
## [11] "Nephrology"                                  
## [12] "Oncology"                                    
## [13] "Pulmonary Disease"                           
## [14] "Pulmonary Disease and Critical Care Medicine"
## [15] "Rheumatology"                                
## [16] "Clinical Oncology"
# Let's make a plot looking at Unfilled spots in Nephrology fellowships over time

unfilled_renal <- NRMP_data %>% filter(specialty == "Nephrology") %>% select(year, unfilled_spots)

unfilled_renal %>% ggplot(aes(x = year, y = unfilled_spots)) + geom_line() + labs(title = "Unfilled Nephrology Fellowship Positions", 
    x = "Year", y = "Total Unfilled Positions", caption = "by Mirza S. Khan
Source: NRMP Fellowship Match Results and Data")

# Unfilled spots across different specialties

unfilled_all <- NRMP_data %>% select(year, specialty, unfilled_spots)

unfilled_plot <- unfilled_all %>% ggplot(aes(x = year, y = unfilled_spots)) +
  geom_line(aes(color = specialty)) + theme(plot.subtitle = element_text(vjust = 1), 
    plot.caption = element_text(vjust = 1)) +labs(title = "Unfilled Positions by Specialty", 
    x = "Year", y = "Total Unfilled Positions", 
    colour = "Specialty", caption = "by Mirza S. Khan
Source: NRMP Fellowship Match Results and Data")

ggplotly(unfilled_plot) %>% layout(showlegend = FALSE)
## The above plots the absolute number of unfilled positions by specialty, but perhaps we may be able to glean more information
## by looking at the ratio of Unfilled Spots by Total Positions available

div_unfill <- 100 * (NRMP_data$unfilled_spots / NRMP_data$total_positions)

div_unfill <- cbind(NRMP_data, div_unfill)

ggdiv <- div_unfill %>% 
  ggplot(aes(x = year, y = div_unfill)) + geom_line(aes(color = specialty)) +
  labs(title = "Unfilled Fellowship Positions by Specialty", 
    x = "Year", y = "Unfilled Fellowship Positions / Total Positions (%)", 
    colour = "Specialty", caption = "by Mirza S. Khan
Source: NRMP Fellowship Match Results and Data")

ggplotly(ggdiv) %>% layout(showlegend = FALSE)
# What you see here is in absolute terms, Nephrology has more unfilled positions, but
# ID has a greater proportion of unfilled spots between the 2 in the more recent data


# Let's look at the total number of positions available and see if this has changed
# Has the demand for fellowship positions translated into changes in supply?

# Supply = total_positions and Demand = total_applicants

unfilled_ratio <- NRMP_data %>% select(year, specialty, unfilled_spots, total_positions, total_applicants)

unfilled_ratio %>% 
  ggplot(aes(x = year)) + geom_line(aes(y = total_positions, color = specialty)) +
  geom_point(aes(y = total_applicants, color = specialty)) +
  facet_wrap(~specialty) +
  theme(legend.position = "bottom") +
  labs(caption = "by Mirza S. Khan
Source: NRMP Fellowship Match Results and Data")

# Create a new variable showing the difference in total applications and total positions
# If Supply > Demand, we will expect a negative value and associated positive unfilled positions

delta_filled <- NRMP_data$total_applicants - NRMP_data$total_positions

NRMP_data <- cbind(NRMP_data, delta_filled)

table(NRMP_data$delta_filled < 0) # There are 12 times where Supply > Demand
## 
## FALSE  TRUE 
##   103    12
NRMP_data %>% select(specialty, unfilled_spots, year) %>% filter(delta_filled < 0)
##                          specialty unfilled_spots year
## 1              Geriatric Medicine*            100 2016
## 2  Hospice and Palliative Medicine             36 2016
## 3               Infectious Disease             82 2016
## 4                       Nephrology             93 2016
## 5              Geriatric Medicine*             99 2015
## 6               Infectious Disease             70 2015
## 7                       Nephrology             68 2015
## 8              Geriatric Medicine*             81 2014
## 9               Infectious Disease             55 2014
## 10                      Nephrology             64 2014
## 11              Infectious Disease             45 2013
## 12              Infectious Disease             51 2012
# Scrape Salary data

library(rvest)

mgma <- "https://www.cejkasearch.com/physician-compensation-report/"
salary <- mgma %>% 
  read_html() %>% 
  html_table() 

salary <- as.data.frame(salary)
salary <- salary[,1:2]

unique(salary[,1])
##  [1] "Allergy & Immunology"                     
##  [2] "Anesthesiology"                           
##  [3] "Breast Surgery"                           
##  [4] "Cardiac & Thoracic Surgery"               
##  [5] "Cardiology"                               
##  [6] "Colon & Rectal Surgery"                   
##  [7] "Critical Care Medicine"                   
##  [8] "Dermatology"                              
##  [9] "Diagnostic Radiology - Interventional"    
## [10] "Diagnostic Radiology - Non-Interventional"
## [11] "Emergency Care"                           
## [12] "Endocrinology"                            
## [13] "Family Medicine"                          
## [14] "Family Medicine - with Obstetrics"        
## [15] "Gastroenterology"                         
## [16] "General Surgery"                          
## [17] "Geriatrics"                               
## [18] "Gynecological Oncology"                   
## [19] "Gynecology"                               
## [20] "Gynecology & Obstetrics"                  
## [21] "Hematology & Medical Oncology"            
## [22] "Hospitalist"                              
## [23] "Hypertension & Nephrology"                
## [24] "Infectious Disease"                       
## [25] "Internal Medicine"                        
## [26] "Maternal Fetal Medicine / Perinatology"   
## [27] "Medical Oncology"                         
## [28] "Neonatology"                              
## [29] "Neurological Surgery"                     
## [30] "Neurology"                                
## [31] "Nuclear Medicine (M.D. only)"             
## [32] "Obstetrics"                               
## [33] "Obstrtrics & Gynecology"                  
## [34] "Occupational / Environmental Medicine"    
## [35] "Oncology - Surgical"                      
## [36] "Ophthalmology"                            
## [37] "Oral - Maxillofacial Surgery"             
## [38] "Orthopedic Surgery"                       
## [39] "Orthopedic Surgery - Hand"                
## [40] "Orthopedic Surgery - Joint Replacement"   
## [41] "Orthopedic Surgery - Pediatrics"          
## [42] "Orthopedic Surgery - Spine"               
## [43] "Orthopedic-Medical"                       
## [44] "Otolaryngology"                           
## [45] "Pain Management (Non-Anesthesiology)"     
## [46] "Palliative Care"                          
## [47] "Pathology (M.D. only)"                    
## [48] "Pediatric Allergy"                        
## [49] "Pediatric Cardiac / Thoracic Surgery"     
## [50] "Pediatric Cardiology"                     
## [51] "Pediatric Endocrinology"                  
## [52] "Pediatric Gastroenterology"               
## [53] "Pediatric Hematology / Oncology"          
## [54] "Pediatric Hospitalist"                    
## [55] "Pediatric Infectious Disease"             
## [56] "Pediatric Intensive Care"                 
## [57] "Pediatric Nephrology"                     
## [58] "Pediatric Neurology"                      
## [59] "Pediatric Orthopedic Surgery"             
## [60] "Pediatric Pulmonary Disease"              
## [61] "Pediatric Surgery"                        
## [62] "Pediatrics & Adolescent"                  
## [63] "Physical Medicine & Rehabilitation"       
## [64] "Plastic & Reconstructive Surgery"         
## [65] "Psychiatry"                               
## [66] "Psychiatry - Child"                       
## [67] "Pulmonary / Critical Care"                
## [68] "Pulmonary Disease"                        
## [69] "Radiation Therapy (M.D. only)"            
## [70] "Reproductive Endocrinology"               
## [71] "Rheumatologic Disease"                    
## [72] "Sports Medicine"                          
## [73] "Transplant Surgery - Kidney"              
## [74] "Transplant Surgery - Liver"               
## [75] "Trauma Surgery"                           
## [76] "Urgent Care"                              
## [77] "Urology"                                  
## [78] "Vascular Surgery"
# Wow, they have 78 different "specialties", let's just filter out the ones we need

med_subs <- c("Allergy & Immunology", "Cardiology", "Critical Care Medicine", "Endocrinology", "Gastroenterology",
              "Geriatrics", "Hematology & Medical Oncology", "Hypertension & Nephrology", "Infectious Disease",
              "Medical Oncology", "Palliative Care", "Pulmonary / Critical Care", "Pulmonary Disease",
              "Rheumatologic Disease")

salary <- salary[salary$Physician.Specialty %in% med_subs,]

# Rename to make it easier to work with, and to help w/ merge later

salary <- salary %>% rename("specialty" = "Physician.Specialty", "salary" = "Median.Physician.Compensation.Data")

# Convert salary from chr to numeric and drop the $ and ,

salary$salary <- gsub(",", "", salary$salary) # remove the ,
salary$salary <- gsub("[[:punct:]]", " ", salary$salary) # remove the special char, $
salary$salary <- as.integer(salary$salary) # convert salary to integer (from char)

salary$year <- rep(2016) # to add a year = 2016 column

salary 
##                        specialty salary year
## 1           Allergy & Immunology 294245 2016
## 5                     Cardiology 483653 2016
## 7         Critical Care Medicine 399692 2016
## 12                 Endocrinology 242202 2016
## 15              Gastroenterology 505194 2016
## 17                    Geriatrics 234181 2016
## 21 Hematology & Medical Oncology 416738 2016
## 23     Hypertension & Nephrology 329750 2016
## 24            Infectious Disease 261630 2016
## 27              Medical Oncology 435000 2016
## 46               Palliative Care 239022 2016
## 67     Pulmonary / Critical Care 385436 2016
## 68             Pulmonary Disease 352462 2016
## 71         Rheumatologic Disease 251913 2016
# Next steps:
# 1. Convert subspecialty to be corresponding name in the NRMP data frame
# 2. Filter for year == 2016 and exclude the other salary column for 2016
# 3. Use merge(NRMP_2016, salary, by = "specialty") to combine them

Importing Data into R

January 21, 2018
howto notes R tutorial

Python Basics: From Zero to Full Monty

September 27, 2017
notes study tutorial python

Tables: Converting Markdown to Huxtable

September 21, 2017
blog howto markdown R