Predicting Salary

Sam Hopkins

Initialize Workspace and Read Data

rm(list = ls())
data <- read.csv("../data/salary_data.csv")
head(data)

##   Age Gender Education.Level         Job.Title Years.of.Experience Salary
## 1  32   Male      Bachelor's Software Engineer                   5  90000
## 2  28 Female        Master's      Data Analyst                   3  65000
## 3  45   Male             PhD    Senior Manager                  15 150000
## 4  36 Female      Bachelor's   Sales Associate                   7  60000
## 5  52   Male        Master's          Director                  20 200000
## 6  29   Male      Bachelor's Marketing Analyst                   2  55000

summary(data)

##       Age           Gender          Education.Level     Job.Title        
##  Min.   :21.00   Length:6704        Length:6704        Length:6704       
##  1st Qu.:28.00   Class :character   Class :character   Class :character  
##  Median :32.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :33.62                                                           
##  3rd Qu.:38.00                                                           
##  Max.   :62.00                                                           
##  NA's   :2                                                               
##  Years.of.Experience     Salary      
##  Min.   : 0.000      Min.   :   350  
##  1st Qu.: 3.000      1st Qu.: 70000  
##  Median : 7.000      Median :115000  
##  Mean   : 8.095      Mean   :115327  
##  3rd Qu.:12.000      3rd Qu.:160000  
##  Max.   :34.000      Max.   :250000  
##  NA's   :3           NA's   :5

Data Cleaning and Preparation

Remove NA values and assign 0 or 1 to gender

print(sum(is.na(data)))

## [1] 10

print(data[which(is.na(data), arr.ind = TRUE), ])

##        Age Gender   Education.Level           Job.Title Years.of.Experience
## 173     NA                                                               NA
## 261     NA                                                               NA
## 173.1   NA                                                               NA
## 261.1   NA                                                               NA
## 5248    26 Female Bachelor's Degree            Social M                  NA
## 173.2   NA                                                               NA
## 261.2   NA                                                               NA
## 3137    31   Male   Master's Degree Full Stack Engineer                   8
## 5248.1  26 Female Bachelor's Degree            Social M                  NA
## 6456    36   Male Bachelor's Degree      Sales Director                   6
## 1       32   Male        Bachelor's   Software Engineer                   5
## 1.1     32   Male        Bachelor's   Software Engineer                   5
## 5       52   Male          Master's            Director                  20
## 5.1     52   Male          Master's            Director                  20
## 5.2     52   Male          Master's            Director                  20
## 6       29   Male        Bachelor's   Marketing Analyst                   2
## 6.1     29   Male        Bachelor's   Marketing Analyst                   2
## 6.2     29   Male        Bachelor's   Marketing Analyst                   2
## 6.3     29   Male        Bachelor's   Marketing Analyst                   2
## 6.4     29   Male        Bachelor's   Marketing Analyst                   2
##        Salary
## 173        NA
## 261        NA
## 173.1      NA
## 261.1      NA
## 5248       NA
## 173.2      NA
## 261.2      NA
## 3137       NA
## 5248.1     NA
## 6456       NA
## 1       90000
## 1.1     90000
## 5      200000
## 5.1    200000
## 5.2    200000
## 6       55000
## 6.1     55000
## 6.2     55000
## 6.3     55000
## 6.4     55000

data <- na.omit(data)
data$GenderN <- 0
data$GenderN[data$Gender == "Male"] <- 1

Remove reduntant classes from education level and assign an integer 0-4

unique(data$Education.Level)

## [1] "Bachelor's"        "Master's"          "PhD"              
## [4] "Bachelor's Degree" "Master's Degree"   ""                 
## [7] "High School"       "phD"

data$EducationN[data$Education.Level == ""] <- 1
data$EducationN[data$Education.Level == "High School"] <- 1
data$EducationN[data$Education.Level == "Bachelor's Degree" | data$Education.Level == "Bachelor's"] <- 2
data$EducationN[data$Education.Level == "Master's Degree" | data$Education.Level == "Master's"] <- 3
data$EducationN[data$Education.Level == "phD" | data$Education.Level == "PhD"] <- 4
unique(data$EducationN)

## [1] 2 3 4 1

Data Inspection

Correlations

library(corrplot)

## corrplot 0.92 loaded

corrplot(cor(data[, sapply(data, is.numeric)]), method="color")

data$GenderN = as.factor(data$GenderN)
data$EducationN = as.factor(data$EducationN)

Histograms

hist(data$Salary, main="Salary", breaks=100)

By Gender

hist(data$Salary[data$GenderN == 1], main="Men", breaks=100)

hist(data$Salary[data$GenderN == 0], main="Women", breaks=100)

By Education Level

hist(data$Salary[data$EducationN == 1], main="High School/None", breaks=100)

hist(data$Salary[data$EducationN == 2], main="Bachelor's", breaks=100)

hist(data$Salary[data$EducationN == 3], main="Master's", breaks=100)

hist(data$Salary[data$EducationN == 4], main="PhD", breaks=100)

Scatterplots

plot(data$Age, data$Salary, main="Age vs Salary",xlab="Age", ylab="Salary")

plot(data$Years.of.Experience, data$Salary, main="Years of Experience vs Salary", xlab="Years of Experience", ylab="Salary")

Split data into training and testing 80/20

splitData <- function(){

    set.seed(1)
    assign("indices", sample(1:nrow(data), as.integer(0.8*nrow(data))), envir=.GlobalEnv)
    assign("Training", data[indices, ], envir=.GlobalEnv)
    assign("Testing", data[-indices, ], envir=.GlobalEnv)
    assign("observed", Testing$Salary, envir=.GlobalEnv)

}
splitData()

Initialize vectors for later comparison

R2s <- c()
RSEs <- c()
forms <- c()
methods <- c()
models <- list()
m <- 1

Define functions for modeling

library(rpart)
library(rpart.plot)
library(randomForest)

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

library(glmnet)

## Loading required package: Matrix

## Loaded glmnet 4.1-8

library(caret)

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

## Loading required package: lattice

evaluateModel <- function(model, ridge = FALSE){
    if (!(ridge)){
        predicted <<- predict(model, newdata=Testing)
    } else {
        X <- as.matrix(Testing[, -which(names(Training) == "Salary" | names(Training) == "Gender" | names(Training) == "Education.Level" | names(Testing) == "Job.Title")])
        predicted <<- predict(model, newx=X)
    }

    SSE = sum((predicted-observed)^2)
    SST = sum((observed-mean(observed))^2)
    
    R2s <<- c(R2s, 1 - SSE/SST)
    RSEs <<- c(RSEs, sqrt(SSE / (length(predicted) - length(model$coefficients))))
    models[[m]] <<- model
    m <<- m + 1
    
    return(list(model, predicted, R2s[m-1], RSEs[m-1]))

}

Modeling using all variables, no interaction terms

Linear Regression

# initialize formula
f0 <- "Salary~Age+Years.of.Experience+GenderN+EducationN"
linearModel <- evaluateModel(glm(as.formula(f0), data=Training))
forms <- c(forms, f0)
methods <- c(methods, "General Linear Model")

R^2: 0.7241793

RSE: 2.7698924^{4}

# summary(models[[m-1]])

Ridge Regression

X <- as.matrix(Training[, -which(names(Training) == "Salary" | names(Training) == "Gender" | names(Training) == "Education.Level" | names(Training) == "Job.Title")])
ridgeModel <- evaluateModel(cv.glmnet(x=X, y=Training$Salary, data=Training, alpha=0, lambda=10^seq(-6, 6, by = 0.1)), TRUE)
forms <- c(forms, f0)
methods <- c(methods, "Ridge Regression")

R^2: 0.7131128

RSE: 2.8175244^{4}

# summary(models[[m-1]])

Decision Tree

treeModel <- evaluateModel(rpart(as.formula(f0), data=Training))
forms <- c(forms, f0)
methods <- c(methods, "Decision Tree")

R^2: 0.7530028

RSE: 2.6143161^{4}

# summary(models[[m-1]])

Random Forest

rfModel0 <- evaluateModel(randomForest(as.formula(f0), data=Training))
forms <- c(forms, f0)
methods <- c(methods, "Random Forest")

R^2: 0.7949237

RSE: 2.3821539^{4}

# importance(models[[m-1]])

Modeling using interaction terms and cross-validation

Create interaction terms and re-split data

# Flatten numeric values (age and years of experience)
# since relationships appear to flatten out the higher x gets
data$rtAge <- (data$Age)^0.3
data$rtExp <- (data$Years.of.Experience)^0.3
data$eduExp <- as.numeric(data$EducationN) * (mean(data$Years.of.Experience)/2) + data$Years.of.Experience

splitData()

Models

f1 <- "Salary~rtAge+Years.of.Experience+EducationN+GenderN"
f2 <- "Salary~Age+rtExp+EducationN+GenderN"
f <- "Salary~Age+Years.of.Experience+eduExp+GenderN"
crossVal <- trainControl(method="cv", number=5, verbose=FALSE)

rfModel <- evaluateModel(randomForest(as.formula(f1), data=Training))
forms <- c(forms, f1)
methods <- c(methods, "Random Forest")

rfModel <- evaluateModel(randomForest(as.formula(f2), data=Training))
forms <- c(forms, f2)
methods <- c(methods, "Random Forest")

rfModel <- evaluateModel(randomForest(as.formula(f), data=Training))
forms <- c(forms, f)
methods <- c(methods, "Random Forest")

crossValRF <- train(as.formula(f), data=Training, method="rf", trControl=crossVal)
bestModel <- evaluateModel(crossValRF)
forms <- c(forms, f)
methods <- c(methods, "CV Random Forest")

crossValRF <- train(as.formula(f0), data=Training, method="rf", trControl=crossVal)
bestModel <- evaluateModel(crossValRF)
forms <- c(forms, f0)
methods <- c(methods, "CV Random Forest")

crossValGLM <- train(as.formula(f0), data=Training, method="glm", trControl = crossVal)
cvGLM <- evaluateModel(crossValGLM)
forms <- c(forms, f0)
methods <- c(methods, "CV GLM")

crossValDT <- train(as.formula(f0), data=Training, method="rpart", trControl = crossVal)

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.

cvDT <- evaluateModel(crossValDT)
forms <- c(forms, f0)
methods <- c(methods, "CV Decision Tree")

crossValRR <- train(as.formula(f0), data=Training, method="glmnet", trControl = crossVal, tuneGrid = expand.grid(.alpha=1, .lambda=10^seq(-6, 6, by = 0.1)))

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.

cvRR <- evaluateModel(crossValRR)
forms <- c(forms, f0)
methods <- c(methods, "CV Ridge Regression")

Comparison Table

library(knitr)
library(kableExtra)

results <- data.frame(
    methods,
    forms,
    R2s,
    RSEs
)

table <- kable(results, format = "html", align = "c") %>%
  kable_styling(bootstrap_options = "striped", full_width = FALSE) %>%
  add_header_above(c(" " = 1, " " = 2, "Model Performance" = 1)) %>%
  row_spec(1:nrow(results), background = "#000") %>%
  row_spec(nrow(results)-3, extra_css = "background-color: #333; color: #ff0000;")

print(table)

			Model Performance
methods	forms	R2s	RSEs
General Linear Model	Salary~Age+Years.of.Experience+GenderN+EducationN	0.7241793	27698.92
Ridge Regression	Salary~Age+Years.of.Experience+GenderN+EducationN	0.7131128	28175.24
Decision Tree	Salary~Age+Years.of.Experience+GenderN+EducationN	0.7530028	26143.16
Random Forest	Salary~Age+Years.of.Experience+GenderN+EducationN	0.7949237	23821.54
Random Forest	Salary~rtAge+Years.of.Experience+EducationN+GenderN	0.7964567	23732.34
Random Forest	Salary~Age+rtExp+EducationN+GenderN	0.7986722	23602.82
Random Forest	Salary~Age+Years.of.Experience+eduExp+GenderN	0.8429496	20846.43
CV Random Forest	Salary~Age+Years.of.Experience+eduExp+GenderN	0.9115498	15644.49
CV Random Forest	Salary~Age+Years.of.Experience+GenderN+EducationN	0.9120391	15601.16
CV GLM	Salary~Age+Years.of.Experience+GenderN+EducationN	0.7241793	27626.48
CV Decision Tree	Salary~Age+Years.of.Experience+GenderN+EducationN	0.6832623	29604.79
CV Ridge Regression	Salary~Age+Years.of.Experience+GenderN+EducationN	0.7243554	27617.66

Best Model: Predicted vs Observed

predicted <- bestModel[[2]]
plot(observed, predicted, xlab="Observed", ylab="Predicted", main=paste(paste("RSE:", round(bestModel[[4]], 1)), paste("R^2:", round(bestModel[[3]], 4))))
abline(lm(predicted~observed), col="red")

plot(bestModel[[1]])

plot(bestModel[[1]]$finalModel)

plot(rfModel0[[1]])

summary(cvGLM[[1]])

## 
## Call:
## NULL
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          74500.6     3920.2  19.004  < 2e-16 ***
## Age                  -2080.5      147.7 -14.082  < 2e-16 ***
## Years.of.Experience   8137.7      187.1  43.483  < 2e-16 ***
## GenderN1              5875.9      794.9   7.392 1.67e-13 ***
## EducationN2          36080.7     1635.9  22.055  < 2e-16 ***
## EducationN3          47861.0     1784.0  26.828  < 2e-16 ***
## EducationN4          60075.8     1972.9  30.451  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 795113883)
## 
##     Null deviance: 1.4955e+13  on 5358  degrees of freedom
## Residual deviance: 4.2554e+12  on 5352  degrees of freedom
## AIC: 125045
## 
## Number of Fisher Scoring iterations: 2

summary(linearModel[[1]])

## 
## Call:
## glm(formula = as.formula(f0), data = Training)
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          74500.6     3920.2  19.004  < 2e-16 ***
## Age                  -2080.5      147.7 -14.082  < 2e-16 ***
## Years.of.Experience   8137.7      187.1  43.483  < 2e-16 ***
## GenderN1              5875.9      794.9   7.392 1.67e-13 ***
## EducationN2          36080.7     1635.9  22.055  < 2e-16 ***
## EducationN3          47861.0     1784.0  26.828  < 2e-16 ***
## EducationN4          60075.8     1972.9  30.451  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 795113883)
## 
##     Null deviance: 1.4955e+13  on 5358  degrees of freedom
## Residual deviance: 4.2554e+12  on 5352  degrees of freedom
## AIC: 125045
## 
## Number of Fisher Scoring iterations: 2

plot(cvDT[[1]])

plot(cvRR[[1]])