Sam Hopkins
rm(list = ls())
data <- read.csv("../data/salary_data.csv")
head(data)
## Age Gender Education.Level Job.Title Years.of.Experience Salary
## 1 32 Male Bachelor's Software Engineer 5 90000
## 2 28 Female Master's Data Analyst 3 65000
## 3 45 Male PhD Senior Manager 15 150000
## 4 36 Female Bachelor's Sales Associate 7 60000
## 5 52 Male Master's Director 20 200000
## 6 29 Male Bachelor's Marketing Analyst 2 55000
summary(data)
## Age Gender Education.Level Job.Title
## Min. :21.00 Length:6704 Length:6704 Length:6704
## 1st Qu.:28.00 Class :character Class :character Class :character
## Median :32.00 Mode :character Mode :character Mode :character
## Mean :33.62
## 3rd Qu.:38.00
## Max. :62.00
## NA's :2
## Years.of.Experience Salary
## Min. : 0.000 Min. : 350
## 1st Qu.: 3.000 1st Qu.: 70000
## Median : 7.000 Median :115000
## Mean : 8.095 Mean :115327
## 3rd Qu.:12.000 3rd Qu.:160000
## Max. :34.000 Max. :250000
## NA's :3 NA's :5
print(sum(is.na(data)))
## [1] 10
print(data[which(is.na(data), arr.ind = TRUE), ])
## Age Gender Education.Level Job.Title Years.of.Experience
## 173 NA NA
## 261 NA NA
## 173.1 NA NA
## 261.1 NA NA
## 5248 26 Female Bachelor's Degree Social M NA
## 173.2 NA NA
## 261.2 NA NA
## 3137 31 Male Master's Degree Full Stack Engineer 8
## 5248.1 26 Female Bachelor's Degree Social M NA
## 6456 36 Male Bachelor's Degree Sales Director 6
## 1 32 Male Bachelor's Software Engineer 5
## 1.1 32 Male Bachelor's Software Engineer 5
## 5 52 Male Master's Director 20
## 5.1 52 Male Master's Director 20
## 5.2 52 Male Master's Director 20
## 6 29 Male Bachelor's Marketing Analyst 2
## 6.1 29 Male Bachelor's Marketing Analyst 2
## 6.2 29 Male Bachelor's Marketing Analyst 2
## 6.3 29 Male Bachelor's Marketing Analyst 2
## 6.4 29 Male Bachelor's Marketing Analyst 2
## Salary
## 173 NA
## 261 NA
## 173.1 NA
## 261.1 NA
## 5248 NA
## 173.2 NA
## 261.2 NA
## 3137 NA
## 5248.1 NA
## 6456 NA
## 1 90000
## 1.1 90000
## 5 200000
## 5.1 200000
## 5.2 200000
## 6 55000
## 6.1 55000
## 6.2 55000
## 6.3 55000
## 6.4 55000
data <- na.omit(data)
data$GenderN <- 0
data$GenderN[data$Gender == "Male"] <- 1
unique(data$Education.Level)
## [1] "Bachelor's" "Master's" "PhD"
## [4] "Bachelor's Degree" "Master's Degree" ""
## [7] "High School" "phD"
data$EducationN[data$Education.Level == ""] <- 1
data$EducationN[data$Education.Level == "High School"] <- 1
data$EducationN[data$Education.Level == "Bachelor's Degree" | data$Education.Level == "Bachelor's"] <- 2
data$EducationN[data$Education.Level == "Master's Degree" | data$Education.Level == "Master's"] <- 3
data$EducationN[data$Education.Level == "phD" | data$Education.Level == "PhD"] <- 4
unique(data$EducationN)
## [1] 2 3 4 1
library(corrplot)
## corrplot 0.92 loaded
corrplot(cor(data[, sapply(data, is.numeric)]), method="color")
data$GenderN = as.factor(data$GenderN)
data$EducationN = as.factor(data$EducationN)
hist(data$Salary, main="Salary", breaks=100)
hist(data$Salary[data$GenderN == 1], main="Men", breaks=100)
hist(data$Salary[data$GenderN == 0], main="Women", breaks=100)
hist(data$Salary[data$EducationN == 1], main="High School/None", breaks=100)
hist(data$Salary[data$EducationN == 2], main="Bachelor's", breaks=100)
hist(data$Salary[data$EducationN == 3], main="Master's", breaks=100)
hist(data$Salary[data$EducationN == 4], main="PhD", breaks=100)
plot(data$Age, data$Salary, main="Age vs Salary",xlab="Age", ylab="Salary")
plot(data$Years.of.Experience, data$Salary, main="Years of Experience vs Salary", xlab="Years of Experience", ylab="Salary")
splitData <- function(){
set.seed(1)
assign("indices", sample(1:nrow(data), as.integer(0.8*nrow(data))), envir=.GlobalEnv)
assign("Training", data[indices, ], envir=.GlobalEnv)
assign("Testing", data[-indices, ], envir=.GlobalEnv)
assign("observed", Testing$Salary, envir=.GlobalEnv)
}
splitData()
R2s <- c()
RSEs <- c()
forms <- c()
methods <- c()
models <- list()
m <- 1
library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-8
library(caret)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
## Loading required package: lattice
evaluateModel <- function(model, ridge = FALSE){
if (!(ridge)){
predicted <<- predict(model, newdata=Testing)
} else {
X <- as.matrix(Testing[, -which(names(Training) == "Salary" | names(Training) == "Gender" | names(Training) == "Education.Level" | names(Testing) == "Job.Title")])
predicted <<- predict(model, newx=X)
}
SSE = sum((predicted-observed)^2)
SST = sum((observed-mean(observed))^2)
R2s <<- c(R2s, 1 - SSE/SST)
RSEs <<- c(RSEs, sqrt(SSE / (length(predicted) - length(model$coefficients))))
models[[m]] <<- model
m <<- m + 1
return(list(model, predicted, R2s[m-1], RSEs[m-1]))
}
# initialize formula
f0 <- "Salary~Age+Years.of.Experience+GenderN+EducationN"
linearModel <- evaluateModel(glm(as.formula(f0), data=Training))
forms <- c(forms, f0)
methods <- c(methods, "General Linear Model")
# summary(models[[m-1]])
X <- as.matrix(Training[, -which(names(Training) == "Salary" | names(Training) == "Gender" | names(Training) == "Education.Level" | names(Training) == "Job.Title")])
ridgeModel <- evaluateModel(cv.glmnet(x=X, y=Training$Salary, data=Training, alpha=0, lambda=10^seq(-6, 6, by = 0.1)), TRUE)
forms <- c(forms, f0)
methods <- c(methods, "Ridge Regression")
# summary(models[[m-1]])
treeModel <- evaluateModel(rpart(as.formula(f0), data=Training))
forms <- c(forms, f0)
methods <- c(methods, "Decision Tree")
# summary(models[[m-1]])
rfModel0 <- evaluateModel(randomForest(as.formula(f0), data=Training))
forms <- c(forms, f0)
methods <- c(methods, "Random Forest")
# importance(models[[m-1]])
# Flatten numeric values (age and years of experience)
# since relationships appear to flatten out the higher x gets
data$rtAge <- (data$Age)^0.3
data$rtExp <- (data$Years.of.Experience)^0.3
data$eduExp <- as.numeric(data$EducationN) * (mean(data$Years.of.Experience)/2) + data$Years.of.Experience
splitData()
f1 <- "Salary~rtAge+Years.of.Experience+EducationN+GenderN"
f2 <- "Salary~Age+rtExp+EducationN+GenderN"
f <- "Salary~Age+Years.of.Experience+eduExp+GenderN"
crossVal <- trainControl(method="cv", number=5, verbose=FALSE)
rfModel <- evaluateModel(randomForest(as.formula(f1), data=Training))
forms <- c(forms, f1)
methods <- c(methods, "Random Forest")
rfModel <- evaluateModel(randomForest(as.formula(f2), data=Training))
forms <- c(forms, f2)
methods <- c(methods, "Random Forest")
rfModel <- evaluateModel(randomForest(as.formula(f), data=Training))
forms <- c(forms, f)
methods <- c(methods, "Random Forest")
crossValRF <- train(as.formula(f), data=Training, method="rf", trControl=crossVal)
bestModel <- evaluateModel(crossValRF)
forms <- c(forms, f)
methods <- c(methods, "CV Random Forest")
crossValRF <- train(as.formula(f0), data=Training, method="rf", trControl=crossVal)
bestModel <- evaluateModel(crossValRF)
forms <- c(forms, f0)
methods <- c(methods, "CV Random Forest")
crossValGLM <- train(as.formula(f0), data=Training, method="glm", trControl = crossVal)
cvGLM <- evaluateModel(crossValGLM)
forms <- c(forms, f0)
methods <- c(methods, "CV GLM")
crossValDT <- train(as.formula(f0), data=Training, method="rpart", trControl = crossVal)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
cvDT <- evaluateModel(crossValDT)
forms <- c(forms, f0)
methods <- c(methods, "CV Decision Tree")
crossValRR <- train(as.formula(f0), data=Training, method="glmnet", trControl = crossVal, tuneGrid = expand.grid(.alpha=1, .lambda=10^seq(-6, 6, by = 0.1)))
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
cvRR <- evaluateModel(crossValRR)
forms <- c(forms, f0)
methods <- c(methods, "CV Ridge Regression")
library(knitr)
library(kableExtra)
results <- data.frame(
methods,
forms,
R2s,
RSEs
)
table <- kable(results, format = "html", align = "c") %>%
kable_styling(bootstrap_options = "striped", full_width = FALSE) %>%
add_header_above(c(" " = 1, " " = 2, "Model Performance" = 1)) %>%
row_spec(1:nrow(results), background = "#000") %>%
row_spec(nrow(results)-3, extra_css = "background-color: #333; color: #ff0000;")
print(table)
methods | forms | R2s | RSEs |
---|---|---|---|
General Linear Model | Salary~Age+Years.of.Experience+GenderN+EducationN | 0.7241793 | 27698.92 |
Ridge Regression | Salary~Age+Years.of.Experience+GenderN+EducationN | 0.7131128 | 28175.24 |
Decision Tree | Salary~Age+Years.of.Experience+GenderN+EducationN | 0.7530028 | 26143.16 |
Random Forest | Salary~Age+Years.of.Experience+GenderN+EducationN | 0.7949237 | 23821.54 |
Random Forest | Salary~rtAge+Years.of.Experience+EducationN+GenderN | 0.7964567 | 23732.34 |
Random Forest | Salary~Age+rtExp+EducationN+GenderN | 0.7986722 | 23602.82 |
Random Forest | Salary~Age+Years.of.Experience+eduExp+GenderN | 0.8429496 | 20846.43 |
CV Random Forest | Salary~Age+Years.of.Experience+eduExp+GenderN | 0.9115498 | 15644.49 |
CV Random Forest | Salary~Age+Years.of.Experience+GenderN+EducationN | 0.9120391 | 15601.16 |
CV GLM | Salary~Age+Years.of.Experience+GenderN+EducationN | 0.7241793 | 27626.48 |
CV Decision Tree | Salary~Age+Years.of.Experience+GenderN+EducationN | 0.6832623 | 29604.79 |
CV Ridge Regression | Salary~Age+Years.of.Experience+GenderN+EducationN | 0.7243554 | 27617.66 |
predicted <- bestModel[[2]]
plot(observed, predicted, xlab="Observed", ylab="Predicted", main=paste(paste("RSE:", round(bestModel[[4]], 1)), paste("R^2:", round(bestModel[[3]], 4))))
abline(lm(predicted~observed), col="red")
plot(bestModel[[1]])
plot(bestModel[[1]]$finalModel)
plot(rfModel0[[1]])
summary(cvGLM[[1]])
##
## Call:
## NULL
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 74500.6 3920.2 19.004 < 2e-16 ***
## Age -2080.5 147.7 -14.082 < 2e-16 ***
## Years.of.Experience 8137.7 187.1 43.483 < 2e-16 ***
## GenderN1 5875.9 794.9 7.392 1.67e-13 ***
## EducationN2 36080.7 1635.9 22.055 < 2e-16 ***
## EducationN3 47861.0 1784.0 26.828 < 2e-16 ***
## EducationN4 60075.8 1972.9 30.451 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 795113883)
##
## Null deviance: 1.4955e+13 on 5358 degrees of freedom
## Residual deviance: 4.2554e+12 on 5352 degrees of freedom
## AIC: 125045
##
## Number of Fisher Scoring iterations: 2
summary(linearModel[[1]])
##
## Call:
## glm(formula = as.formula(f0), data = Training)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 74500.6 3920.2 19.004 < 2e-16 ***
## Age -2080.5 147.7 -14.082 < 2e-16 ***
## Years.of.Experience 8137.7 187.1 43.483 < 2e-16 ***
## GenderN1 5875.9 794.9 7.392 1.67e-13 ***
## EducationN2 36080.7 1635.9 22.055 < 2e-16 ***
## EducationN3 47861.0 1784.0 26.828 < 2e-16 ***
## EducationN4 60075.8 1972.9 30.451 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 795113883)
##
## Null deviance: 1.4955e+13 on 5358 degrees of freedom
## Residual deviance: 4.2554e+12 on 5352 degrees of freedom
## AIC: 125045
##
## Number of Fisher Scoring iterations: 2
plot(cvDT[[1]])
plot(cvRR[[1]])