head(resume, 5)
# Some checks to confirm balance between race and
# other attributes of a resume. There should be
# some minor differences due to randomness, but
# each variable should be (and is) generally
# well-balanced.
table(resume$race, resume$years_college)
table(resume$race, resume$college_degree)
table(resume$race, resume$honors)
table(resume$race, resume$worked_during_school)
table(resume$race, resume$years_experience)
table(resume$race, resume$computer_skills)
table(resume$race, resume$special_skills)
table(resume$race, resume$volunteer)
table(resume$race, resume$military)
table(resume$race, resume$employment_holes)
table(resume$race, resume$has_email_address)
table(resume$race, resume$resume_quality)
# Regarding the callback outcome for race,
# we observe a very large difference.
tapply(
resume$received_callback,
resume[c("race", "gender")],
mean
)
# Natural question: is this statisticaly significant?
# A proper analysis would take into account the
# paired nature of the data. For each ad, let's
# compute the following statistic:
#
# -
# First contruct the callbacks for white and
# black candidates by ad ID:
table(resume$race)
cb_white <- with(
subset(resume, race == "white"),
tapply(received_callback, job_ad_id, mean)
)
cb_black <- with(
subset(resume, race == "black"),
tapply(received_callback, job_ad_id, mean)
)
# Next, compute the differences, where the
# names(cb_white) part ensures we matched up the
# job ad IDs.
diff <- cb_white - cb_black[names(cb_white)]
# Finally, we can apply a t-test on the differences:
t.test(diff)
# There is very strong evidence of an effect.
# Here's a similar check with gender. There are
# more female-inferred candidates used on the resumes.
table(resume$gender)
cb_male <- with(
subset(resume, gender == "m"),
tapply(received_callback, job_ad_id, mean)
)
cb_female <- with(
subset(resume, gender == "f"),
tapply(received_callback, job_ad_id, mean)
)
diff <- cb_female - cb_male[names(cb_female)]
# The `na.rm = TRUE` part ensures we limit to jobs
# where both a male and female resume were sent.
t.test(diff, na.rm = TRUE)
# There is no statistically significant difference.
# Was that the best analysis? Absolutely not!
# However, the analysis was unbiased. To get more
# precision on the estimates, we could build a
# multivariate model that includes many characteristics
# of the resumes sent, e.g. years of experience.
# Since those other characteristics were assigned
# independently of the race characteristics, this
# means the race finding will almost certainy will
# hold. However, it is possible that we'll find
# more interesting results with the gender investigation.
Run the code above in your browser using DataLab