Peer Influence for a music-streaming company
Peer Influence for a music-streaming company
# Load relevant packages
library(dplyr)
library(ggplot2)
library(corrplot)
library(MatchIt)
library(psych)
library(GGally)
library(gridExtra)
#read the dataset
High_Note <- read.csv("Data/peer_influence.csv", header = TRUE)
describeBy(High_Note, group = High_Note$adopter, mat = FALSE, digits=2)
##
## Descriptive statistics by group
## group: 0
## vars n mean sd median trimmed
## ï..ID 1 40300 20150.50 11633.75 20150.50 20150.50
## age 2 40300 23.95 6.37 23.00 23.09
## male 3 40300 0.62 0.48 1.00 0.65
## friend_cnt 4 40300 18.49 57.48 7.00 10.28
## avg_friend_age 5 40300 24.01 5.10 23.00 23.40
## avg_friend_male 6 40300 0.62 0.32 0.67 0.65
## friend_country_cnt 7 40300 3.96 5.76 2.00 2.66
## subscriber_friend_cnt 8 40300 0.42 2.42 0.00 0.13
## songsListened 9 40300 17589.44 28416.02 7440.00 11817.64
## lovedTracks 10 40300 86.82 263.58 14.00 36.35
## posts 11 40300 5.29 104.31 0.00 0.23
## playlists 12 40300 0.55 1.07 0.00 0.45
## shouts 13 40300 29.97 150.69 4.00 8.84
## adopter 14 40300 0.00 0.00 0.00 0.00
## tenure 15 40300 43.81 19.79 44.00 43.72
## good_country 16 40300 0.36 0.48 0.00 0.32
## mad min max range skew kurtosis se
## ï..ID 14937.19 1 40300 40299 0.00 -1.20 57.95
## age 4.45 8 79 71 1.97 6.80 0.03
## male 0.00 0 1 1 -0.50 -1.75 0.00
## friend_cnt 7.41 1 4957 4956 32.67 2087.42 0.29
## avg_friend_age 3.95 8 77 69 1.84 7.15 0.03
## avg_friend_male 0.35 0 1 1 -0.52 -0.72 0.00
## friend_country_cnt 1.48 0 129 129 4.74 38.29 0.03
## subscriber_friend_cnt 0.00 0 309 309 72.19 8024.62 0.01
## songsListened 10576.87 0 1000000 1000000 6.05 105.85 141.55
## lovedTracks 20.76 0 12522 12522 13.12 335.93 1.31
## posts 0.00 0 12309 12309 73.92 7005.34 0.52
## playlists 0.00 0 98 98 28.21 1945.28 0.01
## shouts 4.45 0 7736 7736 22.53 779.12 0.75
## adopter 0.00 0 0 0 NaN NaN 0.00
## tenure 22.24 1 111 110 0.05 -0.70 0.10
## good_country 0.00 0 1 1 0.59 -1.65 0.00
## --------------------------------------------------------
## group: 1
## vars n mean sd median trimmed
## ï..ID 1 3527 42064.00 1018.30 42064.00 42064.00
## age 2 3527 25.98 6.84 24.00 25.05
## male 3 3527 0.73 0.44 1.00 0.79
## friend_cnt 4 3527 39.73 117.27 16.00 23.69
## avg_friend_age 5 3527 25.44 5.21 24.36 24.83
## avg_friend_male 6 3527 0.64 0.25 0.67 0.65
## friend_country_cnt 7 3527 7.19 8.86 4.00 5.36
## subscriber_friend_cnt 8 3527 1.64 5.85 0.00 0.84
## songsListened 9 3527 33758.04 43592.73 20908.00 25811.69
## lovedTracks 10 3527 264.34 491.43 108.00 161.68
## posts 11 3527 21.20 221.99 0.00 1.44
## playlists 12 3527 0.90 2.56 1.00 0.59
## shouts 13 3527 99.44 1156.07 9.00 23.89
## adopter 14 3527 1.00 0.00 1.00 1.00
## tenure 15 3527 45.58 20.04 46.00 45.60
## good_country 16 3527 0.29 0.45 0.00 0.23
## mad min max range skew kurtosis se
## ï..ID 1307.65 40301 43827 3526 0.00 -1.20 17.15
## age 4.45 8 73 65 1.68 4.39 0.12
## male 0.00 0 1 1 -1.03 -0.94 0.01
## friend_cnt 17.79 1 5089 5088 26.04 1013.79 1.97
## avg_friend_age 3.91 12 62 50 1.68 5.05 0.09
## avg_friend_male 0.25 0 1 1 -0.54 -0.05 0.00
## friend_country_cnt 4.45 0 136 136 3.61 24.53 0.15
## subscriber_friend_cnt 0.00 0 287 287 34.05 1609.52 0.10
## songsListened 23276.82 0 817290 817290 4.71 46.64 734.03
## lovedTracks 140.85 0 10220 10220 6.52 80.96 8.27
## posts 0.00 0 8506 8506 26.52 852.38 3.74
## playlists 1.48 0 118 118 28.84 1244.31 0.04
## shouts 11.86 0 65872 65872 52.52 2969.09 19.47
## adopter 0.00 1 1 0 NaN NaN 0.00
## tenure 20.76 0 111 111 0.02 -0.62 0.34
## good_country 0.00 0 1 1 0.94 -1.12 0.01
#Take log of variables where values are too large compared to the others
High_Note <- High_Note %>% mutate(ln_songsListened = log(songsListened + 1))
High_Note <- High_Note %>% mutate(ln_lovedTracks = log(lovedTracks + 1))
#Start with some visualizations
ggcorr(High_Note, palette = "RdBu", label = TRUE)
pairs(~age+friend_cnt+ln_songsListened+posts, col=High_Note$adopter, data=High_Note, main="Scatterplot Matrix")
#Demographics
Age_plot <- ggplot(High_Note, aes(x = factor(adopter), y = age)) + geom_boxplot(aes(fill = factor(adopter))) + labs(x = "Adopter")
Age_plot
Age_plotbygender <- ggplot(High_Note, aes(x=factor(adopter), y=age, color = factor(male))) + geom_boxplot() + labs(x = "Adopter")
Age_plotbygender
#Peer influence
friend_cnt_plot <- ggplot(High_Note, aes(x = factor(adopter), y = friend_cnt)) + geom_boxplot(aes(fill = factor(adopter))) +ylim(0, 1250) +
labs(x = "Adopter")
friend_cnt_plot
## Warning: Removed 13 rows containing non-finite values (stat_boxplot).
subscriber_friend_cnt_plot <- ggplot(High_Note, aes(x = factor(adopter), y = subscriber_friend_cnt)) +
geom_boxplot(aes(fill = factor(adopter))) + ylim(0, 50) + labs(x = "Adopter")
subscriber_friend_cnt_plot
## Warning: Removed 9 rows containing non-finite values (stat_boxplot).
#user engagement
ln_songsListened_plot <- ggplot(High_Note, aes(x = factor(adopter), y = ln_songsListened)) + geom_boxplot(aes(fill = factor(adopter)))
ln_songsListened_plot
ln_lovedTracks_plot <- ggplot(High_Note, aes(x = factor(adopter), y = ln_lovedTracks)) + geom_boxplot(aes(fill = factor(adopter)))
ln_lovedTracks_plot
##Propensity Score Matching
#Transform subscriber friend count into the Treatment and control variable with Treatment being 1 and Control being 0
High_Note$Treatment <- ifelse(High_Note$subscriber_friend_cnt >= 1, 1, 0)
#For those who have 1 or more subscriber friends, on average 18% of them are premuim subsribers,
# while those who have 0 subscriber friends, on average 5% of them are premium subscribers.
High_Note %>%
group_by(Treatment) %>%
summarise(mean_adopter = mean(adopter))
## # A tibble: 2 x 2
## Treatment mean_adopter
## <dbl> <dbl>
## 1 0 0.0524
## 2 1 0.178
with(High_Note, t.test(adopter ~ Treatment))
##
## Welch Two Sample t-test
##
## data: adopter by Treatment
## t = -30.961, df = 11815, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1330281 -0.1171869
## sample estimates:
## mean in group 0 mean in group 1
## 0.05243501 0.17754250
Difference-in-means: pre-treatment covariates
Let’s calculate the mean for each covariate
High_Note_cov <- c("age","male","friend_cnt","avg_friend_age","avg_friend_male","friend_country_cnt","ln_songsListened",
"ln_lovedTracks","posts","playlists","shouts", "tenure","good_country")
High_Note %>%
group_by(Treatment) %>%
select(one_of(High_Note_cov)) %>%
summarise_all(funs(mean(., na.rm = T)))
## Adding missing grouping variables: `Treatment`
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
## # A tibble: 2 x 14
## Treatment age male friend_cnt avg_friend_age avg_friend_male
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 23.7 0.629 10.4 23.8 0.613
## 2 1 25.4 0.636 54.0 25.4 0.636
## # ... with 8 more variables: friend_country_cnt <dbl>,
## # ln_songsListened <dbl>, ln_lovedTracks <dbl>, posts <dbl>,
## # playlists <dbl>, shouts <dbl>, tenure <dbl>, good_country <dbl>
#T-test
list_of_var <- c("age","male","friend_cnt","avg_friend_age","avg_friend_male","friend_country_cnt","ln_songsListened",
"ln_lovedTracks","posts","playlists","shouts", "tenure","good_country")
lapply(list_of_var, function(v) {
t.test(High_Note[, v] ~ High_Note$Treatment)
})
## [[1]]
##
## Welch Two Sample t-test
##
## data: High_Note[, v] by High_Note$Treatment
## t = -20.841, df = 14645, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.778544 -1.472749
## sample estimates:
## mean in group 0 mean in group 1
## 23.74756 25.37321
##
##
## [[2]]
##
## Welch Two Sample t-test
##
## data: High_Note[, v] by High_Note$Treatment
## t = -1.3459, df = 15986, p-value = 0.1784
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.018236129 0.003388028
## sample estimates:
## mean in group 0 mean in group 1
## 0.6288378 0.6362618
##
##
## [[3]]
##
## Welch Two Sample t-test
##
## data: High_Note[, v] by High_Note$Treatment
## t = -33.707, df = 9903.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -46.12459 -41.05469
## sample estimates:
## mean in group 0 mean in group 1
## 10.43133 54.02097
##
##
## [[4]]
##
## Welch Two Sample t-test
##
## data: High_Note[, v] by High_Note$Treatment
## t = -27.658, df = 15667, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.744514 -1.513611
## sample estimates:
## mean in group 0 mean in group 1
## 23.76137 25.39043
##
##
## [[5]]
##
## Welch Two Sample t-test
##
## data: High_Note[, v] by High_Note$Treatment
## t = -7.7114, df = 23020, p-value = 1.294e-14
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.02846397 -0.01692672
## sample estimates:
## mean in group 0 mean in group 1
## 0.6131124 0.6358077
##
##
## [[6]]
##
## Welch Two Sample t-test
##
## data: High_Note[, v] by High_Note$Treatment
## t = -65.05, df = 10372, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -6.861271 -6.459857
## sample estimates:
## mean in group 0 mean in group 1
## 2.725062 9.385626
##
##
## [[7]]
##
## Welch Two Sample t-test
##
## data: High_Note[, v] by High_Note$Treatment
## t = -72.169, df = 24545, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.702958 -1.612902
## sample estimates:
## mean in group 0 mean in group 1
## 7.944104 9.602034
##
##
## [[8]]
##
## Welch Two Sample t-test
##
## data: High_Note[, v] by High_Note$Treatment
## t = -64.938, df = 15507, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.557214 -1.465962
## sample estimates:
## mean in group 0 mean in group 1
## 2.446598 3.958186
##
##
## [[9]]
##
## Welch Two Sample t-test
##
## data: High_Note[, v] by High_Note$Treatment
## t = -7.3649, df = 9933.6, p-value = 1.914e-13
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -22.76492 -13.19424
## sample estimates:
## mean in group 0 mean in group 1
## 2.543377 20.522956
##
##
## [[10]]
##
## Welch Two Sample t-test
##
## data: High_Note[, v] by High_Note$Treatment
## t = -10.492, df = 11238, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.2546958 -0.1745100
## sample estimates:
## mean in group 0 mean in group 1
## 0.5294671 0.7440700
##
##
## [[11]]
##
## Welch Two Sample t-test
##
## data: High_Note[, v] by High_Note$Treatment
## t = -11.426, df = 9888.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -100.04703 -70.74591
## sample estimates:
## mean in group 0 mean in group 1
## 16.42304 101.81951
##
##
## [[12]]
##
## Welch Two Sample t-test
##
## data: High_Note[, v] by High_Note$Treatment
## t = -14.696, df = 15805, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3.792309 -2.899752
## sample estimates:
## mean in group 0 mean in group 1
## 43.20268 46.54871
##
##
## [[13]]
##
## Welch Two Sample t-test
##
## data: High_Note[, v] by High_Note$Treatment
## t = 2.0956, df = 16030, p-value = 0.03613
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.0007383591 0.0220968020
## sample estimates:
## mean in group 0 mean in group 1
## 0.3546936 0.3432760
Propensity score estimation We estimate the propensity score by running a logit model (probit also works) where the outcome variable is a binary variable indicating treatment status. what covariates should we include? For the matching to give you a causal estimate in the end, you need to include any covariate that is related to both the treatment assignment and potential outcomes. Therefore at this moment I choose to include all variables
m_ps <- glm(Treatment ~ age + male + friend_cnt + avg_friend_age + avg_friend_male + friend_country_cnt +
ln_songsListened + ln_lovedTracks + posts + playlists + shouts + tenure + good_country,
family = binomial(), data = High_Note)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(m_ps)
##
## Call:
## glm(formula = Treatment ~ age + male + friend_cnt + avg_friend_age +
## avg_friend_male + friend_country_cnt + ln_songsListened +
## ln_lovedTracks + posts + playlists + shouts + tenure + good_country,
## family = binomial(), data = High_Note)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.1675 -0.5870 -0.4007 -0.1806 6.8050
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.7989512 0.1066723 -63.737 < 2e-16 ***
## age 0.0211928 0.0028963 7.317 2.53e-13 ***
## male 0.0127828 0.0303040 0.422 0.67316
## friend_cnt 0.0281816 0.0010230 27.549 < 2e-16 ***
## avg_friend_age 0.0836847 0.0036034 23.224 < 2e-16 ***
## avg_friend_male 0.2322512 0.0524685 4.426 9.58e-06 ***
## friend_country_cnt 0.1021891 0.0046711 21.877 < 2e-16 ***
## ln_songsListened 0.1700614 0.0088009 19.323 < 2e-16 ***
## ln_lovedTracks 0.1282609 0.0076995 16.658 < 2e-16 ***
## posts 0.0006760 0.0002331 2.900 0.00373 **
## playlists -0.0108768 0.0131725 -0.826 0.40897
## shouts -0.0004121 0.0001525 -2.703 0.00688 **
## tenure -0.0031864 0.0007966 -4.000 6.33e-05 ***
## good_country 0.0032863 0.0293230 0.112 0.91077
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 46640 on 43826 degrees of freedom
## Residual deviance: 33588 on 43813 degrees of freedom
## AIC: 33616
##
## Number of Fisher Scoring iterations: 25
Using this model, we can now calculate the propensity score for each user It is simply the user’s predicted probability of being Treated, given the estimates from the logit model.
prs_df <- data.frame(pr_score = predict(m_ps, type = "response"),
Treatment = m_ps$model$Treatment)
head(prs_df)
## pr_score Treatment
## 1 0.12150210 0
## 2 0.03447076 0
## 3 0.04815195 0
## 4 0.22831497 1
## 5 0.65092056 0
## 6 0.18205340 0
Examining the region of common support After estimating the propensity score, it is useful to plot histograms of the estimated propensity scores by treatment status
labs <- paste("Treatment type:", c("1 or more friends", "0 friends"))
prs_df %>%
mutate(SubscriberType = ifelse(Treatment == 1, labs[1], labs[2])) %>%
ggplot(aes(x = pr_score)) +
geom_histogram(color = "white", binwidth = 0.025) +
facet_wrap(~SubscriberType) +
xlab("Probability of Treatment") +
theme_bw()
The method we use below is to find pairs of observations that have very similar propensity scores, but that differ in their treatment status. We use the package MatchIt for this. This package estimates the propensity score in the background and then matches observations based on the method of choice (“nearest” in this case)
High_Note_nomiss <- High_Note %>% # MatchIt does not allow missing values
select(adopter, Treatment, one_of(High_Note_cov)) %>%
na.omit()
mod_match <- matchit(Treatment ~ age + male + friend_cnt + avg_friend_age + avg_friend_male + friend_country_cnt +
ln_songsListened + ln_lovedTracks + posts + playlists + shouts + tenure + good_country,
method = "nearest", data = High_Note_nomiss)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
We can get some information about how successful the matching was using summary(mod_match) and plot(mod_match)
summary(mod_match)
##
## Call:
## matchit(formula = Treatment ~ age + male + friend_cnt + avg_friend_age +
## avg_friend_male + friend_country_cnt + ln_songsListened +
## ln_lovedTracks + posts + playlists + shouts + tenure + good_country,
## data = High_Note_nomiss, method = "nearest")
##
## Summary of balance for all data:
## Means Treated Means Control SD Control Mean Diff
## distance 0.4689 0.1532 0.1496 0.3157
## age 25.3732 23.7476 6.2245 1.6256
## male 0.6363 0.6288 0.4831 0.0074
## friend_cnt 54.0210 10.4313 15.2769 43.5896
## avg_friend_age 25.3904 23.7614 5.0577 1.6291
## avg_friend_male 0.6358 0.6131 0.3343 0.0227
## friend_country_cnt 9.3856 2.7251 3.1024 6.6606
## ln_songsListened 9.6020 7.9441 2.7002 1.6579
## ln_lovedTracks 3.9582 2.4466 1.9781 1.5116
## posts 20.5230 2.5434 33.7947 17.9796
## playlists 0.7441 0.5295 0.9673 0.2146
## shouts 101.8195 16.4230 79.7381 85.3965
## tenure 46.5487 43.2027 19.7212 3.3460
## good_country 0.3433 0.3547 0.4784 -0.0114
## eQQ Med eQQ Mean eQQ Max
## distance 0.2772 0.3157 0.6429
## age 1.0000 1.6296 5.0000
## male 0.0000 0.0074 1.0000
## friend_cnt 22.0000 43.5838 4794.0000
## avg_friend_age 1.5909 1.6369 11.5000
## avg_friend_male 0.0738 0.0958 0.3636
## friend_country_cnt 5.0000 6.6598 95.0000
## ln_songsListened 1.2880 1.6583 6.0283
## ln_lovedTracks 1.5315 1.5115 2.8332
## posts 0.0000 17.8829 9535.0000
## playlists 0.0000 0.2092 26.0000
## shouts 15.0000 85.1764 59168.0000
## tenure 3.0000 3.3473 10.0000
## good_country 0.0000 0.0114 1.0000
##
##
## Summary of balance for matched data:
## Means Treated Means Control SD Control Mean Diff
## distance 0.4689 0.3151 0.1854 0.1538
## age 25.3732 25.7794 7.5478 -0.4062
## male 0.6363 0.6585 0.4743 -0.0222
## friend_cnt 54.0210 21.5264 23.3770 32.4946
## avg_friend_age 25.3904 26.0051 6.4510 -0.6146
## avg_friend_male 0.6358 0.6478 0.2587 -0.0120
## friend_country_cnt 9.3856 5.0678 4.6172 4.3178
## ln_songsListened 9.6020 9.4839 1.6281 0.1181
## ln_lovedTracks 3.9582 3.7265 1.8645 0.2317
## posts 20.5230 6.2683 60.7389 14.2546
## playlists 0.7441 0.6678 0.9905 0.0762
## shouts 101.8195 36.7657 134.5236 65.0539
## tenure 46.5487 47.1905 19.1378 -0.6418
## good_country 0.3433 0.3602 0.4801 -0.0169
## eQQ Med eQQ Mean eQQ Max
## distance 0.1147 0.1538 0.4184
## age 0.0000 0.4599 6.0000
## male 0.0000 0.0222 1.0000
## friend_cnt 12.0000 32.4946 4794.0000
## avg_friend_age 0.3333 0.8619 14.0000
## avg_friend_male 0.0119 0.0258 0.1463
## friend_country_cnt 2.0000 4.3178 95.0000
## ln_songsListened 0.1447 0.1653 2.1972
## ln_lovedTracks 0.3081 0.2664 0.8255
## posts 0.0000 14.2546 9535.0000
## playlists 0.0000 0.1194 95.0000
## shouts 9.0000 65.0539 59168.0000
## tenure 1.0000 0.9592 3.0000
## good_country 0.0000 0.0169 1.0000
##
## Percent Balance Improvement:
## Mean Diff. eQQ Med eQQ Mean eQQ Max
## distance 51.2952 58.6333 51.2929 34.9201
## age 75.0137 100.0000 71.7766 -20.0000
## male -198.9313 0.0000 -198.6301 0.0000
## friend_cnt 25.4535 45.4545 25.4436 0.0000
## avg_friend_age 62.2701 79.0476 47.3451 -21.7391
## avg_friend_male 47.2679 83.8776 73.0291 59.7722
## friend_country_cnt 35.1733 60.0000 35.1656 0.0000
## ln_songsListened 92.8737 88.7676 90.0310 63.5514
## ln_lovedTracks 84.6695 79.8827 82.3770 70.8626
## posts 20.7178 0.0000 20.2893 0.0000
## playlists 64.4694 0.0000 42.9197 -265.3846
## shouts 23.8214 40.0000 23.6246 0.0000
## tenure 80.8203 66.6667 71.3452 70.0000
## good_country -48.0096 0.0000 -48.2143 0.0000
##
## Sample sizes:
## Control Treated
## All 34004 9823
## Matched 9823 9823
## Unmatched 24181 0
## Discarded 0 0
plot(mod_match)
To create a dataframe containing only the matched observations, use the match.data() function
dta_m <- match.data(mod_match)
dim(dta_m)
## [1] 19646 17
Examining covariate balance in the matched sample Visual Inspection
fn_bal <- function(dta, variable) {
dta$variable <- dta[, variable]
dta$Treatment <- as.factor(dta$Treatment)
support <- c(min(dta$variable), max(dta$variable))
ggplot(dta, aes(x = distance, y = variable, color = Treatment)) +
geom_point(alpha = 0.2, size = 1.3) +
geom_smooth(method = "loess", se = F) +
xlab("Propensity score") +
ylab(variable) +
theme_bw() +
ylim(support)
}
grid.arrange(
fn_bal(dta_m, "age"),
fn_bal(dta_m, "male") + theme(legend.position = "none"),
fn_bal(dta_m, "friend_cnt"),
fn_bal(dta_m, "avg_friend_age") + theme(legend.position = "none"),
fn_bal(dta_m, "friend_country_cnt"),
fn_bal(dta_m, "ln_songsListened") + theme(legend.position = "none"),
fn_bal(dta_m, "ln_lovedTracks"),
fn_bal(dta_m, "playlists") + theme(legend.position = "none"),
fn_bal(dta_m, "tenure"),
fn_bal(dta_m, "good_country") + theme(legend.position = "none"),
nrow = 6, widths = c(1, 0.8)
)
Difference of Means
dta_m %>%
group_by(Treatment) %>%
select(one_of(High_Note_cov)) %>%
summarise_all(funs(mean))
## Adding missing grouping variables: `Treatment`
## # A tibble: 2 x 14
## Treatment age male friend_cnt avg_friend_age avg_friend_male
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 25.8 0.658 21.5 26.0 0.648
## 2 1 25.4 0.636 54.0 25.4 0.636
## # ... with 8 more variables: friend_country_cnt <dbl>,
## # ln_songsListened <dbl>, ln_lovedTracks <dbl>, posts <dbl>,
## # playlists <dbl>, shouts <dbl>, tenure <dbl>, good_country <dbl>
lapply(High_Note_cov, function(v) {
t.test(dta_m[, v] ~ dta_m$Treatment)
})
## [[1]]
##
## Welch Two Sample t-test
##
## data: dta_m[, v] by dta_m$Treatment
## t = 3.9186, df = 19521, p-value = 8.937e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.2030131 0.6093660
## sample estimates:
## mean in group 0 mean in group 1
## 25.77940 25.37321
##
##
## [[2]]
##
## Welch Two Sample t-test
##
## data: dta_m[, v] by dta_m$Treatment
## t = 3.2559, df = 19640, p-value = 0.001132
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.008832644 0.035552982
## sample estimates:
## mean in group 0 mean in group 1
## 0.6584546 0.6362618
##
##
## [[3]]
##
## Welch Two Sample t-test
##
## data: dta_m[, v] by dta_m$Treatment
## t = -24.769, df = 10477, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -35.06619 -29.92292
## sample estimates:
## mean in group 0 mean in group 1
## 21.52642 54.02097
##
##
## [[4]]
##
## Welch Two Sample t-test
##
## data: dta_m[, v] by dta_m$Treatment
## t = 7.3709, df = 18749, p-value = 1.765e-13
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.4511959 0.7780907
## sample estimates:
## mean in group 0 mean in group 1
## 26.00507 25.39043
##
##
## [[5]]
##
## Welch Two Sample t-test
##
## data: dta_m[, v] by dta_m$Treatment
## t = 3.428, df = 19374, p-value = 0.0006094
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.005124659 0.018810817
## sample estimates:
## mean in group 0 mean in group 1
## 0.6477754 0.6358077
##
##
## [[6]]
##
## Welch Two Sample t-test
##
## data: dta_m[, v] by dta_m$Treatment
## t = -38.82, df = 13820, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -4.535843 -4.099808
## sample estimates:
## mean in group 0 mean in group 1
## 5.067800 9.385626
##
##
## [[7]]
##
## Welch Two Sample t-test
##
## data: dta_m[, v] by dta_m$Treatment
## t = -4.8926, df = 19535, p-value = 1.003e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1654825 -0.0708156
## sample estimates:
## mean in group 0 mean in group 1
## 9.483885 9.602034
##
##
## [[8]]
##
## Welch Two Sample t-test
##
## data: dta_m[, v] by dta_m$Treatment
## t = -8.294, df = 19474, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.2864991 -0.1769691
## sample estimates:
## mean in group 0 mean in group 1
## 3.726452 3.958186
##
##
## [[9]]
##
## Welch Two Sample t-test
##
## data: dta_m[, v] by dta_m$Treatment
## t = -5.6784, df = 11062, p-value = 1.394e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -19.175269 -9.333944
## sample estimates:
## mean in group 0 mean in group 1
## 6.26835 20.52296
##
##
## [[10]]
##
## Welch Two Sample t-test
##
## data: dta_m[, v] by dta_m$Treatment
## t = -3.4421, df = 14535, p-value = 0.0005789
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.11967088 -0.03282836
## sample estimates:
## mean in group 0 mean in group 1
## 0.6678204 0.7440700
##
##
## [[11]]
##
## Welch Two Sample t-test
##
## data: dta_m[, v] by dta_m$Treatment
## t = -8.5779, df = 10471, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -79.91979 -50.18792
## sample estimates:
## mean in group 0 mean in group 1
## 36.76565 101.81951
##
##
## [[12]]
##
## Welch Two Sample t-test
##
## data: dta_m[, v] by dta_m$Treatment
## t = 2.3025, df = 19612, p-value = 0.02132
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.09544044 1.18807784
## sample estimates:
## mean in group 0 mean in group 1
## 47.19047 46.54871
##
##
## [[13]]
##
## Welch Two Sample t-test
##
## data: dta_m[, v] by dta_m$Treatment
## t = 2.4805, df = 19642, p-value = 0.01313
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.003545363 0.030252866
## sample estimates:
## mean in group 0 mean in group 1
## 0.3601751 0.3432760
Estimating treatment effects
Estimating the treatment effect is simple once we have a matched sample that we are happy with. We can use a t-test:
with(dta_m, t.test(adopter ~ Treatment))
##
## Welch Two Sample t-test
##
## data: adopter by Treatment
## t = -16.725, df = 18453, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.09098592 -0.07189711
## sample estimates:
## mean in group 0 mean in group 1
## 0.09610099 0.17754250
Or we can use OLS with or without covariates:
glm_treat1 <- glm(adopter ~ Treatment, data = dta_m, family = "binomial")
summary(glm_treat1)
##
## Call:
## glm(formula = adopter ~ Treatment, family = "binomial", data = dta_m)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.6252 -0.6252 -0.4495 -0.4495 2.1644
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.24132 0.03423 -65.48 <2e-16 ***
## Treatment 0.70823 0.04323 16.38 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 15683 on 19645 degrees of freedom
## Residual deviance: 15404 on 19644 degrees of freedom
## AIC: 15408
##
## Number of Fisher Scoring iterations: 4
glm_treat2 <- glm(adopter ~ age + male + friend_cnt + avg_friend_age + avg_friend_male + friend_country_cnt +
ln_songsListened + ln_lovedTracks + posts + playlists + shouts + tenure + good_country + Treatment,
data = dta_m, family = "binomial")
summary(glm_treat2)
##
## Call:
## glm(formula = adopter ~ age + male + friend_cnt + avg_friend_age +
## avg_friend_male + friend_country_cnt + ln_songsListened +
## ln_lovedTracks + posts + playlists + shouts + tenure + good_country +
## Treatment, family = "binomial", data = dta_m)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8606 -0.5791 -0.4414 -0.3078 3.0933
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.432e+00 2.287e-01 -28.118 < 2e-16 ***
## age 1.562e-02 4.257e-03 3.670 0.000243 ***
## male 2.999e-01 4.886e-02 6.138 8.38e-10 ***
## friend_cnt 2.092e-04 2.780e-04 0.752 0.451845
## avg_friend_age 2.881e-02 5.681e-03 5.071 3.96e-07 ***
## avg_friend_male 9.103e-02 9.660e-02 0.942 0.346005
## friend_country_cnt -4.700e-03 3.701e-03 -1.270 0.204087
## ln_songsListened 2.035e-01 1.928e-02 10.554 < 2e-16 ***
## ln_lovedTracks 2.672e-01 1.342e-02 19.916 < 2e-16 ***
## posts 1.653e-04 9.384e-05 1.762 0.078131 .
## playlists 3.776e-02 1.315e-02 2.872 0.004084 **
## shouts 9.645e-05 7.059e-05 1.366 0.171798
## tenure -3.914e-03 1.262e-03 -3.101 0.001928 **
## good_country -3.986e-01 4.780e-02 -8.339 < 2e-16 ***
## Treatment 6.379e-01 4.628e-02 13.783 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 15683 on 19645 degrees of freedom
## Residual deviance: 14366 on 19631 degrees of freedom
## AIC: 14396
##
## Number of Fisher Scoring iterations: 5
In a logistic regression the response being modeled is the log(odds) that Y = 1 The regression coefficient gives the change in log(odds) in the response for a unit change in the predictor variable because log(odds) are difficult to interpret, we can exponentiate them
We can see that the odds of being a premium subscriber are increased by a factor of 1.499 when and individual is male
exp(coef(glm_treat2))
## (Intercept) age male
## 0.001609923 1.015744376 1.349707685
## friend_cnt avg_friend_age avg_friend_male
## 1.000209180 1.029229423 1.095303373
## friend_country_cnt ln_songsListened ln_lovedTracks
## 0.995310871 1.225697366 1.306303144
## posts playlists shouts
## 1.000165322 1.038485243 1.000096459
## tenure good_country Treatment
## 0.996093907 0.671239518 1.892417465