Peer Influence for a music-streaming company

Peer Influence for a music-streaming company

# Load relevant packages
library(dplyr)
library(ggplot2)
library(corrplot)
library(MatchIt)
library(psych)
library(GGally)
library(gridExtra)
#read the dataset
High_Note <- read.csv("Data/peer_influence.csv", header = TRUE)
describeBy(High_Note, group = High_Note$adopter, mat = FALSE, digits=2)
## 
##  Descriptive statistics by group 
## group: 0
##                       vars     n     mean       sd   median  trimmed
## ï..ID                    1 40300 20150.50 11633.75 20150.50 20150.50
## age                      2 40300    23.95     6.37    23.00    23.09
## male                     3 40300     0.62     0.48     1.00     0.65
## friend_cnt               4 40300    18.49    57.48     7.00    10.28
## avg_friend_age           5 40300    24.01     5.10    23.00    23.40
## avg_friend_male          6 40300     0.62     0.32     0.67     0.65
## friend_country_cnt       7 40300     3.96     5.76     2.00     2.66
## subscriber_friend_cnt    8 40300     0.42     2.42     0.00     0.13
## songsListened            9 40300 17589.44 28416.02  7440.00 11817.64
## lovedTracks             10 40300    86.82   263.58    14.00    36.35
## posts                   11 40300     5.29   104.31     0.00     0.23
## playlists               12 40300     0.55     1.07     0.00     0.45
## shouts                  13 40300    29.97   150.69     4.00     8.84
## adopter                 14 40300     0.00     0.00     0.00     0.00
## tenure                  15 40300    43.81    19.79    44.00    43.72
## good_country            16 40300     0.36     0.48     0.00     0.32
##                            mad min     max   range  skew kurtosis     se
## ï..ID                 14937.19   1   40300   40299  0.00    -1.20  57.95
## age                       4.45   8      79      71  1.97     6.80   0.03
## male                      0.00   0       1       1 -0.50    -1.75   0.00
## friend_cnt                7.41   1    4957    4956 32.67  2087.42   0.29
## avg_friend_age            3.95   8      77      69  1.84     7.15   0.03
## avg_friend_male           0.35   0       1       1 -0.52    -0.72   0.00
## friend_country_cnt        1.48   0     129     129  4.74    38.29   0.03
## subscriber_friend_cnt     0.00   0     309     309 72.19  8024.62   0.01
## songsListened         10576.87   0 1000000 1000000  6.05   105.85 141.55
## lovedTracks              20.76   0   12522   12522 13.12   335.93   1.31
## posts                     0.00   0   12309   12309 73.92  7005.34   0.52
## playlists                 0.00   0      98      98 28.21  1945.28   0.01
## shouts                    4.45   0    7736    7736 22.53   779.12   0.75
## adopter                   0.00   0       0       0   NaN      NaN   0.00
## tenure                   22.24   1     111     110  0.05    -0.70   0.10
## good_country              0.00   0       1       1  0.59    -1.65   0.00
## -------------------------------------------------------- 
## group: 1
##                       vars    n     mean       sd   median  trimmed
## ï..ID                    1 3527 42064.00  1018.30 42064.00 42064.00
## age                      2 3527    25.98     6.84    24.00    25.05
## male                     3 3527     0.73     0.44     1.00     0.79
## friend_cnt               4 3527    39.73   117.27    16.00    23.69
## avg_friend_age           5 3527    25.44     5.21    24.36    24.83
## avg_friend_male          6 3527     0.64     0.25     0.67     0.65
## friend_country_cnt       7 3527     7.19     8.86     4.00     5.36
## subscriber_friend_cnt    8 3527     1.64     5.85     0.00     0.84
## songsListened            9 3527 33758.04 43592.73 20908.00 25811.69
## lovedTracks             10 3527   264.34   491.43   108.00   161.68
## posts                   11 3527    21.20   221.99     0.00     1.44
## playlists               12 3527     0.90     2.56     1.00     0.59
## shouts                  13 3527    99.44  1156.07     9.00    23.89
## adopter                 14 3527     1.00     0.00     1.00     1.00
## tenure                  15 3527    45.58    20.04    46.00    45.60
## good_country            16 3527     0.29     0.45     0.00     0.23
##                            mad   min    max  range  skew kurtosis     se
## ï..ID                  1307.65 40301  43827   3526  0.00    -1.20  17.15
## age                       4.45     8     73     65  1.68     4.39   0.12
## male                      0.00     0      1      1 -1.03    -0.94   0.01
## friend_cnt               17.79     1   5089   5088 26.04  1013.79   1.97
## avg_friend_age            3.91    12     62     50  1.68     5.05   0.09
## avg_friend_male           0.25     0      1      1 -0.54    -0.05   0.00
## friend_country_cnt        4.45     0    136    136  3.61    24.53   0.15
## subscriber_friend_cnt     0.00     0    287    287 34.05  1609.52   0.10
## songsListened         23276.82     0 817290 817290  4.71    46.64 734.03
## lovedTracks             140.85     0  10220  10220  6.52    80.96   8.27
## posts                     0.00     0   8506   8506 26.52   852.38   3.74
## playlists                 1.48     0    118    118 28.84  1244.31   0.04
## shouts                   11.86     0  65872  65872 52.52  2969.09  19.47
## adopter                   0.00     1      1      0   NaN      NaN   0.00
## tenure                   20.76     0    111    111  0.02    -0.62   0.34
## good_country              0.00     0      1      1  0.94    -1.12   0.01
#Take log of variables where values are too large compared to the others
High_Note <- High_Note %>% mutate(ln_songsListened = log(songsListened + 1))
High_Note <- High_Note %>% mutate(ln_lovedTracks = log(lovedTracks + 1))
#Start with some visualizations
ggcorr(High_Note, palette = "RdBu", label = TRUE)

pairs(~age+friend_cnt+ln_songsListened+posts, col=High_Note$adopter, data=High_Note, main="Scatterplot Matrix")

#Demographics
Age_plot <- ggplot(High_Note, aes(x = factor(adopter), y = age)) + geom_boxplot(aes(fill = factor(adopter))) + labs(x = "Adopter")
Age_plot

Age_plotbygender <- ggplot(High_Note, aes(x=factor(adopter), y=age, color = factor(male))) + geom_boxplot() + labs(x = "Adopter")
Age_plotbygender

#Peer influence
friend_cnt_plot <- ggplot(High_Note, aes(x = factor(adopter), y = friend_cnt)) + geom_boxplot(aes(fill = factor(adopter))) +ylim(0, 1250) +
  labs(x = "Adopter")
friend_cnt_plot
## Warning: Removed 13 rows containing non-finite values (stat_boxplot).

subscriber_friend_cnt_plot <- ggplot(High_Note, aes(x = factor(adopter), y = subscriber_friend_cnt)) + 
  geom_boxplot(aes(fill = factor(adopter))) + ylim(0, 50)  + labs(x = "Adopter")
subscriber_friend_cnt_plot
## Warning: Removed 9 rows containing non-finite values (stat_boxplot).

#user engagement 
ln_songsListened_plot <- ggplot(High_Note, aes(x = factor(adopter), y = ln_songsListened)) + geom_boxplot(aes(fill = factor(adopter)))
ln_songsListened_plot

ln_lovedTracks_plot <- ggplot(High_Note, aes(x = factor(adopter), y = ln_lovedTracks)) + geom_boxplot(aes(fill = factor(adopter)))
ln_lovedTracks_plot

##Propensity Score Matching

#Transform subscriber friend count into the Treatment and control variable with Treatment being 1 and Control being 0
High_Note$Treatment <- ifelse(High_Note$subscriber_friend_cnt >= 1, 1, 0)
#For those who have 1 or more subscriber friends, on average 18% of them are premuim subsribers, 
# while those who have 0 subscriber friends, on average 5%  of them are premium subscribers. 

High_Note %>%
  group_by(Treatment) %>%
  summarise(mean_adopter = mean(adopter))
## # A tibble: 2 x 2
##   Treatment mean_adopter
##       <dbl>        <dbl>
## 1         0       0.0524
## 2         1       0.178
with(High_Note, t.test(adopter ~ Treatment))
## 
##  Welch Two Sample t-test
## 
## data:  adopter by Treatment
## t = -30.961, df = 11815, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1330281 -0.1171869
## sample estimates:
## mean in group 0 mean in group 1 
##      0.05243501      0.17754250

Difference-in-means: pre-treatment covariates

Let’s calculate the mean for each covariate

High_Note_cov <- c("age","male","friend_cnt","avg_friend_age","avg_friend_male","friend_country_cnt","ln_songsListened", 
              "ln_lovedTracks","posts","playlists","shouts", "tenure","good_country")
High_Note %>%
  group_by(Treatment) %>%
  select(one_of(High_Note_cov)) %>%
  summarise_all(funs(mean(., na.rm = T)))
## Adding missing grouping variables: `Treatment`
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
## # A tibble: 2 x 14
##   Treatment   age  male friend_cnt avg_friend_age avg_friend_male
##       <dbl> <dbl> <dbl>      <dbl>          <dbl>           <dbl>
## 1         0  23.7 0.629       10.4           23.8           0.613
## 2         1  25.4 0.636       54.0           25.4           0.636
## # ... with 8 more variables: friend_country_cnt <dbl>,
## #   ln_songsListened <dbl>, ln_lovedTracks <dbl>, posts <dbl>,
## #   playlists <dbl>, shouts <dbl>, tenure <dbl>, good_country <dbl>
#T-test
list_of_var <- c("age","male","friend_cnt","avg_friend_age","avg_friend_male","friend_country_cnt","ln_songsListened", 
              "ln_lovedTracks","posts","playlists","shouts", "tenure","good_country")
lapply(list_of_var, function(v) {
  t.test(High_Note[, v] ~ High_Note$Treatment)
})
## [[1]]
## 
##  Welch Two Sample t-test
## 
## data:  High_Note[, v] by High_Note$Treatment
## t = -20.841, df = 14645, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.778544 -1.472749
## sample estimates:
## mean in group 0 mean in group 1 
##        23.74756        25.37321 
## 
## 
## [[2]]
## 
##  Welch Two Sample t-test
## 
## data:  High_Note[, v] by High_Note$Treatment
## t = -1.3459, df = 15986, p-value = 0.1784
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.018236129  0.003388028
## sample estimates:
## mean in group 0 mean in group 1 
##       0.6288378       0.6362618 
## 
## 
## [[3]]
## 
##  Welch Two Sample t-test
## 
## data:  High_Note[, v] by High_Note$Treatment
## t = -33.707, df = 9903.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -46.12459 -41.05469
## sample estimates:
## mean in group 0 mean in group 1 
##        10.43133        54.02097 
## 
## 
## [[4]]
## 
##  Welch Two Sample t-test
## 
## data:  High_Note[, v] by High_Note$Treatment
## t = -27.658, df = 15667, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.744514 -1.513611
## sample estimates:
## mean in group 0 mean in group 1 
##        23.76137        25.39043 
## 
## 
## [[5]]
## 
##  Welch Two Sample t-test
## 
## data:  High_Note[, v] by High_Note$Treatment
## t = -7.7114, df = 23020, p-value = 1.294e-14
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.02846397 -0.01692672
## sample estimates:
## mean in group 0 mean in group 1 
##       0.6131124       0.6358077 
## 
## 
## [[6]]
## 
##  Welch Two Sample t-test
## 
## data:  High_Note[, v] by High_Note$Treatment
## t = -65.05, df = 10372, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -6.861271 -6.459857
## sample estimates:
## mean in group 0 mean in group 1 
##        2.725062        9.385626 
## 
## 
## [[7]]
## 
##  Welch Two Sample t-test
## 
## data:  High_Note[, v] by High_Note$Treatment
## t = -72.169, df = 24545, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.702958 -1.612902
## sample estimates:
## mean in group 0 mean in group 1 
##        7.944104        9.602034 
## 
## 
## [[8]]
## 
##  Welch Two Sample t-test
## 
## data:  High_Note[, v] by High_Note$Treatment
## t = -64.938, df = 15507, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.557214 -1.465962
## sample estimates:
## mean in group 0 mean in group 1 
##        2.446598        3.958186 
## 
## 
## [[9]]
## 
##  Welch Two Sample t-test
## 
## data:  High_Note[, v] by High_Note$Treatment
## t = -7.3649, df = 9933.6, p-value = 1.914e-13
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -22.76492 -13.19424
## sample estimates:
## mean in group 0 mean in group 1 
##        2.543377       20.522956 
## 
## 
## [[10]]
## 
##  Welch Two Sample t-test
## 
## data:  High_Note[, v] by High_Note$Treatment
## t = -10.492, df = 11238, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.2546958 -0.1745100
## sample estimates:
## mean in group 0 mean in group 1 
##       0.5294671       0.7440700 
## 
## 
## [[11]]
## 
##  Welch Two Sample t-test
## 
## data:  High_Note[, v] by High_Note$Treatment
## t = -11.426, df = 9888.1, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -100.04703  -70.74591
## sample estimates:
## mean in group 0 mean in group 1 
##        16.42304       101.81951 
## 
## 
## [[12]]
## 
##  Welch Two Sample t-test
## 
## data:  High_Note[, v] by High_Note$Treatment
## t = -14.696, df = 15805, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -3.792309 -2.899752
## sample estimates:
## mean in group 0 mean in group 1 
##        43.20268        46.54871 
## 
## 
## [[13]]
## 
##  Welch Two Sample t-test
## 
## data:  High_Note[, v] by High_Note$Treatment
## t = 2.0956, df = 16030, p-value = 0.03613
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.0007383591 0.0220968020
## sample estimates:
## mean in group 0 mean in group 1 
##       0.3546936       0.3432760

Propensity score estimation We estimate the propensity score by running a logit model (probit also works) where the outcome variable is a binary variable indicating treatment status. what covariates should we include? For the matching to give you a causal estimate in the end, you need to include any covariate that is related to both the treatment assignment and potential outcomes. Therefore at this moment I choose to include all variables

m_ps <- glm(Treatment ~ age + male + friend_cnt + avg_friend_age + avg_friend_male + friend_country_cnt +  
               ln_songsListened + ln_lovedTracks + posts + playlists + shouts + tenure + good_country,
             family = binomial(), data = High_Note)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(m_ps)
## 
## Call:
## glm(formula = Treatment ~ age + male + friend_cnt + avg_friend_age + 
##     avg_friend_male + friend_country_cnt + ln_songsListened + 
##     ln_lovedTracks + posts + playlists + shouts + tenure + good_country, 
##     family = binomial(), data = High_Note)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.1675  -0.5870  -0.4007  -0.1806   6.8050  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -6.7989512  0.1066723 -63.737  < 2e-16 ***
## age                 0.0211928  0.0028963   7.317 2.53e-13 ***
## male                0.0127828  0.0303040   0.422  0.67316    
## friend_cnt          0.0281816  0.0010230  27.549  < 2e-16 ***
## avg_friend_age      0.0836847  0.0036034  23.224  < 2e-16 ***
## avg_friend_male     0.2322512  0.0524685   4.426 9.58e-06 ***
## friend_country_cnt  0.1021891  0.0046711  21.877  < 2e-16 ***
## ln_songsListened    0.1700614  0.0088009  19.323  < 2e-16 ***
## ln_lovedTracks      0.1282609  0.0076995  16.658  < 2e-16 ***
## posts               0.0006760  0.0002331   2.900  0.00373 ** 
## playlists          -0.0108768  0.0131725  -0.826  0.40897    
## shouts             -0.0004121  0.0001525  -2.703  0.00688 ** 
## tenure             -0.0031864  0.0007966  -4.000 6.33e-05 ***
## good_country        0.0032863  0.0293230   0.112  0.91077    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 46640  on 43826  degrees of freedom
## Residual deviance: 33588  on 43813  degrees of freedom
## AIC: 33616
## 
## Number of Fisher Scoring iterations: 25

Using this model, we can now calculate the propensity score for each user It is simply the user’s predicted probability of being Treated, given the estimates from the logit model.

prs_df <- data.frame(pr_score = predict(m_ps, type = "response"),
                     Treatment = m_ps$model$Treatment)
head(prs_df)
##     pr_score Treatment
## 1 0.12150210         0
## 2 0.03447076         0
## 3 0.04815195         0
## 4 0.22831497         1
## 5 0.65092056         0
## 6 0.18205340         0

Examining the region of common support After estimating the propensity score, it is useful to plot histograms of the estimated propensity scores by treatment status

labs <- paste("Treatment type:", c("1 or more friends", "0 friends"))
prs_df %>%
  mutate(SubscriberType = ifelse(Treatment == 1, labs[1], labs[2])) %>%
  ggplot(aes(x = pr_score)) +
  geom_histogram(color = "white", binwidth = 0.025) +
  facet_wrap(~SubscriberType) +
  xlab("Probability of Treatment") +
  theme_bw()

The method we use below is to find pairs of observations that have very similar propensity scores, but that differ in their treatment status. We use the package MatchIt for this. This package estimates the propensity score in the background and then matches observations based on the method of choice (“nearest” in this case)

High_Note_nomiss <- High_Note %>%  # MatchIt does not allow missing values
  select(adopter, Treatment, one_of(High_Note_cov)) %>%
  na.omit()

mod_match <- matchit(Treatment ~ age + male + friend_cnt + avg_friend_age + avg_friend_male + friend_country_cnt +  
                       ln_songsListened + ln_lovedTracks + posts + playlists + shouts + tenure + good_country,
                     method = "nearest", data = High_Note_nomiss)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

We can get some information about how successful the matching was using summary(mod_match) and plot(mod_match)

summary(mod_match)
## 
## Call:
## matchit(formula = Treatment ~ age + male + friend_cnt + avg_friend_age + 
##     avg_friend_male + friend_country_cnt + ln_songsListened + 
##     ln_lovedTracks + posts + playlists + shouts + tenure + good_country, 
##     data = High_Note_nomiss, method = "nearest")
## 
## Summary of balance for all data:
##                    Means Treated Means Control SD Control Mean Diff
## distance                  0.4689        0.1532     0.1496    0.3157
## age                      25.3732       23.7476     6.2245    1.6256
## male                      0.6363        0.6288     0.4831    0.0074
## friend_cnt               54.0210       10.4313    15.2769   43.5896
## avg_friend_age           25.3904       23.7614     5.0577    1.6291
## avg_friend_male           0.6358        0.6131     0.3343    0.0227
## friend_country_cnt        9.3856        2.7251     3.1024    6.6606
## ln_songsListened          9.6020        7.9441     2.7002    1.6579
## ln_lovedTracks            3.9582        2.4466     1.9781    1.5116
## posts                    20.5230        2.5434    33.7947   17.9796
## playlists                 0.7441        0.5295     0.9673    0.2146
## shouts                  101.8195       16.4230    79.7381   85.3965
## tenure                   46.5487       43.2027    19.7212    3.3460
## good_country              0.3433        0.3547     0.4784   -0.0114
##                    eQQ Med eQQ Mean    eQQ Max
## distance            0.2772   0.3157     0.6429
## age                 1.0000   1.6296     5.0000
## male                0.0000   0.0074     1.0000
## friend_cnt         22.0000  43.5838  4794.0000
## avg_friend_age      1.5909   1.6369    11.5000
## avg_friend_male     0.0738   0.0958     0.3636
## friend_country_cnt  5.0000   6.6598    95.0000
## ln_songsListened    1.2880   1.6583     6.0283
## ln_lovedTracks      1.5315   1.5115     2.8332
## posts               0.0000  17.8829  9535.0000
## playlists           0.0000   0.2092    26.0000
## shouts             15.0000  85.1764 59168.0000
## tenure              3.0000   3.3473    10.0000
## good_country        0.0000   0.0114     1.0000
## 
## 
## Summary of balance for matched data:
##                    Means Treated Means Control SD Control Mean Diff
## distance                  0.4689        0.3151     0.1854    0.1538
## age                      25.3732       25.7794     7.5478   -0.4062
## male                      0.6363        0.6585     0.4743   -0.0222
## friend_cnt               54.0210       21.5264    23.3770   32.4946
## avg_friend_age           25.3904       26.0051     6.4510   -0.6146
## avg_friend_male           0.6358        0.6478     0.2587   -0.0120
## friend_country_cnt        9.3856        5.0678     4.6172    4.3178
## ln_songsListened          9.6020        9.4839     1.6281    0.1181
## ln_lovedTracks            3.9582        3.7265     1.8645    0.2317
## posts                    20.5230        6.2683    60.7389   14.2546
## playlists                 0.7441        0.6678     0.9905    0.0762
## shouts                  101.8195       36.7657   134.5236   65.0539
## tenure                   46.5487       47.1905    19.1378   -0.6418
## good_country              0.3433        0.3602     0.4801   -0.0169
##                    eQQ Med eQQ Mean    eQQ Max
## distance            0.1147   0.1538     0.4184
## age                 0.0000   0.4599     6.0000
## male                0.0000   0.0222     1.0000
## friend_cnt         12.0000  32.4946  4794.0000
## avg_friend_age      0.3333   0.8619    14.0000
## avg_friend_male     0.0119   0.0258     0.1463
## friend_country_cnt  2.0000   4.3178    95.0000
## ln_songsListened    0.1447   0.1653     2.1972
## ln_lovedTracks      0.3081   0.2664     0.8255
## posts               0.0000  14.2546  9535.0000
## playlists           0.0000   0.1194    95.0000
## shouts              9.0000  65.0539 59168.0000
## tenure              1.0000   0.9592     3.0000
## good_country        0.0000   0.0169     1.0000
## 
## Percent Balance Improvement:
##                    Mean Diff.  eQQ Med  eQQ Mean   eQQ Max
## distance              51.2952  58.6333   51.2929   34.9201
## age                   75.0137 100.0000   71.7766  -20.0000
## male                -198.9313   0.0000 -198.6301    0.0000
## friend_cnt            25.4535  45.4545   25.4436    0.0000
## avg_friend_age        62.2701  79.0476   47.3451  -21.7391
## avg_friend_male       47.2679  83.8776   73.0291   59.7722
## friend_country_cnt    35.1733  60.0000   35.1656    0.0000
## ln_songsListened      92.8737  88.7676   90.0310   63.5514
## ln_lovedTracks        84.6695  79.8827   82.3770   70.8626
## posts                 20.7178   0.0000   20.2893    0.0000
## playlists             64.4694   0.0000   42.9197 -265.3846
## shouts                23.8214  40.0000   23.6246    0.0000
## tenure                80.8203  66.6667   71.3452   70.0000
## good_country         -48.0096   0.0000  -48.2143    0.0000
## 
## Sample sizes:
##           Control Treated
## All         34004    9823
## Matched      9823    9823
## Unmatched   24181       0
## Discarded       0       0
plot(mod_match)

To create a dataframe containing only the matched observations, use the match.data() function

dta_m <- match.data(mod_match)
dim(dta_m)
## [1] 19646    17

Examining covariate balance in the matched sample Visual Inspection

fn_bal <- function(dta, variable) {
  dta$variable <- dta[, variable]
  dta$Treatment <- as.factor(dta$Treatment)
  support <- c(min(dta$variable), max(dta$variable))
  ggplot(dta, aes(x = distance, y = variable, color = Treatment)) +
    geom_point(alpha = 0.2, size = 1.3) +
    geom_smooth(method = "loess", se = F) +
    xlab("Propensity score") +
    ylab(variable) +
    theme_bw() +
    ylim(support)
}

grid.arrange(
  fn_bal(dta_m, "age"),
  fn_bal(dta_m, "male") + theme(legend.position = "none"),
  fn_bal(dta_m, "friend_cnt"),
  fn_bal(dta_m, "avg_friend_age") + theme(legend.position = "none"),
  fn_bal(dta_m, "friend_country_cnt"),
  fn_bal(dta_m, "ln_songsListened") + theme(legend.position = "none"),
  fn_bal(dta_m, "ln_lovedTracks"),
  fn_bal(dta_m, "playlists") + theme(legend.position = "none"),
  fn_bal(dta_m, "tenure"),
  fn_bal(dta_m, "good_country") + theme(legend.position = "none"),
  nrow = 6, widths = c(1, 0.8)
)

Difference of Means

dta_m %>%
  group_by(Treatment) %>%
  select(one_of(High_Note_cov)) %>%
  summarise_all(funs(mean))
## Adding missing grouping variables: `Treatment`
## # A tibble: 2 x 14
##   Treatment   age  male friend_cnt avg_friend_age avg_friend_male
##       <dbl> <dbl> <dbl>      <dbl>          <dbl>           <dbl>
## 1         0  25.8 0.658       21.5           26.0           0.648
## 2         1  25.4 0.636       54.0           25.4           0.636
## # ... with 8 more variables: friend_country_cnt <dbl>,
## #   ln_songsListened <dbl>, ln_lovedTracks <dbl>, posts <dbl>,
## #   playlists <dbl>, shouts <dbl>, tenure <dbl>, good_country <dbl>
lapply(High_Note_cov, function(v) {
  t.test(dta_m[, v] ~ dta_m$Treatment)
})
## [[1]]
## 
##  Welch Two Sample t-test
## 
## data:  dta_m[, v] by dta_m$Treatment
## t = 3.9186, df = 19521, p-value = 8.937e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.2030131 0.6093660
## sample estimates:
## mean in group 0 mean in group 1 
##        25.77940        25.37321 
## 
## 
## [[2]]
## 
##  Welch Two Sample t-test
## 
## data:  dta_m[, v] by dta_m$Treatment
## t = 3.2559, df = 19640, p-value = 0.001132
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.008832644 0.035552982
## sample estimates:
## mean in group 0 mean in group 1 
##       0.6584546       0.6362618 
## 
## 
## [[3]]
## 
##  Welch Two Sample t-test
## 
## data:  dta_m[, v] by dta_m$Treatment
## t = -24.769, df = 10477, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -35.06619 -29.92292
## sample estimates:
## mean in group 0 mean in group 1 
##        21.52642        54.02097 
## 
## 
## [[4]]
## 
##  Welch Two Sample t-test
## 
## data:  dta_m[, v] by dta_m$Treatment
## t = 7.3709, df = 18749, p-value = 1.765e-13
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.4511959 0.7780907
## sample estimates:
## mean in group 0 mean in group 1 
##        26.00507        25.39043 
## 
## 
## [[5]]
## 
##  Welch Two Sample t-test
## 
## data:  dta_m[, v] by dta_m$Treatment
## t = 3.428, df = 19374, p-value = 0.0006094
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.005124659 0.018810817
## sample estimates:
## mean in group 0 mean in group 1 
##       0.6477754       0.6358077 
## 
## 
## [[6]]
## 
##  Welch Two Sample t-test
## 
## data:  dta_m[, v] by dta_m$Treatment
## t = -38.82, df = 13820, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -4.535843 -4.099808
## sample estimates:
## mean in group 0 mean in group 1 
##        5.067800        9.385626 
## 
## 
## [[7]]
## 
##  Welch Two Sample t-test
## 
## data:  dta_m[, v] by dta_m$Treatment
## t = -4.8926, df = 19535, p-value = 1.003e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1654825 -0.0708156
## sample estimates:
## mean in group 0 mean in group 1 
##        9.483885        9.602034 
## 
## 
## [[8]]
## 
##  Welch Two Sample t-test
## 
## data:  dta_m[, v] by dta_m$Treatment
## t = -8.294, df = 19474, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.2864991 -0.1769691
## sample estimates:
## mean in group 0 mean in group 1 
##        3.726452        3.958186 
## 
## 
## [[9]]
## 
##  Welch Two Sample t-test
## 
## data:  dta_m[, v] by dta_m$Treatment
## t = -5.6784, df = 11062, p-value = 1.394e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -19.175269  -9.333944
## sample estimates:
## mean in group 0 mean in group 1 
##         6.26835        20.52296 
## 
## 
## [[10]]
## 
##  Welch Two Sample t-test
## 
## data:  dta_m[, v] by dta_m$Treatment
## t = -3.4421, df = 14535, p-value = 0.0005789
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.11967088 -0.03282836
## sample estimates:
## mean in group 0 mean in group 1 
##       0.6678204       0.7440700 
## 
## 
## [[11]]
## 
##  Welch Two Sample t-test
## 
## data:  dta_m[, v] by dta_m$Treatment
## t = -8.5779, df = 10471, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -79.91979 -50.18792
## sample estimates:
## mean in group 0 mean in group 1 
##        36.76565       101.81951 
## 
## 
## [[12]]
## 
##  Welch Two Sample t-test
## 
## data:  dta_m[, v] by dta_m$Treatment
## t = 2.3025, df = 19612, p-value = 0.02132
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.09544044 1.18807784
## sample estimates:
## mean in group 0 mean in group 1 
##        47.19047        46.54871 
## 
## 
## [[13]]
## 
##  Welch Two Sample t-test
## 
## data:  dta_m[, v] by dta_m$Treatment
## t = 2.4805, df = 19642, p-value = 0.01313
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.003545363 0.030252866
## sample estimates:
## mean in group 0 mean in group 1 
##       0.3601751       0.3432760

Estimating treatment effects

Estimating the treatment effect is simple once we have a matched sample that we are happy with. We can use a t-test:

with(dta_m, t.test(adopter ~ Treatment))
## 
##  Welch Two Sample t-test
## 
## data:  adopter by Treatment
## t = -16.725, df = 18453, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.09098592 -0.07189711
## sample estimates:
## mean in group 0 mean in group 1 
##      0.09610099      0.17754250

Or we can use OLS with or without covariates:

glm_treat1 <- glm(adopter ~ Treatment, data = dta_m, family = "binomial")
summary(glm_treat1)
## 
## Call:
## glm(formula = adopter ~ Treatment, family = "binomial", data = dta_m)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.6252  -0.6252  -0.4495  -0.4495   2.1644  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.24132    0.03423  -65.48   <2e-16 ***
## Treatment    0.70823    0.04323   16.38   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 15683  on 19645  degrees of freedom
## Residual deviance: 15404  on 19644  degrees of freedom
## AIC: 15408
## 
## Number of Fisher Scoring iterations: 4
glm_treat2 <- glm(adopter ~ age + male + friend_cnt + avg_friend_age + avg_friend_male + friend_country_cnt +  
                    ln_songsListened + ln_lovedTracks + posts + playlists + shouts + tenure + good_country + Treatment, 
                  data = dta_m, family = "binomial")
summary(glm_treat2)
## 
## Call:
## glm(formula = adopter ~ age + male + friend_cnt + avg_friend_age + 
##     avg_friend_male + friend_country_cnt + ln_songsListened + 
##     ln_lovedTracks + posts + playlists + shouts + tenure + good_country + 
##     Treatment, family = "binomial", data = dta_m)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8606  -0.5791  -0.4414  -0.3078   3.0933  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -6.432e+00  2.287e-01 -28.118  < 2e-16 ***
## age                 1.562e-02  4.257e-03   3.670 0.000243 ***
## male                2.999e-01  4.886e-02   6.138 8.38e-10 ***
## friend_cnt          2.092e-04  2.780e-04   0.752 0.451845    
## avg_friend_age      2.881e-02  5.681e-03   5.071 3.96e-07 ***
## avg_friend_male     9.103e-02  9.660e-02   0.942 0.346005    
## friend_country_cnt -4.700e-03  3.701e-03  -1.270 0.204087    
## ln_songsListened    2.035e-01  1.928e-02  10.554  < 2e-16 ***
## ln_lovedTracks      2.672e-01  1.342e-02  19.916  < 2e-16 ***
## posts               1.653e-04  9.384e-05   1.762 0.078131 .  
## playlists           3.776e-02  1.315e-02   2.872 0.004084 ** 
## shouts              9.645e-05  7.059e-05   1.366 0.171798    
## tenure             -3.914e-03  1.262e-03  -3.101 0.001928 ** 
## good_country       -3.986e-01  4.780e-02  -8.339  < 2e-16 ***
## Treatment           6.379e-01  4.628e-02  13.783  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 15683  on 19645  degrees of freedom
## Residual deviance: 14366  on 19631  degrees of freedom
## AIC: 14396
## 
## Number of Fisher Scoring iterations: 5

In a logistic regression the response being modeled is the log(odds) that Y = 1 The regression coefficient gives the change in log(odds) in the response for a unit change in the predictor variable because log(odds) are difficult to interpret, we can exponentiate them

We can see that the odds of being a premium subscriber are increased by a factor of 1.499 when and individual is male

exp(coef(glm_treat2))
##        (Intercept)                age               male 
##        0.001609923        1.015744376        1.349707685 
##         friend_cnt     avg_friend_age    avg_friend_male 
##        1.000209180        1.029229423        1.095303373 
## friend_country_cnt   ln_songsListened     ln_lovedTracks 
##        0.995310871        1.225697366        1.306303144 
##              posts          playlists             shouts 
##        1.000165322        1.038485243        1.000096459 
##             tenure       good_country          Treatment 
##        0.996093907        0.671239518        1.892417465