Amazon Network Analysis

Amazon Network Analysis

# Load relevant packages
library(car)
library(dplyr)
library(tidyr)
library(igraph)
library(ggplot2)
library(corrplot)
head(products)
##   id                                                             title
## 1  1                           Patterns of Preaching: A Sermon Sampler
## 2  2                                        Candlemas: Feast of Flames
## 3  3                  World War II Allied Fighter Planes Trading Cards
## 4  4      Life Application Bible Commentary: 1 and 2 Timothy and Titus
## 5  5                   Prayers That Avail Much for Business: Executive
## 6  6 How the Other Half Lives: Studies Among the Tenements of New York
##   group salesrank review_cnt downloads rating
## 1  Book    396585          2         2    5.0
## 2  Book    168596         12        12    4.5
## 3  Book   1270652          1         1    5.0
## 4  Book    631289          1         1    4.0
## 5  Book    455160          0         0    0.0
## 6  Book    188784         17        17    4.0
head(copurchase)
##   Source Target
## 1      1      2
## 2      1      4
## 3      1      5
## 4      1     15
## 5      2     11
## 6      2     12
#We are onluy interested in a subset of this dataset, we want to look for Books 
books.products <- filter(products, group == "Book" 
                         & salesrank <= 150000 & salesrank >= 0)

books.copurchase <- filter(copurchase, Source %in% books.products$id
                           & Target %in% books.products$id)
#We want to find the ID with the highest indegree (For a vertex, the number of head ends adjacent to a vertex is called the indegree of the vertex)
indegree.df <- summarize(group_by(books.copurchase, Target), indegree = n()) %>% arrange(desc(indegree))
names(indegree.df)[1]<-"id"
head(indegree.df)
## # A tibble: 6 x 2
##      id indegree
##   <int>    <int>
## 1  4429       46
## 2    33       44
## 3   244       35
## 4   302       22
## 5  5913       20
## 6   626       16
##We want to find the ID with the highest outdegree (the number of tail ends adjacent to a vertex is its outdegree)
outdegree.df <- summarize(group_by(books.copurchase, Source), outdegree = n()) %>% arrange(desc(outdegree))
names(outdegree.df)[1]<-"id"
head(outdegree.df)
## # A tibble: 6 x 2
##       id outdegree
##    <int>     <int>
## 1 126396         5
## 2 151687         5
## 3   4993         4
## 4  26268         4
## 5  28040         4
## 6  29680         4
books.graph <- merge(books.products, indegree.df, by="id", all.x = TRUE)  #Merge the Dataframes
books.graph <- merge(books.graph, outdegree.df, by="id", all.x = TRUE)  #Merge the Dataframes
books.graph$indegree[is.na(books.graph$indegree)] <- 0     #ssign 0 to those who are na
books.graph$outdegree[is.na(books.graph$outdegree)] <- 0  #assign 0 to those who are na
books.graph <- mutate(books.graph, degree = indegree + outdegree) # sum indegree and outdegree to create degree
head(books.graph)
##   id
## 1 12
## 2 33
## 3 39
## 4 45
## 5 74
## 6 77
##                                                                                                      title
## 1 Fantastic Food with Splenda : 160 Great Recipes for Meals Low in Sugar, Carbohydrates, Fat, and Calories
## 2                                                                           Double Jeopardy (T*Witches, 6)
## 3                                                                           Night of Many Dreams : A Novel
## 4                                                                     Beginning ASP.NET Databases using C#
## 5                                                      Service Delivery (It Infrastructure Library Series)
## 6                                                                                     Water Touching Stone
##   group salesrank review_cnt downloads rating indegree outdegree degree
## 1  Book     24741         12        12    4.5        5         1      6
## 2  Book     97166          4         4    5.0       44         0     44
## 3  Book     57186         22        22    3.5        4         0      4
## 4  Book     48408          4         4    4.0        0         0      0
## 5  Book     27507          2         2    4.0        1         1      2
## 6  Book     27012         11        11    4.5        3         1      4
#We are looking for the book with highest degree
filter(books.graph, degree == max(books.graph$degree))
##     id                                    title group salesrank review_cnt
## 1 4429 Harley-Davidson Panheads, 1948-1965/M418  Book    147799          3
##   downloads rating indegree outdegree degree
## 1         3    4.5       46         1     47
g <- graph_from_data_frame(books.copurchase, directed = TRUE)    #we creathe a directed graph
sg <- induced_subgraph(g, subcomponent(g, "4429", "all"), impl = "auto")  #we are only interested in id "4429"
sg <- simplify(sg, remove.multiple = F, remove.loops = T)
V(sg)
## + 756/756 vertices, named, from 2bed862:
##   [1] 77     130    148    187    193    224    321    322    422    556   
##  [11] 577    626    724    1051   1644   1817   1822   1851   1971   2071  
##  [21] 2210   2279   2285   2326   2330   2332   2343   2423   2470   2501  
##  [31] 2505   2558   2572   2657   2658   2806   2807   2959   3032   3119  
##  [41] 3191   3217   3306   3588   3670   3737   3861   3909   4002   4014  
##  [51] 4068   4099   4140   4174   4184   4185   4222   4223   4345   4429  
##  [61] 4977   4993   4994   5018   5163   5164   5293   5355   5388   5623  
##  [71] 5638   5639   5655   5670   5821   5851   5875   6012   6014   6392  
##  [81] 6411   6445   6546   6711   6713   6817   6942   7196   7198   7222  
##  [91] 7233   7325   7376   7406   7544   7743   7754   7775   7839   7841  
## + ... omitted several vertices
E(sg)
## + 986/986 edges from 2bed862 (vertex names):
##  [1] 77  ->422  130 ->78   148 ->302  187 ->321  187 ->322  187 ->78  
##  [7] 193 ->224  224 ->193  224 ->33   321 ->187  321 ->322  321 ->78  
## [13] 322 ->187  322 ->321  322 ->78   422 ->77   422 ->1644 556 ->78  
## [19] 577 ->33   626 ->33   724 ->302  1051->302  1644->422  1644->5293
## [25] 1817->976  1822->193  1822->724  1851->78   1971->193  2071->3155
## [31] 2210->2279 2210->2285 2279->2210 2279->2326 2285->2330 2326->193 
## [37] 2326->2210 2330->2343 2330->2345 2332->4140 2343->2285 2343->2330
## [43] 2423->5410 2470->556  2501->3588 2505->2501 2558->33   2572->4184
## [49] 2572->4185 2657->2658 2658->77   2806->2807 2807->302  2959->1673
## [55] 3032->2558 3119->976  3191->2279 3217->4319 3306->2071 3306->4345
## + ... omitted several edges
diameter <- get_diameter(sg)
diameter
## + 10/756 vertices, named, from 2bed862:
##  [1] 37895 27936 21584 10889 11080 14111 4429  2501  3588  6676
#Plot the graph for id "4429"

V(sg)$color <- ifelse(V(sg)$name %in% diameter$name, "red", "lightblue")
V(sg)["4429"]$color <- "green"
V(sg)["33"]$color <- "gold"
E(sg)$color <- "darkgray"
E(sg,path=diameter)$color <- "red"
E(sg)$width <- 1
E(sg,path=diameter)$width <- 3
options(repr.plot.width = 100, repr.plot.height = 100)
plot(sg, layout=layout_with_fr, vertex.size=1, vertex.label=NA, edge.arrow.size=0.05)