Amazon Network Analysis
# Load relevant packages
library(car)
library(dplyr)
library(tidyr)
library(igraph)
library(ggplot2)
library(corrplot)
head(products)
## id title
## 1 1 Patterns of Preaching: A Sermon Sampler
## 2 2 Candlemas: Feast of Flames
## 3 3 World War II Allied Fighter Planes Trading Cards
## 4 4 Life Application Bible Commentary: 1 and 2 Timothy and Titus
## 5 5 Prayers That Avail Much for Business: Executive
## 6 6 How the Other Half Lives: Studies Among the Tenements of New York
## group salesrank review_cnt downloads rating
## 1 Book 396585 2 2 5.0
## 2 Book 168596 12 12 4.5
## 3 Book 1270652 1 1 5.0
## 4 Book 631289 1 1 4.0
## 5 Book 455160 0 0 0.0
## 6 Book 188784 17 17 4.0
head(copurchase)
## Source Target
## 1 1 2
## 2 1 4
## 3 1 5
## 4 1 15
## 5 2 11
## 6 2 12
#We are onluy interested in a subset of this dataset, we want to look for Books
books.products <- filter(products, group == "Book"
& salesrank <= 150000 & salesrank >= 0)
books.copurchase <- filter(copurchase, Source %in% books.products$id
& Target %in% books.products$id)
#We want to find the ID with the highest indegree (For a vertex, the number of head ends adjacent to a vertex is called the indegree of the vertex)
indegree.df <- summarize(group_by(books.copurchase, Target), indegree = n()) %>% arrange(desc(indegree))
names(indegree.df)[1]<-"id"
head(indegree.df)
## # A tibble: 6 x 2
## id indegree
## <int> <int>
## 1 4429 46
## 2 33 44
## 3 244 35
## 4 302 22
## 5 5913 20
## 6 626 16
##We want to find the ID with the highest outdegree (the number of tail ends adjacent to a vertex is its outdegree)
outdegree.df <- summarize(group_by(books.copurchase, Source), outdegree = n()) %>% arrange(desc(outdegree))
names(outdegree.df)[1]<-"id"
head(outdegree.df)
## # A tibble: 6 x 2
## id outdegree
## <int> <int>
## 1 126396 5
## 2 151687 5
## 3 4993 4
## 4 26268 4
## 5 28040 4
## 6 29680 4
books.graph <- merge(books.products, indegree.df, by="id", all.x = TRUE) #Merge the Dataframes
books.graph <- merge(books.graph, outdegree.df, by="id", all.x = TRUE) #Merge the Dataframes
books.graph$indegree[is.na(books.graph$indegree)] <- 0 #ssign 0 to those who are na
books.graph$outdegree[is.na(books.graph$outdegree)] <- 0 #assign 0 to those who are na
books.graph <- mutate(books.graph, degree = indegree + outdegree) # sum indegree and outdegree to create degree
head(books.graph)
## id
## 1 12
## 2 33
## 3 39
## 4 45
## 5 74
## 6 77
## title
## 1 Fantastic Food with Splenda : 160 Great Recipes for Meals Low in Sugar, Carbohydrates, Fat, and Calories
## 2 Double Jeopardy (T*Witches, 6)
## 3 Night of Many Dreams : A Novel
## 4 Beginning ASP.NET Databases using C#
## 5 Service Delivery (It Infrastructure Library Series)
## 6 Water Touching Stone
## group salesrank review_cnt downloads rating indegree outdegree degree
## 1 Book 24741 12 12 4.5 5 1 6
## 2 Book 97166 4 4 5.0 44 0 44
## 3 Book 57186 22 22 3.5 4 0 4
## 4 Book 48408 4 4 4.0 0 0 0
## 5 Book 27507 2 2 4.0 1 1 2
## 6 Book 27012 11 11 4.5 3 1 4
#We are looking for the book with highest degree
filter(books.graph, degree == max(books.graph$degree))
## id title group salesrank review_cnt
## 1 4429 Harley-Davidson Panheads, 1948-1965/M418 Book 147799 3
## downloads rating indegree outdegree degree
## 1 3 4.5 46 1 47
g <- graph_from_data_frame(books.copurchase, directed = TRUE) #we creathe a directed graph
sg <- induced_subgraph(g, subcomponent(g, "4429", "all"), impl = "auto") #we are only interested in id "4429"
sg <- simplify(sg, remove.multiple = F, remove.loops = T)
V(sg)
## + 756/756 vertices, named, from 2bed862:
## [1] 77 130 148 187 193 224 321 322 422 556
## [11] 577 626 724 1051 1644 1817 1822 1851 1971 2071
## [21] 2210 2279 2285 2326 2330 2332 2343 2423 2470 2501
## [31] 2505 2558 2572 2657 2658 2806 2807 2959 3032 3119
## [41] 3191 3217 3306 3588 3670 3737 3861 3909 4002 4014
## [51] 4068 4099 4140 4174 4184 4185 4222 4223 4345 4429
## [61] 4977 4993 4994 5018 5163 5164 5293 5355 5388 5623
## [71] 5638 5639 5655 5670 5821 5851 5875 6012 6014 6392
## [81] 6411 6445 6546 6711 6713 6817 6942 7196 7198 7222
## [91] 7233 7325 7376 7406 7544 7743 7754 7775 7839 7841
## + ... omitted several vertices
E(sg)
## + 986/986 edges from 2bed862 (vertex names):
## [1] 77 ->422 130 ->78 148 ->302 187 ->321 187 ->322 187 ->78
## [7] 193 ->224 224 ->193 224 ->33 321 ->187 321 ->322 321 ->78
## [13] 322 ->187 322 ->321 322 ->78 422 ->77 422 ->1644 556 ->78
## [19] 577 ->33 626 ->33 724 ->302 1051->302 1644->422 1644->5293
## [25] 1817->976 1822->193 1822->724 1851->78 1971->193 2071->3155
## [31] 2210->2279 2210->2285 2279->2210 2279->2326 2285->2330 2326->193
## [37] 2326->2210 2330->2343 2330->2345 2332->4140 2343->2285 2343->2330
## [43] 2423->5410 2470->556 2501->3588 2505->2501 2558->33 2572->4184
## [49] 2572->4185 2657->2658 2658->77 2806->2807 2807->302 2959->1673
## [55] 3032->2558 3119->976 3191->2279 3217->4319 3306->2071 3306->4345
## + ... omitted several edges
diameter <- get_diameter(sg)
diameter
## + 10/756 vertices, named, from 2bed862:
## [1] 37895 27936 21584 10889 11080 14111 4429 2501 3588 6676
#Plot the graph for id "4429"
V(sg)$color <- ifelse(V(sg)$name %in% diameter$name, "red", "lightblue")
V(sg)["4429"]$color <- "green"
V(sg)["33"]$color <- "gold"
E(sg)$color <- "darkgray"
E(sg,path=diameter)$color <- "red"
E(sg)$width <- 1
E(sg,path=diameter)$width <- 3
options(repr.plot.width = 100, repr.plot.height = 100)
plot(sg, layout=layout_with_fr, vertex.size=1, vertex.label=NA, edge.arrow.size=0.05)
