Beer Recommendation System
Prepare Data
library(readr)
library(plyr)
library(dplyr)
data <- read_csv("~/Github/Business_Statistics_41000/Lecture/Bonus/beer_reviews/beer_reviews.csv")
head(data)
## # A tibble: 6 x 13
## brewery_id brewery_name review_time review_overall review_aroma
## <int> <chr> <int> <dbl> <dbl>
## 1 10325 Vecchio Birraio 1234817823 1.5 2
## 2 10325 Vecchio Birraio 1235915097 3 2.5
## 3 10325 Vecchio Birraio 1235916604 3 2.5
## 4 10325 Vecchio Birraio 1234725145 3 3
## 5 1075 Caldera Brewing Comp… 1293735206 4 4.5
## 6 1075 Caldera Brewing Comp… 1325524659 3 3.5
## # ... with 8 more variables: review_appearance <dbl>,
## # review_profilename <chr>, beer_style <chr>, review_palate <dbl>,
## # review_taste <dbl>, beer_name <chr>, beer_abv <dbl>, beer_beerid <int>
glimpse(data)
## Observations: 1,586,614
## Variables: 13
## $ brewery_id <int> 10325, 10325, 10325, 10325, 1075, 1075, 107...
## $ brewery_name <chr> "Vecchio Birraio", "Vecchio Birraio", "Vecc...
## $ review_time <int> 1234817823, 1235915097, 1235916604, 1234725...
## $ review_overall <dbl> 1.5, 3.0, 3.0, 3.0, 4.0, 3.0, 3.5, 3.0, 4.0...
## $ review_aroma <dbl> 2.0, 2.5, 2.5, 3.0, 4.5, 3.5, 3.5, 2.5, 3.0...
## $ review_appearance <dbl> 2.5, 3.0, 3.0, 3.5, 4.0, 3.5, 3.5, 3.5, 3.5...
## $ review_profilename <chr> "stcules", "stcules", "stcules", "stcules",...
## $ beer_style <chr> "Hefeweizen", "English Strong Ale", "Foreig...
## $ review_palate <dbl> 1.5, 3.0, 3.0, 2.5, 4.0, 3.0, 4.0, 2.0, 3.5...
## $ review_taste <dbl> 1.5, 3.0, 3.0, 3.0, 4.5, 3.5, 4.0, 3.5, 4.0...
## $ beer_name <chr> "Sausa Weizen", "Red Moon", "Black Horse Bl...
## $ beer_abv <dbl> 5.0, 6.2, 6.5, 5.0, 7.7, 4.7, 4.7, 4.7, 4.7...
## $ beer_beerid <int> 47986, 48213, 48215, 47969, 64883, 52159, 5...
Find common review
common_reviewers_by_id <- function(beer1, beer2) {
reviews1 <- subset(data, beer_beerid==beer1)
reviews2 <- subset(data, beer_beerid==beer2)
reviewers_sameset <- intersect(reviews1[,'review_profilename'],
reviews2[,'review_profilename'])
if (length(reviewers_sameset$review_profilename)==0) {
return(NA)
} else {
return(reviewers_sameset$review_profilename)
}
}
beer_lookup <- data[,c("beer_beerid", "beer_name")]
beer_lookup <- beer_lookup[duplicated(beer_lookup)==FALSE,]
common_reviewers_by_name <- function(name1, name2) {
beer1 <- subset(beer_lookup, beer_name==name1)$beer_beerid
beer2 <- subset(beer_lookup, beer_name==name2)$beer_beerid
common_reviewers_by_id(beer1, beer2)
}
beer_name_to_id <- function(name){
return(subset(beer_lookup, beer_name==name)$beer_beerid)
}
common_reviewers=common_reviewers_by_id(34146, 837)
common_reviewers_by_name("Founders Double Trouble", "Coors Light")
## [1] "JoEBoBpr" "ZAP" "connecticutpoet"
## [4] "brewdlyhooked13" "northyorksammy" "Foxman"
## [7] "NJpadreFan" "Phyl21ca" "garymuchow"
## [10] "WVbeergeek" "NeroFiddled" "Suds"
## [13] "cokes" "Brent" "coldmeat23"
## [16] "Reaper16" "notchucknorris" "Rifugium"
## [19] "FosterJM" "psuKinger" "Slatetank"
## [22] "ZenAgnostic" "BierFan" "stewart124"
## [25] "heebes" "NODAK" "Shrews629"
## [28] "bnes09" "JayS2629" "greenmonstah"
## [31] "vandemonian" "beerprovedwright" "Vdubb86"
## [34] "TheKingofWichita" "scottfrie" "lacqueredmouse"
## [37] "xnicknj" "akorsak" "TMoney2591"
## [40] "perrymarcus" "kbutler1" "Beerandraiderfan"
## [43] "kwjd" "DrewCapzz" "DrJay"
## [46] "jdhilt" "Bitterbill" "Soonami"
## [49] "Chico1985" "MrHungryMonkey" "lovindahops"
## [52] "wcintula" "Gtreid" "wordemupg"
## [55] "Bfarr" "buschbeer" "MrStark"
## [58] "jmich24" "garuda" "Tilley4"
## [61] "zimm421" "drabmuh" "tigg924"
## [64] "Wasatch" "gtermi" "Scotchboy"
## [67] "rhoadsrage" "Clydesdale" "DNICE555"
## [70] "hopheadjuice" "jdklks" "katan"
## [73] "Gavage" "hardy008" "Strix"
## [76] "alleykatking" "rfgetz" "womencantsail"
## [79] "woosterbill" "scottyshades" "jimj21"
## [82] "projectflam86" "pmcadamis" "jrallen34"
## [85] "jjanega08" "dasenebler" "Jesse13713"
## [88] "JoeyBeerBelly" "Jimmys" "Onenote81"
## [91] "civilizedpsycho" "Blakaeris" "bashiba"
## [94] "Mdog" "ColForbinBC" "nlmartin"
## [97] "ChopperSmith" "Duhast500" "mothman"
## [100] "sarahspat" "biboergosum" "LordAdmNelson"
## [103] "Metalmonk" "BeerFMAndy" "Thorpe429"
## [106] "BDLbrewster" "sonicdescent" "CHickman"
## [109] "Hojaminbag" "champ103" "youngleo"
## [112] "woodske1" "BirdFlu" "Stunner97"
## [115] "onix1agr" "cnally" "match1112"
## [118] "WesWes" "KBoudreau66" "youngblood"
## [121] "CampusCrew" "BradLikesBrew" "ChainGangGuy"
## [124] "flipper2gv" "Mistofminn" "WakeandBake"
## [127] "colts9016" "largadeer" "BeerCon5"
## [130] "ThreeWiseMen" "PerzentRizen" "bark"
## [133] "illidurit" "Goliath" "oakbluff"
## [136] "philbe311" "cvstrickland" "beerthulhu"
## [139] "PatrickJR" "biglobo8971" "jayhawk73"
## [142] "argock" "Badbobx" "Haybeerman"
## [145] "happygnome" "TheManiacalOne" "PhillyStyle"
## [148] "ckeegan04" "ibbjamin" "Lexluthor33"
## [151] "CrellMoset" "Doomcifer" "cdkrenz"
## [154] "buckeyesox" "daledeee" "baos"
## [157] "johnnnniee" "Overlord" "clayrock81"
## [160] "dsa7783" "berserker256" "Huhzubendah"
## [163] "ktrillionaire" "kirok1999" "Brad007"
## [166] "TheBierBand" "drpimento" "MrMcGibblets"
## [169] "DarkerTheBetter" "zdk9" "beveritt"
## [172] "prototypic" "mikesgroove" "soupyman10"
## [175] "Gmann" "Risser09" "RoamingGnome"
## [178] "UCLABrewN84" "rayjay" "zeff80"
## [181] "gunnerman" "jwc215" "neenerzig"
## [184] "AltBock" "AmericanGothic" "mdaschaf"
## [187] "mdfb79" "ummswimmin" "Drew966"
## [190] "Viggo" "aforbes10" "feloniousmonk"
## [193] "JerzDevl2000" "tempest" "BEERchitect"
## [196] "chilidog" "BuckeyeNation" "ommegangpbr"
## [199] "Billolick" "gskitt"
A first visualization
library(ggplot2)
library(reshape2)
visual_beer_scatterplots <- function(name1, name2){
beer1 = beer_name_to_id(name1)
beer2 = beer_name_to_id(name2)
common_reviewers=common_reviewers_by_id(beer1, beer2)
reviews_beer1=get_review_metrics(beer1, common_reviewers)
reviews_beer2=get_review_metrics(beer2, common_reviewers)
reviews_beer1_reshape = melt(reviews_beer1, id.vars = c("review_profilename", "beer_name"))
reviews_beer2_reshape = melt(reviews_beer2, id.vars = c("review_profilename", "beer_name"))
dt_plot = data.frame(reviews_profilename = reviews_beer1_reshape$review_profilename, review_type = reviews_beer1_reshape$variable, x = reviews_beer1_reshape$value, y = reviews_beer2_reshape$value)
ggplot(dt_plot, aes(x, y, color = review_type)) + geom_count() + scale_size_area(max_size = 4) + facet_wrap(~review_type, ncol=2) + geom_abline(slope = 1) + xlab(reviews_beer1[1,1]$beer_name) + ylab(reviews_beer2[1,1]$beer_name)
}
Coors Light vs Founders Double Trouble
Calculating similarity among beers
calc_similarity <- function(b1, b2) {
common_users <- common_reviewers_by_id(b1, b2)
if (is.na(common_users)) {
return (NA)
}
beer1.reviews <- get_review_metrics(b1, common_users)
beer2.reviews <- get_review_metrics(b2, common_users)
#this can be more complex; we're just taking a weighted average
weights <- c(2, 1, 1, 1)
corrs <- sapply(names(beer1.reviews)[3:6], function(metric) {
cor(beer1.reviews[metric], beer2.reviews[metric])
})
sum(corrs * weights, na.rm=TRUE)
}
Fat Tire Amber Ale vs Dale’s Pale Ale
name1 = "Fat Tire Amber Ale"
name2 = "Dale's Pale Ale"
visual_beer_scatterplots(name1, name2)
b1 <- beer_name_to_id("Fat Tire Amber Ale")
b2 <- beer_name_to_id("Dale's Pale Ale")
calc_similarity(b1, b2)
## Warning in if (is.na(common_users)) {: the condition has length > 1 and
## only the first element will be used
## [1] 0.7294634
Fat Tire Amber Ale vs Michelob Ultra
name1 = "Fat Tire Amber Ale"
name2 = "Michelob Ultra"
visual_beer_scatterplots(name1, name2)
b1 <- beer_name_to_id("Fat Tire Amber Ale")
b2 <- beer_name_to_id("Michelob Ultra")
calc_similarity(b1, b2)
## Warning in if (is.na(common_users)) {: the condition has length > 1 and
## only the first element will be used
## [1] 0.2099679
Beer recommendation system
beer.counts <- ddply(data, .(beer_name, brewery_name, beer_style), nrow)
o <- order(-beer.counts$V1)
# get the 20 most commonly reviewed beers
K = 30
all.beers <- head(beer.counts[o,], K)$beer_name
beer.pairs <- expand.grid(beer1=all.beers, beer2=all.beers)
beer.pairs <- subset(beer.pairs, beer1!=beer2)
results <- ddply(beer.pairs, .(beer1, beer2), function(x) {
b1 <- beer_name_to_id(x$beer1)
b2 <- beer_name_to_id(x$beer2)
c("sim"=calc_similarity(b1, b2))
})
# ,.progress="text")
find_similar_beers <- function(mybeer, style=NULL, n=5) {
similar <- subset(results, beer1==mybeer)
similar <- merge(beer.counts, similar, by.x="beer_name", by.y="beer2")
if (!is.null(style)) {
similar <- subset(similar, beer_style==style)
}
similar <- similar[order(-similar$sim),]
n <- min(n, nrow(similar))
similar <- similar[1:n,c("brewery_name", "beer_name", "beer_style", "sim")]
similar
}
mybeer = "90 Minute IPA"
find_similar_beers(mybeer)
## brewery_name beer_name
## 1 Dogfish Head Brewery 60 Minute IPA
## 23 Stone Brewing Co. Stone IPA (India Pale Ale)
## 25 Victory Brewing Company Storm King Stout
## 26 Brasserie de Rochefort Trappistes Rochefort 10
## 13 North Coast Brewing Co. Old Rasputin Russian Imperial Stout
## beer_style sim
## 1 American IPA 1.0386640
## 23 American IPA 0.7691263
## 25 Russian Imperial Stout 0.7594290
## 26 Quadrupel (Quad) 0.7426568
## 13 Russian Imperial Stout 0.7369112
Clusters visualization
set.seed(1)
sim_mat = matrix(0, nrow = K, ncol = K)
colnames(sim_mat) = head(beer.counts[o,], K)$beer_name
rownames(sim_mat) = head(beer.counts[o,], K)$beer_name
for(i in 1:length(results$sim)){
sim_mat[results$beer1[i], results$beer2[i]] = results$sim[i]
}
r = eigen(sim_mat)
points.matrix = r$vectors[,2:4]
kclust = kmeans(points.matrix, 3)
data_plot_spectral = data.frame(colnames(sim_mat), r$vectors[,2:3], as.factor(kclust$cluster), head(beer.counts[o,], K)$V1)
colnames(data_plot_spectral) = c("beer_name", "eig2", "eig3", "cluster","n_reviews")
ggplot(data_plot_spectral, aes(eig2, eig3, color=cluster)) + geom_point(aes(size=n_reviews)) + geom_text(aes(label=beer_name), hjust = -0.05, vjust = 0.05)