Beer Recommendation System

Prepare Data

library(readr)
library(plyr)
library(dplyr)
data <- read_csv("~/Github/Business_Statistics_41000/Lecture/Bonus/beer_reviews/beer_reviews.csv")
head(data)
## # A tibble: 6 x 13
##   brewery_id brewery_name          review_time review_overall review_aroma
##        <int> <chr>                       <int>          <dbl>        <dbl>
## 1      10325 Vecchio Birraio        1234817823            1.5          2  
## 2      10325 Vecchio Birraio        1235915097            3            2.5
## 3      10325 Vecchio Birraio        1235916604            3            2.5
## 4      10325 Vecchio Birraio        1234725145            3            3  
## 5       1075 Caldera Brewing Comp…  1293735206            4            4.5
## 6       1075 Caldera Brewing Comp…  1325524659            3            3.5
## # ... with 8 more variables: review_appearance <dbl>,
## #   review_profilename <chr>, beer_style <chr>, review_palate <dbl>,
## #   review_taste <dbl>, beer_name <chr>, beer_abv <dbl>, beer_beerid <int>
glimpse(data)
## Observations: 1,586,614
## Variables: 13
## $ brewery_id         <int> 10325, 10325, 10325, 10325, 1075, 1075, 107...
## $ brewery_name       <chr> "Vecchio Birraio", "Vecchio Birraio", "Vecc...
## $ review_time        <int> 1234817823, 1235915097, 1235916604, 1234725...
## $ review_overall     <dbl> 1.5, 3.0, 3.0, 3.0, 4.0, 3.0, 3.5, 3.0, 4.0...
## $ review_aroma       <dbl> 2.0, 2.5, 2.5, 3.0, 4.5, 3.5, 3.5, 2.5, 3.0...
## $ review_appearance  <dbl> 2.5, 3.0, 3.0, 3.5, 4.0, 3.5, 3.5, 3.5, 3.5...
## $ review_profilename <chr> "stcules", "stcules", "stcules", "stcules",...
## $ beer_style         <chr> "Hefeweizen", "English Strong Ale", "Foreig...
## $ review_palate      <dbl> 1.5, 3.0, 3.0, 2.5, 4.0, 3.0, 4.0, 2.0, 3.5...
## $ review_taste       <dbl> 1.5, 3.0, 3.0, 3.0, 4.5, 3.5, 4.0, 3.5, 4.0...
## $ beer_name          <chr> "Sausa Weizen", "Red Moon", "Black Horse Bl...
## $ beer_abv           <dbl> 5.0, 6.2, 6.5, 5.0, 7.7, 4.7, 4.7, 4.7, 4.7...
## $ beer_beerid        <int> 47986, 48213, 48215, 47969, 64883, 52159, 5...

Find common review

common_reviewers_by_id <- function(beer1, beer2) {
  reviews1 <- subset(data, beer_beerid==beer1)
  reviews2 <- subset(data, beer_beerid==beer2)
  reviewers_sameset <- intersect(reviews1[,'review_profilename'],
                                        reviews2[,'review_profilename'])
  if (length(reviewers_sameset$review_profilename)==0) {
    return(NA)
  } else {
    return(reviewers_sameset$review_profilename)
  }
}

beer_lookup <- data[,c("beer_beerid", "beer_name")]
beer_lookup <- beer_lookup[duplicated(beer_lookup)==FALSE,]

common_reviewers_by_name <- function(name1, name2) {
  beer1 <- subset(beer_lookup, beer_name==name1)$beer_beerid
  beer2 <- subset(beer_lookup, beer_name==name2)$beer_beerid
  common_reviewers_by_id(beer1, beer2)
}

beer_name_to_id <- function(name){
  return(subset(beer_lookup, beer_name==name)$beer_beerid)
}

common_reviewers=common_reviewers_by_id(34146, 837)
common_reviewers_by_name("Founders Double Trouble", "Coors Light")
##   [1] "JoEBoBpr"         "ZAP"              "connecticutpoet" 
##   [4] "brewdlyhooked13"  "northyorksammy"   "Foxman"          
##   [7] "NJpadreFan"       "Phyl21ca"         "garymuchow"      
##  [10] "WVbeergeek"       "NeroFiddled"      "Suds"            
##  [13] "cokes"            "Brent"            "coldmeat23"      
##  [16] "Reaper16"         "notchucknorris"   "Rifugium"        
##  [19] "FosterJM"         "psuKinger"        "Slatetank"       
##  [22] "ZenAgnostic"      "BierFan"          "stewart124"      
##  [25] "heebes"           "NODAK"            "Shrews629"       
##  [28] "bnes09"           "JayS2629"         "greenmonstah"    
##  [31] "vandemonian"      "beerprovedwright" "Vdubb86"         
##  [34] "TheKingofWichita" "scottfrie"        "lacqueredmouse"  
##  [37] "xnicknj"          "akorsak"          "TMoney2591"      
##  [40] "perrymarcus"      "kbutler1"         "Beerandraiderfan"
##  [43] "kwjd"             "DrewCapzz"        "DrJay"           
##  [46] "jdhilt"           "Bitterbill"       "Soonami"         
##  [49] "Chico1985"        "MrHungryMonkey"   "lovindahops"     
##  [52] "wcintula"         "Gtreid"           "wordemupg"       
##  [55] "Bfarr"            "buschbeer"        "MrStark"         
##  [58] "jmich24"          "garuda"           "Tilley4"         
##  [61] "zimm421"          "drabmuh"          "tigg924"         
##  [64] "Wasatch"          "gtermi"           "Scotchboy"       
##  [67] "rhoadsrage"       "Clydesdale"       "DNICE555"        
##  [70] "hopheadjuice"     "jdklks"           "katan"           
##  [73] "Gavage"           "hardy008"         "Strix"           
##  [76] "alleykatking"     "rfgetz"           "womencantsail"   
##  [79] "woosterbill"      "scottyshades"     "jimj21"          
##  [82] "projectflam86"    "pmcadamis"        "jrallen34"       
##  [85] "jjanega08"        "dasenebler"       "Jesse13713"      
##  [88] "JoeyBeerBelly"    "Jimmys"           "Onenote81"       
##  [91] "civilizedpsycho"  "Blakaeris"        "bashiba"         
##  [94] "Mdog"             "ColForbinBC"      "nlmartin"        
##  [97] "ChopperSmith"     "Duhast500"        "mothman"         
## [100] "sarahspat"        "biboergosum"      "LordAdmNelson"   
## [103] "Metalmonk"        "BeerFMAndy"       "Thorpe429"       
## [106] "BDLbrewster"      "sonicdescent"     "CHickman"        
## [109] "Hojaminbag"       "champ103"         "youngleo"        
## [112] "woodske1"         "BirdFlu"          "Stunner97"       
## [115] "onix1agr"         "cnally"           "match1112"       
## [118] "WesWes"           "KBoudreau66"      "youngblood"      
## [121] "CampusCrew"       "BradLikesBrew"    "ChainGangGuy"    
## [124] "flipper2gv"       "Mistofminn"       "WakeandBake"     
## [127] "colts9016"        "largadeer"        "BeerCon5"        
## [130] "ThreeWiseMen"     "PerzentRizen"     "bark"            
## [133] "illidurit"        "Goliath"          "oakbluff"        
## [136] "philbe311"        "cvstrickland"     "beerthulhu"      
## [139] "PatrickJR"        "biglobo8971"      "jayhawk73"       
## [142] "argock"           "Badbobx"          "Haybeerman"      
## [145] "happygnome"       "TheManiacalOne"   "PhillyStyle"     
## [148] "ckeegan04"        "ibbjamin"         "Lexluthor33"     
## [151] "CrellMoset"       "Doomcifer"        "cdkrenz"         
## [154] "buckeyesox"       "daledeee"         "baos"            
## [157] "johnnnniee"       "Overlord"         "clayrock81"      
## [160] "dsa7783"          "berserker256"     "Huhzubendah"     
## [163] "ktrillionaire"    "kirok1999"        "Brad007"         
## [166] "TheBierBand"      "drpimento"        "MrMcGibblets"    
## [169] "DarkerTheBetter"  "zdk9"             "beveritt"        
## [172] "prototypic"       "mikesgroove"      "soupyman10"      
## [175] "Gmann"            "Risser09"         "RoamingGnome"    
## [178] "UCLABrewN84"      "rayjay"           "zeff80"          
## [181] "gunnerman"        "jwc215"           "neenerzig"       
## [184] "AltBock"          "AmericanGothic"   "mdaschaf"        
## [187] "mdfb79"           "ummswimmin"       "Drew966"         
## [190] "Viggo"            "aforbes10"        "feloniousmonk"   
## [193] "JerzDevl2000"     "tempest"          "BEERchitect"     
## [196] "chilidog"         "BuckeyeNation"    "ommegangpbr"     
## [199] "Billolick"        "gskitt"

Extract review feature

features <- c("beer_name", "review_profilename", "review_overall", "review_aroma", 
    "review_palate", "review_taste")

get_review_metrics <- function(beer, userset) {
    beer.data <- subset(data, beer_beerid == beer & review_profilename %in% 
        userset)
    ord <- order(beer.data$review_profilename)
    beer.data <- beer.data[ord, ]
    dups <- duplicated(beer.data$review_profilename) == FALSE
    beer.data <- beer.data[dups, ]
    # this can return more than 1 type of metric
    return(beer.data[, features])
}

reviews_beer1 = get_review_metrics(837, common_reviewers)
reviews_beer2 = get_review_metrics(34146, common_reviewers)
head(reviews_beer1)
## # A tibble: 6 x 6
##   beer_name   review_profilename review_overall review_aroma review_palate
##   <chr>       <chr>                       <dbl>        <dbl>         <dbl>
## 1 Coors Light aforbes10                     2            2             2  
## 2 Coors Light akorsak                       3.5          3             3  
## 3 Coors Light alleykatking                  1.5          1.5           2  
## 4 Coors Light AltBock                       1.5          1             1.5
## 5 Coors Light AmericanGothic                2            2             2  
## 6 Coors Light argock                        3            2             2  
## # ... with 1 more variable: review_taste <dbl>
head(reviews_beer2)
## # A tibble: 6 x 6
##   beer_name     review_profilen… review_overall review_aroma review_palate
##   <chr>         <chr>                     <dbl>        <dbl>         <dbl>
## 1 Founders Dou… aforbes10                   4            3.5           4  
## 2 Founders Dou… akorsak                     3.5          4             3.5
## 3 Founders Dou… alleykatking                4.5          3.5           3.5
## 4 Founders Dou… AltBock                     4            4             3.5
## 5 Founders Dou… AmericanGothic              4.5          4             4.5
## 6 Founders Dou… argock                      4            4.5           4  
## # ... with 1 more variable: review_taste <dbl>

A first visualization

library(ggplot2)
library(reshape2)

visual_beer_scatterplots <- function(name1, name2){
  beer1 = beer_name_to_id(name1)
  beer2 = beer_name_to_id(name2)
  common_reviewers=common_reviewers_by_id(beer1, beer2)
  reviews_beer1=get_review_metrics(beer1, common_reviewers)
  reviews_beer2=get_review_metrics(beer2, common_reviewers)
  reviews_beer1_reshape = melt(reviews_beer1, id.vars = c("review_profilename", "beer_name"))
  reviews_beer2_reshape = melt(reviews_beer2, id.vars = c("review_profilename", "beer_name"))
  dt_plot = data.frame(reviews_profilename = reviews_beer1_reshape$review_profilename, review_type = reviews_beer1_reshape$variable, x = reviews_beer1_reshape$value, y = reviews_beer2_reshape$value)
  ggplot(dt_plot, aes(x, y, color = review_type)) + geom_count() + scale_size_area(max_size = 4) + facet_wrap(~review_type, ncol=2) + geom_abline(slope = 1) + xlab(reviews_beer1[1,1]$beer_name) + ylab(reviews_beer2[1,1]$beer_name)
}

Coors Light vs Founders Double Trouble

Calculating similarity among beers

calc_similarity <- function(b1, b2) {
  common_users <- common_reviewers_by_id(b1, b2)
  if (is.na(common_users)) {
    return (NA)
  }
  beer1.reviews <- get_review_metrics(b1, common_users)
  beer2.reviews <- get_review_metrics(b2, common_users)
  #this can be more complex; we're just taking a weighted average
  weights <- c(2, 1, 1, 1)
  corrs <- sapply(names(beer1.reviews)[3:6], function(metric) {
    cor(beer1.reviews[metric], beer2.reviews[metric])
  })
  sum(corrs * weights, na.rm=TRUE)
}

Fat Tire Amber Ale vs Dale’s Pale Ale

name1 = "Fat Tire Amber Ale"
name2 = "Dale's Pale Ale"
visual_beer_scatterplots(name1, name2)

b1 <- beer_name_to_id("Fat Tire Amber Ale")
b2 <- beer_name_to_id("Dale's Pale Ale")
calc_similarity(b1, b2)
## Warning in if (is.na(common_users)) {: the condition has length > 1 and
## only the first element will be used
## [1] 0.7294634

Fat Tire Amber Ale vs Michelob Ultra

name1 = "Fat Tire Amber Ale"
name2 = "Michelob Ultra"
visual_beer_scatterplots(name1, name2)

b1 <- beer_name_to_id("Fat Tire Amber Ale")
b2 <- beer_name_to_id("Michelob Ultra")
calc_similarity(b1, b2)
## Warning in if (is.na(common_users)) {: the condition has length > 1 and
## only the first element will be used
## [1] 0.2099679

Beer recommendation system

beer.counts <- ddply(data, .(beer_name, brewery_name, beer_style), nrow)
o <- order(-beer.counts$V1)
# get the 20 most commonly reviewed beers
K = 30
all.beers <- head(beer.counts[o,], K)$beer_name

beer.pairs <- expand.grid(beer1=all.beers, beer2=all.beers)
beer.pairs <- subset(beer.pairs, beer1!=beer2)
results <- ddply(beer.pairs, .(beer1, beer2), function(x) {
  b1 <- beer_name_to_id(x$beer1)
  b2 <- beer_name_to_id(x$beer2)
  c("sim"=calc_similarity(b1, b2))
})
# ,.progress="text")

find_similar_beers <- function(mybeer, style=NULL, n=5) {
  similar <- subset(results, beer1==mybeer)
  similar <- merge(beer.counts, similar, by.x="beer_name", by.y="beer2")
  if (!is.null(style)) {
    similar <- subset(similar, beer_style==style)
  }
  similar <- similar[order(-similar$sim),]
  n <- min(n, nrow(similar))
  similar <- similar[1:n,c("brewery_name", "beer_name", "beer_style", "sim")]
  similar
}
mybeer = "90 Minute IPA"
find_similar_beers(mybeer)
##               brewery_name                           beer_name
## 1     Dogfish Head Brewery                       60 Minute IPA
## 23       Stone Brewing Co.          Stone IPA (India Pale Ale)
## 25 Victory Brewing Company                    Storm King Stout
## 26  Brasserie de Rochefort             Trappistes Rochefort 10
## 13 North Coast Brewing Co. Old Rasputin Russian Imperial Stout
##                beer_style       sim
## 1            American IPA 1.0386640
## 23           American IPA 0.7691263
## 25 Russian Imperial Stout 0.7594290
## 26       Quadrupel (Quad) 0.7426568
## 13 Russian Imperial Stout 0.7369112

Clusters visualization

set.seed(1)
sim_mat = matrix(0, nrow = K, ncol = K)
colnames(sim_mat) = head(beer.counts[o,], K)$beer_name
rownames(sim_mat) = head(beer.counts[o,], K)$beer_name
for(i in 1:length(results$sim)){
  sim_mat[results$beer1[i], results$beer2[i]] = results$sim[i]
}

r = eigen(sim_mat)
points.matrix = r$vectors[,2:4]
kclust = kmeans(points.matrix, 3)

data_plot_spectral = data.frame(colnames(sim_mat), r$vectors[,2:3], as.factor(kclust$cluster), head(beer.counts[o,], K)$V1)
colnames(data_plot_spectral) = c("beer_name", "eig2", "eig3", "cluster","n_reviews")
ggplot(data_plot_spectral, aes(eig2, eig3, color=cluster)) + geom_point(aes(size=n_reviews)) + geom_text(aes(label=beer_name), hjust = -0.05, vjust = 0.05)