Foreword

  • Output options: the ‘tango’ syntax and the ‘readable’ theme.
  • Snippets and results.
  • Sources: ‘R, Yelp and the Search for Goog Indian Food’ from DataCamp.

1, Importing Data and Plotting Star Review Modifications

Exploring the data

The data comes in three separate data sets.

  1. reviews contains user_id's, business_id's and star reviews.
  2. users is a list of user_id's and user_names's.
  3. businesses contains information about the businesses on yelp like the business location, the category of food, the number of reviews and the average star review.

reviews is made of 229907 observations. We have to load in seven .csv files, bind the data together and format the variables.

library(readr)

reviews1 <- read_csv('reviews.csv')
#str(reviews1)
reviews65 <- read_csv('65.csv')
#str(reviews65)
reviews90 <- read_csv('90.csv')
#str(reviews90)
reviews120 <- read_csv('120.csv')
#str(reviews120)
reviews150 <- read_csv('150.csv')
#str(reviews150)
reviews180 <- read_csv('180.csv')
#str(reviews180)
reviews210 <- read_csv('210.csv')
#str(reviews210)

reviews <- rbind(reviews1, reviews65, reviews90, reviews120, reviews150, reviews180, reviews210)

reviews$user_id <- as.factor(reviews$user_id)
reviews$business_id <- as.factor(reviews$business_id)
reviews$stars <- as.integer(reviews$stars)
reviews$no <- NULL

# Check it out
dim(reviews)
## [1] 229907      3

We have 229907 observations and three variables. Print the top observations.

head(reviews)
## # A tibble: 6 x 3
##                  user_id            business_id stars
##                   <fctr>                 <fctr> <int>
## 1 rLtl8ZkDX5vH5nAx9C3q5Q 9yKzy9PApeiPPOUJEtnvkg     5
## 2 0a2KyEL0d3Yb1V6aivbIuQ ZRJwVLyzEJq1VAihDhYiow     5
## 3 0hT2KtfLiobPvh6cDC8JQg 6oRAC4uyJCsJl1X0WZpVSA     4
## 4 uZetl9T0NcROGOyFfughhg _1QQZuf4zZOyFCvXc0o6Vg     5
## 5 vYmM4KTsC8ZfQBg-j5MWkw 6ozycU1RpktNG2-1BroVtw     5
## 6 sqYN3lNgvPbPCTRsMFu27g  yxfBYGB6SEqszmxJxd97A     4

Next, load in the users and businesses.

users <- read_csv('users.csv')

users$user_id <- as.factor(users$user_id)

# Check it out
dim(users)
## [1] 43873     2
head(users)
## # A tibble: 6 x 2
##                  user_id user_name
##                   <fctr>     <chr>
## 1 CR2y7yEm4X035ZMzrTtN9Q       Jim
## 2 _9GXoHhdxc30ujPaQwh6Ew     Kelle
## 3 8mM-nqxjg6pT04kwcjMbsw Stephanie
## 4 Ch6CdTR2IVaVANr-RglMOg         T
## 5 NZrLmHRyiHmyT1JrfzkCOA      Beth
## 6 mWx5Sxt_dx-sYBZg6RgJHQ       Amy
businesses <- read_csv2('businesses.csv')

businesses$business_id <- as.factor(businesses$business_id)
businesses$categories <- as.factor(businesses$categories)

# Check it out
dim(businesses)
## [1] 4503    6
head(businesses)
## # A tibble: 6 x 6
##              business_id        city          business_name
##                   <fctr>       <chr>                  <chr>
## 1 PzOqRohWw7F7YEPBz6AubA Glendale Az      Hot Bagels & Deli
## 2 qarobAbxGSHI7ygf1f7a_Q     Gilbert     Jersey Mike's Subs
## 3 gA5CuBxF-0CnOpGnryWJdQ     Phoenix La Paloma Mexican Food
## 4 JxVGJ9Nly2FFIs_WpJvkug  Scottsdale                  Sauce
## 5 Jj7bcQ6NDfKoz4TXwvYfMg     Phoenix            Fuddruckers
## 6 yOYFhiTjT-SM4spKtDk92w     Phoenix  China Chan Restaurant
## # ... with 3 more variables: categories <fctr>, review_count <int>,
## #   avg_stars <dbl>

We can also extract the descriptive statistics.

# Explore the `reviews` dataset
summary(reviews)
##                    user_id                       business_id    
##  fczQCSmaWF78toLEmb0Zsw:   517   hW0Ne_HTHEAgGF1rAdmR-g:   738  
##  90a6z--_CUrl84aCzZyPsg:   433   VVeogjZya58oiTxK7qUjAQ:   686  
##  0CMz8YaO3f8xu4KqQgKb9Q:   391   JokKtdXU7zXHcr20Lrk29A:   637  
##  4ozupHULqGyO42s3zNUzOQ:   372   ntN85eu27C04nwyPa8IHtw:   600  
##  joIzw_aUiNvBTuGoytrH7g:   346   EWMwV5V9BxNs_U6nNVMeqw:   558  
##  0bNXP9quoJEgyVZu9ipGgQ:   334   sC66z4SO3tR7nFCjfQwuQ :   540  
##  (Other)               :227514   (Other)               :226148  
##      stars      
##  Min.   :1.000  
##  1st Qu.:3.000  
##  Median :4.000  
##  Mean   :3.767  
##  3rd Qu.:5.000  
##  Max.   :5.000  
## 
# Explore the `users` dataset
summary(users)
##                    user_id       user_name        
##  00dakCMMozVoqbO3ljgVIg:    1   Length:43873      
##  00DeWETLJWNTx4okutHVeA:    1   Class :character  
##  00IVoTJHcLi9EgDQsFJAiQ:    1   Mode  :character  
##  00KxB2nOCoIYU7FCG1I4Rg:    1                     
##  00PjBksrk0DYsG1LiJBnHQ:    1                     
##  00rEiH5Z7hnPjj0bQdG2vA:    1                     
##  (Other)               :43867
# Explore the `businesses` dataset
summary(businesses)
##                  business_id       city           business_name     
##  01euuGhBwvcDhl9KcPTang:   1   Length:4503        Length:4503       
##  04HakCR6gEeRYNAoKxkppQ:   1   Class :character   Class :character  
##  04Jrd2zSzwHwmBqdKQDmIw:   1   Mode  :character   Mode  :character  
##  04SKYBXRAqeYIuuVDzwKNg:   1                                        
##  06kfoeRs9Acj82Yl3i9p_w:   1                                        
##  07OyvsOpqTWuyTtvE4_gcA:   1                                        
##  (Other)               :4497                                        
##                                      categories    review_count   
##  ['Mexican', 'Restaurants']               : 528   Min.   :  3.00  
##  ['Pizza', 'Restaurants']                 : 281   1st Qu.:  5.00  
##  ['Restaurants']                          : 202   Median : 13.00  
##  ['Chinese', 'Restaurants']               : 198   Mean   : 35.63  
##  ['American (Traditional)', 'Restaurants']: 183   3rd Qu.: 40.00  
##  ['Sandwiches', 'Restaurants']            : 139   Max.   :803.00  
##  (Other)                                  :2972                   
##    avg_stars    
##  Min.   :1.000  
##  1st Qu.:3.000  
##  Median :3.500  
##  Mean   :3.476  
##  3rd Qu.:4.000  
##  Max.   :5.000  
## 

Combining data into one dataset

Combine the three datasets.

reviews, users, and businesses, are data frames. We can combine a data frame in many ways, but must first clean the missing data (a business without a review).

Use the inner_join function from the dplyr package to combine the three datasets.


Pointer

The dplyr package, along with tidyr and data.table, have functionalities for molding the data and extracting very specific subsets just like with SQL; or NumPy and pandas in Python.


The dplyr cheat sheet


Combining data into one dataset (cont’d)

The function inner_join combines two datasets by finding columns with identical labels and then only combining the rows that are found in both independent datasets.

library(dplyr)

# Venn diagrams
# Combine the reviews and users data sets
ru  <- inner_join(reviews, users)

# combine the newly created data set with the businesses data set
rub <- inner_join(ru, businesses)

# Take a look at the combined data frame
summary(rub)
##    user_id          business_id            stars       user_name        
##  Length:128648      Length:128648      Min.   :1.00   Length:128648     
##  Class :character   Class :character   1st Qu.:3.00   Class :character  
##  Mode  :character   Mode  :character   Median :4.00   Mode  :character  
##                                        Mean   :3.74                     
##                                        3rd Qu.:5.00                     
##                                        Max.   :5.00                     
##                                                                         
##      city           business_name     
##  Length:128648      Length:128648     
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##                                       
##                                       
##                                      categories     review_count  
##  ['Mexican', 'Restaurants']               :12384   Min.   :  3.0  
##  ['American (New)', 'Restaurants']        : 6491   1st Qu.: 39.0  
##  ['Pizza', 'Restaurants']                 : 5290   Median : 89.0  
##  ['American (Traditional)', 'Restaurants']: 4068   Mean   :138.8  
##  ['Chinese', 'Restaurants']               : 3544   3rd Qu.:182.0  
##  ['Restaurants', 'Italian']               : 3397   Max.   :803.0  
##  (Other)                                  :93474                  
##    avg_stars    
##  Min.   :1.000  
##  1st Qu.:3.500  
##  Median :4.000  
##  Mean   :3.745  
##  3rd Qu.:4.000  
##  Max.   :5.000  
## 
dim(rub)
## [1] 128648      9

From 229907 observations, we now have 128648 obs (and 10 variables.

Isolating Indian restaurants

The dataset rub is large and covers many genres of cuisine. In order to simplify the task, we will only look at reviews for Indian restaurants.

We need to filter out all of the non-Indian reviews.

# Create indian review column
# grepl searches a keyword
rub$is_indian <- grepl('Indian', rub$categories) == TRUE

# Select only reviews for Indian restaurants
indian <- subset(rub, is_indian == TRUE)

dim(rub)
## [1] 128648     10
dim(indian)
## [1] 1333   10

We now have 1333 observations (from 128648 obs).

Finding number of reviews per user

We now have a manageable data set with just one type of cuisine. We can begin adapting the Yelp star reviews to create a new review that gives more weight to those who have reviewed more restaurants of the same cuisine.

Create a new data frame with the number of reviews each reviewer has made for the collection of Indian restaurants in the original data set.

# Generate a new data frame with the number of reviews by each reviewer
# data %>% select %>% group_by %>% summarise
number_reviews_indian <- indian %>% 
  select(c(user_id, user_name)) %>%
  group_by(user_id) %>% 
  summarise(total_reviews = n())

# Check it out
str(number_reviews_indian)
## Classes 'tbl_df', 'tbl' and 'data.frame':    1048 obs. of  2 variables:
##  $ user_id      : chr  "01pI8lIEwSAzzfPxUby-yw" "_01X4a-sS1Oc2yZuMMuCEQ" "03kllIN9ASWGmecQPLPqZA" "0aQyjVmcytcjkiabuhkIgA" ...
##  $ total_reviews: int  1 1 1 1 1 1 8 1 1 1 ...
# Print the frequency table of total_reviews
table(number_reviews_indian$total_reviews)
## 
##   1   2   3   4   5   6   7   8   9  12  13  15 
## 882 114  32  10   2   1   2   1   1   1   1   1
# Pring the average number of reviews per users
mean(number_reviews_indian$total_reviews)
## [1] 1.271947

Adding review count to the dataset

Before we create a weighted star review for each restaurant, add the number_reviews_indian to the larger data frame indian.

# Combine number of Indian reviews with original data frame of Indian restaurant reviews
indian_plus_number <- inner_join(indian, number_reviews_indian)

# Check it out
head(indian)
## # A tibble: 6 x 10
##                  user_id            business_id stars user_name       city
##                    <chr>                  <chr> <int>     <chr>      <chr>
## 1 3sJ62Mkavx69FBec71agYg NCbHGtOP5yJBJsPPaE3X5g     5      Mark    Phoenix
## 2 hTKFGpi3ltCV4B-XDFRT-A 3GAPcBG8SowgrpS6UHlDeQ     3    Amanda    Phoenix
## 3 Y_ZITL9cYCKgBgDCwUQrow LYyGQgL60VKdV-p_9OxmWQ     4     Edgar    Phoenix
## 4 P2kVk4cIWyK4e4h14RhK-Q V9i9LnTg9H2XvzqCVBSOXg     4       Jim Scottsdale
## 5 LMlBCXFVAHdPnSA94jc6PQ d_8bMNQd0mesbEUeq1U2kQ     3      Mini    Phoenix
## 6 xH5ETGmT8IUaPjbwZmj-Ow y3V2jqKmvWjyo9Mc-Ipn4g     4      Kurt      Tempe
## # ... with 5 more variables: business_name <chr>, categories <fctr>,
## #   review_count <int>, avg_stars <dbl>, is_indian <lgl>
head(number_reviews_indian)
## # A tibble: 6 x 2
##                  user_id total_reviews
##                    <chr>         <int>
## 1 01pI8lIEwSAzzfPxUby-yw             1
## 2 _01X4a-sS1Oc2yZuMMuCEQ             1
## 3 03kllIN9ASWGmecQPLPqZA             1
## 4 0aQyjVmcytcjkiabuhkIgA             1
## 5 0BBO-d1_Dr16Tc55x--ejA             1
## 6 0beZngJ03EmIQfVcnlVK4w             1
head(indian_plus_number)
## # A tibble: 6 x 11
##                  user_id            business_id stars user_name       city
##                    <chr>                  <chr> <int>     <chr>      <chr>
## 1 3sJ62Mkavx69FBec71agYg NCbHGtOP5yJBJsPPaE3X5g     5      Mark    Phoenix
## 2 hTKFGpi3ltCV4B-XDFRT-A 3GAPcBG8SowgrpS6UHlDeQ     3    Amanda    Phoenix
## 3 Y_ZITL9cYCKgBgDCwUQrow LYyGQgL60VKdV-p_9OxmWQ     4     Edgar    Phoenix
## 4 P2kVk4cIWyK4e4h14RhK-Q V9i9LnTg9H2XvzqCVBSOXg     4       Jim Scottsdale
## 5 LMlBCXFVAHdPnSA94jc6PQ d_8bMNQd0mesbEUeq1U2kQ     3      Mini    Phoenix
## 6 xH5ETGmT8IUaPjbwZmj-Ow y3V2jqKmvWjyo9Mc-Ipn4g     4      Kurt      Tempe
## # ... with 6 more variables: business_name <chr>, categories <fctr>,
## #   review_count <int>, avg_stars <dbl>, is_indian <lgl>,
## #   total_reviews <int>
# Display column names for the new data frame
names(indian_plus_number)
##  [1] "user_id"       "business_id"   "stars"         "user_name"    
##  [5] "city"          "business_name" "categories"    "review_count" 
##  [9] "avg_stars"     "is_indian"     "total_reviews"
# Finally
dim(indian)
## [1] 1333   10
dim(indian_plus_number)
## [1] 1333   11

We added a new column to the data.

Generating weighted star reviews

Use the combined dataset, indian_plus_number, to create weighted star reviews for each user. Multiply the unweighted restaurant review variable stars by the total number of reviews variable total_reviews for each user.

# Generate weighted_stars variable 
indian_plus_number$weighted_stars <- indian_plus_number$stars * indian_plus_number$total_reviews

head(indian_plus_number$stars)
## [1] 5 3 4 4 3 4
head(indian_plus_number$total_reviews)
## [1] 1 1 1 1 4 1

Generate weighted star reviews.

# Create a new weighted review for each restaurant
# data %>% select %>% group_by %>% summarise
new_review_indian <- indian_plus_number %>% 
  select(city, business_name, avg_stars, stars, total_reviews, weighted_stars) %>%
  group_by(city, business_name, avg_stars) %>%
  summarise(count = n(),
            avg = sum(stars) / count,
            new = sum(weighted_stars) / sum(total_reviews),
            diff = new - avg)

# Check it out
str(new_review_indian)
## Classes 'grouped_df', 'tbl_df', 'tbl' and 'data.frame':  39 obs. of  7 variables:
##  $ city         : chr  "Avondale" "Chandler" "Chandler" "Chandler" ...
##  $ business_name: chr  "India Garden" "Cafe Krishna" "Copper Kettle" "Curry House" ...
##  $ avg_stars    : num  4 3.5 3.5 2.5 3.5 4 2 3.5 4 3.5 ...
##  $ count        : int  39 40 12 8 34 56 3 42 54 16 ...
##  $ avg          : num  4.18 3.4 3.58 2.88 3.47 ...
##  $ new          : num  3.93 3.27 3.83 2.38 2.95 ...
##  $ diff         : num  -0.253 -0.131 0.244 -0.49 -0.523 ...
##  - attr(*, "vars")= chr  "city" "business_name"
##  - attr(*, "drop")= logi TRUE

Detecting & plotting modification effects

Detect the effects of the modifications.

# Plot the distribution of changes to reviews 
hist(new_review_indian$diff, main = "Changes in Star Reviews", xlab = "Change")

# Plot the changes in review per restaurant 
library(ggplot2)

ggplot(new_review_indian, aes(x = 1:nrow(new_review_indian), y = diff, fill = city)) +
    geom_bar(stat="identity", position = position_dodge()) + 
    theme_classic() + 
    scale_fill_grey() + 
    xlab("Businesses ID") + 
    ylab("Change in Star Review")

# Sort
new_review_indian2 <- new_review_indian[order(new_review_indian$diff),]

# Plot the changes in review per restaurant 
ggplot(new_review_indian2, aes(x = 1:nrow(new_review_indian2), y = diff, fill = city)) +
    geom_bar(stat="identity", position = position_dodge()) + 
    theme_classic() + 
    scale_fill_grey() + 
    xlab("Businesses ID") + 
    ylab("Change in Star Review")

# Display a summary of the 
summary(new_review_indian)
##      city           business_name        avg_stars         count       
##  Length:39          Length:39          Min.   :1.000   Min.   :  3.00  
##  Class :character   Class :character   1st Qu.:3.500   1st Qu.: 10.00  
##  Mode  :character   Mode  :character   Median :3.500   Median : 31.00  
##                                        Mean   :3.513   Mean   : 34.18  
##                                        3rd Qu.:4.000   3rd Qu.: 51.50  
##                                        Max.   :4.500   Max.   :105.00  
##       avg             new             diff        
##  Min.   :1.000   Min.   :1.000   Min.   :-0.7857  
##  1st Qu.:3.388   1st Qu.:3.150   1st Qu.:-0.3019  
##  Median :3.609   Median :3.429   Median :-0.1246  
##  Mean   :3.531   Mean   :3.381   Mean   :-0.1504  
##  3rd Qu.:3.959   3rd Qu.:3.905   3rd Qu.: 0.0000  
##  Max.   :4.333   Max.   :4.267   Max.   : 0.3689

2, Generating Authentic Star Reviews

Exploring the data

This method looks to adapt the star reviews with the perception that Yelp reviewers with Indian heritage would provide more accurate and authentic reviews for Indian cuisine. The strategy for manipulating the star reviews involves selecting only the reviewers with Indian names for the aggregate restaurant star review.

Select the users with native Indian names (we skip this stap and we load in data that are already prepared).

Load in the text file.

# Read Indian names into a list
indian_names <- scan('indian_names.txt', what = character())

# Show the first names from the indian_names list
head(indian_names)
## [1] "Aayush"     "Abhi"       "Abhijeet"   "Abhijit"    "Abhilash"  
## [6] "Abhinandan"
length(indian_names)
## [1] 627

We have 627 names.

Cleaning the names list

The list was taken from an online resource and may contain names that don’t make sense or aren’t useful.

Take a look at the list and see if any names don’t fit. There are a few names that could select users that would be hard to tell whether they were native Indian or not.

The single character names like A., C. or K. may select users that we don’t want. Remove those names from the list before using it to select the native Indian users.

Find a regular expression that will find all the names with a single character followed by a .. The regular expression [A-z]\\. should do the trick. Combine the regular expression with the grep function to locate the names that we want to eliminate. These locations can be used to eliminate just the names we don’t want.

# Locate the names that we want to eliminate
indian_names_remove <- grep("[A-z]\\.", indian_names, perl = TRUE)

# Check to make sure they are the correct names
indian_names[indian_names_remove]
## [1] "A." "K." "C."
# Eliminate them from the indian_names list
indian_names_clean <- indian_names[-indian_names_remove]

# Show the first names from the indian_names_clean list
head(indian_names_clean)
## [1] "Aayush"     "Abhi"       "Abhijeet"   "Abhijit"    "Abhilash"  
## [6] "Abhinandan"
length(indian_names_clean)
## [1] 624

Finding authentic users

Select just the reviews from the users that have a name that is part of this list.

The subset function will make this task simple. Split the indian data set by defining the subset argument within the subset function. We can define the column to in which to divide the data by with the subset argument. Using the %in% operator, we can define criteria in which to select from the column defined by the subset function. In this case, it would be looking for authentic Indian names within the user_name column.

# Subset the `indian` data set to just the users with native Indian names
authentic_users <- subset(indian, indian$user_name %in% indian_names_clean)

Generate a table of the authentic Indian users to get a sense of the size of the data.

# Find the number of users in each city
# data %>% select %>% group_by %>% summarise
number_authentic_city <- authentic_users %>%
  select(city,user_name) %>%
  group_by(city) %>%
  summarise(users = n())

# Print the number of users per city
number_authentic_city
## # A tibble: 6 x 2
##         city users
##        <chr> <int>
## 1   Avondale     1
## 2   Chandler    10
## 3   Glendale     6
## 4    Phoenix    15
## 5 Scottsdale    16
## 6      Tempe    27

How many authentic users?

Take a look at the authentic_users and number_authentic_city datasets. Calculate the total number of authentic users.

str(authentic_users)
## Classes 'tbl_df', 'tbl' and 'data.frame':    75 obs. of  10 variables:
##  $ user_id      : chr  "1pbDBfeOWnRVrotNPRzAzw" "hdGOqtk3Z9GModnuDkaD2w" "Sw3A6WoyVZAy1lIoVaPQqQ" "AkJFqLqHHAKY3H5R8p7cPQ" ...
##  $ business_id  : chr  "V9i9LnTg9H2XvzqCVBSOXg" "x3ws2L-TlI-JPcEQjjEC9g" "Ew6YfU0XcFGM-S4zncZ5Dg" "y3V2jqKmvWjyo9Mc-Ipn4g" ...
##  $ stars        : int  5 5 2 2 4 2 1 5 3 5 ...
##  $ user_name    : chr  "Suman" "Ravi" "Ajay" "Vijay" ...
##  $ city         : chr  "Scottsdale" "Tempe" "Avondale" "Tempe" ...
##  $ business_name: chr  "Tandoori Times Indian Bistro" "Curry Corner" "India Garden" "Pasand" ...
##  $ categories   : Factor w/ 782 levels "['Active Life', 'American (New)', 'Golf', 'Restaurants']",..: 547 546 545 625 521 625 625 546 759 416 ...
##  $ review_count : int  115 44 52 39 46 78 78 44 131 52 ...
##  $ avg_stars    : num  3.5 4.5 4 3 4 3.5 3.5 4.5 3.5 3.5 ...
##  $ is_indian    : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
str(number_authentic_city)
## Classes 'tbl_df', 'tbl' and 'data.frame':    6 obs. of  2 variables:
##  $ city : chr  "Avondale" "Chandler" "Glendale" "Phoenix" ...
##  $ users: int  1 10 6 15 16 27
# Calculate
sum(number_authentic_city$users)
## [1] 75

Generating average authenic reviews

We’ve now selected our authentic Indian users and can use their reviews to generate average authentic star reviews.

With the dplyr package, use the select, group_by, summarise and mutate functions to add new variables to the larger dataset.

The select function allows we to isolate the variables we wish to use to create the new values. The group_by, %>% and summarized functions allow for separate calculations to be performed within the unique values of the variable or variables being grouped.

We should create a new star review column called new_star and a column of the difference between the original average star reviews and the new star reviews. Assign the column of differences to diff.

# Generate new "immigrant" review
# data %>% select %>% group_by %>% summarise %>% mutate
avg_review_indian <- authentic_users %>% 
    select(business_id, business_name, city, stars, 
           avg_stars, is_indian, user_name) %>%
    group_by(city, business_name, avg_stars) %>%
    summarise(count = n(),
    new_stars = sum(stars) / count) %>%
    mutate(diff = new_stars - avg_stars)

Detecting manipulation effect

Detect the effects of the modifications.

# Create a histogram of the avg_stars
hist(avg_review_indian$avg_stars)

# Create a histogram of the new_stars
hist(avg_review_indian$new_stars)

# Plot the distribution of changes to reviews 
hist(avg_review_indian$diff, main = "Changes in Star Reviews", xlab = "Change")

# Plot the changes to per restaurant 
ggplot(avg_review_indian, aes(x = 1:nrow(avg_review_indian), y = diff, fill = city)) +
    geom_bar(stat="identity", position = position_dodge()) + 
    theme_classic() + scale_fill_grey() + xlab("Businesses ID") + ylab("Change in Star Review")

# Sort
avg_review_indian2 <- avg_review_indian[order(avg_review_indian$diff),]

# Plot the changes to per restaurant 
ggplot(avg_review_indian2, aes(x = 1:nrow(avg_review_indian2), y = diff, fill = city)) +
    geom_bar(stat="identity", position = position_dodge()) + 
    theme_classic() + scale_fill_grey() + xlab("Businesses ID") + ylab("Change in Star Review")