Introduction

What factors influence the popularity of songs? The goal is to construct a model to predict rating and get a lower RMSE based on features of songs in the dataset.

install.packages('qdap', repos = "https://cloud.r-project.org/")
## 
## The downloaded binary packages are in
##  /var/folders/lg/5q0wpmsd1dd1vqj9_6j28zwh0000gn/T//RtmpYW9DFE/downloaded_packages
library(caret)
library(rpart)
library(rpart.plot)
library(dplyr)
library(tidyr)
library(tidyverse)
library(ggplot2)
library(gridExtra)
library(ranger)
library(qdap)

Load Data

The dataset characterizes around 20,000 songs based on auditory characteristics, such as loudness and speed, as well as performer and genre.

df= read.csv('/Users/cathy/Documents/Columbia Sem 1/5200_FRAMEWORKS & METHOD/Kaggle/lalasongs22/analysisData.csv')
score_df= read.csv("/Users/cathy/Documents/Columbia Sem 1/5200_FRAMEWORKS & METHOD/Kaggle/lalasongs22/scoringData.csv")

Explore Data

Data Structure

We trained the data using the df dataset.

str(df)
## 'data.frame':    19485 obs. of  19 variables:
##  $ id              : int  94500 64901 28440 19804 83560 16501 58033 67048 48848 95622 ...
##  $ performer       : chr  "Andy Williams" "Sandy Nelson" "Britney Spears" "Taylor Swift" ...
##  $ song            : chr  "......And Roses And Roses" "...And Then There Were Drums" "...Baby One More Time" "...Ready For It?" ...
##  $ genre           : chr  "['adult standards', 'brill building pop', 'easy listening', 'mellow gold']" "['rock-and-roll', 'space age pop', 'surf music']" "['dance pop', 'pop', 'post-teen pop']" "['pop', 'post-teen pop']" ...
##  $ track_duration  : num  166106 172066 211066 208186 182080 ...
##  $ track_explicit  : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ danceability    : num  0.154 0.588 0.759 0.613 0.45 0.57 0.612 0.253 0.575 0.615 ...
##  $ energy          : num  0.185 0.672 0.699 0.764 0.294 0.629 0.542 0.232 0.434 0.497 ...
##  $ key             : int  5 11 0 2 7 9 5 0 5 7 ...
##  $ loudness        : num  -14.06 -17.28 -5.75 -6.51 -12.02 ...
##  $ mode            : int  1 0 0 1 1 0 1 1 1 1 ...
##  $ speechiness     : num  0.0315 0.0361 0.0307 0.136 0.0318 0.0331 0.0264 0.0318 0.0312 0.439 ...
##  $ acousticness    : num  0.911 0.00256 0.202 0.0527 0.832 0.593 0.0781 0.805 0.735 0.016 ...
##  $ instrumentalness: num  2.67e-04 7.45e-01 1.31e-04 0.00 3.53e-05 1.36e-04 0.00 1.80e-04 6.59e-05 0.00 ...
##  $ liveness        : num  0.112 0.145 0.443 0.197 0.108 0.77 0.0763 0.0939 0.105 0.312 ...
##  $ valence         : num  0.15 0.801 0.907 0.417 0.146 0.308 0.433 0.307 0.348 0.769 ...
##  $ tempo           : num  84 122 93 160 141 ...
##  $ time_signature  : int  4 4 4 4 4 4 4 3 4 3 ...
##  $ rating          : int  36 16 70 64 19 34 44 34 47 26 ...

As you can see, there is no ‘rating’ variable in the score_df dataset, so this is the dataset that we wanted to forecast ratings.

str(score_df)
## 'data.frame':    4844 obs. of  18 variables:
##  $ id              : int  50400 96747 1824 67597 86944 85423 5500 82675 5926 57666 ...
##  $ performer       : chr  "Paul Davis" "Luther Vandross" "The Olympics" "Maxine Nightingale" ...
##  $ song            : chr  "'65 Love Affair" "'Til My Baby Comes Home" "(Baby) Hully Gully" "(Bringing Out) The Girl In Me" ...
##  $ genre           : chr  "['album rock', 'bubblegum pop', 'country rock', 'folk rock', 'mellow gold', 'new wave pop', 'soft rock', 'yacht rock']" "['funk', 'motown', 'neo soul', 'new jack swing', 'quiet storm', 'r&b', 'soul', 'urban contemporary']" "['doo-wop', 'rhythm and blues']" "['classic uk pop']" ...
##  $ track_duration  : num  219813 332226 127829 210973 180133 ...
##  $ track_explicit  : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ danceability    : num  0.647 0.804 0.744 0.712 0.665 0.442 0.61 0.478 0.499 0.801 ...
##  $ energy          : num  0.686 0.714 0.47 0.753 0.552 0.717 0.377 0.298 0.249 0.875 ...
##  $ key             : int  2 11 7 9 6 9 9 2 5 7 ...
##  $ loudness        : num  -4.25 -6.71 -10.29 -7.67 -10.18 ...
##  $ mode            : int  0 0 1 1 1 0 1 1 1 1 ...
##  $ speechiness     : num  0.0274 0.183 0.035 0.0399 0.0307 0.0395 0.0935 0.0342 0.0422 0.0995 ...
##  $ acousticness    : num  0.432 0.0567 0.659 0.756 0.493 0.518 0.414 0.74 0.733 0.00366 ...
##  $ instrumentalness: num  6.19e-06 6.21e-06 0.00 1.58e-06 2.72e-02 2.62e-01 0.00 1.11e-05 0.00 4.25e-01 ...
##  $ liveness        : num  0.133 0.0253 0.256 0.169 0.421 0.184 0.0734 0.139 0.105 0.182 ...
##  $ valence         : num  0.952 0.802 0.908 0.953 0.899 0.376 0.519 0.376 0.539 0.637 ...
##  $ tempo           : num  155.7 139.7 116.2 116.2 96.9 ...
##  $ time_signature  : int  4 4 4 4 4 4 4 4 4 4 ...

Features

Examine the correlation between each feature and rating.

Track Duration

cor_track_duration <- cor(df$track_duration, df$rating)

Danceability

cor_danceability <- cor(df$danceability, df$rating)

Energy

cor_energy <- cor(df$energy, df$rating)

Key

cor_key <- cor(df$key, df$rating)

Loudness

cor_loudness <- cor(df$loudness, df$rating)

Mode

cor_mode <- cor(df$mode, df$rating)

Speechiness

cor_speechiness <- cor(df$speechiness, df$rating)

Acousticness

cor_acousticness <- cor(df$acousticness, df$rating)

Instrumentalness

cor_instrumentalness <- cor(df$instrumentalness, df$rating)

Liveness

cor_liveness <- cor(df$liveness, df$rating)

Valence

cor_valence <- cor(df$valence, df$rating)

Tempo

cor_tempo <- cor(df$tempo, df$rating)

Time Signature

cor_time_signature <- cor(df$time_signature, df$rating)

Overview of Correlation

correlation_data <- data.frame(
  Features = c("track_duration", "danceability", "energy", "key", "loudness", 
               "mode", "speechiness", "acousticness", "instrumentalness", "liveness", 
               "valence", "tempo", "time_signature"),
  Correlation_with_Rating = c(cor_track_duration, cor_danceability, cor_energy, cor_key, 
                  cor_loudness, cor_mode, cor_speechiness, cor_acousticness, 
                  cor_instrumentalness, cor_liveness, cor_valence, cor_tempo, cor_time_signature)
)
print(correlation_data)
##            Features Correlation_with_Rating
## 1    track_duration             0.146732322
## 2      danceability             0.138889626
## 3            energy             0.102413789
## 4               key             0.001075925
## 5          loudness             0.196450769
## 6              mode            -0.064136001
## 7       speechiness             0.076755595
## 8      acousticness            -0.197898859
## 9  instrumentalness            -0.088368234
## 10         liveness            -0.058780344
## 11          valence            -0.093921139
## 12            tempo             0.012671405
## 13   time_signature             0.092397211

Visual of Correlation

We can observe the relationship between all attributes and ratings graphically.

combined_plots <- grid.arrange(
  ggplot(df, aes(track_duration, rating)) + geom_smooth(method='lm', se=FALSE) + scale_y_continuous(limits = c(0, 90)),
  ggplot(df, aes(danceability, rating)) + geom_smooth(method='lm', se=FALSE) + scale_y_continuous(limits = c(0, 90)),
  ggplot(df, aes(energy, rating)) + geom_smooth(method='lm', formula=y~x) + scale_y_continuous(limits = c(0, 90)),
  ggplot(df, aes(key, rating)) + geom_smooth(method='lm', se=FALSE) + scale_y_continuous(limits = c(0, 90)),
  ggplot(df, aes(loudness, rating)) + geom_smooth(method='lm', se=FALSE) + scale_y_continuous(limits = c(0, 90)),
  ggplot(df, aes(mode, rating)) + geom_smooth(method='lm', se=FALSE) + scale_y_continuous(limits = c(0, 90)),
  ggplot(df, aes(speechiness, rating)) + geom_smooth(method='lm', se=FALSE) + scale_y_continuous(limits = c(0, 90)),
  ggplot(df, aes(acousticness, rating)) + geom_smooth(method='lm', se=FALSE) + scale_y_continuous(limits = c(0, 90)),
  ggplot(df, aes(instrumentalness, rating)) + geom_smooth(method='lm', se=FALSE) + scale_y_continuous(limits = c(0, 90)),
  ggplot(df, aes(liveness, rating)) + geom_smooth(method='lm', se=FALSE) + scale_y_continuous(limits = c(0, 90)),
  ggplot(df, aes(valence, rating)) + geom_smooth(method='lm', se=FALSE) + scale_y_continuous(limits = c(0, 90)),
  ggplot(df, aes(tempo, rating)) + geom_smooth(method='lm', se=FALSE) + scale_y_continuous(limits = c(0, 90)),
  ggplot(df, aes(time_signature, rating)) + geom_smooth(method='lm', se=FALSE) + scale_y_continuous(limits = c(0, 90)),
  nrow = 4
)

Clean Data

Genre is an important factor that influences ratings, I make genre into dummy variables so that it may be included in the prediction model.

df$genre = gsub(' ', '', df$genre)
df$genre = gsub('\\[', '', df$genre)
df$genre = gsub(']', '', df$genre)
df$genre = gsub("'", '', df$genre)

score_df$genre = gsub(' ', '', score_df$genre)
score_df$genre = gsub('\\[', '', score_df$genre)
score_df$genre = gsub(']', '', score_df$genre)
score_df$genre = gsub("'", '', score_df$genre)


# Split genres by ',' both in "data" and "scoringData" files
df$genre = strsplit(df$genre, ',')
score_df$genre = strsplit(score_df$genre, ',')


# Make dummy varibales out of column of genre vector in "data" and "scoringData" files
df = cbind(df, mtabulate(df$genre))
score_df = cbind(score_df, mtabulate(score_df$genre))

# Use genres found in both "data" and "scoringData" files
shared_columns = intersect(names(df), names(score_df))
score_df = select(score_df, all_of(shared_columns))
df = select(df, c('rating', all_of(shared_columns) ))

Split Data

set.seed(1031)
split = createDataPartition(y = df$rating, p = 0.7, list = F,groups = 40)
train = df[split,]
test = df[-split,]

Predictive Model- Tuned Ranger Forest

To derive value from ranger, it is important to tune model hyperparameters. Here we are going to tune mtry, splitrule and min.node.size with 5-fold cross-validation using the caret framework.

trControl=trainControl(method="cv",number=5)
tuneGrid = expand.grid(mtry=3:10, 
                       splitrule = c('variance','extratrees','maxstat'),
                       min.node.size = c(2,3,4,5,10))
set.seed(1031)
cvModel = train(rating~track_duration+danceability+ key+loudness+ 
                  +speechiness+ tempo+time_signature+contemporarycountry+ 
                  + countryroad+dancepop+disco+poprap+poprock+pop+hippop+electropop+edm+
                  + rap+rock+softrock+soul+poprock +trap+vocaljazz
                +newjackswing+motown+latin+hardrock+hardcorehiphop+metal,
               data = train, 
               method="ranger",
               num.trees=200,
               trControl=trControl,
               tuneGrid=tuneGrid)

  cvModel$bestTune
##    mtry splitrule min.node.size
## 19    4  variance             5

Now, that we have the best combination of hyperparameters, We can use this to fit a ranger forest model and make predictions.

set.seed(1031)
tuned_forest_ranger = ranger(rating~track_duration+danceability+ key+loudness+ 
                               +speechiness+ tempo+time_signature+contemporarycountry+ 
                               + countryroad+dancepop+disco+poprap+poprock+pop+hippop+electropop+edm+
                               + rap+rock+softrock+soul+poprock +trap+vocaljazz
                             +newjackswing+motown+latin+hardrock+hardcorehiphop+metal,
                data=train,
                num.trees = 200, 
                mtry=cvModel$bestTune$mtry,
                min.node.size =cvModel$bestTune$min.node.size, 
                splitrule = cvModel$bestTune$splitrule)

Predict Train Data

Fit the model with the best hyperparameters, resulting with a RMSE on train data of 12.08.

#rmse_train_forest_ranger
pred_train = predict(tuned_forest_ranger, data = train, num.trees = 200)
rmse_train_tuned_forest_ranger = sqrt(mean((pred_train$predictions - train$rating)^2))
rmse_train_tuned_forest_ranger
## [1] 12.08452

Predict Test Data

The RMSE of test data is 14.7 and is similar to the final result when I predict the final data, demonstrating that the predictive model is effective.

#rmse_test_forest_ranger
pred = predict(tuned_forest_ranger, data = test, num.trees = 200)
rmse_tuned_forest_ranger = sqrt(mean((pred$predictions - test$rating)^2))
rmse_tuned_forest_ranger
## [1] 14.77343

Predict Final Data

pred_final = predict(tuned_forest_ranger, data = score_df, num.trees = 200)
submissionFile = data.frame(id = score_df$id, rating = pred_final)
write.csv(submissionFile, "submission_tuned_forest_ranger.csv", row.names = F)

Final Thought

After experimenting with other models, the tuned ranger forest model is the one that best predicts this data. If I could go back in time, I would update the train model to incorporate the performer feature because it may be an important factor that influences the rating. This research taught me that, in addition to training the model, understanding what to put in the model is critical, because the brainstorming process may result in more combinations of characteristics that help forecast the model better.