What factors influence the popularity of songs? The goal is to construct a model to predict rating and get a lower RMSE based on features of songs in the dataset.
install.packages('qdap', repos = "https://cloud.r-project.org/")
##
## The downloaded binary packages are in
## /var/folders/lg/5q0wpmsd1dd1vqj9_6j28zwh0000gn/T//RtmpYW9DFE/downloaded_packages
library(caret)
library(rpart)
library(rpart.plot)
library(dplyr)
library(tidyr)
library(tidyverse)
library(ggplot2)
library(gridExtra)
library(ranger)
library(qdap)
The dataset characterizes around 20,000 songs based on auditory characteristics, such as loudness and speed, as well as performer and genre.
= read.csv('/Users/cathy/Documents/Columbia Sem 1/5200_FRAMEWORKS & METHOD/Kaggle/lalasongs22/analysisData.csv')
df= read.csv("/Users/cathy/Documents/Columbia Sem 1/5200_FRAMEWORKS & METHOD/Kaggle/lalasongs22/scoringData.csv") score_df
We trained the data using the df dataset.
str(df)
## 'data.frame': 19485 obs. of 19 variables:
## $ id : int 94500 64901 28440 19804 83560 16501 58033 67048 48848 95622 ...
## $ performer : chr "Andy Williams" "Sandy Nelson" "Britney Spears" "Taylor Swift" ...
## $ song : chr "......And Roses And Roses" "...And Then There Were Drums" "...Baby One More Time" "...Ready For It?" ...
## $ genre : chr "['adult standards', 'brill building pop', 'easy listening', 'mellow gold']" "['rock-and-roll', 'space age pop', 'surf music']" "['dance pop', 'pop', 'post-teen pop']" "['pop', 'post-teen pop']" ...
## $ track_duration : num 166106 172066 211066 208186 182080 ...
## $ track_explicit : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ danceability : num 0.154 0.588 0.759 0.613 0.45 0.57 0.612 0.253 0.575 0.615 ...
## $ energy : num 0.185 0.672 0.699 0.764 0.294 0.629 0.542 0.232 0.434 0.497 ...
## $ key : int 5 11 0 2 7 9 5 0 5 7 ...
## $ loudness : num -14.06 -17.28 -5.75 -6.51 -12.02 ...
## $ mode : int 1 0 0 1 1 0 1 1 1 1 ...
## $ speechiness : num 0.0315 0.0361 0.0307 0.136 0.0318 0.0331 0.0264 0.0318 0.0312 0.439 ...
## $ acousticness : num 0.911 0.00256 0.202 0.0527 0.832 0.593 0.0781 0.805 0.735 0.016 ...
## $ instrumentalness: num 2.67e-04 7.45e-01 1.31e-04 0.00 3.53e-05 1.36e-04 0.00 1.80e-04 6.59e-05 0.00 ...
## $ liveness : num 0.112 0.145 0.443 0.197 0.108 0.77 0.0763 0.0939 0.105 0.312 ...
## $ valence : num 0.15 0.801 0.907 0.417 0.146 0.308 0.433 0.307 0.348 0.769 ...
## $ tempo : num 84 122 93 160 141 ...
## $ time_signature : int 4 4 4 4 4 4 4 3 4 3 ...
## $ rating : int 36 16 70 64 19 34 44 34 47 26 ...
As you can see, there is no ‘rating’ variable in the score_df dataset, so this is the dataset that we wanted to forecast ratings.
str(score_df)
## 'data.frame': 4844 obs. of 18 variables:
## $ id : int 50400 96747 1824 67597 86944 85423 5500 82675 5926 57666 ...
## $ performer : chr "Paul Davis" "Luther Vandross" "The Olympics" "Maxine Nightingale" ...
## $ song : chr "'65 Love Affair" "'Til My Baby Comes Home" "(Baby) Hully Gully" "(Bringing Out) The Girl In Me" ...
## $ genre : chr "['album rock', 'bubblegum pop', 'country rock', 'folk rock', 'mellow gold', 'new wave pop', 'soft rock', 'yacht rock']" "['funk', 'motown', 'neo soul', 'new jack swing', 'quiet storm', 'r&b', 'soul', 'urban contemporary']" "['doo-wop', 'rhythm and blues']" "['classic uk pop']" ...
## $ track_duration : num 219813 332226 127829 210973 180133 ...
## $ track_explicit : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ danceability : num 0.647 0.804 0.744 0.712 0.665 0.442 0.61 0.478 0.499 0.801 ...
## $ energy : num 0.686 0.714 0.47 0.753 0.552 0.717 0.377 0.298 0.249 0.875 ...
## $ key : int 2 11 7 9 6 9 9 2 5 7 ...
## $ loudness : num -4.25 -6.71 -10.29 -7.67 -10.18 ...
## $ mode : int 0 0 1 1 1 0 1 1 1 1 ...
## $ speechiness : num 0.0274 0.183 0.035 0.0399 0.0307 0.0395 0.0935 0.0342 0.0422 0.0995 ...
## $ acousticness : num 0.432 0.0567 0.659 0.756 0.493 0.518 0.414 0.74 0.733 0.00366 ...
## $ instrumentalness: num 6.19e-06 6.21e-06 0.00 1.58e-06 2.72e-02 2.62e-01 0.00 1.11e-05 0.00 4.25e-01 ...
## $ liveness : num 0.133 0.0253 0.256 0.169 0.421 0.184 0.0734 0.139 0.105 0.182 ...
## $ valence : num 0.952 0.802 0.908 0.953 0.899 0.376 0.519 0.376 0.539 0.637 ...
## $ tempo : num 155.7 139.7 116.2 116.2 96.9 ...
## $ time_signature : int 4 4 4 4 4 4 4 4 4 4 ...
Examine the correlation between each feature and rating.
<- cor(df$track_duration, df$rating) cor_track_duration
<- cor(df$danceability, df$rating) cor_danceability
<- cor(df$energy, df$rating) cor_energy
<- cor(df$key, df$rating) cor_key
<- cor(df$loudness, df$rating) cor_loudness
<- cor(df$mode, df$rating) cor_mode
<- cor(df$speechiness, df$rating) cor_speechiness
<- cor(df$acousticness, df$rating) cor_acousticness
<- cor(df$instrumentalness, df$rating) cor_instrumentalness
<- cor(df$liveness, df$rating) cor_liveness
<- cor(df$valence, df$rating) cor_valence
<- cor(df$tempo, df$rating) cor_tempo
<- cor(df$time_signature, df$rating) cor_time_signature
<- data.frame(
correlation_data Features = c("track_duration", "danceability", "energy", "key", "loudness",
"mode", "speechiness", "acousticness", "instrumentalness", "liveness",
"valence", "tempo", "time_signature"),
Correlation_with_Rating = c(cor_track_duration, cor_danceability, cor_energy, cor_key,
cor_loudness, cor_mode, cor_speechiness, cor_acousticness,
cor_instrumentalness, cor_liveness, cor_valence, cor_tempo, cor_time_signature)
)print(correlation_data)
## Features Correlation_with_Rating
## 1 track_duration 0.146732322
## 2 danceability 0.138889626
## 3 energy 0.102413789
## 4 key 0.001075925
## 5 loudness 0.196450769
## 6 mode -0.064136001
## 7 speechiness 0.076755595
## 8 acousticness -0.197898859
## 9 instrumentalness -0.088368234
## 10 liveness -0.058780344
## 11 valence -0.093921139
## 12 tempo 0.012671405
## 13 time_signature 0.092397211
We can observe the relationship between all attributes and ratings graphically.
<- grid.arrange(
combined_plots ggplot(df, aes(track_duration, rating)) + geom_smooth(method='lm', se=FALSE) + scale_y_continuous(limits = c(0, 90)),
ggplot(df, aes(danceability, rating)) + geom_smooth(method='lm', se=FALSE) + scale_y_continuous(limits = c(0, 90)),
ggplot(df, aes(energy, rating)) + geom_smooth(method='lm', formula=y~x) + scale_y_continuous(limits = c(0, 90)),
ggplot(df, aes(key, rating)) + geom_smooth(method='lm', se=FALSE) + scale_y_continuous(limits = c(0, 90)),
ggplot(df, aes(loudness, rating)) + geom_smooth(method='lm', se=FALSE) + scale_y_continuous(limits = c(0, 90)),
ggplot(df, aes(mode, rating)) + geom_smooth(method='lm', se=FALSE) + scale_y_continuous(limits = c(0, 90)),
ggplot(df, aes(speechiness, rating)) + geom_smooth(method='lm', se=FALSE) + scale_y_continuous(limits = c(0, 90)),
ggplot(df, aes(acousticness, rating)) + geom_smooth(method='lm', se=FALSE) + scale_y_continuous(limits = c(0, 90)),
ggplot(df, aes(instrumentalness, rating)) + geom_smooth(method='lm', se=FALSE) + scale_y_continuous(limits = c(0, 90)),
ggplot(df, aes(liveness, rating)) + geom_smooth(method='lm', se=FALSE) + scale_y_continuous(limits = c(0, 90)),
ggplot(df, aes(valence, rating)) + geom_smooth(method='lm', se=FALSE) + scale_y_continuous(limits = c(0, 90)),
ggplot(df, aes(tempo, rating)) + geom_smooth(method='lm', se=FALSE) + scale_y_continuous(limits = c(0, 90)),
ggplot(df, aes(time_signature, rating)) + geom_smooth(method='lm', se=FALSE) + scale_y_continuous(limits = c(0, 90)),
nrow = 4
)
Genre is an important factor that influences ratings, I make genre into dummy variables so that it may be included in the prediction model.
$genre = gsub(' ', '', df$genre)
df$genre = gsub('\\[', '', df$genre)
df$genre = gsub(']', '', df$genre)
df$genre = gsub("'", '', df$genre)
df
$genre = gsub(' ', '', score_df$genre)
score_df$genre = gsub('\\[', '', score_df$genre)
score_df$genre = gsub(']', '', score_df$genre)
score_df$genre = gsub("'", '', score_df$genre)
score_df
# Split genres by ',' both in "data" and "scoringData" files
$genre = strsplit(df$genre, ',')
df$genre = strsplit(score_df$genre, ',')
score_df
# Make dummy varibales out of column of genre vector in "data" and "scoringData" files
= cbind(df, mtabulate(df$genre))
df = cbind(score_df, mtabulate(score_df$genre))
score_df
# Use genres found in both "data" and "scoringData" files
= intersect(names(df), names(score_df))
shared_columns = select(score_df, all_of(shared_columns))
score_df = select(df, c('rating', all_of(shared_columns) )) df
set.seed(1031)
= createDataPartition(y = df$rating, p = 0.7, list = F,groups = 40)
split = df[split,]
train = df[-split,] test
To derive value from ranger, it is important to tune model hyperparameters. Here we are going to tune mtry, splitrule and min.node.size with 5-fold cross-validation using the caret framework.
=trainControl(method="cv",number=5)
trControl= expand.grid(mtry=3:10,
tuneGrid splitrule = c('variance','extratrees','maxstat'),
min.node.size = c(2,3,4,5,10))
set.seed(1031)
= train(rating~track_duration+danceability+ key+loudness+
cvModel +speechiness+ tempo+time_signature+contemporarycountry+
+ countryroad+dancepop+disco+poprap+poprock+pop+hippop+electropop+edm+
+ rap+rock+softrock+soul+poprock +trap+vocaljazz
+newjackswing+motown+latin+hardrock+hardcorehiphop+metal,
data = train,
method="ranger",
num.trees=200,
trControl=trControl,
tuneGrid=tuneGrid)
$bestTune cvModel
## mtry splitrule min.node.size
## 19 4 variance 5
Now, that we have the best combination of hyperparameters, We can use this to fit a ranger forest model and make predictions.
set.seed(1031)
= ranger(rating~track_duration+danceability+ key+loudness+
tuned_forest_ranger +speechiness+ tempo+time_signature+contemporarycountry+
+ countryroad+dancepop+disco+poprap+poprock+pop+hippop+electropop+edm+
+ rap+rock+softrock+soul+poprock +trap+vocaljazz
+newjackswing+motown+latin+hardrock+hardcorehiphop+metal,
data=train,
num.trees = 200,
mtry=cvModel$bestTune$mtry,
min.node.size =cvModel$bestTune$min.node.size,
splitrule = cvModel$bestTune$splitrule)
Fit the model with the best hyperparameters, resulting with a RMSE on train data of 12.08.
#rmse_train_forest_ranger
= predict(tuned_forest_ranger, data = train, num.trees = 200)
pred_train = sqrt(mean((pred_train$predictions - train$rating)^2))
rmse_train_tuned_forest_ranger rmse_train_tuned_forest_ranger
## [1] 12.08452
The RMSE of test data is 14.7 and is similar to the final result when I predict the final data, demonstrating that the predictive model is effective.
#rmse_test_forest_ranger
= predict(tuned_forest_ranger, data = test, num.trees = 200)
pred = sqrt(mean((pred$predictions - test$rating)^2))
rmse_tuned_forest_ranger rmse_tuned_forest_ranger
## [1] 14.77343
= predict(tuned_forest_ranger, data = score_df, num.trees = 200)
pred_final = data.frame(id = score_df$id, rating = pred_final)
submissionFile write.csv(submissionFile, "submission_tuned_forest_ranger.csv", row.names = F)
After experimenting with other models, the tuned ranger forest model is the one that best predicts this data. If I could go back in time, I would update the train model to incorporate the performer feature because it may be an important factor that influences the rating. This research taught me that, in addition to training the model, understanding what to put in the model is critical, because the brainstorming process may result in more combinations of characteristics that help forecast the model better.