
What factors influence the popularity of songs? The goal is to construct a model to predict rating and get a lower RMSE based on features of songs in the dataset.

Load Data

The dataset characterizes around 20,000 songs based on auditory characteristics, such as loudness and speed, as well as performer and genre.

df= read.csv('/Users/cathy/Documents/Columbia Sem 1/5200_FRAMEWORKS & METHOD/Kaggle/lalasongs22/analysisData.csv')
score_df= read.csv("/Users/cathy/Documents/Columbia Sem 1/5200_FRAMEWORKS & METHOD/Kaggle/lalasongs22/scoringData.csv")

Explore Data

Data Structure

We trained the data using the df dataset.

As you can see, there is no ‘rating’ variable in the score_df dataset, so this is the dataset that we wanted to forecast ratings.

Examine the correlation between each feature and rating.

Track Duration

cor_track_duration <- cor(df$track_duration, df$rating)


cor_danceability <- cor(df$danceability, df$rating)


cor_energy <- cor(df$energy, df$rating)


cor_key <- cor(df$key, df$rating)


cor_loudness <- cor(df$loudness, df$rating)


cor_mode <- cor(df$mode, df$rating)


cor_speechiness <- cor(df$speechiness, df$rating)


cor_acousticness <- cor(df$acousticness, df$rating)


cor_instrumentalness <- cor(df$instrumentalness, df$rating)


cor_liveness <- cor(df$liveness, df$rating)


cor_valence <- cor(df$valence, df$rating)


cor_tempo <- cor(df$tempo, df$rating)

Time Signature

cor_time_signature <- cor(df$time_signature, df$rating)

Overview of Correlation

correlation_data <- data.frame(
  Features = c("track_duration", "danceability", "energy", "key", "loudness", 
               "mode", "speechiness", "acousticness", "instrumentalness", "liveness", 
               "valence", "tempo", "time_signature"),
  Correlation_with_Rating = c(cor_track_duration, cor_danceability, cor_energy, cor_key, 
                  cor_loudness, cor_mode, cor_speechiness, cor_acousticness, 
                  cor_instrumentalness, cor_liveness, cor_valence, cor_tempo, cor_time_signature)
##            Features Correlation_with_Rating
## 1    track_duration             0.146732322
## 2      danceability             0.138889626
## 3            energy             0.102413789
## 4               key             0.001075925
## 5          loudness             0.196450769
## 6              mode            -0.064136001
## 7       speechiness             0.076755595
## 8      acousticness            -0.197898859
## 9  instrumentalness            -0.088368234
## 10         liveness            -0.058780344
## 11          valence            -0.093921139
## 12            tempo             0.012671405
## 13   time_signature             0.092397211

Visual of Correlation

We can observe the relationship between all attributes and ratings graphically.

combined_plots <- grid.arrange(
  ggplot(df, aes(track_duration, rating)) + geom_smooth(method='lm', se=FALSE) + scale_y_continuous(limits = c(0, 90)),
  ggplot(df, aes(danceability, rating)) + geom_smooth(method='lm', se=FALSE) + scale_y_continuous(limits = c(0, 90)),
  ggplot(df, aes(energy, rating)) + geom_smooth(method='lm', formula=y~x) + scale_y_continuous(limits = c(0, 90)),
  ggplot(df, aes(key, rating)) + geom_smooth(method='lm', se=FALSE) + scale_y_continuous(limits = c(0, 90)),
  ggplot(df, aes(loudness, rating)) + geom_smooth(method='lm', se=FALSE) + scale_y_continuous(limits = c(0, 90)),
  ggplot(df, aes(mode, rating)) + geom_smooth(method='lm', se=FALSE) + scale_y_continuous(limits = c(0, 90)),
  ggplot(df, aes(speechiness, rating)) + geom_smooth(method='lm', se=FALSE) + scale_y_continuous(limits = c(0, 90)),
  ggplot(df, aes(acousticness, rating)) + geom_smooth(method='lm', se=FALSE) + scale_y_continuous(limits = c(0, 90)),
  ggplot(df, aes(instrumentalness, rating)) + geom_smooth(method='lm', se=FALSE) + scale_y_continuous(limits = c(0, 90)),
  ggplot(df, aes(liveness, rating)) + geom_smooth(method='lm', se=FALSE) + scale_y_continuous(limits = c(0, 90)),
  ggplot(df, aes(valence, rating)) + geom_smooth(method='lm', se=FALSE) + scale_y_continuous(limits = c(0, 90)),
  ggplot(df, aes(tempo, rating)) + geom_smooth(method='lm', se=FALSE) + scale_y_continuous(limits = c(0, 90)),
  ggplot(df, aes(time_signature, rating)) + geom_smooth(method='lm', se=FALSE) + scale_y_continuous(limits = c(0, 90)),
  nrow = 4

Clean Data

Genre is an important factor that influences ratings, I make genre into dummy variables so that it may be included in the prediction model.

df$genre = gsub(' ', '', df$genre)
df$genre = gsub('\\[', '', df$genre)
df$genre = gsub(']', '', df$genre)
df$genre = gsub("'", '', df$genre)

score_df$genre = gsub(' ', '', score_df$genre)
score_df$genre = gsub('\\[', '', score_df$genre)
score_df$genre = gsub(']', '', score_df$genre)
score_df$genre = gsub("'", '', score_df$genre)

# Split genres by ',' both in "data" and "scoringData" files
df$genre = strsplit(df$genre, ',')
score_df$genre = strsplit(score_df$genre, ',')

# Make dummy varibales out of column of genre vector in "data" and "scoringData" files
df = cbind(df, mtabulate(df$genre))
score_df = cbind(score_df, mtabulate(score_df$genre))

# Use genres found in both "data" and "scoringData" files
shared_columns = intersect(names(df), names(score_df))
score_df = select(score_df, all_of(shared_columns))
df = select(df, c('rating', all_of(shared_columns) ))

Split Data

split = createDataPartition(y = df$rating, p = 0.7, list = F,groups = 40)
train = df[split,]
test = df[-split,]

Predictive Model- Tuned Ranger Forest

To derive value from ranger, it is important to tune model hyperparameters. Here we are going to tune mtry, splitrule and min.node.size with 5-fold cross-validation using the caret framework.

tuneGrid = expand.grid(mtry=3:10, 
                       splitrule = c('variance','extratrees','maxstat'),
                       min.node.size = c(2,3,4,5,10))
cvModel = train(rating~track_duration+danceability+ key+loudness+ 
                  +speechiness+ tempo+time_signature+contemporarycountry+ 
                  + countryroad+dancepop+disco+poprap+poprock+pop+hippop+electropop+edm+
                  + rap+rock+softrock+soul+poprock +trap+vocaljazz
               data = train, 

##    mtry splitrule min.node.size
## 19    4  variance             5

Now, that we have the best combination of hyperparameters, We can use this to fit a ranger forest model and make predictions.

tuned_forest_ranger = ranger(rating~track_duration+danceability+ key+loudness+ 
                               +speechiness+ tempo+time_signature+contemporarycountry+ 
                               + countryroad+dancepop+disco+poprap+poprock+pop+hippop+electropop+edm+
                               + rap+rock+softrock+soul+poprock +trap+vocaljazz
                num.trees = 200, 
                min.node.size =cvModel$bestTune$min.node.size, 
                splitrule = cvModel$bestTune$splitrule)

Predict Train Data

Fit the model with the best hyperparameters, resulting with a RMSE on train data of 12.08.

pred_train = predict(tuned_forest_ranger, data = train, num.trees = 200)
rmse_train_tuned_forest_ranger = sqrt(mean((pred_train$predictions - train$rating)^2))
## [1] 12.08452

Predict Test Data

The RMSE of test data is 14.7 and is similar to the final result when I predict the final data, demonstrating that the predictive model is effective.

pred = predict(tuned_forest_ranger, data = test, num.trees = 200)
rmse_tuned_forest_ranger = sqrt(mean((pred$predictions - test$rating)^2))
## [1] 14.77343

Predict Final Data

pred_final = predict(tuned_forest_ranger, data = score_df, num.trees = 200)
submissionFile = data.frame(id = score_df$id, rating = pred_final)
write.csv(submissionFile, "submission_tuned_forest_ranger.csv", row.names = F)

Final Thought

After experimenting with other models, the tuned ranger forest model is the one that best predicts this data. If I could go back in time, I would update the train model to incorporate the performer feature because it may be an important factor that influences the rating. This research taught me that, in addition to training the model, understanding what to put in the model is critical, because the brainstorming process may result in more combinations of characteristics that help forecast the model better.