In this project, I analyze trends in the Billboard Hot 100 music charts over the last 61 years, and integrate data from Spotify’s API to make predictions about songs on the charts.

SPOTIFY API In this chunk, I create three functions which I use throughout my code to make calls to Spotify’s API. The first, spotifyAPI(), takes in a URL meant to be an “endpoint” in Spotify’s API, which will return data in a JSON format. The second, getTrackIDFromQuery(), takes in a character query and searches for it through Spotify’s API, then returns the first track ID found in the results. I use this to get the spotify track IDs of songs on the Billboard charts by entering the track title and the artist’s name as query parameters. The final function takes a spotify track id and returns a JSON containing data describing the audio features of that track. This includes many descriptors of the content of the track itself. More can be found at https://developer.spotify.com/console/get-audio-features-track/.

library(jsonlite)
library(httr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(rvest)
## Loading required package: xml2
clientID <- "f37c56713eee4eb390b4fa3d0b963ffd"
secret <- "3488df63939040ff8d4715a694c60fbd"

accessToken <- content(POST(
  "https://accounts.spotify.com/api/token",
  accept_json(),
  authenticate(clientID, secret),
  body = list(grant_type = "client_credentials"),
  encode = "form",
  verbose()
))$access_token
authentication <- paste0("Bearer ", accessToken)

spotifyAPI <- function(URL) {
  return(GET(url = URL, add_headers(Authorization = authentication)))
}

getTrackIDFromQuery <- function(query) {
  return(fromJSON(content(spotifyAPI(paste0("https://api.spotify.com/v1/search?q=", mgsub(pattern = " ", replacement = "%20", x = query), "&type=track")), "text"))[["tracks"]][["items"]][["id"]][1])
}

getAudioFeatures <- function(trackID) {
  if (is.na(trackID)) {
    return(NA)
  } 
  out <- fromJSON(content(spotifyAPI(paste0("https://api.spotify.com/v1/audio-features/", trackID)), "text"))
  if (out[[1]][[1]] == 404) {
    return(NA)
  }
  return(out)
}

WEB SCRAPING TOOL This chunk is a helper function for scraping Billboard’s website. It takes in the output of rvest::read_html(url) and correctly parses the nodes to extract the track title, track artist, and ranking for the week. I use my spotify method getTrackIDFromQuery to link the spotify track ID to each track. I then return the newly-created data frame.

library(textclean)
extractData <- function(webpage) {
  newDF <- data.frame(track_title = character(), track_artist = character(), track_rank = numeric())
  newDF <- rbind(newDF, data.frame(
    mgsub(pattern = c("\"", "\n", "`", "%"), replacement = "", x = as.character(html_text(html_nodes(webpage, ".chart-list-item__title-text , .chart-number-one__title")))), 
    mgsub(pattern = c("\"", "\n", "`", "%"), replacement = "", x = as.character(html_text(html_nodes(webpage, ".chart-number-one__artist , .chart-list-item__artist")))), 
    as.numeric(mgsub(pattern = c("\"", "\n", "`", "%"), replacement = "", x = as.numeric(html_text(html_nodes(webpage, ".chart-list-item__rank , .chart-number-one__rank img")))))
    ))
  names(newDF) <- c("track_title", "track_artist", "track_rank")
  newDF$track_rank[1] <- 1
  
  i <- 1
  spotifyTrackID <- c()
  for (i in 1:nrow(newDF)) {
    newID <- getTrackIDFromQuery(paste(newDF$track_title[i], newDF$track_artist[i]))
    if (is.null(newID)) {
      newID <- NA
    }
    spotifyTrackID <- c(spotifyTrackID, newID)
  }
  
  newDF <- cbind(newDF, spotifyTrackID)
  return(newDF)
}

Web Scraping Billboard First, I create a vector of all dates from the first Hot 100 chart until today, separated by a year. Each Hot 100 chart can be found at the rootURL + a date. I call read_html on each combination of the root and dates, and bind each result to a single data frame. I also attatch the date to each record.

dateVector <- seq(as.Date("1958-08-04"), as.Date("2018-12-08"), "1 year")

rootURL <- "https://www.billboard.com/charts/hot-100/"

i <- 1
billboardData <- extractData(read_html(paste0(rootURL, dateVector[i]))) 
billboardData <- cbind(billboardData, rep(x = dateVector[i], times = 100))
for (i in 2:61) {
  newData <- extractData(read_html(paste0(rootURL, dateVector[i])))
  newData <- cbind(newData, rep(x = dateVector[i], times = 100))
  billboardData <- rbind(billboardData, newData)
}
colnames(billboardData)[5] <- "chart_date"

Creating Audiofeatures Dataframe From the data frame created above, I create a second data frame by compiling the audio features of each track with my helper getAudioFeatures, using the Spotify track IDs found earlier. Some tracks that are represented in the BillboardData df are not represented here, because not every song on the Billboard Hot 100 charts is on spotify, and therefore I cannot get the audio features for every single track.

i <- 1
audiofeatures <- data.frame(getAudioFeatures(billboardData$spotifyTrackID[i]), stringsAsFactors = FALSE)
for (i in 2:nrow(billboardData)) {
  newData <- getAudioFeatures(billboardData$spotifyTrackID[i])
  if (!is.na(newData))  {
    code <- newData[[1]][[1]][[1]]
    
    if (length(code)) {
      if (code == 429) {
        Sys.sleep(10)
        newData <- getAudioFeatures(billboardData$spotifyTrackID[i])
      }
    }
    
    audiofeatures <- rbind(audiofeatures, newData)
  }
}
## Warning in if (!is.na(newData)) {: the condition has length > 1 and only
## the first element will be used

Here, I plot the distribution of each of the audio features given by the spotify API for each track. I remove outliers from the data by removing entire tracks that have an outlier in one of these variables, because there is enough data to do so.

library(ggplot2)
ggplot(data = audiofeatures, aes(audiofeatures$loudness)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = audiofeatures, aes(audiofeatures$danceability)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = audiofeatures, aes(audiofeatures$energy)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = audiofeatures, aes(audiofeatures$key)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = audiofeatures, aes(audiofeatures$loudness)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = audiofeatures, aes(audiofeatures$mode)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = audiofeatures, aes(audiofeatures$speechiness)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = audiofeatures, aes(audiofeatures$acousticness)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = audiofeatures, aes(audiofeatures$instrumentalness)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = audiofeatures, aes(audiofeatures$liveness)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = audiofeatures, aes(audiofeatures$valence)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = audiofeatures, aes(audiofeatures$tempo)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

outliers <- function(col) {
  zscores <- scale(col)
  for (i in 1:length(col)) {
    if (abs(zscores[i]) > 3) {
      col[i] <- NA
    }
  }
  return(col)
}

audiofeatures$danceability <- outliers(audiofeatures$danceability)
audiofeatures$energy <- outliers(audiofeatures$energy)
audiofeatures$key <- outliers(audiofeatures$key)
audiofeatures$loudness <- outliers(audiofeatures$loudness)
audiofeatures$mode <- outliers(audiofeatures$mode)
audiofeatures$speechiness <- outliers(audiofeatures$speechiness)
audiofeatures$acousticness <- outliers(audiofeatures$acousticness)
audiofeatures$instrumentalness <- outliers(audiofeatures$instrumentalness)
audiofeatures$liveness <- outliers(audiofeatures$liveness)
audiofeatures$valence <- outliers(audiofeatures$valence)

audiofeatures <- audiofeatures[complete.cases(audiofeatures),]

DATA STORAGE & RETRIEVAL After removing outliers, write the two data frames that I created above to two tables in a SQL database. I call the two select * queries to ensure that the data was properly inserted into a table. Each table has the spotifyTrackId for each track, so I am able to join the tables together on this value. I use the sql MIN function to find the peak rank of each track on the charts in the data, and return that value along with the combination of all variables from both tables.

library("RSQLite")

database <- dbConnect(SQLite(), dbname = "db.sqlite")

dbWriteTable(database, "audiofeatures", audiofeatures, overwrite = TRUE)
billboardData$chart_date <- as.character(billboardData$chart_date)
dbWriteTable(database, "billboard", billboardData, overwrite = TRUE)

dbGetQuery(database, statement = "SELECT * FROM audiofeatures")
dbGetQuery(database, statement = "SELECT * FROM billboard")
data <- dbGetQuery(database, "SELECT MIN(billboard.track_rank) as peak_rank, * FROM audiofeatures LEFT JOIN billboard ON billboard.spotifyTrackID = audiofeatures.id GROUP BY billboard.spotifyTrackID")

Here, I plot the data I retreive from my database above. This allows me to see correlations between the two different tables and how the data changes over time. I plot each of the audio features against the dates of the songs feature on the Billboard charts, which allows me to visualize changes in these features over the last 60 years. In general, these graphs show that music is getting louder, more danceable, more energetic, and less acoustic.

ggplot(data = data, aes(x = as.Date(chart_date), y = loudness)) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data = data, aes(x = as.Date(chart_date), y = instrumentalness)) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data = data, aes(x = as.Date(chart_date), y = valence)) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data = data, aes(x = as.Date(chart_date), y = liveness)) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data = data, aes(x = as.Date(chart_date), y = acousticness)) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data = data, aes(x = as.Date(chart_date), y = speechiness)) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data = data, aes(x = as.Date(chart_date), y = energy)) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

ggplot(data = data, aes(x = as.Date(chart_date), y = danceability)) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

MODEL CONSTRUCTION AND EVALUATION First, I split the data into training and testing sets. Then, I create a multiple linear regression model with each of the ten audio features included in the formula for determining a track’s peak rank. I then use the step function to step backward through the variables and determine the ideal model for determining peak rank. Of the 10 original variables, valence, liveness, mode, danceability, and key are used in the ideal model. Finally, I use predict to predict values for the test set, and see if those values are within +- 1 of the actual values below. Overall, my model had a 3.55% success rate, showing that more information is needed to determine a song’s peak rank than just some descriptors of the audio.

library(caTools)
splitVector <- sample.split(data$track_title, SplitRatio = .7)
train <- data[splitVector, ]
test <- data[!splitVector, ]

model <- lm(formula = peak_rank ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo, data = train)

ideal.model <- step(model, direction = "backward")
## Start:  AIC=21592.14
## peak_rank ~ danceability + energy + key + loudness + mode + speechiness + 
##     acousticness + instrumentalness + liveness + valence + tempo
## 
##                    Df Sum of Sq     RSS   AIC
## - acousticness      1      19.5 2691421 21590
## - loudness          1     323.5 2691725 21591
## - instrumentalness  1     430.8 2691832 21591
## - tempo             1    1374.1 2692775 21592
## - energy            1    1562.5 2692964 21592
## - danceability      1    1574.7 2692976 21592
## - mode              1    1645.8 2693047 21592
## - key               1    1680.2 2693081 21592
## <none>                          2691401 21592
## - valence           1    2285.3 2693686 21593
## - speechiness       1    3260.9 2694662 21594
## - liveness          1    4066.1 2695467 21595
## 
## Step:  AIC=21590.16
## peak_rank ~ danceability + energy + key + loudness + mode + speechiness + 
##     instrumentalness + liveness + valence + tempo
## 
##                    Df Sum of Sq     RSS   AIC
## - loudness          1     340.2 2691761 21589
## - instrumentalness  1     441.1 2691862 21589
## - tempo             1    1417.2 2692838 21590
## - danceability      1    1592.0 2693013 21590
## - mode              1    1628.9 2693050 21590
## - key               1    1677.5 2693098 21590
## <none>                          2691421 21590
## - energy            1    2264.5 2693685 21591
## - valence           1    2505.3 2693926 21591
## - speechiness       1    3260.6 2694681 21592
## - liveness          1    4047.9 2695468 21593
## 
## Step:  AIC=21588.56
## peak_rank ~ danceability + energy + key + mode + speechiness + 
##     instrumentalness + liveness + valence + tempo
## 
##                    Df Sum of Sq     RSS   AIC
## - instrumentalness  1     569.9 2692331 21587
## - tempo             1    1398.2 2693159 21588
## - mode              1    1601.6 2693362 21589
## - key               1    1672.5 2693433 21589
## <none>                          2691761 21589
## - danceability      1    1711.4 2693472 21589
## - valence           1    2170.4 2693931 21589
## - energy            1    2470.4 2694231 21590
## - speechiness       1    3316.3 2695077 21591
## - liveness          1    4097.7 2695859 21591
## 
## Step:  AIC=21587.24
## peak_rank ~ danceability + energy + key + mode + speechiness + 
##     liveness + valence + tempo
## 
##                Df Sum of Sq     RSS   AIC
## - tempo         1    1444.4 2693775 21587
## - mode          1    1543.5 2693874 21587
## - danceability  1    1627.9 2693959 21587
## - key           1    1679.6 2694010 21587
## <none>                      2692331 21587
## - valence       1    2181.4 2694512 21588
## - energy        1    2447.9 2694779 21588
## - speechiness   1    3382.2 2695713 21589
## - liveness      1    3995.5 2696326 21590
## 
## Step:  AIC=21586.96
## peak_rank ~ danceability + energy + key + mode + speechiness + 
##     liveness + valence
## 
##                Df Sum of Sq     RSS   AIC
## - mode          1    1620.1 2695395 21587
## <none>                      2693775 21587
## - key           1    1712.7 2695488 21587
## - valence       1    1757.6 2695533 21587
## - danceability  1    2681.0 2696456 21588
## - energy        1    2942.9 2696718 21589
## - speechiness   1    3022.2 2696797 21589
## - liveness      1    3756.1 2697531 21589
## 
## Step:  AIC=21586.88
## peak_rank ~ danceability + energy + key + speechiness + liveness + 
##     valence
## 
##                Df Sum of Sq     RSS   AIC
## - key           1    1300.2 2696696 21586
## - valence       1    1673.5 2697069 21587
## <none>                      2695395 21587
## - energy        1    2711.6 2698107 21588
## - speechiness   1    3133.6 2698529 21589
## - danceability  1    3268.5 2698664 21589
## - liveness      1    3863.9 2699259 21590
## 
## Step:  AIC=21586.43
## peak_rank ~ danceability + energy + speechiness + liveness + 
##     valence
## 
##                Df Sum of Sq     RSS   AIC
## - valence       1    1669.3 2698365 21586
## <none>                      2696696 21586
## - energy        1    2694.3 2699390 21588
## - speechiness   1    3059.5 2699755 21588
## - danceability  1    3331.9 2700027 21588
## - liveness      1    3777.4 2700473 21589
## 
## Step:  AIC=21586.41
## peak_rank ~ danceability + energy + speechiness + liveness
## 
##                Df Sum of Sq     RSS   AIC
## - energy        1    1638.8 2700004 21586
## <none>                      2698365 21586
## - speechiness   1    2697.0 2701062 21588
## - liveness      1    3452.4 2701817 21589
## - danceability  1    7153.2 2705518 21593
## 
## Step:  AIC=21586.36
## peak_rank ~ danceability + speechiness + liveness
## 
##                Df Sum of Sq     RSS   AIC
## <none>                      2700004 21586
## - speechiness   1    2199.2 2702203 21587
## - liveness      1    4030.8 2704034 21589
## - danceability  1    6387.5 2706391 21592
test$peak_rank_prediction <- predict(ideal.model, newdata = test, type = "response")

accurateCount <- 0
for (i in 1:nrow(test)) {
  if (round(test$peak_rank_prediction[i]) == test$peak_rank[i]
      || round(test$peak_rank_prediction[i]) - 1 == test$peak_rank[i]
      || round(test$peak_rank_prediction[i]) + 1 == test$peak_rank[i]) {
    accurateCount <- accurateCount + 1
  }
}
print(accurateCount/nrow(test))
## [1] 0.0407866

Here, I construct a logistic regression model to try to predict the likelihood that a song breaks into the top 50 of the Billboard Hot 100.

splitVector2 <- sample.split(data$track_title, SplitRatio = .7)
train2 <- data[splitVector2,]
test2 <- data[!splitVector2,]

top50 <- c()
for (i in 1:nrow(train2)) {
  if (train2$peak_rank[i] <= 50) {
    top50 <- c(top50, TRUE)
  } else {
    top50 <- c(top50, FALSE)
  }
}
train2$top50 <- top50

top50Model <- glm(formula = top50 ~ danceability + energy + key + loudness + mode + speechiness + acousticness + instrumentalness + liveness + valence + tempo, family = "binomial", data = train2)
optimal <- step(top50Model, direction = "backward")
## Start:  AIC=4440.58
## top50 ~ danceability + energy + key + loudness + mode + speechiness + 
##     acousticness + instrumentalness + liveness + valence + tempo
## 
##                    Df Deviance    AIC
## - tempo             1   4416.6 4438.6
## - acousticness      1   4416.6 4438.6
## - speechiness       1   4416.7 4438.7
## - liveness          1   4417.1 4439.1
## - mode              1   4417.1 4439.1
## - loudness          1   4417.3 4439.3
## - instrumentalness  1   4417.7 4439.7
## - valence           1   4418.1 4440.1
## - energy            1   4418.3 4440.3
## <none>                  4416.6 4440.6
## - key               1   4418.9 4440.9
## - danceability      1   4421.7 4443.7
## 
## Step:  AIC=4438.63
## top50 ~ danceability + energy + key + loudness + mode + speechiness + 
##     acousticness + instrumentalness + liveness + valence
## 
##                    Df Deviance    AIC
## - acousticness      1   4416.7 4436.7
## - speechiness       1   4416.8 4436.8
## - liveness          1   4417.2 4437.2
## - mode              1   4417.2 4437.2
## - loudness          1   4417.3 4437.3
## - instrumentalness  1   4417.8 4437.8
## - valence           1   4418.3 4438.3
## - energy            1   4418.3 4438.3
## <none>                  4416.6 4438.6
## - key               1   4419.0 4439.0
## - danceability      1   4421.9 4441.9
## 
## Step:  AIC=4436.7
## top50 ~ danceability + energy + key + loudness + mode + speechiness + 
##     instrumentalness + liveness + valence
## 
##                    Df Deviance    AIC
## - speechiness       1   4416.8 4434.8
## - liveness          1   4417.3 4435.3
## - mode              1   4417.3 4435.3
## - loudness          1   4417.3 4435.3
## - instrumentalness  1   4417.8 4435.8
## - valence           1   4418.3 4436.3
## - energy            1   4418.5 4436.5
## <none>                  4416.7 4436.7
## - key               1   4419.0 4437.0
## - danceability      1   4422.6 4440.6
## 
## Step:  AIC=4434.82
## top50 ~ danceability + energy + key + loudness + mode + instrumentalness + 
##     liveness + valence
## 
##                    Df Deviance    AIC
## - mode              1   4417.4 4433.4
## - liveness          1   4417.4 4433.4
## - loudness          1   4417.4 4433.4
## - instrumentalness  1   4417.9 4433.9
## - valence           1   4418.5 4434.5
## - energy            1   4418.8 4434.8
## <none>                  4416.8 4434.8
## - key               1   4419.1 4435.1
## - danceability      1   4422.6 4438.6
## 
## Step:  AIC=4433.4
## top50 ~ danceability + energy + key + loudness + instrumentalness + 
##     liveness + valence
## 
##                    Df Deviance    AIC
## - loudness          1   4418.0 4432.0
## - liveness          1   4418.1 4432.1
## - instrumentalness  1   4418.5 4432.5
## - valence           1   4419.0 4433.0
## - energy            1   4419.2 4433.2
## <none>                  4417.4 4433.4
## - key               1   4419.5 4433.5
## - danceability      1   4423.9 4437.9
## 
## Step:  AIC=4432
## top50 ~ danceability + energy + key + instrumentalness + liveness + 
##     valence
## 
##                    Df Deviance    AIC
## - liveness          1   4418.7 4430.7
## - valence           1   4419.1 4431.1
## - energy            1   4419.3 4431.3
## - instrumentalness  1   4419.3 4431.3
## <none>                  4418.0 4432.0
## - key               1   4420.1 4432.1
## - danceability      1   4424.9 4436.9
## 
## Step:  AIC=4430.65
## top50 ~ danceability + energy + key + instrumentalness + valence
## 
##                    Df Deviance    AIC
## - valence           1   4419.7 4429.7
## - instrumentalness  1   4419.9 4429.9
## - energy            1   4420.1 4430.1
## <none>                  4418.7 4430.7
## - key               1   4420.7 4430.7
## - danceability      1   4426.2 4436.2
## 
## Step:  AIC=4429.7
## top50 ~ danceability + energy + key + instrumentalness
## 
##                    Df Deviance    AIC
## - energy            1   4420.6 4428.6
## - instrumentalness  1   4421.0 4429.0
## <none>                  4419.7 4429.7
## - key               1   4421.8 4429.8
## - danceability      1   4432.6 4440.6
## 
## Step:  AIC=4428.57
## top50 ~ danceability + key + instrumentalness
## 
##                    Df Deviance    AIC
## - instrumentalness  1   4421.8 4427.8
## <none>                  4420.6 4428.6
## - key               1   4422.6 4428.6
## - danceability      1   4432.7 4438.7
## 
## Step:  AIC=4427.84
## top50 ~ danceability + key
## 
##                Df Deviance    AIC
## <none>              4421.8 4427.8
## - key           1   4423.9 4427.9
## - danceability  1   4433.8 4437.8
top50 <- c()
for (i in 1:nrow(test2)) {
  if (test2$peak_rank[i] <= 50) {
    top50 <- c(top50, TRUE)
  } else {
    top50 <- c(top50, FALSE)
  }
}
test2$top50 <- top50

predictions <- predict(object = optimal, newdata = test2, type = "response")
for (i in 1:length(predictions)) {
  if (predictions[i] > .5) {
    predictions[i] = TRUE
  } else {
    predictions[i] = FALSE
  }
}
test2$top50predictions <- predictions


accurate <- 0
for (i in 1:nrow(test2)) {
  if (test2$top50predictions[i] == test2$top50[i]) {
    accurate <- accurate + 1
  }
}
print(accurate/nrow(test2))
## [1] 0.5433358