NBA Game Simulation
NBA Simulation
Trevor Johnson
3/11/2020
Intro
Simulating NBA games and predicting wins using Monte Carlo Simulation
Libraries needed
libraries <- c("XML", "rvest", "dplyr", "data.table", "DT", "ggplot2", "patchwork")
sapply(libraries, require, character.only = T)
Gathering Data
Scrape real time data from basketball-reference.com.
# https://www.basketball-reference.com/leagues/NBA_2020_totals.html
# you can change the year to get historical stats
# libraries
libraries <- c("XML", "rvest", "dplyr", "data.table")
sapply(libraries, require, character.only = T)
# Scraping the regular player data
url <- "https://www.basketball-reference.com/leagues/NBA_2020_totals.html"
css <- "td" # other CSS options: "#div_totals_stats", ".right , .left , .center"
stats <- rvest::html_text(rvest::html_nodes(xml2::read_html(url), css))
# each df should have these 29 fields:
colHeaders <- c("Player", "Pos", "Age", "Tm", "G", "GS", "MP", "FG", "FGA", "FG%", "3P", "3PA", "3P%", "2P", "2PA", "2P%", "eFG%", "FT", "FTA", "FT%", "ORB", "DRB", "TRB", "AST", "STL", "BLK", "TOV", "PF", "PTS")
# Every 29 elements in the large "stats" vector is a new row, so looping through to convert to data.frame format
df <- cbind(stats[seq(1, 600*29, by = 29)], stats[seq(2, 600*29, by = 29)])
for(i in 3:29){
df <- cbind(df, stats[seq(i, 600*29, by = 29)])
}
df <- data.frame(df)
names(df) <- colHeaders
# cleaning up the data.frame
df <- df[!is.na(df$Player),] # deleting extra rows
# changing column types
df[colHeaders[1:29]] <- sapply(df[colHeaders[1:29]], as.character)
df[df == ""] <- "0" # replacing missing values w/ 0's
df[colHeaders[c(3, 5:29)]] <- sapply(df[colHeaders[c(3, 5:29)]], as.numeric)
totals <- df
# now scraping the advanced table
# Scraping the data
url <- "https://www.basketball-reference.com/leagues/NBA_2020_advanced.html"
css <- "td" # other CSS options: "#div_totals_stats", ".right , .left , .center"
stats <- rvest::html_text(rvest::html_nodes(xml2::read_html(url), css))
# there are 2 columns with blank data in the dataset
colHeaders <- c("Player", "Pos", "Age", "Tm", "G", 'MP', "PER", "TS%", "3PAr", "FTr", "ORB%", "DRB%", "TRB%", "AST%", "STL%", "BLK%", "TOV%", "USG%", "blank1", "OWS", "DWS", "WS", "WS/48", "blank2", "OBPM", "DBPM", "BPM", "VORP")
# new player every 28 records
df <- cbind(stats[seq(1, 600*28, by = 28)], stats[seq(2, 600*28, by = 28)])
for(i in 3:28){
df <- cbind(df, stats[seq(i, 600*28, by = 28)])
}
df <- data.frame(df)
names(df) <- colHeaders
# data clean
df <- df[!is.na(df$Player),] # deleting extra rows
df[colHeaders[1:28]] <- sapply(df[colHeaders[c(1:28)]], as.character)
df[df == ""] <- "0" # replacing missing values w/ 0's
df[colHeaders[c(3, 5:28)]] <- sapply(df[colHeaders[c(3, 5:28)]], as.numeric)
advanced <- df
#----------------------------------------------------------------------------------------------------
# joining the two tables
players_all <- left_join(totals, select(advanced, Player, Tm, PER, `TS%`, DWS), by = c("Player", "Tm"))
#Saving the output for reproducable results
#write.csv(players_all, "/Users/tj/Documents/Data/NBA/PlayerStats_2020.03.10.csv", row.names = FALSE)
Data Cleaning
players_all <- fread("/Users/tj/Documents/Data/NBA/PlayerStats_2020.03.10.csv")
players <- players_all %>% select(Player, Tm, `3PA`, `3P%`, `2PA`, `2P%`, FTA, `FT%`, TOV, ORB, DRB, DWS, G, MP) %>% data.table()
# getting each players probability of playing within each team
players$DWS_zscore <- (players$DWS - mean(players$DWS))/sd(players$DWS)
players$DWS_scaled <- (players$DWS_zscore - min(players$DWS_zscore)) / (max(players$DWS_zscore) - min(players$DWS_zscore))
players$DWS_scaled <- players$DWS_scaled - mean(players$DWS_scaled)
players$DWS_zscore <- NULL
Possession Function
This function takes two teams as inputs, and simulates one possession round.
# possession function
## read this as team A is on offense, team B is on defense
# need to initialize this outside of the function to reset, then loop the possession function
playByPlay <- data.frame("Team" = character(), "Player" = character(), "Event" = character(), "Points" = double(), stringsAsFactors = F)
# team_A = team w/ the ball
# team_B = team defending
# printPlayByPlay = bool, if you run the function once, it's nice to set this to TRUE
# def_multiplier = some number, 0 means defense has no impact, 1 means a regular impact, 2 means 2x, 3 = 3x, etc.
possession <- function(team_A, team_B, printPlayByPlay = TRUE, def_multiplier = 1){
team_A1 = players[Tm == team_A]
team_B1 = players[Tm == team_B]
# using total season minutes played (MP) as play probability
team_A5 = team_A1[Player %in% sample(team_A1$Player, size = 5, replace = FALSE, prob = team_A1$MP)]
team_B5 = team_B1[Player %in% sample(team_B1$Player, size = 5, replace = FALSE, prob = team_B1$MP)]
# action
possible_actions <- c(paste0(1:5, "_2P%"), paste0(1:5, "_3P%"), paste0(1:5, "_FT%"), paste0(1:5, "_TOV"))
prob_weights <- c(team_A5$`2PA`, team_A5$`3PA`, team_A5$FTA, team_A5$TOV)
rebound_tbl <- data.table(Player = c(team_A5$Player, team_B5$Player), Tm = c(team_A5$Tm, team_B5$Tm), Rebounds = c(team_A5$ORB, team_B5$DRB))
# away team's defensive impact
defImpact <- mean(team_B5$DWS_scaled)*-1*def_multiplier
# what happens?
# Initialize these variables to begin, and they will eventually reset
# event <- "miss"
rebound <- "homeRebound"
# the actual play
while(rebound == "homeRebound"){
theAction <- sample(possible_actions, size = 1, prob = prob_weights)
theAction2 <- c(substr(theAction, 1, 1), substr(theAction, 3, nchar(theAction)))
prb <- (team_A5[as.numeric(theAction2[1]),] %>% select(theAction2[2]))*(1+defImpact) #def impact happens here, only impacts shooting, is already inherent in rebound counts
if(theAction2[2] == "TOV"){
rebound <- "awayRebound" # to end the while loop
assign("playByPlay", rbind(playByPlay, data.frame("Team" = team_A, "Player" = team_A5[as.numeric(theAction2[1]),]$Player, "Event" = "Turnover", "Points" = 0, stringsAsFactors = F)), envir = parent.frame()) # save event
}else if(theAction2[2] == "2P%"){
pt <- sample(c(0, 2), size = 1, prob = c(1-prb, prb))
assign("playByPlay", rbind(playByPlay, data.frame("Team" = team_A, "Player" = team_A5[as.numeric(theAction2[1]),]$Player, "Event" = "2pt", "Points" = pt, stringsAsFactors = F)), envir = parent.frame()) # save event
if(pt == 0){
rebound_ind <- sample(1:10, size = 1, prob = rebound_tbl$Rebounds)
if(rebound_ind <= 5){rebound <- "homeRebound"}else{rebound <- "awayRebound"}
assign("playByPlay", rbind(playByPlay, data.frame("Team" = rebound_tbl[rebound_ind,]$Tm, "Player" = rebound_tbl[rebound_ind,]$Player, "Event" = "Rebound", "Points" = 0, stringsAsFactors = F)), envir = parent.frame())
}else{
rebound <- "awayRebound" # to end while loop
}
}else if(theAction2[2] == "3P%"){
pt <- sample(c(0, 3), size = 1, prob = c(1-prb, prb))
assign("playByPlay", rbind(playByPlay, data.frame("Team" = team_A, "Player" = team_A5[as.numeric(theAction2[1]),]$Player, "Event" = "3pt", "Points" = pt, stringsAsFactors = F)), envir = parent.frame()) # save event
if(pt == 0){
rebound_ind <- sample(1:10, size = 1, prob = rebound_tbl$Rebounds)
if(rebound_ind <= 5){rebound <- "homeRebound"}else{rebound <- "awayRebound"}
assign("playByPlay", rbind(playByPlay, data.frame("Team" = rebound_tbl[rebound_ind,]$Tm, "Player" = rebound_tbl[rebound_ind,]$Player, "Event" = "Rebound", "Points" = 0, stringsAsFactors = F)), envir = parent.frame())
}else{
rebound <- "awayRebound"
}
}else if(theAction2[2] == "FT%"){
pt <- sample(c(0, 1), size = 1, prob = c(1-prb, prb))
assign("playByPlay", rbind(playByPlay, data.frame("Team" = team_A, "Player" = team_A5[as.numeric(theAction2[1]),]$Player, "Event" = "FT", "Points" = pt, stringsAsFactors = F)), envir = parent.frame()) # save event
pt <- sample(c(0, 1), size = 1, prob = c(1-prb, prb))
assign("playByPlay", rbind(playByPlay, data.frame("Team" = team_A, "Player" = team_A5[as.numeric(theAction2[1]),]$Player, "Event" = "FT", "Points" = pt, stringsAsFactors = F)), envir = parent.frame()) # save event
if(pt == 0){
rebound_ind <- sample(1:10, size = 1, prob = rebound_tbl$Rebounds)
if(rebound_ind <= 5){rebound <- "homeRebound"}else{rebound <- "awayRebound"}
assign("playByPlay", rbind(playByPlay, data.frame("Team" = rebound_tbl[rebound_ind,]$Tm, "Player" = rebound_tbl[rebound_ind,]$Player, "Event" = "Rebound", "Points" = 0, stringsAsFactors = F)), envir = parent.frame())
}else{
rebound <- "awayRebound" # tto end while loop
}
}
}# end while
if(printPlayByPlay == TRUE){print(playByPlay)}
}
Testing it out
# testing it out
set.seed(123)
playByPlay <- data.frame("Team" = character(), "Player" = character(), "Event" = character(), "Points" = double(), stringsAsFactors = F)
possession(team_A = "UTA", team_B = "TOR", printPlayByPlay = F, def_multiplier = 1)
possession(team_A = "TOR", team_B = "UTA", printPlayByPlay = T, def_multiplier = 1)
## Team Player Event Points
## 1 UTA Tony Bradley FT 0
## 2 UTA Tony Bradley FT 1
## 3 TOR Norman Powell Turnover 0
# looping it for 1 game
playByPlay <- data.frame("Team" = character(), "Player" = character(), "Event" = character(), "Points" = double(), stringsAsFactors = F)
for(i in 1:100){
possession(team_A = "UTA", team_B = "TOR", printPlayByPlay = F, def_multiplier = 1)
possession(team_A = "TOR", team_B = "UTA", printPlayByPlay = F, def_multiplier = 1)
}
#datatable(playByPlay)
head(playByPlay, 20)
## Team Player Event Points
## 1 UTA Rudy Gobert 2pt 2
## 2 TOR Kyle Lowry 3pt 0
## 3 UTA Rudy Gobert Rebound 0
## 4 UTA Donovan Mitchell 3pt 0
## 5 TOR Serge Ibaka Rebound 0
## 6 TOR Serge Ibaka FT 1
## 7 TOR Serge Ibaka FT 1
## 8 UTA Donovan Mitchell 3pt 3
## 9 TOR Kyle Lowry 3pt 0
## 10 UTA Donovan Mitchell Rebound 0
## 11 UTA Mike Conley Turnover 0
## 12 TOR Pascal Siakam 3pt 0
## 13 UTA Royce O'Neale Rebound 0
## 14 UTA Donovan Mitchell 3pt 0
## 15 TOR Norman Powell Rebound 0
## 16 TOR Pascal Siakam 2pt 0
## 17 UTA Bojan Bogdanović Rebound 0
## 18 UTA Donovan Mitchell 3pt 0
## 19 UTA Rudy Gobert Rebound 0
## 20 UTA Bojan Bogdanović 3pt 0
Monte Carlo Simulation
Simulate NBA games by running the posession function many times. Assume that each game has 100 possessions. Then repeat this process 20 times to simulate 20 games.
# Monte Carlo Simulation
home_team <- "LAL"
away_team <- "UTA"
nPossessions <- 100
nGames <- 20
home_team_scores <- NULL
away_team_scores <- NULL
playByPlay_overall <- data.frame("Team" = character(), "Player" = character(), "Event" = character(), "Points" = double(), stringsAsFactors = F, game = double())
for(j in 1:nGames){
playByPlay <- data.frame("Team" = character(), "Player" = character(), "Event" = character(), "Points" = double(), stringsAsFactors = F)
for(i in 1:nPossessions){
possession(team_A = home_team, team_B = away_team, printPlayByPlay = F, def_multiplier = 1)
possession(team_A = away_team, team_B = home_team, printPlayByPlay = F, def_multiplier = 1)
}
playByPlay_overall <- rbind(playByPlay_overall, cbind(playByPlay, data.frame(game = j)))
#print(paste0(j, "/", nGames))
}
playByPlay_overall2 <- data.table(playByPlay_overall)
x <- playByPlay_overall2[,.(sum(Points)), by = c("Team", "game")] %>% dcast.data.table(game ~ Team)
print(x)
## game LAL UTA
## 1: 1 112 110
## 2: 2 92 100
## 3: 3 101 67
## 4: 4 109 96
## 5: 5 79 89
## 6: 6 120 72
## 7: 7 105 94
## 8: 8 103 93
## 9: 9 95 98
## 10: 10 89 84
## 11: 11 87 96
## 12: 12 103 78
## 13: 13 89 67
## 14: 14 112 99
## 15: 15 101 88
## 16: 16 82 73
## 17: 17 91 75
## 18: 18 99 96
## 19: 19 102 83
## 20: 20 94 84
See which team won more games and by how much
x$spread <- x$LAL - x$UTA
x2 <- x[,win := ifelse(spread > 0, 1, 0)]
Lakers won 16/20 games against the Jazz, and win by 11.15 points.
Barplot below displays the average points per game per player across 20 games.
scores <- playByPlay_overall2[,.(sum(Points)), by = c("Team", "Player")]
scores$PPG <- scores$V1/nGames
p1 <- scores[Team == home_team,] %>% ggplot(aes(x = Player, y = PPG)) + geom_col() + coord_flip() + theme_bw() + theme(axis.title = element_blank(), panel.border = element_blank())
p2 <- scores[Team == away_team,] %>% ggplot(aes(x = Player, y = PPG)) + geom_col() + coord_flip() + theme_bw() + theme(axis.title = element_blank(), panel.border = element_blank())
p1 | p2