NBA Game Simulation

8 minute read

NBA Simulation

Intro

Simulating NBA games and predicting wins using Monte Carlo Simulation

Libraries needed

libraries <- c("XML", "rvest", "dplyr", "data.table", "DT", "ggplot2", "patchwork")
sapply(libraries, require, character.only = T)

Gathering Data

Scrape real time data from basketball-reference.com.

# https://www.basketball-reference.com/leagues/NBA_2020_totals.html
# you can change the year to get historical stats

# libraries
libraries <- c("XML", "rvest", "dplyr", "data.table")
sapply(libraries, require, character.only = T)


# Scraping the regular player data
url <- "https://www.basketball-reference.com/leagues/NBA_2020_totals.html"
css <- "td" # other CSS options: "#div_totals_stats", ".right , .left , .center"
stats <- rvest::html_text(rvest::html_nodes(xml2::read_html(url), css))

# each df should have these 29 fields:
colHeaders <- c("Player", "Pos", "Age", "Tm", "G", "GS", "MP", "FG", "FGA", "FG%", "3P", "3PA", "3P%", "2P", "2PA", "2P%", "eFG%", "FT", "FTA", "FT%", "ORB", "DRB", "TRB", "AST", "STL", "BLK", "TOV", "PF", "PTS")

# Every 29 elements in the large "stats" vector is a new row, so looping through to convert to data.frame format
df <- cbind(stats[seq(1, 600*29, by = 29)], stats[seq(2, 600*29, by = 29)])
for(i in 3:29){
  df <- cbind(df, stats[seq(i, 600*29, by = 29)])
}
df <- data.frame(df)
names(df) <- colHeaders

# cleaning up the data.frame
df <- df[!is.na(df$Player),] # deleting extra rows
# changing column types
df[colHeaders[1:29]] <- sapply(df[colHeaders[1:29]], as.character)
df[df == ""] <- "0" # replacing missing values w/ 0's
df[colHeaders[c(3, 5:29)]] <- sapply(df[colHeaders[c(3, 5:29)]], as.numeric)
totals <- df



# now scraping the advanced table
# Scraping the data
url <- "https://www.basketball-reference.com/leagues/NBA_2020_advanced.html"
css <- "td" # other CSS options: "#div_totals_stats", ".right , .left , .center"
stats <- rvest::html_text(rvest::html_nodes(xml2::read_html(url), css))

# there are 2 columns with blank data in the dataset
colHeaders <- c("Player", "Pos", "Age", "Tm", "G", 'MP', "PER", "TS%", "3PAr", "FTr", "ORB%", "DRB%", "TRB%", "AST%", "STL%", "BLK%", "TOV%", "USG%", "blank1", "OWS", "DWS", "WS", "WS/48", "blank2", "OBPM", "DBPM", "BPM", "VORP")

# new player every 28 records
df <- cbind(stats[seq(1, 600*28, by = 28)], stats[seq(2, 600*28, by = 28)])
for(i in 3:28){
  df <- cbind(df, stats[seq(i, 600*28, by = 28)])
}
df <- data.frame(df)
names(df) <- colHeaders

# data clean
df <- df[!is.na(df$Player),] # deleting extra rows
df[colHeaders[1:28]] <- sapply(df[colHeaders[c(1:28)]], as.character)
df[df == ""] <- "0" # replacing missing values w/ 0's
df[colHeaders[c(3, 5:28)]] <- sapply(df[colHeaders[c(3, 5:28)]], as.numeric)
advanced <- df

#----------------------------------------------------------------------------------------------------
# joining the two tables
players_all <- left_join(totals, select(advanced, Player, Tm, PER, `TS%`, DWS), by = c("Player", "Tm"))

#Saving the output for reproducable results
#write.csv(players_all, "/Users/tj/Documents/Data/NBA/PlayerStats_2020.03.10.csv", row.names = FALSE)

Data Cleaning

players_all <- fread("/Users/tj/Documents/Data/NBA/PlayerStats_2020.03.10.csv")
players <- players_all %>% select(Player, Tm, `3PA`, `3P%`, `2PA`, `2P%`, FTA, `FT%`, TOV, ORB, DRB, DWS, G, MP) %>% data.table()
# getting each players probability of playing within each team
players$DWS_zscore <- (players$DWS - mean(players$DWS))/sd(players$DWS)
players$DWS_scaled <- (players$DWS_zscore - min(players$DWS_zscore)) / (max(players$DWS_zscore) - min(players$DWS_zscore))
players$DWS_scaled <- players$DWS_scaled - mean(players$DWS_scaled)
players$DWS_zscore <- NULL

Possession Function

This function takes two teams as inputs, and simulates one possession round.

# possession function
## read this as team A is on offense, team B is on defense

# need to initialize this outside of the function to reset, then loop the possession function
playByPlay <- data.frame("Team" = character(), "Player" = character(), "Event" = character(), "Points" = double(), stringsAsFactors = F) 

# team_A = team w/ the ball
# team_B = team defending
# printPlayByPlay = bool, if you run the function once, it's nice to set this to TRUE
# def_multiplier = some number, 0 means defense has no impact, 1 means a regular impact, 2 means 2x, 3 = 3x, etc.

possession <- function(team_A, team_B, printPlayByPlay = TRUE, def_multiplier = 1){
  
  team_A1 = players[Tm == team_A]
  team_B1 = players[Tm == team_B]
  
  # using total season minutes played (MP) as play probability
  team_A5 = team_A1[Player %in% sample(team_A1$Player, size = 5, replace = FALSE, prob = team_A1$MP)]
  team_B5 = team_B1[Player %in% sample(team_B1$Player, size = 5, replace = FALSE, prob = team_B1$MP)]
  
  
  # action
  possible_actions <- c(paste0(1:5, "_2P%"), paste0(1:5, "_3P%"), paste0(1:5, "_FT%"), paste0(1:5, "_TOV"))
  prob_weights <- c(team_A5$`2PA`, team_A5$`3PA`, team_A5$FTA, team_A5$TOV)
  
  rebound_tbl <- data.table(Player = c(team_A5$Player, team_B5$Player), Tm = c(team_A5$Tm, team_B5$Tm), Rebounds = c(team_A5$ORB, team_B5$DRB))
  
  # away team's defensive impact
  defImpact <- mean(team_B5$DWS_scaled)*-1*def_multiplier
  
  # what happens? 
  # Initialize these variables to begin, and they will eventually reset
  # event <- "miss"
  rebound <- "homeRebound"
  
  # the actual play
  while(rebound == "homeRebound"){
    
    theAction <- sample(possible_actions, size = 1, prob = prob_weights)
    theAction2 <- c(substr(theAction, 1, 1), substr(theAction, 3, nchar(theAction))) 
    prb <- (team_A5[as.numeric(theAction2[1]),] %>% select(theAction2[2]))*(1+defImpact) #def impact happens here, only impacts shooting, is already inherent in rebound counts
    
    if(theAction2[2] == "TOV"){
      rebound <- "awayRebound" # to end the while loop
      assign("playByPlay", rbind(playByPlay, data.frame("Team" = team_A, "Player" = team_A5[as.numeric(theAction2[1]),]$Player, "Event" = "Turnover", "Points" = 0, stringsAsFactors = F)), envir = parent.frame()) # save event
      
    }else if(theAction2[2] == "2P%"){
      
      pt <- sample(c(0, 2), size = 1, prob = c(1-prb, prb))
      assign("playByPlay", rbind(playByPlay, data.frame("Team" = team_A, "Player" = team_A5[as.numeric(theAction2[1]),]$Player, "Event" = "2pt", "Points" = pt, stringsAsFactors = F)), envir = parent.frame()) # save event
      
      if(pt == 0){
        
        rebound_ind <- sample(1:10, size = 1, prob = rebound_tbl$Rebounds)
        if(rebound_ind <= 5){rebound <- "homeRebound"}else{rebound <- "awayRebound"}
        assign("playByPlay", rbind(playByPlay, data.frame("Team" = rebound_tbl[rebound_ind,]$Tm, "Player" = rebound_tbl[rebound_ind,]$Player, "Event" = "Rebound", "Points" = 0, stringsAsFactors = F)), envir = parent.frame()) 
        
      }else{
        rebound <- "awayRebound" # to end while loop
      }
      
    }else if(theAction2[2] == "3P%"){
      
      pt <- sample(c(0, 3), size = 1, prob = c(1-prb, prb))
      assign("playByPlay", rbind(playByPlay, data.frame("Team" = team_A, "Player" = team_A5[as.numeric(theAction2[1]),]$Player, "Event" = "3pt", "Points" = pt, stringsAsFactors = F)), envir = parent.frame()) # save event
      if(pt == 0){
        
        rebound_ind <- sample(1:10, size = 1, prob = rebound_tbl$Rebounds)
        if(rebound_ind <= 5){rebound <- "homeRebound"}else{rebound <- "awayRebound"}
        assign("playByPlay", rbind(playByPlay, data.frame("Team" = rebound_tbl[rebound_ind,]$Tm, "Player" = rebound_tbl[rebound_ind,]$Player, "Event" = "Rebound", "Points" = 0, stringsAsFactors = F)), envir = parent.frame()) 
        
      }else{
        rebound <- "awayRebound"
      }
      
    }else if(theAction2[2] == "FT%"){
      
      pt <- sample(c(0, 1), size = 1, prob = c(1-prb, prb))
      assign("playByPlay", rbind(playByPlay, data.frame("Team" = team_A, "Player" = team_A5[as.numeric(theAction2[1]),]$Player, "Event" = "FT", "Points" = pt, stringsAsFactors = F)), envir = parent.frame()) # save event
      
      pt <- sample(c(0, 1), size = 1, prob = c(1-prb, prb))
      assign("playByPlay", rbind(playByPlay, data.frame("Team" = team_A, "Player" = team_A5[as.numeric(theAction2[1]),]$Player, "Event" = "FT", "Points" = pt, stringsAsFactors = F)), envir = parent.frame()) # save event
      if(pt == 0){
        
        rebound_ind <- sample(1:10, size = 1, prob = rebound_tbl$Rebounds)
        if(rebound_ind <= 5){rebound <- "homeRebound"}else{rebound <- "awayRebound"}
        assign("playByPlay", rbind(playByPlay, data.frame("Team" = rebound_tbl[rebound_ind,]$Tm, "Player" = rebound_tbl[rebound_ind,]$Player, "Event" = "Rebound", "Points" = 0, stringsAsFactors = F)), envir = parent.frame()) 
        
      }else{
        rebound <- "awayRebound" # tto end while loop
      }
    }
  }# end while
  
  if(printPlayByPlay == TRUE){print(playByPlay)}
  
}

Testing it out

# testing it out
set.seed(123)
playByPlay <- data.frame("Team" = character(), "Player" = character(), "Event" = character(), "Points" = double(), stringsAsFactors = F) 
possession(team_A = "UTA", team_B = "TOR", printPlayByPlay = F, def_multiplier = 1)
possession(team_A = "TOR", team_B = "UTA", printPlayByPlay = T, def_multiplier = 1)
##   Team        Player    Event Points
## 1  UTA  Tony Bradley       FT      0
## 2  UTA  Tony Bradley       FT      1
## 3  TOR Norman Powell Turnover      0
# looping it for 1 game
playByPlay <- data.frame("Team" = character(), "Player" = character(), "Event" = character(), "Points" = double(), stringsAsFactors = F) 
for(i in 1:100){
  possession(team_A = "UTA", team_B = "TOR", printPlayByPlay = F, def_multiplier = 1)
  possession(team_A = "TOR", team_B = "UTA", printPlayByPlay = F, def_multiplier = 1)
}
#datatable(playByPlay)
head(playByPlay, 20)
##    Team           Player    Event Points
## 1   UTA      Rudy Gobert      2pt      2
## 2   TOR       Kyle Lowry      3pt      0
## 3   UTA      Rudy Gobert  Rebound      0
## 4   UTA Donovan Mitchell      3pt      0
## 5   TOR      Serge Ibaka  Rebound      0
## 6   TOR      Serge Ibaka       FT      1
## 7   TOR      Serge Ibaka       FT      1
## 8   UTA Donovan Mitchell      3pt      3
## 9   TOR       Kyle Lowry      3pt      0
## 10  UTA Donovan Mitchell  Rebound      0
## 11  UTA      Mike Conley Turnover      0
## 12  TOR    Pascal Siakam      3pt      0
## 13  UTA    Royce O'Neale  Rebound      0
## 14  UTA Donovan Mitchell      3pt      0
## 15  TOR    Norman Powell  Rebound      0
## 16  TOR    Pascal Siakam      2pt      0
## 17  UTA Bojan Bogdanović  Rebound      0
## 18  UTA Donovan Mitchell      3pt      0
## 19  UTA      Rudy Gobert  Rebound      0
## 20  UTA Bojan Bogdanović      3pt      0

Monte Carlo Simulation

Simulate NBA games by running the posession function many times. Assume that each game has 100 possessions. Then repeat this process 20 times to simulate 20 games.

# Monte Carlo Simulation
home_team <- "LAL"
away_team <- "UTA"
nPossessions <- 100
nGames <- 20

home_team_scores <- NULL
away_team_scores <- NULL
playByPlay_overall <- data.frame("Team" = character(), "Player" = character(), "Event" = character(), "Points" = double(), stringsAsFactors = F, game = double()) 

for(j in 1:nGames){
  playByPlay <- data.frame("Team" = character(), "Player" = character(), "Event" = character(), "Points" = double(), stringsAsFactors = F) 
  for(i in 1:nPossessions){
    possession(team_A = home_team, team_B = away_team, printPlayByPlay = F, def_multiplier = 1)
    possession(team_A = away_team, team_B = home_team, printPlayByPlay = F, def_multiplier = 1)
  }
  playByPlay_overall <- rbind(playByPlay_overall, cbind(playByPlay, data.frame(game = j)))
  #print(paste0(j, "/", nGames))
}
playByPlay_overall2 <- data.table(playByPlay_overall)
x <- playByPlay_overall2[,.(sum(Points)), by = c("Team", "game")] %>% dcast.data.table(game ~ Team)

print(x)
##     game LAL UTA
##  1:    1 112 110
##  2:    2  92 100
##  3:    3 101  67
##  4:    4 109  96
##  5:    5  79  89
##  6:    6 120  72
##  7:    7 105  94
##  8:    8 103  93
##  9:    9  95  98
## 10:   10  89  84
## 11:   11  87  96
## 12:   12 103  78
## 13:   13  89  67
## 14:   14 112  99
## 15:   15 101  88
## 16:   16  82  73
## 17:   17  91  75
## 18:   18  99  96
## 19:   19 102  83
## 20:   20  94  84

See which team won more games and by how much

x$spread <- x$LAL - x$UTA
x2 <- x[,win := ifelse(spread > 0, 1, 0)]

Lakers won 16/20 games against the Jazz, and win by 11.15 points.

Barplot below displays the average points per game per player across 20 games.

scores <- playByPlay_overall2[,.(sum(Points)), by = c("Team", "Player")]
scores$PPG <- scores$V1/nGames
p1 <- scores[Team == home_team,] %>% ggplot(aes(x = Player, y = PPG)) + geom_col() + coord_flip() + theme_bw() + theme(axis.title = element_blank(), panel.border = element_blank())
p2 <- scores[Team == away_team,] %>% ggplot(aes(x = Player, y = PPG)) + geom_col() + coord_flip() + theme_bw() + theme(axis.title = element_blank(), panel.border = element_blank())
p1 | p2