WNBA Draft Analysis
  • Home
  • EDA
    • Will
    • Nox
    • Hamza
  • Appendices
    • Proposal
    • Case Study

Member 2 Nox Matse

  • Show All Code
  • Hide All Code

  • View Source

Your exploratory data analysis of the team datasets go here.

Code
library(tidyverse)
wnba2 <- read.csv("../data/processed/WNBA_Stats_1997_2026.csv")
Code
head(wnba2)
  Year       Player.Name Position Team  College...Origin Year.Drafted
1 1997  Adrienne Johnson        G  CLE        Ohio State         1999
2 1997 Andrea Congreaves      F-C  CHA Mercer University         1997
3 1997    Andrea Stinson        G  CHA          NC State         1997
4 1997     Anita Maxwell        F  CLE  New Mexico State         1997
5 1997    Bridget Pettis        G  PHO           Florida         1997
6 1997  Bridgette Gordon        F  SAC         Tennessee         1997
  Draft.Pick Height Weight Minutes.Per.Game Points.Per.Game Rebounds.Per.Game
1          8   5-10  154lb              7.8             2.1               0.9
2         26    6-2  183lb             23.5             6.7               4.8
3              5-10  158lb             36.1            15.7               5.5
4         29                            7.0             2.1               1.3
5          7    5-9                    30.1            12.6               3.8
6               6-0  172lb             35.0            13.0               4.8
  Assists.Per.Game Blocks.Per.Game Steals.Per.Game Turnovers.Per.Game   FG.
1              0.4             0.0             0.2                1.1 0.379
2              1.5             0.2             0.6                1.1 0.500
3              4.4             0.8             1.5                3.5 0.447
4              0.9             0.0             0.4                0.7 0.320
5              2.8             0.4             1.8                2.9 0.334
6              2.8             0.3             1.4                3.0 0.433
   X3P.   FT. Win.Shares
1 0.500 0.778       -0.5
2 0.409 0.768        3.1
3 0.325 0.674        3.8
4    NA 0.375       -0.1
5 0.306 0.898        3.2
6 0.275 0.785        1.9
Code
library(dplyr)

wnba2$Draft.Pick[wnba2$Draft.Pick == "Undrafted"] <- NA
wnba2$Draft.Pick <- as.numeric(wnba2$Draft.Pick)
wnba2 <- wnba2[!is.na(wnba2$Draft.Pick), ]
Code
# library(randomForest)
# library(dplyr)
# 
# # Clean and select variables
# wnba_clean <- wnba2 %>%
#   select(Draft.Pick, Points.Per.Game, College...Origin, Win.Shares, Assists.Per.Game, Rebounds.Per.Game, Minutes.Per.Game, Height) %>%
#   na.omit()
# 
# # Make sure overall_pick is numeric
# wnba_clean$overall_pick <- as.numeric(wnba_clean$Draft.Pick)
# 
# # Train model
# rf_model <- randomForest(
#   Draft.Pick ~ .,
#   data = wnba_clean,
#   importance = TRUE
# )
# 
# # View importance
# importance(rf_model)
# varImpPlot(rf_model)
Code
# library(glmnet)
# library(dplyr)
# 
# # Clean data
# wnba_clean2 <- wnba2 %>%
#   select(Draft.Pick, Points.Per.Game, College...Origin, Win.Shares, Assists.Per.Game, Rebounds.Per.Game, Minutes.Per.Game, Height) %>%
#   na.omit()
# 
# # X and Y
# x <- as.matrix(wnba_clean %>% select(-Draft.Pick))
# y <- wnba_clean$Draft.Pick
# 
# # LASSO
# cv_model <- cv.glmnet(x, y, alpha = 1)
# lasso_model <- glmnet(x, y, alpha = 1, lambda = cv_model$lambda.min)
# 
# # Convert coefficients to dataframe
# coef_df <- as.matrix(coef(lasso_model)) %>%
#   as.data.frame()
# 
# # Rename the coefficient column safely
# colnames(coef_df)[1] <- "coefficient"
# 
# # Add variable names + rank
# coef_df <- coef_df %>%
#   mutate(variable = rownames(.)) %>%
#   arrange(desc(abs(coefficient)))
# 
# # View results
# coef_df
Code
library(dplyr)
library(glmnet)

# Step 1: clean data
clean_data <- wnba2 %>%
  mutate(
    Draft.Pick = as.character(Draft.Pick),
    Draft.Pick[Draft.Pick %in% c("Undrafted", "")] <- NA,
    Draft.Pick = as.numeric(Draft.Pick)
  ) %>%
  filter(!is.na(Draft.Pick)) %>%
  select(where(is.numeric)) %>%
  na.omit()   # ⚠️ THIS is key: removes any remaining NA rows

# Step 2: create X and y from SAME dataset
x <- model.matrix(Draft.Pick ~ ., clean_data)[, -1]
y <- clean_data$Draft.Pick

# Step 3: run LASSO
cv_model <- cv.glmnet(x, y, alpha = 1)
lasso_model <- glmnet(x, y, alpha = 1, lambda = cv_model$lambda.min)

# Step 4: extract important variables
important_vars <- coef(lasso_model)
important_vars[important_vars != 0]
 [1] 449.87417945  -0.21357143  -0.64337016  -0.07384913  -0.13357690
 [6]  -1.63063561   0.43600870  -1.00145202   1.53330522  -2.02640879
[11]   0.09786055
Code
library(dplyr)

coef_df <- as.matrix(coef(lasso_model)) %>%
  as.data.frame()

# Rename column
colnames(coef_df)[1] <- "coefficient"

# Add variable names and clean up
coef_df <- coef_df %>%
  mutate(variable = rownames(.)) %>%
  filter(coefficient != 0) %>%   # keep only important variables
  arrange(desc(abs(coefficient)))

# View
coef_df
                    coefficient           variable
(Intercept)        449.87417945        (Intercept)
FT.                 -2.02640879                FT.
Blocks.Per.Game     -1.63063561    Blocks.Per.Game
X3P.                 1.53330522               X3P.
Turnovers.Per.Game  -1.00145202 Turnovers.Per.Game
Points.Per.Game     -0.64337016    Points.Per.Game
Steals.Per.Game      0.43600870    Steals.Per.Game
Year                -0.21357143               Year
Assists.Per.Game    -0.13357690   Assists.Per.Game
Win.Shares           0.09786055         Win.Shares
Rebounds.Per.Game   -0.07384913  Rebounds.Per.Game
Code
library(dplyr)
library(randomForest)

# Step 1: clean data (same logic as before)
clean_data <- wnba2 %>%
  mutate(
    Draft.Pick = as.character(Draft.Pick),
    Draft.Pick[Draft.Pick %in% c("Undrafted", "")] <- NA,
    Draft.Pick = as.numeric(Draft.Pick)
  ) %>%
  filter(!is.na(Draft.Pick)) %>%
  select(where(is.numeric)) %>%
  na.omit()

# Step 2: run random forest
set.seed(123)

rf_model <- randomForest(
  Draft.Pick ~ ., 
  data = clean_data,
  importance = TRUE,
  ntree = 500
)

# Step 3: extract variable importance
importance_df <- as.data.frame(importance(rf_model))
importance_df$variable <- rownames(importance_df)

# Sort by importance
importance_df <- importance_df %>%
  arrange(desc(IncNodePurity))

# View results
importance_df
                    %IncMSE IncNodePurity           variable
Points.Per.Game    46.16407      54441.30    Points.Per.Game
Year.Drafted       60.47708      42769.57       Year.Drafted
Minutes.Per.Game   26.22497      37146.72   Minutes.Per.Game
Rebounds.Per.Game  38.46880      28773.92  Rebounds.Per.Game
Year               41.95568      28526.85               Year
Turnovers.Per.Game 36.20875      25126.84 Turnovers.Per.Game
FG.                31.06777      24920.98                FG.
FT.                29.36691      23636.38                FT.
Assists.Per.Game   38.93775      22308.18   Assists.Per.Game
X3P.               28.13583      21756.51               X3P.
Win.Shares         23.59044      21344.32         Win.Shares
Steals.Per.Game    26.44115      16384.42    Steals.Per.Game
Blocks.Per.Game    31.94281      13973.31    Blocks.Per.Game
Code
varImpPlot(rf_model)

%IncMSE Measures how much prediction error increases if a variable is removed

Higher = more important

IncNodePurity (right plot)

Measures how much a variable improves tree splits

both plots show that points per game, year drafted, year and minutes per minute are important from random forest

Code
ggplot(clean_data, aes(x = FT., y = Blocks.Per.Game)) +
  geom_bin2d(bins = 25) +
  scale_fill_viridis_c(option = "C") +
  geom_smooth(method = "loess", se = FALSE, color = "white", linewidth = 1) +
  labs(
    title = "Relationship Between Free Throw % and Blocks per Game",
    x = "Free Throw Percentage (FT%)",
    y = "Blocks Per Game",
    fill = "Player Count"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title = element_text(face = "bold"),
    panel.grid.minor = element_blank()
  )

Code
library(ggplot2)
library(dplyr)
library(tidyr)

top_vars <- clean_data %>%
  select(FT., Blocks.Per.Game, X3P., Turnovers.Per.Game, Points.Per.Game) %>%
  pivot_longer(cols = everything(),
               names_to = "variable",
               values_to = "value")

ggplot(top_vars, aes(x = value)) +
  geom_density(fill = "#2C7FB8", alpha = 0.6) +
  facet_wrap(~variable, scales = "free") +
  labs(
    title = "Distribution of Key Performance Variables",
    x = "Value",
    y = "Density"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    strip.text = element_text(face = "bold"),
    plot.title = element_text(face = "bold")
  )

Source Code
---
title: "Member 2 Nox Matse"
---

Your exploratory data analysis of the team datasets go here.

```{r}
library(tidyverse)
wnba2 <- read.csv("../data/processed/WNBA_Stats_1997_2026.csv")
```

```{r}
head(wnba2)
```

```{r}
library(dplyr)

wnba2$Draft.Pick[wnba2$Draft.Pick == "Undrafted"] <- NA
wnba2$Draft.Pick <- as.numeric(wnba2$Draft.Pick)
wnba2 <- wnba2[!is.na(wnba2$Draft.Pick), ]

```

```{r}
# library(randomForest)
# library(dplyr)
# 
# # Clean and select variables
# wnba_clean <- wnba2 %>%
#   select(Draft.Pick, Points.Per.Game, College...Origin, Win.Shares, Assists.Per.Game, Rebounds.Per.Game, Minutes.Per.Game, Height) %>%
#   na.omit()
# 
# # Make sure overall_pick is numeric
# wnba_clean$overall_pick <- as.numeric(wnba_clean$Draft.Pick)
# 
# # Train model
# rf_model <- randomForest(
#   Draft.Pick ~ .,
#   data = wnba_clean,
#   importance = TRUE
# )
# 
# # View importance
# importance(rf_model)
# varImpPlot(rf_model)
```

```{r}
# library(glmnet)
# library(dplyr)
# 
# # Clean data
# wnba_clean2 <- wnba2 %>%
#   select(Draft.Pick, Points.Per.Game, College...Origin, Win.Shares, Assists.Per.Game, Rebounds.Per.Game, Minutes.Per.Game, Height) %>%
#   na.omit()
# 
# # X and Y
# x <- as.matrix(wnba_clean %>% select(-Draft.Pick))
# y <- wnba_clean$Draft.Pick
# 
# # LASSO
# cv_model <- cv.glmnet(x, y, alpha = 1)
# lasso_model <- glmnet(x, y, alpha = 1, lambda = cv_model$lambda.min)
# 
# # Convert coefficients to dataframe
# coef_df <- as.matrix(coef(lasso_model)) %>%
#   as.data.frame()
# 
# # Rename the coefficient column safely
# colnames(coef_df)[1] <- "coefficient"
# 
# # Add variable names + rank
# coef_df <- coef_df %>%
#   mutate(variable = rownames(.)) %>%
#   arrange(desc(abs(coefficient)))
# 
# # View results
# coef_df
```

```{r}
library(dplyr)
library(glmnet)

# Step 1: clean data
clean_data <- wnba2 %>%
  mutate(
    Draft.Pick = as.character(Draft.Pick),
    Draft.Pick[Draft.Pick %in% c("Undrafted", "")] <- NA,
    Draft.Pick = as.numeric(Draft.Pick)
  ) %>%
  filter(!is.na(Draft.Pick)) %>%
  select(where(is.numeric)) %>%
  na.omit()   # ⚠️ THIS is key: removes any remaining NA rows

# Step 2: create X and y from SAME dataset
x <- model.matrix(Draft.Pick ~ ., clean_data)[, -1]
y <- clean_data$Draft.Pick

# Step 3: run LASSO
cv_model <- cv.glmnet(x, y, alpha = 1)
lasso_model <- glmnet(x, y, alpha = 1, lambda = cv_model$lambda.min)

# Step 4: extract important variables
important_vars <- coef(lasso_model)
important_vars[important_vars != 0]

library(dplyr)

coef_df <- as.matrix(coef(lasso_model)) %>%
  as.data.frame()

# Rename column
colnames(coef_df)[1] <- "coefficient"

# Add variable names and clean up
coef_df <- coef_df %>%
  mutate(variable = rownames(.)) %>%
  filter(coefficient != 0) %>%   # keep only important variables
  arrange(desc(abs(coefficient)))

# View
coef_df


```
```{r}
library(dplyr)
library(randomForest)

# Step 1: clean data (same logic as before)
clean_data <- wnba2 %>%
  mutate(
    Draft.Pick = as.character(Draft.Pick),
    Draft.Pick[Draft.Pick %in% c("Undrafted", "")] <- NA,
    Draft.Pick = as.numeric(Draft.Pick)
  ) %>%
  filter(!is.na(Draft.Pick)) %>%
  select(where(is.numeric)) %>%
  na.omit()

# Step 2: run random forest
set.seed(123)

rf_model <- randomForest(
  Draft.Pick ~ ., 
  data = clean_data,
  importance = TRUE,
  ntree = 500
)

# Step 3: extract variable importance
importance_df <- as.data.frame(importance(rf_model))
importance_df$variable <- rownames(importance_df)

# Sort by importance
importance_df <- importance_df %>%
  arrange(desc(IncNodePurity))

# View results
importance_df

```

```{r}
varImpPlot(rf_model)

```

%IncMSE
Measures how much prediction error increases if a variable is removed

Higher = more important

 IncNodePurity (right plot)

Measures how much a variable improves tree splits

both plots show that points per game, year drafted, year and minutes per minute are important from random forest 

```{r}
ggplot(clean_data, aes(x = FT., y = Blocks.Per.Game)) +
  geom_bin2d(bins = 25) +
  scale_fill_viridis_c(option = "C") +
  geom_smooth(method = "loess", se = FALSE, color = "white", linewidth = 1) +
  labs(
    title = "Relationship Between Free Throw % and Blocks per Game",
    x = "Free Throw Percentage (FT%)",
    y = "Blocks Per Game",
    fill = "Player Count"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title = element_text(face = "bold"),
    panel.grid.minor = element_blank()
  )

```

```{r}
library(ggplot2)
library(dplyr)
library(tidyr)

top_vars <- clean_data %>%
  select(FT., Blocks.Per.Game, X3P., Turnovers.Per.Game, Points.Per.Game) %>%
  pivot_longer(cols = everything(),
               names_to = "variable",
               values_to = "value")

ggplot(top_vars, aes(x = value)) +
  geom_density(fill = "#2C7FB8", alpha = 0.6) +
  facet_wrap(~variable, scales = "free") +
  labs(
    title = "Distribution of Key Performance Variables",
    x = "Value",
    y = "Density"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    strip.text = element_text(face = "bold"),
    plot.title = element_text(face = "bold")
  )

```