This server is intended for use for Academic Classwork related Git repositories only. Projects/repositories will generally be removed after 6 months following close of the semester. Inactive repositories from previous semester are now being archived when no activity for 365 days. They are renamed and marked as 'archived'. After 90 days in that state they will be removed from the system completely.

Commit 14e13d2c authored by Drew Willey's avatar Drew Willey

mining methods, regression, nn, decision trees

parent 89f768b2
library(dplyr)
library(purrr)
library(rpart.plot)
library(arules)
library(neuralnet)
source("../src/utils.R")
# import all data
mlb_train <- read.csv(
file = "../data/mlb_train.csv",
stringsAsFactors = TRUE
)
mlb_test <- read.csv(
file = "../data/mlb_test.csv",
stringsAsFactors = TRUE
)
# add wind strength column
mlb_train <- add_wind_strength_col(mlb_train)
mlb_test <- add_wind_strength_col(mlb_test)
# add pitching adv column
mlb_train <- add_pitching_adv_col(mlb_train)
mlb_test <- add_pitching_adv_col(mlb_test)
# convert pitching adv column to numeric
mlb_train$pitching_adv <- ifelse(mlb_train$pitching_adv == T, 1, 0)
mlb_test$pitching_adv <- ifelse(mlb_test$pitching_adv == T, 1, 0)
# add high_error_rate col
mlb_train <- add_high_error_rate_col(mlb_train)
mlb_test <- add_high_error_rate_col(mlb_test)
# bin the numeric wx columns
wx_fields <- c("hourly_relative_humidity",
"hourly_wind_speed",
"hourly_sea_level_pressure",
"hourly_dry_bulb_temperature",
"hourly_dew_point_temperature")
# print(summary(mlb_train))
print(summary(discretizeDF(mlb_train[wx_fields])))
for (field in wx_fields) {
mlb_train[, field] <- discretize(mlb_train[, field], breaks = 10)
mlb_test[, field] <- discretize(mlb_test[, field], breaks = 10)
}
print(mlb_train$hourly_relative_humidity)
print('end')
decision_tree <- function() {
fit <- rpart(total_e_per_9 ~ hourly_dry_bulb_temperature
# + hourly_sea_level_pressure
# + hourly_dew_point_temperature
+ hourly_relative_humidity
# + hourly_wind_speed
+ wind_strength,
data = mlb_train, control=rpart.control(minsplit=1, minbucket=1, cp=0.001))
pred <- predict(fit, mlb_test)
# prp(fit)
pred
}
pred <- decision_tree()
......@@ -60,40 +60,45 @@ groupby_wind_dir <- function() {
era_pred <- predict.lm(era_fit, mlb_test)
print(mean(era_pred - mlb_test$total_era))
# print(mean(era_pred - mlb_test$total_era))
# do it again, but with wind speed as well
# do it again, but with wind speed as well, jk use humidity
mlb_train$wind_adjusted <- mlb_train$wind_dir_az * mlb_train$hourly_wind_speed
mlb_test$wind_adjusted <- mlb_test$wind_dir_az * mlb_test$hourly_wind_speed
era_fit2 <- lm(total_era ~ wind_adjusted, mlb_train)
era_fit2 <- lm(total_hr_per_9 ~ hourly_relative_humidity + hourly_dry_bulb_temperature, mlb_train)
era_summary2 <- summary(era_fit2)
print(era_summary2)
era_pred2 <- predict.lm(era_fit2, mlb_test)
print(mean(era_pred2 - mlb_test$total_era))
print('just humidity on hr')
# print(mlb_test$total_hr_per_9)
print(mean(era_pred2 - mlb_test$total_hr_per_9))
print(sd(era_pred2 - mlb_test$total_hr_per_9))
# do mlr on all of the factors, including wind
fit <- lm(
total_era ~ hourly_dry_bulb_temperature
total_e_per_9 ~ hourly_dry_bulb_temperature
+ hourly_sea_level_pressure
+ hourly_dew_point_temperature
+ hourly_relative_humidity
+ hourly_wind_speed
+ wind_adjusted,
mlb_train
)
sm <- summary(fit)
# print(sm)
print(sm)
# print the average difference between prediction and actual
pred <- predict.lm(fit, mlb_test)
print(mean(pred - mlb_test$total_era))
print(mean(pred - mlb_test$total_e_per_9))
}
# groupby_wind_dir()
neural_net <- function() {
groupby_wind_dir()
by_ball_park <- function() {
# group data by location
# for each group, run regression on it
# return a table of accuracy
}
neural_net()
# by_ball_park()
library(dplyr)
library(purrr)
library(neuralnet)
source("../src/utils.R")
# import all data
mlb_train <- read.csv(
file = "../data/mlb_train.csv",
stringsAsFactors = TRUE
)
mlb_test <- read.csv(
file = "../data/mlb_test.csv",
stringsAsFactors = TRUE
)
# add wind strength column
mlb_train <- add_wind_strength_col(mlb_train)
mlb_test <- add_wind_strength_col(mlb_test)
# add pitching adv column
mlb_train <- add_pitching_adv_col(mlb_train)
mlb_test <- add_pitching_adv_col(mlb_test)
# convert pitching adv column to numeric
mlb_train$pitching_adv <- ifelse(mlb_train$pitching_adv == T, 1, 0)
mlb_test$pitching_adv <- ifelse(mlb_test$pitching_adv == T, 1, 0)
# add high_error_rate col
mlb_train <- add_high_error_rate_col(mlb_train)
mlb_test <- add_high_error_rate_col(mlb_test)
# convert high error rate column to numeric
mlb_train$high_error_rate <- ifelse(mlb_train$high_error_rate == T, 1, 0)
mlb_test$high_error_rate <- ifelse(mlb_test$high_error_rate == T, 1, 0)
# print(summary(mlb_train$pitching_adv))
neural_net <- function() {
net <- neuralnet(high_error_rate == 1 ~ hourly_dry_bulb_temperature
# + hourly_sea_level_pressure
# + hourly_dew_point_temperature
+ hourly_relative_humidity
+ hourly_wind_speed
+ wind_strength,
data = mlb_train,
hidden = c(3, 2),
lifesign = "minimal",
stepmax = 1e+05,
threshold = 0.05,
rep = 1,
act.fct = "logistic",
linear.output = F)
# plot(net, rep = "best")
# pred <- predict(net, mlb_test)
# table(mlb_test$pitching_adv == 1, pred[, 1] > 0.5)
net
}
net <- neural_net()
add_wind_strength_col <- function(dfrm) {
dfrm$wind_strength <- wind_dir_to_batter_azimuth(dfrm$wind_direction)
dfrm
}
add_pitching_adv_col <- function(dfrm) {
# league average era in 2019 was 4.51.
dfrm$pitching_adv <- ifelse(dfrm$total_era < 4.51, T, F)
dfrm
}
add_high_error_rate_col <- function(dfrm) {
dfrm$high_error_rate <- ifelse(dfrm$total_e_per_9 > .59, T, F)
dfrm
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment