This server is intended for use for Academic Classwork related Git repositories only. Projects/repositories will generally be removed after 6 months following close of the semester. Inactive repositories from previous semester are now being archived when no activity for 365 days. They are renamed and marked as 'archived'. After 90 days in that state they will be removed from the system completely.

Commit 373ad2d8 authored by Drew Willey's avatar Drew Willey

mlr groupings

parent 5094c0c7
......@@ -2,34 +2,93 @@ library(dplyr)
library(purrr)
# import all data
all_data <- read.csv(
file = "../data/mlb_games_and_weather_full_dataset.csv",
mlb_train <- read.csv(
file = "../data/mlb_train.csv",
stringsAsFactors = TRUE
)
set.seed(1337)
train_ind <- sample(seq_len(nrow(all_data)), size = floor(.7 * nrow(all_data)))
train <- all_data[train_ind, ]
test <- all_data[-train_ind, ]
fit <- lm(
total_adjusted_era_plus ~ hourly_dry_bulb_temperature
+ hourly_sea_level_pressure
+ hourly_dew_point_temperature
+ hourly_relative_humidity
+ hourly_wind_speed,
train
mlb_test <- read.csv(
file = "../data/mlb_test.csv",
stringsAsFactors = TRUE
)
sm <- summary(fit)
# print the ssq err
print(sum(sm$residuals^2))
basic_mlr <- function() {
# tried this with er, era and era+ as the dependent var.
# of these, era was the best.
fit <- lm(
total_era ~ hourly_dry_bulb_temperature
+ hourly_sea_level_pressure
+ hourly_dew_point_temperature
+ hourly_relative_humidity
+ hourly_wind_speed,
mlb_train
)
sm <- summary(fit)
print(sm)
# print the ssq err
print(sum(sm$residuals^2))
# print the average difference between prediction and actual
pred <- predict.lm(fit, mlb_test)
print(mean(pred - mlb_test$total_era))
}
# basic_mlr()
# Convert wind to a fraction of 1 (on the batter's nose) # based on cosine of
# the angle off the batter.
wind_dir_to_batter_azimuth <- function(wind_dir_words) {
ifelse(wind_dir_words == "L To R", 0,
ifelse(wind_dir_words == "R To L", 0,
ifelse(wind_dir_words == "In From RF", .525,
ifelse(wind_dir_words == "In From CF", 1,
ifelse(wind_dir_words == "In From LF", .525,
ifelse(wind_dir_words == "Out To LF", -.525,
ifelse(wind_dir_words == "Out To CF", 1,
ifelse(wind_dir_words == "Out To RF", -.525,
0))))))))
}
groupby_wind_dir <- function() {
# print(unique(mlb_train$wind_direction))
# Convert wind dir in words to relative strength (pro batter).
mlb_train$wind_dir_az <- wind_dir_to_batter_azimuth(mlb_train$wind_direction)
mlb_test$wind_dir_az <- wind_dir_to_batter_azimuth(mlb_test$wind_direction)
era_fit <- lm(total_era ~ wind_dir_az, mlb_train)
era_summary <- summary(era_fit)
era_pred <- predict.lm(era_fit, mlb_test)
print(mean(era_pred - mlb_test$total_era))
# do it again, but with wind speed as well
mlb_train$wind_adjusted <- mlb_train$wind_dir_az * mlb_train$hourly_wind_speed
mlb_test$wind_adjusted <- mlb_test$wind_dir_az * mlb_test$hourly_wind_speed
era_fit2 <- lm(total_era ~ wind_adjusted, mlb_train)
era_summary2 <- summary(era_fit2)
era_pred2 <- predict.lm(era_fit2, mlb_test)
# print the average difference between prediction and actual
pred <- predict.lm(fit, test)
print(pred - test$total_adjusted_era_plus)
print(mean(era_pred2 - mlb_test$total_era))
plot(all_data$hourly_relative_humidity, all_data$total_adjusted_era_plus)
# do mlr on all of the factors, including wind
fit <- lm(
total_era ~ hourly_dry_bulb_temperature
+ hourly_sea_level_pressure
+ hourly_dew_point_temperature
+ hourly_relative_humidity
+ hourly_wind_speed
+ wind_adjusted,
mlb_train
)
sm <- summary(fit)
# print(sm)
write.csv(train, '../data/mlb_train.csv')
write.csv(test, '../data/mlb_test.csv')
# print the average difference between prediction and actual
pred <- predict.lm(fit, mlb_test)
print(mean(pred - mlb_test$total_era))
}
groupby_wind_dir()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment