This server is intended for use for Academic Classwork related Git repositories only. Projects/repositories will generally be removed after 6 months following close of the semester. Inactive repositories from previous semester are now being archived when no activity for 365 days. They are renamed and marked as 'archived'. After 90 days in that state they will be removed from the system completely.

...
 
Commits (2)
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -5,10 +5,10 @@ library(httr)
library(parsedate)
# import game data
game_data <- read.csv(
file = "../data/2019_mlb_game_data.csv",
stringsAsFactors = TRUE
)
# game_data <- read.csv(
# file = "../data/2019_mlb_game_data.csv",
# stringsAsFactors = TRUE
# )
date_to_isodate <- function(date) {
strptime(date, format = "%Y-%m-%d %H:%M:%S") %>% format("%Y-%m-%dT%H:%M:%S")
......@@ -49,42 +49,43 @@ wx_fields <- c("HourlyDryBulbTemperature",
add_wx_data <- function(game_data) {
wx_cols <- game_data %>%
split(as.numeric(rownames(game_data))) %>%
map(compose( function(x) colMeans(x, na.rm = TRUE), fetch_wx_data))
map(compose(
function(x) colMeans(x, na.rm = TRUE),
function(x) fetch_wx_data(x, wx_fields)))
# append new columns to new game dataset
game_data[wx_fields] <- data.frame(matrix(unlist(wx_cols), nrow=length(wx_cols), byrow=T))
# save the dataset as a csv
game_data <- game_data[, -c(1)]
game_data
}
fetch_wx_data <- function(game_row) {
fetch_wx_data <- function(
city,
mlb_game_id,
actual_start_local_time,
end_local_time,
wx_fields,
time_buffer_hours = 1) {
# lookup station id
print(game_row$city)
print(game_row$mlb_game_id)
wxid <- get_wxid[game_row$city]
print(city)
print(mlb_game_id)
wxid <- get_wxid[city]
print(wxid)
# use starttime - 1hr and endtime + 1hr to account for dst ambiguity
# (since its an average) almost none of the season was outside dst anyway
# if we go there, dst began on march 10 and ended nov 3 in 2019
start_time <- strsplit(
format_iso_8601(parse_iso_8601(game_row$actual_start_local_time) - 60 * 60),
format_iso_8601(
parse_iso_8601(actual_start_local_time) -
60 * 60 * time_buffer_hours),
split = "+", fixed = TRUE
)[[1]][[1]]
end_time <- strsplit(
format_iso_8601(parse_iso_8601(game_row$end_local_time) + 60 * 60),
format_iso_8601(
parse_iso_8601(end_local_time) +
60 * 60 * time_buffer_hours),
split = "+", fixed = TRUE
)[[1]][[1]]
# If endtime is NA, make it 5 hours from start time.
end_time <-
if (is.na(end_time))
strsplit(
format_iso_8601(parse_iso_8601(start_time) + 60 *60 * 5),
split = "+",
fixed = TRUE
)[[1]][[1]]
else
end_time
print(end_time)
params <- paste("?", "startDate=", start_time, "&",
"endDate=", end_time, "&",
"format=csv", "&",
......@@ -104,9 +105,10 @@ fetch_wx_data <- function(game_row) {
res <- sapply(bdy, function(x) as.numeric(as.character(x)))
# preserve original dimensions
dim(res) <- d
print(colMeans(res))
print(res)
# print(colMeans(res, na.rm = T))
res
}
# write.csv(add_wx_data(game_data[game_data$mlb_game_id == 565895 | game_data$mlb_game_id == 567456,]), "../data/2019_mlb_game_data_wx_bak.csv", row.names = FALSE)
write.csv(add_wx_data(game_data), "../data/2019_mlb_game_data_wx.csv", row.names = FALSE)
# write.csv(add_wx_data(game_data), "../data/2019_mlb_game_data_wx.csv", row.names = FALSE)
......@@ -2,6 +2,7 @@ library(dplyr)
library(purrr)
library(httr)
library(parsedate)
source("addwx.R")
# merge perf data with wx data by game id
merge_data <- function() {
......@@ -20,44 +21,77 @@ merge_data <- function() {
select(-c(2:17)) %>%
distinct()
# think sql left join
new_df <- merge(mlb_data, just_wx_data, by = "mlb_game_id", all.x = TRUE)
# check the lengths
print(length(unique(wx_data$mlb_game_id)))
print(length(unique(mlb_data$mlb_game_id)))
print(nrow(new_df))
print(nrow(mlb_data))
# get the game ids we don't have yet
print(mlb_data[, "mlb_game_id"][!mlb_data[, "mlb_game_id"] %in% just_wx_data[, "mlb_game_id"]])
write.csv(new_df, "../data/new_data.csv", row.names = FALSE)
}
merge_data()
# merge_data()
reimport <- function() {
# enable lookup by data col name
wx_names_to_wx_col_names <- c(
"hourly_dry_bulb_temperature",
"hourly_sea_level_pressure",
"hourly_dew_point_temperature",
"hourly_relative_humidity",
"hourly_wind_speed"
)
names(wx_names_to_wx_col_names) <- wx_fields
names(wx_fields) <- wx_names_to_wx_col_names
check_new_data <- function(col_name = wx_names_to_wx_col_names) {
# import all data
all_data <- read.csv(
file = "../data/mlb_games_and_weather_full_dataset.csv",
stringsAsFactors = TRUE
)
# print(summary(all_data[, col_name]))
print(all_data[is.na(all_data$hourly_sea_level_pressure), ]$mlb_game_id)
}
just_wx_data <- all_data %>%
select(-c(1:(ncol(all_data) - 5)))
reimport <- function() {
# import all data
all_data <- read.csv(
file = "../data/mlb_games_and_weather_full_dataset_fix.csv",
stringsAsFactors = TRUE
)
# take a look at nas
print(summary(just_wx_data))
# print(summary(just_wx_data))
get_new_wx <- function(a, b, d, e, f) {
print(paste(a , b , d , e))
f
fix_wx_col <- function(all_data, wx_col_name) {
print(nrow(all_data))
for (i in 1:nrow(all_data)) {
irow <- all_data[i, ]
if (is.na(irow[wx_col_name])) {
new_val <- fetch_wx_data(
irow$city,
irow$mlb_game_id,
irow$actual_start_local_time,
irow$end_local_time,
wx_fields[wx_col_name],
1 # hour buffer (this data is missing, so we accept less accuracy)
) %>% (function(x) colMeans(x, na.rm = T))
print(new_val)
all_data[i, wx_col_name] <- new_val
print(all_data[i, wx_col_name])
}
}
write.csv(all_data, "../data/mlb_games_and_weather_full_dataset_fix.csv", row.names = F)
check_new_data(wx_col_name)
all_data
}
all_data$hourly_dry_bulb_temperature <- ifelse(
is.na(all_data$hourly_dry_bulb_temperature),
get_new_wx(
all_data$city,
all_data$mlb_data,
all_data$actual_start_local_time,
all_data$end_local_time,
all_data$hourly_dry_bulb_temperature
),
all_data$hourly_dry_bulb_temperature
)
all_data <- fix_wx_col(all_data, wx_names_to_wx_col_names[5])
}
# reimport()
check_new_data()