# Recall PRCP = Precipitation is measured as "tenths of mm" (254 = 1 inch)
# To get inches: x/254
# To get mm: x/10
library(lubridate)
library(extRemes)
library(stats)
library(dplyr)
library(xts)
library(gnFit)
library(ismev)
# library(tseries)
# library(trend)
# library(astsa)
# library(ggmap)
# Load in the precipitation data
<- read.csv("../scrape-NOAA-data/Data/all_stations_precip_UDP_updated_2021.csv")
precip
### Declustering
= precip
precip_dclust for ( i in seq(from=2, to=length(precip[1,]))){
= precip[,i]
station = which(station>=0)
non_na is.na(station)] <- 0
station[
= extRemes::decluster(station, threshold=0, clusterfun = "max", replace.with = 0)
dec = as.numeric(dec)
stat_clus
= stat_clus
station = station[non_na]
precip_dclust[non_na,i]
}
### format
1] <- lubridate::ymd(as.character(precip_dclust[,1])) #put Date column into date format
precip_dclust[,<- as.data.frame(precip_dclust) #make it a data frame
precip_dclust
### Set threshold
<- 253 # for 1-day thresh
3 Fitting GPD to moving windows
The methodology used here is described in . If fitting to more rolling windows than described in the paper this material accompanies, you will need to run this code to generate the object window
.
3.1 Setup and Declustering
3.2 Fit GPD to entire series by station
## NOTE: suppressing some warnings and errors which do not impact the analysis-- they are an artifact of the extent/format of the data (e.g. NAs from before stations were collecting data)
# Distribution Fits -------------------------------------------------------
<- numrow <- 601 #stations
numstat #numcol <- 79 #moving windows (this number will change with the new data since we have more years, old data was 1900-2017)
<- (2020-39)-1900 +1
numcol #2020-39 will get the starting year for the last window
#1900 is the starting year but it is not counted in the math, hence the +1
source("Code/HelperFunctions_GPD/fitgpdR.R")
# Fit to entire station length --------------------------------------------
# Fitting GPD to entire length of data for each station
<- fitgpdR(precip_dclust, thresh)
fullfits # which(is.na(fullfits)) # only a few are NA
source("Code/HelperFunctions_GPD/gof.R")
source("Code/HelperFunctions_GPD/data_no_na.R")
source("Code/HelperFunctions_GPD/gof_fix_error.R")
<- data_no_na(precip_dclust, thresh) # saving data corresponding to fullfits
fulldata_no_na
## Goodness of fit measures
<- ADp <- NULL
CVMp for(h in c(1:numstat)){ # for some reason 481 was not working with gof fn, so use gof_fix_error #but it has <1 year in length so will be eliminated anyway
if(!sum(is.na(fullfits[[h]]))){
<- gof_fix_error(fulldata_no_na[[h]], dist="gpd", pr=fullfits[[h]]$results$par, threshold=thresh)
gof_h <- gof_h$Wpval
CVMp[h] <- gof_h$Apval
ADp[h] else{
}<- ADp[h] <- NA
CVMp[h]
} }
3.3 GPD moving window fits
This code chunk generates the “window” file which is the starting point for the PARE model. It will take some time to run and the resulting file is about 1.6 Gb.
# All Stations - 40-Year Windows ------------------------------------------
# Eventually we will want these fits for all of the 40-year windows, but for now we can just fit to the most recent 40 years.
### Uncomment if you want to run from scratch- Note: will take about an hour
### You can instead load the .rds file below to load a saved copy (see readRDS line below)
# ### 40-YEAR MOVING WINDOWS ///////////////
#numcol <- 79 #windows of 40
<- (2020-39)-1900 +1
numcol <- "-01-01"
startday <- "-12-31"
endday <- list()
window <- NULL # end year of window
labelyr
# to access window j, gpdfit for station i, use window[[j]][[i]]
<- 1900
startyr ## Takes awhile to run
# for(j in 1:numcol){
# start <- paste0(startyr, startday)
# endyr <- startyr + 39
# end <- paste0(endyr, endday)
# labelyr[j] <- lubridate::year(as.Date(end))
#
# start <- which(precip_dclust$Date==start) #finding indexes corresponding to start & end dates
# end <- which(precip_dclust$Date==end)
# sub <- precip_dclust[start:end, ] #subset data to those 40 years
# window[[j]] <- fitgpdR(sub, thresh) # GPD fit with extRemes package
#
# startyr <- startyr + 1
# }
#
## Large file-- takes time to save/load
#saveRDS(window, file="Data/window_1day_dclust_updated.rds")
#window <- readRDS(file="Data/window_1day_dclust_updated.rds")
## file too big for quarto to load-- could try some of the solutions listed
3.4 Calculate return levels
Using the GPD parameter estimates to calculate the return level estimates.
# Saving Return Levels for Easy Plotting ----------------------------------
# in mm
# this is set up differently for easier plotting
# saving vector of RLs for each station across 79 windows
<- list()
trend_RL for(i in 1:numstat){
<- NULL
RL for(j in 1:numcol){
<- window[[j]][[i]]
fit if(!sum(is.na(fit)) == TRUE){
<- extRemes::return.level(fit, return.period=100)/10 # divide by 10 to get mm
RL[j] else{
}<- NA
RL[j]
}
}<- RL
trend_RL[[i]]
}
# trend_RL[[588]] # Hobby
<- list()
trend_RL_25 <- list()
trend_RL_100 <- list()
trend_RL_500 for(i in 1:numstat){
<- RL_100 <- RL_500 <- NULL
RL_25 for(j in 1:numcol){
<- window[[j]][[i]]
fit if(!sum(is.na(fit)) == TRUE){
<- extRemes::return.level(fit, return.period=25)/10 # divide by 10 to get mm
RL_25[j] <- extRemes::return.level(fit, return.period=100)/10 # divide by 10 to get mm
RL_100[j] <- extRemes::return.level(fit, return.period=500)/10 # divide by 10 to get mm
RL_500[j] else{
}<- RL_100[j] <- RL_500[j] <- NA
RL_25[j]
}
}<- RL_25
trend_RL_25[[i]] <- RL_100
trend_RL_100[[i]] <- RL_500
trend_RL_500[[i]]
}
#labelyr <- 1939:2017
<- 1939:2020 labelyr
3.5 Goodness of fit measures
This code chunk performs goodness of fit tests to determine for each of the rolling windows which of the stations have achieved a good fit to the GPD distribution. These goodness-of-fit results are used in the extreme value model fitting process as part of the data cleaning. The spatial models are only applied to stations which do not show evidence of a lack of GPD fit for a particular window.
# Goodness of Fit ---------------------------------------------------------
# Now going to do GOF for ALL stations, for EACH WINDOW FIT
### Should save RDS to access saved output instead of having to run again
<- matrix(nrow = numstat, ncol = numcol)
window_CVM <- matrix(nrow = numstat, ncol = numcol)
window_AD for(j in 1:numcol){
<- ADp <- NULL
CVMp for(i in 1:numstat){
<- window[[j]][[i]]
fit if(!sum(is.na(fit)) == TRUE){
<- gof_fix_error(fit$x, dist="gpd", pr=fit$results$par, threshold=thresh) # New gof function which corrects for errors by rounding
gof_h <- gof_h$Wpval
CVMp[i] <- gof_h$Apval
ADp[i] else{
}<- ADp[i] <- NA
CVMp[i]
}
}<- CVMp
window_CVM[, j] <- ADp
window_AD[, j]
}<- as.data.frame(window_CVM)
window_CVM <- as.data.frame(window_AD)
window_AD
# saveRDS(window_CVM, file="Data/window_CVM_updated.rds")
# saveRDS(window_AD, file="Data/window_AD_updated.rds")
<- readRDS("Data/window_CVM_updated.rds")
window_CVM <- readRDS("Data/window_AD_updated.rds")
window_AD
# window_CVM[i, j] gives CVM (Cramer Von Mises) p-value for station i, window j (similar for AD is Anderson Darling)
# window[[j]][[i]] gives gpd fit output for station i, window j