Load packages

require(dplyr)
require(zooper)
require(lubridate)
require(readr)
require(tidyr)
require(ggplot2)
require(sf)
require(readxl)
require(stringr)
require(mgcv)
require(purrr)

Load zoop data

zoop_data<-Zoopsynther(Data_type="Community", Sources=c("EMP", "STN", "20mm", "FMWT"), Time_consistency = TRUE)
## [1] "These species have no relatives in their size class common to all datasets and have been removed from one or more size classes: Ostracoda Adult (Meso), Cumacea Adult (Meso), Annelida Adult (Meso), Gammarus Adult (Meso), Orientomysis aspera Adult (Meso), Chironomidae Larva (Meso), Insecta Larva (Meso)"

Read in zoop mass conversions

zoop_mass_conversions<-read_excel("~/ZoopSynth/Data paper/Biomass conversions.xlsx", sheet="Micro and Meso-zooplankton")%>%
  mutate(Taxname=case_when(Taxname=="Sinocalanus"~"Sinocalanus doerrii", # Change to help this match to zoop data
                           TRUE ~ Taxname),
         Taxlifestage=paste(Taxname, Lifestage))%>%
  select(Taxlifestage, CarbonWeight_ug)

Read in zoop groupings

zoop_groups<-read_csv("Data/zoopcrosswalk2.csv", col_types=cols_only(Taxlifestage="c", IBMR="c"))%>%
  distinct()

Load Mysid biomass data

zoop_mysid<-read_excel("Data/1972-2020MysidBPUEMatrix.xlsx", # EMP
                       sheet="Mysid_BPUE_matrix_1972-2020", na = "NA",
                       col_types = c(rep("numeric", 4), "date", "text", "text", rep("text", 7), rep("numeric", 8)))%>%
  select(Date=SampleDate, Station=StationNZ, BPUE=`Hyperacanthomysis longirostris`)%>% # Only select Hyperacanthomysis longirostris
  mutate(Source="EMP")%>%
  bind_rows(read_csv("Data/FMWT STN 2007to2019 Mysid BPUE.csv", # FMWT/STN
                     col_types=cols_only(Station="c", SampleDate="c", Project="c", `Hyperacanthomysis longirostris`="d"))%>% 
              rename(Date=SampleDate, Source=Project, BPUE=`Hyperacanthomysis longirostris`)%>% # Only select Hyperacanthomysis longirostris
              mutate(Date=mdy(Date),
                     Station=recode(Station, MONT="Mont", HONK="Honk")))%>% #Get station names to match to main dataset
  mutate(BPUE_mysid=BPUE*1000, # Convert to ug
         Taxlifestage="Hyperacanthomysis longirostris Adult",
         SampleID=paste(Source, Station, Date),
         SizeClass="Macro")%>%
  select(SampleID, Taxlifestage, SizeClass, BPUE_mysid)

Start processing the zoop data

zoop_data_mass<-zoop_data%>%
  mutate(Taxlifestage=str_remove(Taxlifestage, fixed("_UnID")))%>%
  filter(
    !(SizeClass=="Meso" & #eliminating species which are counted in meso and micro and retained better in the micro net from the meso calcs
        
        Taxlifestage%in%c("Asplanchna Adult", "Copepoda Larva","Cyclopoida Juvenile", "Eurytemora Larva", "Harpacticoida Undifferentiated",
                          "Keratella Adult", "Limnoithona Adult", "Limnoithona Juvenile", "Limnoithona sinenesis Adult", "Limnoithona tetraspina
                                    Adult", "Oithona Adult", "Oithona Juvenile", "Oithona davisae Adult", "Polyarthra Adult","Pseudodiaptomus Larva", 
                          "Rotifera Adult", "Sinocalanus doerrii Larva", "Synchaeta Adult", "Synchaeta bicornis Adult", "Trichocerca Adult")) &
      
      !(SizeClass=="Micro" &Taxlifestage%in%c("Cirripedia Larva", "Cyclopoida Adult", "Oithona similis")) & #removing categories better retained in meso net from micro net matrix
      Order!="Amphipoda" & # Remove amphipods
      (Order!="Mysida" | Taxlifestage=="Hyperacanthomysis longirostris Adult"))%>% #Only retain Hyperacanthomysis longirostris
  mutate(Taxlifestage=recode(Taxlifestage, `Synchaeta bicornis Adult`="Synchaeta Adult", # Change some names to match to biomass conversion dataset
                             `Pseudodiaptomus Adult`="Pseudodiaptomus forbesi Adult",
                             `Acanthocyclops vernalis Adult`="Acanthocyclops Adult"))%>%
  left_join(zoop_mass_conversions, by="Taxlifestage")%>% # Add biomass conversions
  left_join(zoop_mysid, by=c("SampleID", "Taxlifestage", "SizeClass"))%>% # Add mysid biomass
  left_join(zoop_groups, by="Taxlifestage")%>% # Add IBMR categories
  mutate(BPUE=if_else(Taxlifestage=="Hyperacanthomysis longirostris Adult", BPUE_mysid, CPUE*CarbonWeight_ug))%>% # Create 1 BPUE variable
  filter(!is.na(BPUE) & !is.na(Latitude) & !is.na(Longitude) & !is.na(SalSurf))%>% # Removes any data without BPUE, which is currently restricted to Decapod Larvae, and H. longirostris from STN. Also removes 20mm and EMP EZ stations without coordinates
  group_by(IBMR)%>%
  mutate(flag=if_else(all(c("Micro", "Meso")%in%SizeClass), "Remove", "Keep"))%>% # This and the next 2 lines are meant to ensure that all categories are consistent across the surveys. Since only EMP samples microzoops, only EMP data can be used for categories that include both micro and mesozoops.
  ungroup()%>%
  filter(!(flag=="Remove" & Source!="EMP"))%>%
  select(SampleID, Station, Latitude, Longitude, SalSurf, Date, Year, IBMR, BPUE)%>%
  group_by(across(-BPUE))%>%
  summarise(BPUE=sum(BPUE), .groups="drop")%>% # Sum each IBMR categories
  st_as_sf(coords=c("Longitude", "Latitude"), crs=4326)%>%
  st_transform(crs=st_crs(deltamapr::R_EDSM_Subregions_Mahardja))%>% # This and next 4 lines to filter data to Suisun Marsh
  st_join(deltamapr::R_EDSM_Subregions_Mahardja%>%
            select(SubRegion))%>%
  st_drop_geometry()%>%
  filter(SubRegion=="Suisun Marsh")%>%
  mutate(doy=yday(Date), #Day of year
         Year_fac=factor(Year), # Factor year for model random effect
         Station_fac=factor(Station), # Factor station for model random effect
         across(c(SalSurf, doy), list(s=~(.x-mean(.x))/sd(.x))), # Center and standardize predictors
         BPUE_log1p=log(BPUE+1)) # log1p transform BPUE for model

write_csv(zoop_data_mass, "Outputs/zoop_data_mass.csv") # Save data for Rosie

Set up prediction data for model

newdata<-expand_grid(date=mdy(paste(1:12, 15, 2001, sep="/")), # The 15th of each month on a non-leap year
                     SalSurf=seq(min(round(zoop_data_mass$SalSurf), 1), 
                                 round(max(zoop_data_mass$SalSurf), 1), by=0.1))%>% # Salinity sequence nicely rounded to 1 decimal
  mutate(Month=month(date),
         doy=yday(date), # Day of year
         SalSurf_s=(SalSurf-mean(zoop_data_mass$SalSurf))/sd(zoop_data_mass$SalSurf), # center and standardize salinity to match data
         doy_s=(doy-mean(zoop_data_mass$doy))/sd(zoop_data_mass$doy), # center and standardize doy to match data
         Year_fac="2010", # Set arbitrary year (won't be used)
         Station_fac="NZ032")%>% # Set arbitrary station (won't be used)
  select(Month, doy, doy_s, Year_fac, Station_fac, SalSurf, SalSurf_s)

model

sal_model<-function(group){
  par(mfrow=c(2,2))
  cat("<<<<<<<<<<<<<<<<<<<<<<< modeling", group, ">>>>>>>>>>>>>>>>>>>>>>>>>\n\n")
  
  model<-gam(BPUE_log1p ~ te(SalSurf_s, doy_s, k=c(5,5), bs=c("cs", "cc")) + 
               s(Year_fac, bs="re") + s(Station_fac, bs="re"),
             data=filter(zoop_data_mass, IBMR==group & Year>=2000), 
             method="REML")
  
  cat("-------------gam check-------------\n")
  gam.check(model)
  
  cat("\n\n-------------summary-------------\n")
  print(summary(model))
  
  sal<-predict(model, type="terms", terms="te(SalSurf_s,doy_s)", newdata=newdata, se.fit=T)%>%
    as_tibble()%>%
    mutate(across(everything(), as.vector))%>% # Make everything tidy
    rename(fit=starts_with("fit"), se=starts_with("se.fit"))%>%
    bind_cols(newdata%>% # Add covariate columns before these columns
                select(-Year_fac, -Station_fac, -doy_s, -SalSurf_s), 
              .)
  
  out<-list(model=model, sal=sal)
  return(out)
}

Apply model to all groups

sal_models<-map(set_names(unique(zoop_data_mass$IBMR)), sal_model)
## <<<<<<<<<<<<<<<<<<<<<<< modeling acartela >>>>>>>>>>>>>>>>>>>>>>>>>
## 
## -------------gam check-------------

## 
## Method: REML   Optimizer: outer newton
## full convergence after 6 iterations.
## Gradient range [-0.0001530318,0.0001289009]
## (score 2406.445 & scale 1.608608).
## Hessian positive definite, eigenvalue range [1.371107,703.6882].
## Model rank =  49 / 49 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                        k'   edf k-index             p-value    
## te(SalSurf_s,doy_s) 19.00 15.43    0.93 <0.0000000000000002 ***
## s(Year_fac)         21.00 19.18      NA                  NA    
## s(Station_fac)       8.00  5.67      NA                  NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## -------------summary-------------
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## BPUE_log1p ~ te(SalSurf_s, doy_s, k = c(5, 5), bs = c("cs", "cc")) + 
##     s(Year_fac, bs = "re") + s(Station_fac, bs = "re")
## 
## Parametric coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)   3.7895     0.2326   16.29 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df       F             p-value    
## te(SalSurf_s,doy_s) 15.435     19 1006.21 <0.0000000000000002 ***
## s(Year_fac)         19.179     20   23.97 <0.0000000000000002 ***
## s(Station_fac)       5.668      7   10.74 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.802   Deviance explained = 80.8%
## -REML = 2406.4  Scale est. = 1.6086    n = 1408
## <<<<<<<<<<<<<<<<<<<<<<< modeling daphnia >>>>>>>>>>>>>>>>>>>>>>>>>
## 
## -------------gam check-------------

## 
## Method: REML   Optimizer: outer newton
## full convergence after 8 iterations.
## Gradient range [-0.000003802226,0.000002679368]
## (score 1894.986 & scale 2.037998).
## Hessian positive definite, eigenvalue range [0.8837161,521.671].
## Model rank =  47 / 47 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                       k'  edf k-index p-value   
## te(SalSurf_s,doy_s) 19.0 12.5    0.95    0.01 **
## s(Year_fac)         21.0 16.3      NA      NA   
## s(Station_fac)       6.0  3.1      NA      NA   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## -------------summary-------------
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## BPUE_log1p ~ te(SalSurf_s, doy_s, k = c(5, 5), bs = c("cs", "cc")) + 
##     s(Year_fac, bs = "re") + s(Station_fac, bs = "re")
## 
## Parametric coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)   1.2868     0.1342   9.589 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df      F              p-value    
## te(SalSurf_s,doy_s) 12.524     19 59.064 < 0.0000000000000002 ***
## s(Year_fac)         16.276     20  4.312 < 0.0000000000000002 ***
## s(Station_fac)       3.097      5  2.102              0.00872 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.486   Deviance explained = 50.2%
## -REML =   1895  Scale est. = 2.038     n = 1044
## <<<<<<<<<<<<<<<<<<<<<<< modeling eurytem >>>>>>>>>>>>>>>>>>>>>>>>>
## 
## -------------gam check-------------

## 
## Method: REML   Optimizer: outer newton
## full convergence after 7 iterations.
## Gradient range [-0.000009770776,0.000008031399]
## (score 2233.818 & scale 2.472308).
## Hessian positive definite, eigenvalue range [1.241239,582.1755].
## Model rank =  49 / 49 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                        k'   edf k-index             p-value    
## te(SalSurf_s,doy_s) 19.00 14.20    0.76 <0.0000000000000002 ***
## s(Year_fac)         21.00 16.74      NA                  NA    
## s(Station_fac)       8.00  4.56      NA                  NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## -------------summary-------------
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## BPUE_log1p ~ te(SalSurf_s, doy_s, k = c(5, 5), bs = c("cs", "cc")) + 
##     s(Year_fac, bs = "re") + s(Station_fac, bs = "re")
## 
## Parametric coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)   4.3239     0.2032   21.28 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df       F             p-value    
## te(SalSurf_s,doy_s) 14.196     19 193.868 <0.0000000000000002 ***
## s(Year_fac)         16.735     20   4.897 <0.0000000000000002 ***
## s(Station_fac)       4.558      7   9.354 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.667   Deviance explained = 67.7%
## -REML = 2233.8  Scale est. = 2.4723    n = 1165
## <<<<<<<<<<<<<<<<<<<<<<< modeling othcalad >>>>>>>>>>>>>>>>>>>>>>>>>
## 
## -------------gam check-------------

## 
## Method: REML   Optimizer: outer newton
## full convergence after 6 iterations.
## Gradient range [-0.0006356981,0.0005147218]
## (score 2440.351 & scale 2.001391).
## Hessian positive definite, eigenvalue range [1.704383,672.178].
## Model rank =  49 / 49 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                        k'   edf k-index             p-value    
## te(SalSurf_s,doy_s) 19.00 16.06    0.92 <0.0000000000000002 ***
## s(Year_fac)         21.00 17.61      NA                  NA    
## s(Station_fac)       8.00  5.78      NA                  NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## -------------summary-------------
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## BPUE_log1p ~ te(SalSurf_s, doy_s, k = c(5, 5), bs = c("cs", "cc")) + 
##     s(Year_fac, bs = "re") + s(Station_fac, bs = "re")
## 
## Parametric coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)   5.7460     0.2445    23.5 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df       F             p-value    
## te(SalSurf_s,doy_s) 16.061     19 331.388 <0.0000000000000002 ***
## s(Year_fac)         17.614     20   8.673 <0.0000000000000002 ***
## s(Station_fac)       5.777      7  10.936 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.525   Deviance explained = 53.9%
## -REML = 2440.4  Scale est. = 2.0014    n = 1345
## <<<<<<<<<<<<<<<<<<<<<<< modeling othcaljuv >>>>>>>>>>>>>>>>>>>>>>>>>
## 
## -------------gam check-------------

## 
## Method: REML   Optimizer: outer newton
## full convergence after 7 iterations.
## Gradient range [-0.0002635295,0.00007143662]
## (score 2060.153 & scale 0.9402621).
## Hessian positive definite, eigenvalue range [1.938979,718.1505].
## Model rank =  49 / 49 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                        k'   edf k-index             p-value    
## te(SalSurf_s,doy_s) 19.00 16.52    0.88 <0.0000000000000002 ***
## s(Year_fac)         21.00 15.81      NA                  NA    
## s(Station_fac)       8.00  6.49      NA                  NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## -------------summary-------------
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## BPUE_log1p ~ te(SalSurf_s, doy_s, k = c(5, 5), bs = c("cs", "cc")) + 
##     s(Year_fac, bs = "re") + s(Station_fac, bs = "re")
## 
## Parametric coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)   6.7008     0.2153   31.12 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df       F             p-value    
## te(SalSurf_s,doy_s) 16.517     19 108.367 <0.0000000000000002 ***
## s(Year_fac)         15.814     20   4.031 <0.0000000000000002 ***
## s(Station_fac)       6.488      7  47.553 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.414   Deviance explained =   43%
## -REML = 2060.2  Scale est. = 0.94026   n = 1437
## <<<<<<<<<<<<<<<<<<<<<<< modeling othclad >>>>>>>>>>>>>>>>>>>>>>>>>
## 
## -------------gam check-------------

## 
## Method: REML   Optimizer: outer newton
## full convergence after 8 iterations.
## Gradient range [-0.00000176167,0.0000007777305]
## (score 1943.508 & scale 1.348785).
## Hessian positive definite, eigenvalue range [1.651184,601.6864].
## Model rank =  49 / 49 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                        k'   edf k-index p-value   
## te(SalSurf_s,doy_s) 19.00 12.82    0.95    0.01 **
## s(Year_fac)         21.00 18.29      NA      NA   
## s(Station_fac)       8.00  5.02      NA      NA   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## -------------summary-------------
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## BPUE_log1p ~ te(SalSurf_s, doy_s, k = c(5, 5), bs = c("cs", "cc")) + 
##     s(Year_fac, bs = "re") + s(Station_fac, bs = "re")
## 
## Parametric coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)    1.589      0.178   8.927 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                       edf Ref.df      F             p-value    
## te(SalSurf_s,doy_s) 12.82     19 330.89 <0.0000000000000002 ***
## s(Year_fac)         18.29     20  11.07 <0.0000000000000002 ***
## s(Station_fac)       5.02      7  10.64 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.561   Deviance explained = 57.4%
## -REML = 1943.5  Scale est. = 1.3488    n = 1204
## <<<<<<<<<<<<<<<<<<<<<<< modeling pdiapfor >>>>>>>>>>>>>>>>>>>>>>>>>
## 
## -------------gam check-------------

## 
## Method: REML   Optimizer: outer newton
## full convergence after 8 iterations.
## Gradient range [-0.001381544,0.001112387]
## (score 2560.156 & scale 1.87464).
## Hessian positive definite, eigenvalue range [1.017025,716.6863].
## Model rank =  49 / 49 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                        k'   edf k-index             p-value    
## te(SalSurf_s,doy_s) 19.00 15.08    0.88 <0.0000000000000002 ***
## s(Year_fac)         21.00 19.11      NA                  NA    
## s(Station_fac)       8.00  6.13      NA                  NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## -------------summary-------------
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## BPUE_log1p ~ te(SalSurf_s, doy_s, k = c(5, 5), bs = c("cs", "cc")) + 
##     s(Year_fac, bs = "re") + s(Station_fac, bs = "re")
## 
## Parametric coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)   5.0523     0.2856   17.69 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df       F             p-value    
## te(SalSurf_s,doy_s) 15.076     19 1649.53 <0.0000000000000002 ***
## s(Year_fac)         19.110     20   20.94 <0.0000000000000002 ***
## s(Station_fac)       6.127      7   22.48 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.701   Deviance explained =   71%
## -REML = 2560.2  Scale est. = 1.8746    n = 1434
## <<<<<<<<<<<<<<<<<<<<<<< modeling allcopnaup >>>>>>>>>>>>>>>>>>>>>>>>>
## 
## -------------gam check-------------

## 
## Method: REML   Optimizer: outer newton
## full convergence after 9 iterations.
## Gradient range [-0.0002665227,0.0001511567]
## (score 1041.11 & scale 3.48814).
## Hessian positive definite, eigenvalue range [0.0002664258,245.3791].
## Model rank =  43 / 43 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                            k'       edf k-index p-value   
## te(SalSurf_s,doy_s) 19.000000  9.993763     0.9   0.005 **
## s(Year_fac)         21.000000 17.626013      NA      NA   
## s(Station_fac)       2.000000  0.000836      NA      NA   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## -------------summary-------------
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## BPUE_log1p ~ te(SalSurf_s, doy_s, k = c(5, 5), bs = c("cs", "cc")) + 
##     s(Year_fac, bs = "re") + s(Station_fac, bs = "re")
## 
## Parametric coefficients:
##             Estimate Std. Error t value         Pr(>|t|)    
## (Intercept)   1.8375     0.2524   7.279 0.00000000000146 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                            edf Ref.df      F             p-value    
## te(SalSurf_s,doy_s)  9.9937631     19 18.626 <0.0000000000000002 ***
## s(Year_fac)         17.6260129     20  7.318 <0.0000000000000002 ***
## s(Station_fac)       0.0008362      1  0.000               0.546    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.454   Deviance explained = 48.4%
## -REML = 1041.1  Scale est. = 3.4881    n = 491
## <<<<<<<<<<<<<<<<<<<<<<< modeling limno >>>>>>>>>>>>>>>>>>>>>>>>>
## 
## -------------gam check-------------

## 
## Method: REML   Optimizer: outer newton
## full convergence after 7 iterations.
## Gradient range [-0.000002803489,0.000002590693]
## (score 921.1552 & scale 2.085479).
## Hessian positive definite, eigenvalue range [0.474912,247.7156].
## Model rank =  43 / 43 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                         k'    edf k-index p-value  
## te(SalSurf_s,doy_s) 19.000 14.242    0.95    0.09 .
## s(Year_fac)         21.000 10.381      NA      NA  
## s(Station_fac)       2.000  0.976      NA      NA  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## -------------summary-------------
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## BPUE_log1p ~ te(SalSurf_s, doy_s, k = c(5, 5), bs = c("cs", "cc")) + 
##     s(Year_fac, bs = "re") + s(Station_fac, bs = "re")
## 
## Parametric coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)   6.2650     0.4288   14.61 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                         edf Ref.df      F              p-value    
## te(SalSurf_s,doy_s) 14.2416     19 84.408 < 0.0000000000000002 ***
## s(Year_fac)         10.3811     20  1.092              0.00356 ** 
## s(Station_fac)       0.9758      1 40.575 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =   0.73   Deviance explained = 74.4%
## -REML = 921.16  Scale est. = 2.0855    n = 496
## <<<<<<<<<<<<<<<<<<<<<<< modeling mysid >>>>>>>>>>>>>>>>>>>>>>>>>
## 
## -------------gam check-------------

## 
## Method: REML   Optimizer: outer newton
## full convergence after 6 iterations.
## Gradient range [-0.00000009163881,0.00000006254805]
## (score 1240.961 & scale 3.302209).
## Hessian positive definite, eigenvalue range [1.772933,293.8917].
## Model rank =  48 / 48 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                        k'   edf k-index p-value   
## te(SalSurf_s,doy_s) 19.00 14.46    0.92    0.01 **
## s(Year_fac)         21.00 17.82      NA      NA   
## s(Station_fac)       7.00  5.37      NA      NA   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## -------------summary-------------
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## BPUE_log1p ~ te(SalSurf_s, doy_s, k = c(5, 5), bs = c("cs", "cc")) + 
##     s(Year_fac, bs = "re") + s(Station_fac, bs = "re")
## 
## Parametric coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)   5.4655     0.6154   8.881 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df     F             p-value    
## te(SalSurf_s,doy_s) 14.456     19 46.69 <0.0000000000000002 ***
## s(Year_fac)         17.818     20  7.94 <0.0000000000000002 ***
## s(Station_fac)       5.366      6 15.32 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.467   Deviance explained = 50.1%
## -REML =   1241  Scale est. = 3.3022    n = 588
## <<<<<<<<<<<<<<<<<<<<<<< modeling othcyc >>>>>>>>>>>>>>>>>>>>>>>>>
## 
## -------------gam check-------------

## 
## Method: REML   Optimizer: outer newton
## full convergence after 8 iterations.
## Gradient range [-0.0000002737279,0.0000001591475]
## (score 782.4187 & scale 1.203426).
## Hessian positive definite, eigenvalue range [0.4770929,247.7024].
## Model rank =  43 / 43 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                         k'    edf k-index p-value
## te(SalSurf_s,doy_s) 19.000 13.540    0.92    0.11
## s(Year_fac)         21.000 10.271      NA      NA
## s(Station_fac)       2.000  0.978      NA      NA
## 
## 
## -------------summary-------------
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## BPUE_log1p ~ te(SalSurf_s, doy_s, k = c(5, 5), bs = c("cs", "cc")) + 
##     s(Year_fac, bs = "re") + s(Station_fac, bs = "re")
## 
## Parametric coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)   9.5247     0.3425   27.81 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                         edf Ref.df      F              p-value    
## te(SalSurf_s,doy_s) 13.5395     19 42.507 < 0.0000000000000002 ***
## s(Year_fac)         10.2705     20  1.035              0.00573 ** 
## s(Station_fac)       0.9782      1 45.167 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.591   Deviance explained = 61.1%
## -REML = 782.42  Scale est. = 1.2034    n = 496
## <<<<<<<<<<<<<<<<<<<<<<< modeling other >>>>>>>>>>>>>>>>>>>>>>>>>
## 
## -------------gam check-------------

## 
## Method: REML   Optimizer: outer newton
## full convergence after 11 iterations.
## Gradient range [-0.0002193929,0.000137932]
## (score 1034.276 & scale 3.43328).
## Hessian positive definite, eigenvalue range [0.0002193328,245.3345].
## Model rank =  43 / 43 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                            k'       edf k-index p-value
## te(SalSurf_s,doy_s) 19.000000 11.875902    1.02    0.65
## s(Year_fac)         21.000000 15.819162      NA      NA
## s(Station_fac)       2.000000  0.000715      NA      NA
## 
## 
## -------------summary-------------
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## BPUE_log1p ~ te(SalSurf_s, doy_s, k = c(5, 5), bs = c("cs", "cc")) + 
##     s(Year_fac, bs = "re") + s(Station_fac, bs = "re")
## 
## Parametric coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)   5.7537     0.1888   30.48 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                            edf Ref.df      F             p-value    
## te(SalSurf_s,doy_s) 11.8759018     19 11.915 <0.0000000000000002 ***
## s(Year_fac)         15.8191618     20  3.721 <0.0000000000000002 ***
## s(Station_fac)       0.0007154      1  0.000               0.533    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.373   Deviance explained = 40.9%
## -REML = 1034.3  Scale est. = 3.4333    n = 491

Extract salinity conversions

sal_conversions<-map_dfr(sal_models, ~.x[["sal"]], .id = "IBMR")

str(sal_conversions)
## tibble [22,608 x 6] (S3: tbl_df/tbl/data.frame)
##  $ IBMR   : chr [1:22608] "acartela" "acartela" "acartela" "acartela" ...
##  $ Month  : num [1:22608] 1 1 1 1 1 1 1 1 1 1 ...
##  $ doy    : num [1:22608] 15 15 15 15 15 15 15 15 15 15 ...
##  $ SalSurf: num [1:22608] 0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 ...
##  $ fit    : num [1:22608] -3.25 -3.11 -2.97 -2.84 -2.7 ...
##  $ se     : num [1:22608] 0.428 0.395 0.364 0.335 0.309 ...

plot salinity conversions

ggplot(sal_conversions, aes(x=SalSurf, y=fit, ymin=fit-se, ymax=fit+se, fill=IBMR))+
  geom_ribbon(alpha=0.4)+
  ylab("fit (log scale)")+
  facet_wrap(~Month)+
  scale_fill_viridis_d()+
  theme_bw()

Load in SMSCG modeled salinity

SMSCG_sal<-read_csv("Data/SMSCG salinity modeling/bdl_summer_fall_2022_psu_filtered.csv", skip = 10)%>%
  mutate(Month=month(datetime))%>%
  drop_na()%>%
  group_by(Month)%>%
  summarise(across(contains("_bn_"), ~round(mean(.x), 1)))
## Rows: 30048 Columns: 5
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl  (4): bdl_bn_noact, bdl_bn_smscg4, bdl_bn_smscg6, bdl_dry_noact
## dttm (1): datetime
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

Plot SMSCG modeled salinity

ggplot(pivot_longer(SMSCG_sal, cols=contains("_bn_"), names_to="Scenario", values_to="Salinity")%>% # Prepare data for easier plotting
         mutate(Scenario=factor(Scenario, levels=c("bdl_bn_noact", "bdl_bn_smscg6", "bdl_bn_smscg4"))), 
       aes(x=Month, y=Salinity, color=Scenario))+
  geom_line()+
  scale_color_viridis_d(direction = -1)+
  scale_x_continuous(breaks=1:12)+
  theme_bw()+
  theme(legend.position = "bottom")

salinity conversion function

sal_fix<-function(data, sal_conversions, sal_target, target_column, months){
  
  salinity_conversions<-sal_conversions%>%
    filter(Month%in%months)%>% ## Only choose the months to be included
    mutate(SalSurf=as.character(SalSurf))%>% # Change to character for safer joining
    select(IBMR, Month, SalSurf, fit)
  
  # Are any target salinities outside the range of the model?
  target_outside<-setdiff(as.character(sal_target[[target_column]]), salinity_conversions$SalSurf)
  
  if(length(target_outside)>0){
    cat("Some target salinities are out of range for the prediction. They are:", sort(target_outside))
  }
  
  # select target salinities from the conversions dataset
  sal_conversions_target<-salinity_conversions%>%
    semi_join(sal_target%>%
                mutate(SalSurf=as.character(.data[[target_column]])), 
              by=c("Month", "SalSurf"))%>%
    select(IBMR, Month, fit_target=fit)
  
  if(any(duplicated(paste(sal_conversions_target$Month, sal_conversions_target$IBMR)))){
    stop("sal_conversions were duplicated in filtering out the target salinities")
  }
  
  #Make sure all months are present for each IBMR
  monthcheck<-sal_conversions_target%>%
    group_by(IBMR)%>%
    summarise(N=n_distinct(Month))
  
  if(any(monthcheck$N!=length(months))){
    stop("Some months are missing in the sal_target")
  }
  
  # Set up salinity conversion factors for data and target
  salinity_conversions_correction<-salinity_conversions%>%
    left_join(sal_conversions_target, by=c("IBMR", "Month"))%>% # Join targets to full set of conversions
    mutate(Correction=fit_target-fit)%>% # Calculate correction factor
    select(IBMR, Month, SalSurf, Correction)
  
  # Pull unique salinities for outrange (to test for any data salinities not present in model)
  sals<-unique(salinity_conversions_correction$SalSurf)
  
  # Convert salinity to a character rounded to 1 decimal for safer joining
  data<-data%>%
    mutate(Sal_merge=as.character(round(SalSurf, 1)))
  
  # Are any salinities from the data not present in the model?
  outrange<-setdiff(data$Sal_merge, sals)
  
  if(length(outrange)>0){
    cat("Some salinities are out of range for the prediction. They are:", sort(outrange))
  }
  
  # Join salinity conversions and calculate adjusted BPUE
  data<-data%>%
    left_join(salinity_conversions_correction, by=c("Month", "Sal_merge"="SalSurf", "IBMR"))%>%
    mutate(BPUE_adj=exp(BPUE_log1p+Correction)-1) # Calculate adjusted BPUE and undo the log1p transformation
  
  return(data)
}

Prepare data for conversion

# From Rosie:
#Calculate water year function
wtr_yr <- function(dates, start_month=12) {
  # Convert dates into POSIXlt
  dates.posix = as.POSIXlt(dates)
  # Year offset
  offset = ifelse(dates.posix$mon == start_month - 1, 1, 0)
  # Water year
  adj.year = dates.posix$year + 1900 + offset
  # Return the water year
  adj.year
}

# Read in water year types
year_types<-read_csv(url("https://raw.githubusercontent.com/rosehartman/DCGmodeling/main/yeartypes.csv"))%>%
  rename(Yearadj = Year)
## Rows: 52 Columns: 4
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (2): Yr_type, Drought
## dbl (2): Year, Index
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Choose the months to be used
months<-c(6,7,8,9,10)

# Filter data to what we want to apply salinity conversions to
zoop_data_mass_bn<-zoop_data_mass%>%
  mutate(Yearadj = wtr_yr(Date),
         Month=month(Date)) %>%
  filter(Year > 2000)%>%
  left_join(year_types) %>%
  filter(Yr_type == "Below Normal" & Month %in% months)%>%
  select(Date, Month, Year, SalSurf, IBMR, BPUE, BPUE_log1p)
## Joining, by = "Yearadj"

Apply the conversion function

zoop_corrected<-map_dfr(set_names(c("bdl_bn_noact", "bdl_bn_smscg4", "bdl_bn_smscg6")), ~sal_fix(zoop_data_mass_bn, sal_conversions, SMSCG_sal, .x, months), .id="Scenario")%>%
  mutate(Scenario=factor(Scenario, levels=c("bdl_bn_noact", "bdl_bn_smscg6", "bdl_bn_smscg4")))

Plot the result

ggplot(zoop_corrected, aes(x=Date, y=BPUE_adj, color=Scenario))+
  geom_point(alpha=0.4)+
  scale_y_continuous(trans="log1p", breaks = c(0, 10^(1:6)))+
  scale_color_viridis_d(direction = -1, end = 0.9)+
  facet_wrap(~paste(IBMR, Year), scales="free", ncol=5)+
  theme_bw()+
  theme(legend.position = "bottom")

Plot BPUE vs BPUE_adj

ggplot(zoop_corrected, aes(x=BPUE, y=BPUE_adj, color=Scenario))+
  geom_point(alpha=0.4)+
  scale_y_continuous(trans="log1p", breaks = c(0, 10^(1:6)))+
  scale_x_continuous(trans="log1p", breaks = c(0, 10^(1:6)))+
  scale_color_viridis_d(direction = -1, end = 0.9)+
  facet_wrap(~IBMR, scales="free")+
  theme_bw()+
  theme(legend.position = "bottom")

Summarise zoops by IBMR, scenario, and month

zoop_bn_sum<-zoop_corrected%>%
  group_by(IBMR, Month, Scenario)%>%
  summarise(BPUE_sum=sum(BPUE_adj), BPUE_sd=sd(BPUE_adj), .groups="drop")

write_csv(zoop_bn_sum, "Outputs/zoop_salinity_corrected_sum.csv")

Plot summarized CPUE +/- SD

ggplot(zoop_bn_sum, aes(x=Month, y=BPUE_sum, ymin=BPUE_sum-BPUE_sd, ymax=BPUE_sum+BPUE_sd, fill=Scenario))+
  geom_pointrange(position = position_dodge(width=0.6), shape=21, color="black", size=1)+
  scale_fill_viridis_d(direction = -1, end = 0.9)+
  facet_wrap(~IBMR, scales="free")+
  theme_bw()+
  theme(legend.position = "bottom")