Generating probabilities

Introduction

The aim of this document is to illustrate how to automatically generate some types of sample probabilities using functions runChecksOnSelectionAndProbs and applyGenerateProbs of the RDBEScore package.

Load the package

library(RDBEScore)

main data requirements to generate probabilities

RDBEScore currently only provides probability generation for selection method “SRSWR”,“SRSWOR” and “CENSUS”. Also, to automatically generate probabilities, it is necessary that numTotal and numSamp are declared in every sampling table. Functions are also only configured to handle for cases where “clustering==”N”. If that is not the case, changes need to be made to the data before running the functions. Such changes configure significant assumptions that should be left well visible in the data preparation section of any estimation script.

Load and validate some example data

First we’ll load some example data from the RDBES and check it’s valid. It’s a good tip to check your RDBESDataObjects are valid after any manipulations you perform. See how to import your own data in the vignette Import RDBES data In this vignette package example data pre-loaded with RDBEScore is used.


# load some H1 test data
myH1DataObject <- H1Example
     
# filter data for DEstratumName==DE_stratum1_H1 to make object smaller and easier to handle
myH1DataObject <- filterAndTidyRDBESDataObject(myH1DataObject,c("DEstratumName"), c("DE_stratum1_H1"),
                                               killOrphans=TRUE)

The functions that generate probabilities do not yet deal with lower hierarchies A and B so we rework a bit the data so it looks like lower hierarchy C.

# Temp fixes to change data to lower hierarchy C - function won't deal with A, or B yet
myH1DataObject[["BV"]] <- dplyr::distinct(myH1DataObject[["BV"]], FMid, .keep_all = TRUE)
temp <- dplyr::left_join(myH1DataObject[["BV"]][,c("BVid","FMid")], 
                         myH1DataObject[["FM"]][,c("FMid","SAid")], 
                         by="FMid")
myH1DataObject[["BV"]]$SAid <- temp$SAid
myH1DataObject[["BV"]]$FMid <- NA
myH1DataObject[["SA"]]$SAlowHierarchy <- "C"
myH1DataObject[["BV"]]$BVnumTotal <- 10
myH1DataObject[["BV"]]$BVnumSamp <- 10

# reworking stratification of VS table
myH1DataObject[["VS"]][VSencrVessCode %in% c("VDcode_5","VDcode_8","VDcode_9")]$VSstratumName <- 
  "VS_stratum1"
myH1DataObject[["VS"]][VSencrVessCode %in% c("VDcode_5","VDcode_8","VDcode_9")]$VSnumTotal <- 30
myH1DataObject[["VS"]][VSencrVessCode %in% c("VDcode_6","VDcode_7","VDcode_10")]$VSstratumName <- 
  "VS_stratum2"
myH1DataObject[["VS"]][VSstratumName %in% "VS_stratum1",]$VSnumSamp <- 5
myH1DataObject[["VS"]][VSstratumName %in% "VS_stratum2",]$VSnumSamp <- 4

# reworking FT table
myH1DataObject[["FT"]]$FTselectMeth <- "SRSWOR"

tmp<-myH1DataObject[["FT"]]
tmp$VSencrVessCode<-myH1DataObject[["VS"]]$VSencrVessCode[match(myH1DataObject[["FT"]]$VSid,
                                                                myH1DataObject[["VS"]]$VSid)]

tmp$FTnumSamp<-as.integer(table(tmp$VSencrVessCode))[match(tmp$VSencrVessCode, 
                                                           names(table(tmp$VSencrVessCode)))]

tmp[tmp$VSencrVessCode %in% c("VDcode_5"),]$FTnumTotal<-100
tmp[tmp$VSencrVessCode %in% c("VDcode_6"),]$FTnumTotal<-50
tmp[tmp$VSencrVessCode %in% c("VDcode_7"),]$FTnumTotal<-25
tmp[tmp$VSencrVessCode %in% c("VDcode_8"),]$FTnumTotal<-80
tmp[tmp$VSencrVessCode %in% c("VDcode_9"),]$FTnumTotal<-70
tmp[tmp$VSencrVessCode %in% c("VDcode_10"),]$FTnumTotal<-60

tmp$VSencrVessCode<-NULL

tmp$FTstratumName<-"U"
tmp$FTstratification<-"N"

myH1DataObject[["FT"]]<-tmp
myH1DataObject[["FO"]]$FOselectMeth<-"SRSWOR"
myH1DataObject$SA$SAselectMeth<-"SRSWOR"
myH1DataObject$SS$SSselectMeth<-"SRSWOR"
myH1DataObject$BV$BVselectMeth<-"SRSWOR"

# confirm validity
validateRDBESDataObject(myH1DataObject)

A closer look at the example data

The final data contains 10 ages in each of 243 hauls sampled from 81 trips done by 9 selected vessels.

Examining the selection methods used in the VS table it is visible that the 9 vessels were selected with replacement (SRSWR) out of two strata, one strata with a total of 30 vessels (VS_stratum1) and one with a total of 60 vessels (VS_stratum2). It is also noticeable that selection and inclusion probabilities were not declared during upload.

unique(myH1DataObject[["VS"]][,c("VSstratification","VSstratumName","VSselectMeth",
                                 "VSnumTotal","VSnumSamp","VSselProb","VSincProb")])
#>    VSstratification VSstratumName VSselectMeth VSnumTotal VSnumSamp VSselProb
#>              <char>        <char>       <char>      <int>     <int>     <num>
#> 1:                Y   VS_stratum1        SRSWR         30         5        NA
#> 2:                Y   VS_stratum2        SRSWR         60         4        NA
#>    VSincProb
#>        <num>
#> 1:        NA
#> 2:        NA

With regards to trips these were selected without replacement (SRSWOR). either 9 or 18 trips were selected from each vessel. Individual vessels registered total number of trips between 25 and 100 trips. We also see that selection and inclusion probabilities were not declared.

unique(myH1DataObject[["FT"]][,c("VSid","FTstratification","FTstratumName","FTselectMeth",
                                 "FTnumTotal","FTnumSamp","FTselProb","FTincProb")])
#>     VSid FTstratification FTstratumName FTselectMeth FTnumTotal FTnumSamp
#>    <int>           <char>        <char>       <char>      <int>     <int>
#> 1: 78006                N             U       SRSWOR        100        18
#> 2: 78007                N             U       SRSWOR         80        18
#> 3: 78008                N             U       SRSWOR         70         9
#> 4: 78009                N             U       SRSWOR         60         9
#> 5: 78010                N             U       SRSWOR         50        18
#> 6: 78011                N             U       SRSWOR         50        18
#> 7: 78012                N             U       SRSWOR        100        18
#> 8: 78013                N             U       SRSWOR         80        18
#> 9: 78014                N             U       SRSWOR         25         9
#>    FTselProb FTincProb
#>        <num>     <num>
#> 1:        NA        NA
#> 2:        NA        NA
#> 3:        NA        NA
#> 4:        NA        NA
#> 5:        NA        NA
#> 6:        NA        NA
#> 7:        NA        NA
#> 8:        NA        NA
#> 9:        NA        NA

With regards to hauls the example data indicates that 20 were done in every trip(!) from which 3 were sampled. Not very likely data, but good enough for demonstration purposes. Also here we see that selection and inclusion probabilities were not declared.

table(myH1DataObject[["FO"]]$FTid)
#> 
#> 21677 21678 21679 21680 21681 21682 21683 21684 21685 21686 21687 21688 21689 
#>     3     3     3     3     3     3     3     3     3     3     3     3     3 
#> 21690 21691 21692 21693 21694 21695 21696 21697 21698 21699 21700 21701 21702 
#>     3     3     3     3     3     3     3     3     3     3     3     3     3 
#> 21703 21704 21705 21706 21707 21708 21709 21710 21711 21712 21713 21714 21715 
#>     3     3     3     3     3     3     3     3     3     3     3     3     3 
#> 21716 21717 21718 21719 21720 21721 21722 21723 21724 21725 21726 21727 21728 
#>     3     3     3     3     3     3     3     3     3     3     3     3     3 
#> 21729 21730 21731 21732 21733 21734 21735 21736 21737 21738 21739 21740 21741 
#>     3     3     3     3     3     3     3     3     3     3     3     3     3 
#> 21742 21743 21744 21745 21746 21747 21748 21749 21750 21751 21752 21753 21754 
#>     3     3     3     3     3     3     3     3     3     3     3     3     3 
#> 21755 21756 21757 
#>     3     3     3
unique(myH1DataObject[["FO"]][,c("FOid","FOstratification","FOstratumName","FOselectMeth",
                                 "FOnumTotal","FOnumSamp","FOselProb","FOincProb")])
#> Key: <FOid>
#>       FOid FOstratification FOstratumName FOselectMeth FOnumTotal FOnumSamp
#>      <int>           <char>        <char>       <char>      <int>     <int>
#>   1: 68983                N             U       SRSWOR         20         3
#>   2: 68984                N             U       SRSWOR         20         3
#>   3: 68985                N             U       SRSWOR         20         3
#>   4: 68986                N             U       SRSWOR         20         3
#>   5: 68987                N             U       SRSWOR         20         3
#>  ---                                                                       
#> 239: 69221                N             U       SRSWOR         20         3
#> 240: 69222                N             U       SRSWOR         20         3
#> 241: 69223                N             U       SRSWOR         20         3
#> 242: 69224                N             U       SRSWOR         20         3
#> 243: 69225                N             U       SRSWOR         20         3
#>      FOselProb FOincProb
#>          <num>     <num>
#>   1:        NA        NA
#>   2:        NA        NA
#>   3:        NA        NA
#>   4:        NA        NA
#>   5:        NA        NA
#>  ---                    
#> 239:        NA        NA
#> 240:        NA        NA
#> 241:        NA        NA
#> 242:        NA        NA
#> 243:        NA        NA

generating probabilities for only one table: generateProbs

To generate probabilities for one of the tables choose what type of probabilities you want to generate (“selection” or “inclusion”) and run generateProbs.

note: To check the data for some issues related to selection methods and probabilities, you can run function runChecksOnSelectionAndProbs. But in general this is not necessary because when you run applyGenerateProbs with defaults a call to runChecksOnSelectionAndProbs is included.

myH1DataObject_uptde<-myH1DataObject
myH1DataObject_uptde[["VS"]] <- generateProbs(myH1DataObject[["VS"]], 
                                              probType="inclusion")
#> [1] "SRSWR"
# display changes
myH1DataObject_uptde[["VS"]][,c("VSstratification","VSstratumName","VSselectMeth",
                                "VSnumTotal","VSnumSamp","VSselProb","VSincProb")]
#>    VSstratification VSstratumName VSselectMeth VSnumTotal VSnumSamp VSselProb
#>              <char>        <char>       <char>      <int>     <int>     <num>
#> 1:                Y   VS_stratum1        SRSWR         30         5        NA
#> 2:                Y   VS_stratum1        SRSWR         30         5        NA
#> 3:                Y   VS_stratum1        SRSWR         30         5        NA
#> 4:                Y   VS_stratum2        SRSWR         60         4        NA
#> 5:                Y   VS_stratum2        SRSWR         60         4        NA
#> 6:                Y   VS_stratum2        SRSWR         60         4        NA
#> 7:                Y   VS_stratum1        SRSWR         30         5        NA
#> 8:                Y   VS_stratum1        SRSWR         30         5        NA
#> 9:                Y   VS_stratum2        SRSWR         60         4        NA
#>     VSincProb
#>         <num>
#> 1: 0.15591979
#> 2: 0.15591979
#> 3: 0.15591979
#> 4: 0.06501844
#> 5: 0.06501844
#> 6: 0.06501844
#> 7: 0.15591979
#> 8: 0.15591979
#> 9: 0.06501844

myH1DataObject_uptde[["VS"]] <- generateProbs(myH1DataObject_uptde[["VS"]], 
                                              probType="selection")
#> [1] "SRSWR"
# display changes
myH1DataObject_uptde[["VS"]][,c("VSstratification","VSstratumName","VSselectMeth",
                                "VSnumTotal","VSnumSamp","VSselProb","VSincProb")]
#>    VSstratification VSstratumName VSselectMeth VSnumTotal VSnumSamp  VSselProb
#>              <char>        <char>       <char>      <int>     <int>      <num>
#> 1:                Y   VS_stratum1        SRSWR         30         5 0.03333333
#> 2:                Y   VS_stratum1        SRSWR         30         5 0.03333333
#> 3:                Y   VS_stratum1        SRSWR         30         5 0.03333333
#> 4:                Y   VS_stratum2        SRSWR         60         4 0.01666667
#> 5:                Y   VS_stratum2        SRSWR         60         4 0.01666667
#> 6:                Y   VS_stratum2        SRSWR         60         4 0.01666667
#> 7:                Y   VS_stratum1        SRSWR         30         5 0.03333333
#> 8:                Y   VS_stratum1        SRSWR         30         5 0.03333333
#> 9:                Y   VS_stratum2        SRSWR         60         4 0.01666667
#>     VSincProb
#>         <num>
#> 1: 0.15591979
#> 2: 0.15591979
#> 3: 0.15591979
#> 4: 0.06501844
#> 5: 0.06501844
#> 6: 0.06501844
#> 7: 0.15591979
#> 8: 0.15591979
#> 9: 0.06501844

generating probabilities for an entire RDBES data object: applyGenerateProbs

The function applyGenerateProbs generates selection or inclusion probabilities for all selection tables of an RDBES data object in one go. Here, we avoid running the checks by setting runInitialProbChecks to FALSE.

  myH1DataObject_uptde<-applyGenerateProbs (x = myH1DataObject
                                      , probType = "inclusion"
                                      , overwrite=T
                                      , runInitialProbChecks = FALSE)

validateRDBESDataObject(myH1DataObject_uptde)

# display changes
myH1DataObject_uptde[["VS"]][,c("VSstratification","VSstratumName","VSselectMeth",
                                "VSnumTotal","VSnumSamp","VSselProb","VSincProb")]

unique(myH1DataObject_uptde[["FT"]][,c("VSid","FTstratification","FTstratumName",
                                       "FTselectMeth","FTnumTotal","FTnumSamp","FTselProb","FTincProb")])

unique(myH1DataObject_uptde[["FO"]][,c("FOid","FOstratification","FOstratumName",
                                       "FOselectMeth","FOnumTotal","FOnumSamp","FOselProb","FOincProb")])

unique(myH1DataObject_uptde[["BV"]][,c("BVid","BVfishId","BVselectMeth","BVnumTotal",
                                       "BVnumSamp","BVselProb","BVincProb")])

We could use probType = "selection" to further complete the data with selection probabilities. However, selection method in table FT is SRSWOR and so the applyGenerateProbs issues an error (see ?applyGenerateProbs for more details)

  myH1DataObject_uptde<-applyGenerateProbs (x = myH1DataObject_uptde
                                      , probType = "selection"
                                      , overwrite=T
                                      , runInitialProbChecks = FALSE)
#> [1] "========start generateProbs======="
#> [1] "VS"
#> [1] "SDid: 5438"
#> [1] "SRSWR"
#> [1] "FT"
#> [1] "VSid: 78006"
#> [1] "SRSWOR"
#> Error in generateProbs(x, probType): depends on order

The overwrite argument

To be completed