documentation!

7a01b27c · Peter Jentsch · e8a16b48 · 7a01b27c · 7a01b27c · 7a01b27c
Commit 7a01b27c authored 4 years ago by Peter Jentsch
--- a/IntervalsModel/simulation_data/hh.dat
+++ b/IntervalsModel/simulation_data/hh.dat
--- a/IntervalsModel/simulation_data/ws.dat
+++ b/IntervalsModel/simulation_data/ws.dat
--- a/IntervalsModel/src/IntervalsModel.jl
+++ b/IntervalsModel/src/IntervalsModel.jl
@@ -22,10 +22,14 @@ using Plots
 const rng = Xoroshiro128Plus()
-#lets us use nicer names for the indices
+#consts that let give us nicer names for the indices
 const YOUNG, MIDDLE,OLD = 1,2,3
 const durmax = 144
-const comparison_samples = 100
+"""
+Number of start times to sample for a given set of distribution parameters. 
+"""
+const comparison_samples = 70
 const sparam = [60, 12]
 # Swap age brackets for numbers
 const swap_dict = OrderedDict("Y" => YOUNG, "M" => MIDDLE, "O" => OLD)
@@ -98,6 +102,8 @@ function bayesian_estimation(fname, err_func, priors_list, dists, particles; alp
 end
 """
+    do_hh(particles::Integer)
 Fit HH durations. This function specifies a list of distributions to fit, and loads the priors for "hh", and then calls `bayesian_estimation` with `err_hh`. This function passes parameters to `bayesian_estimation` depending on what produces the best fit for the HH data.
 """
 function do_hh(particles)
@@ -115,6 +121,9 @@ end
 """
+    do_ws(particles::Integer)
 Fit WS durations. This function specifies a list of distributions to fit, and loads the priors for "workschool", and then calls `bayesian_estimation` with `err_ws`. This function passes parameters to `bayesian_estimation` depending on what produces the best fit for the WS data.
 """
 function do_ws(particles)
@@ -130,6 +139,9 @@ function do_ws(particles)
 end
 """
+    do_rest(particles::Integer)
 Fit rest durations. This function specifies a list of distributions to fit, and loads the priors for "rest", and then calls `bayesian_estimation` with `err_rest`. This function passes parameters to `bayesian_estimation` depending on what produces the best fit for the Rest data.
 """
 function do_rest(particles)

--- a/IntervalsModel/src/hh_durations_model.jl
+++ b/IntervalsModel/src/hh_durations_model.jl
 const HHYMO = DataFrame(CSV.File("$PACKAGE_FOLDER/network-data/Timeuse/HH/HHYMO.csv"))
-const cnst_hh = (
+const Wghttotal = sum(HHYMO[:,"WGHT_PER"])
-    subsize = size(HHYMO)[1],
-    Wghttotal = sum(HHYMO[:,"WGHT_PER"]),
-)
 function make_dat_array()
    durs = hcat(
@@ -27,28 +24,34 @@ function make_dat_array()
 end
 const dat = make_dat_array() #assign a constant data array
+"""
-#error function
+    err_hh(p,dist)
+Calculate error between observed total contact durations in HH data and simulated total contact durations based on `dist(p[i]...)`. This one works a bit differently than the WS or rest error function, since we do not have a distribution of total durations to compare to, but rather one average total duration, and we do have the explicit samples of numbers of contacts.
+"""
 function err_hh(p,dist)
    p_matrix = zeros(3,3)
    for i in 1:9
        p_matrix[i] = p[i]
    end
    age_dists = [dist(p_matrix[i,j]...) for i in YOUNG:OLD, j in YOUNG:OLD]
-    duration_subarray =  dat.durs
+    duration_subarray =  dat.durs #durations, these are call subarray because we used to only sample a subset
-    num_contacts_subarray = dat.nums
+    num_contacts_subarray = dat.nums #numbers of contacts for each duration
    # display(p_matrix)
-    AGERESP =  dat.AGERESP 
+    AGERESP =  dat.AGERESP #age of the respondent
    errsum = 0
-    @inbounds for i = 1:cnst_hh.subsize
+    @inbounds for i = 1:length(duration_subarray) #loop through entire file
        age_sample = AGERESP[i]
        @inbounds for age_j in YOUNG:OLD #for a given age_sample loop over possible contact ages
+            #get num_contacts from the HH data
            durs = trunc.(Int,rand(rng,age_dists[age_sample,age_j],num_contacts_subarray[i,age_j])) .% durmax
+            #this returns the MEAN of the distribution of total durations, given we have durations given by durs, and we sample comparison_samples from that distribution.
            expdur = tot_dur_sample(comparison_samples,durs)
-            errsum += (expdur/comparison_samples - duration_subarray[i,age_j])^2 #compute total 
+            errsum += (expdur - duration_subarray[i,age_j])^2 #compute total 
        end
    end
-    # display("2")
    return errsum
 end
\ No newline at end of file
--- a/IntervalsModel/src/interval_overlap_sampling.jl
+++ b/IntervalsModel/src/interval_overlap_sampling.jl
 # Set the underlying parameters for the intervals model
+# This is the distribution of start_times
 const startdist = Normal(sparam...)
@@ -10,7 +11,14 @@ function coverage!(cov,S_j,E_j)
        push!(cov,Interval(S_j,E_j))
    end
 end
-#compute the total duration of a sample of intervals
+"""
+    tot_dur_sample(n::Integer,durlist::Vector{Integer})
+Returns the mean of the distribution of total durations, given a set of contact durations `durlist`, and a number of samples `n`. 
+Note that this is different from `tot_dur_sample!` in that it returns only a mean and does not mutate it's arguments. This could be reducible to that one but not without losing some performance I think. Used in `err_hh` where we only need a mean. 
+"""
 function tot_dur_sample(n,durlist)
    if isempty(durlist)
        return 0
@@ -20,7 +28,6 @@ function tot_dur_sample(n,durlist)
    int_list = Vector{Interval{Int,Closed,Closed}}()
    sizehint!(int_list,numcontact*2)
    start_matrix = trunc.(Int,rand(rng,startdist,(numcontact,n)))
    @inbounds for i in 1:n  
        empty!(int_list)
@@ -32,10 +39,18 @@ function tot_dur_sample(n,durlist)
        union!(int_list)
        total_dur += mapreduce(Intervals.span,+,int_list)
    end
-    return total_dur
+    return total_dur/n 
 end
+"""
+    tot_dur_sample(sample_list::Vector{Integer},durlist::Vector{Integer})
+Mutates `sample_list` to contain samples from the distribution of total durations, given a set of contact durations `durlist`.
+This is different from `tot_dur_sample` because it modifies it's arguments. Used in `err_ws` and `err_rest` where we need a distribution.
+"""
 function tot_dur_sample!(sample_list,durlist)
    if isempty(durlist)
        sample_list .= 0.0
@@ -45,7 +60,6 @@ function tot_dur_sample!(sample_list,durlist)
    n = length(sample_list)
    int_list = Vector{Interval{Int,Closed,Closed}}()
    sizehint!(int_list,numcontact*2)
-    # @show rand(rng,startdist,(numcontact,n))
    start_matrix = trunc.(Int,rand(rng,startdist,(numcontact,n)))
    for i in 1:n  
        empty!(int_list)
@@ -58,12 +72,3 @@ function tot_dur_sample!(sample_list,durlist)
        sample_list[i] = mapreduce(Intervals.span,+,int_list)
    end
 end
-function as_symmetric_matrix(l) #turn a vector of length 6, l, into a symmetric 3x3 matrix, probably a nicer way to do this exists
-    return [
-        l[1] l[2] l[3]
-        l[2] l[4] l[5]
-        l[3] l[5] l[6]
-    ]
-end
\ No newline at end of file
--- a/IntervalsModel/src/rest_durations_model.jl
+++ b/IntervalsModel/src/rest_durations_model.jl
@@ -6,25 +6,42 @@ const kerneldensity_data_rest = (
    M = kde(rest_data.M.durs,weights = rest_data.M.weights),
    O = kde(rest_data.O.durs,weights = rest_data.O.weights),
 )
+"""
+    err_rest(p::Vector{Number},dist::Distribution)
-#error function for rest
+Calculate error between observed contact durations in rest data and simulated contact durations based on `dist(p[i]...)`.
+"""
 function err_rest(p,dist)
+    #we get the vector p as a 9 element vector from KissABC, but more convenient as a 3x3 matrix
    p_matrix = zeros(3,3)
    for i in 1:9
        p_matrix[i] = p[i]
    end
+    #sample a matrix of neighourhood sizes for distributions in rest_distributions
+    #this is actually not correct? neighbourhood sizes should be symmetric!
    neighourhoods = rand.(rng,rest_distributions)
+    #create a matrix of distributions of type dist, from the parameters in p_matrix
    age_dists = [dist(p_matrix[i,j]...) for i in YOUNG:OLD, j in YOUNG:OLD]
+    #initialize the list of samples and the total error
    sample_list = zeros(comparison_samples)
    errsum = 0
    for age_sample in YOUNG:OLD
        for age_j in YOUNG:OLD #for a given age_sample loop over possible contact ages
            if neighourhoods[age_sample,age_j] > 0
+                #get durations from our candidate distribtions for each of the contacts in neighbourhood
                durs = trunc.(Int,rand(rng,age_dists[age_sample,age_j],neighourhoods[age_sample,age_j])) .% durmax
+                #this MODIFIES sample_list to contain samples from the distribution of total_durations, given intervals of length dur.
                tot_dur_sample!(sample_list,durs)
+                #compute a kernel density from the list of samples
                kde_est = kde(sample_list)
+                #compute L1 distance between observed distribution of total durations from survey, and observed distribution from simulation
                errsum += mapreduce(+,0:1:durmax) do i
                    return abs(pdf(kde_est,i) - pdf(kerneldensity_data_ws[age_sample],i))
                end

--- a/IntervalsModel/src/utils.jl
+++ b/IntervalsModel/src/utils.jl
 import Pandas: read_csv
 using DataFrames
+"""
+    get_params(params::Vector{Number})
+This function reorganizes a Vector of 6n parameters into a 3x3 symmetric matrix of n-tuples of parameters.
+"""
 function get_params(params)
    p_list = [as_symmetric_matrix(params[i:i+5]) for i = 1:6:length(params)]
    return zip(p_list...) |> collect
 end
+"""
+Turn a vector of length 6, l, into a symmetric 3x3 matrix, probably a nicer way to do this exists but meh.
+"""
+function as_symmetric_matrix(l) 
+    return [
+        l[1] l[2] l[3]
+        l[2] l[4] l[5]
+        l[3] l[5] l[6]
+    ]
+end
+"""
+Load rest data from `network-data/Timeuse/Rest/RData`.
+"""
 function get_rest_data()
    path = "$PACKAGE_FOLDER/network-data/Timeuse/Rest/RData"
    data_table_by_age = map(collect(keys(swap_dict))) do age
@@ -18,6 +38,9 @@ function get_rest_data()
    return (;data_table_by_age...)
 end
+"""
+Load WS data from `network-data/Timeuse/WS/WorkschoolData`.
+"""
 function get_ws_data()
    path = "$PACKAGE_FOLDER/network-data/Timeuse/WS/WorkschoolData"
    data_table_by_age = map(collect(keys(swap_dict))) do age
@@ -29,6 +52,9 @@ function get_ws_data()
    return (;data_table_by_age...)
 end
+"""
+Load priors data from `IntervalsModel/network-data/POLYMOD/AALPoisPriors.csv` and format into a DataFrame with some additional columns for the index of the age class (per `swap_dict`).
+"""
 function get_priors()
    df = DataFrame(read_csv("IntervalsModel/network-data/POLYMOD/AALPoisPriors.csv"))
    df_w_indicies = transform(
@@ -41,12 +67,21 @@ function get_priors()
    return df_w_indicies
 end
-function filter_priors(key)
+"""
+    filter_priors(location_key::String)
+Filter priors dataframe based on location_key, which should be one of "rest", "workschool", or "home". Returns an array of Categorical distributions.
+"""
+function filter_priors(location_key)
    priors = get_priors()
-    if !(key in priors[!,:location])
+    if !(location_key in priors[!,:location])
        throw(ArgumentError("location key not in priors file!"))
    end
-    priors_probs = filter(row-> row["location"] == key, eachrow(priors)) |>
+    priors_probs = filter(row-> row["location"] == location_key, eachrow(priors)) |>
                    df -> map(r -> (r[:index_out],r[:index_in]) => convert(Vector,r[string.(0:143)]),df) |>
                    Dict |>
                    df -> Dict(map(v -> v => Categorical(df[v]), collect(keys(df))))

--- a/IntervalsModel/src/ws_durations_model.jl
+++ b/IntervalsModel/src/ws_durations_model.jl
-const ws_distributions = CovidAlertVaccinationModel.initial_workschool_mixing_matrix
 const ws_data = get_ws_data()
+"""
+Kernel density functions based on the durs in ws_data and their corresponding weights. This defines an interpolated density function that we can compare to the density obtained from the simulation.
+"""
 const kerneldensity_data_ws = (
    Y = kde(ws_data.Y.durs;weights = ws_data.Y.weights),
    M = kde(ws_data.M.durs;weights = ws_data.M.weights),
    O = kde(ws_data.O.durs;weights = ws_data.O.weights),
 )
+"""
+    err_ws(p::Vector{Number},dist::Distribution)
-#error function for ws
+Calculate error between observed contact durations in workschool data and simulated contact durations based on `dist(p[i]...)`. For these workschool simulations, we use a different sampling procedure for neighbourhood sizes based on `ws_sample` in CovidAlertVaccinationModel.
+"""
 function err_ws(p,dist)
-    p_matrix = zeros(3,3)
+    #we get the vector p as a 9 element vector from KissABC, but more convenient as a 3x3 matrix
+    p_matrix = zeros(3,3) 
    for i in 1:9
-        p_matrix[i] = p[i]
+        p_matrix[i] = p[i] #this will fill p_matrix with the elements of p in column-major order
    end
-    age_dists = [dist(p_matrix[i,j]...) for i in YOUNG:OLD, j in YOUNG:OLD]
+    #create a matrix of distributions of type dist, from the parameters in p_matrix
+    age_dists = [dist(p_matrix[i,j]...) for i in YOUNG:OLD, j in YOUNG:OLD] 
+    #initialize the list of samples and the total error
    sample_list = zeros(comparison_samples)
    errsum = 0
    for age_sample in YOUNG:OLD
+        #Sample neighbourhood sizes. Note that this includes the weekend/weekday coinflip, implemented in CovidAlertVaccinationModel.  
        neighourhoods = CovidAlertVaccinationModel.ws_sample(age_sample)
-        for age_j in YOUNG:OLD #for a given age_sample loop over possible contact ages
+        for age_j in YOUNG:OLD #for a given age_sample loop over possible contact ages        
            if neighourhoods[age_j] > 0
+                #get durations from our candidate distribtions for each of the contacts in neighbourhood
                durs = trunc.(Int,rand(rng,age_dists[age_sample,age_j],neighourhoods[age_j])) .% durmax
+                #this MODIFIES sample_list to contain samples from the distribution of total_durations, given intervals of length dur.
                tot_dur_sample!(sample_list,durs)
+                #compute a kernel density from the list of samples
                kde_est = kde(sample_list)
+                #compute L1 distance between observed distribution of total durations from survey, and observed distribution from simulation
                errsum += mapreduce(+,0:1:durmax) do i
                    return abs(pdf(kde_est,i) - pdf(kerneldensity_data_ws[age_sample],i))
                end