diff --git a/IntervalsModel/simulation_data/hh.dat b/IntervalsModel/simulation_data/hh.dat index eb57911038e6bf2d99523665eb316df80e6f8ec2..fea7ea8ac0d3dd37a0ab30ff5ce899e8d56515ce 100644 Binary files a/IntervalsModel/simulation_data/hh.dat and b/IntervalsModel/simulation_data/hh.dat differ diff --git a/IntervalsModel/simulation_data/ws.dat b/IntervalsModel/simulation_data/ws.dat index 687a6b2c3245be85104c1a36ed856e2dcbce8fbe..ee07dd956d8f31c12c7c6005b403b532cdc55b5b 100644 Binary files a/IntervalsModel/simulation_data/ws.dat and b/IntervalsModel/simulation_data/ws.dat differ diff --git a/IntervalsModel/src/IntervalsModel.jl b/IntervalsModel/src/IntervalsModel.jl index e79d487f5afa7320a6bfad7cec5473a5d7f96bc5..5d730605994b10c4473c7a13e065a05eae9d483e 100644 --- a/IntervalsModel/src/IntervalsModel.jl +++ b/IntervalsModel/src/IntervalsModel.jl @@ -22,10 +22,14 @@ using Plots const rng = Xoroshiro128Plus() -#lets us use nicer names for the indices +#consts that let give us nicer names for the indices const YOUNG, MIDDLE,OLD = 1,2,3 const durmax = 144 -const comparison_samples = 100 + +""" +Number of start times to sample for a given set of distribution parameters. +""" +const comparison_samples = 70 const sparam = [60, 12] # Swap age brackets for numbers const swap_dict = OrderedDict("Y" => YOUNG, "M" => MIDDLE, "O" => OLD) @@ -98,6 +102,8 @@ function bayesian_estimation(fname, err_func, priors_list, dists, particles; alp end """ + do_hh(particles::Integer) + Fit HH durations. This function specifies a list of distributions to fit, and loads the priors for "hh", and then calls `bayesian_estimation` with `err_hh`. This function passes parameters to `bayesian_estimation` depending on what produces the best fit for the HH data. """ function do_hh(particles) @@ -115,6 +121,9 @@ end """ + + do_ws(particles::Integer) + Fit WS durations. This function specifies a list of distributions to fit, and loads the priors for "workschool", and then calls `bayesian_estimation` with `err_ws`. This function passes parameters to `bayesian_estimation` depending on what produces the best fit for the WS data. """ function do_ws(particles) @@ -130,6 +139,9 @@ function do_ws(particles) end """ + + do_rest(particles::Integer) + Fit rest durations. This function specifies a list of distributions to fit, and loads the priors for "rest", and then calls `bayesian_estimation` with `err_rest`. This function passes parameters to `bayesian_estimation` depending on what produces the best fit for the Rest data. """ function do_rest(particles) diff --git a/IntervalsModel/src/hh_durations_model.jl b/IntervalsModel/src/hh_durations_model.jl index 2f2eebc038a3421939f7942279c68c12fb9cd1f8..fe80396c424e430bbef1cb03f577b171a5f36180 100644 --- a/IntervalsModel/src/hh_durations_model.jl +++ b/IntervalsModel/src/hh_durations_model.jl @@ -1,9 +1,6 @@ const HHYMO = DataFrame(CSV.File("$PACKAGE_FOLDER/network-data/Timeuse/HH/HHYMO.csv")) -const cnst_hh = ( - subsize = size(HHYMO)[1], - Wghttotal = sum(HHYMO[:,"WGHT_PER"]), -) +const Wghttotal = sum(HHYMO[:,"WGHT_PER"]) function make_dat_array() durs = hcat( @@ -27,28 +24,34 @@ function make_dat_array() end const dat = make_dat_array() #assign a constant data array +""" -#error function + err_hh(p,dist) + +Calculate error between observed total contact durations in HH data and simulated total contact durations based on `dist(p[i]...)`. This one works a bit differently than the WS or rest error function, since we do not have a distribution of total durations to compare to, but rather one average total duration, and we do have the explicit samples of numbers of contacts. +""" function err_hh(p,dist) p_matrix = zeros(3,3) for i in 1:9 p_matrix[i] = p[i] end age_dists = [dist(p_matrix[i,j]...) for i in YOUNG:OLD, j in YOUNG:OLD] - duration_subarray = dat.durs - num_contacts_subarray = dat.nums + duration_subarray = dat.durs #durations, these are call subarray because we used to only sample a subset + num_contacts_subarray = dat.nums #numbers of contacts for each duration # display(p_matrix) - AGERESP = dat.AGERESP + AGERESP = dat.AGERESP #age of the respondent errsum = 0 - @inbounds for i = 1:cnst_hh.subsize + @inbounds for i = 1:length(duration_subarray) #loop through entire file age_sample = AGERESP[i] @inbounds for age_j in YOUNG:OLD #for a given age_sample loop over possible contact ages + + #get num_contacts from the HH data durs = trunc.(Int,rand(rng,age_dists[age_sample,age_j],num_contacts_subarray[i,age_j])) .% durmax + + #this returns the MEAN of the distribution of total durations, given we have durations given by durs, and we sample comparison_samples from that distribution. expdur = tot_dur_sample(comparison_samples,durs) - errsum += (expdur/comparison_samples - duration_subarray[i,age_j])^2 #compute total + errsum += (expdur - duration_subarray[i,age_j])^2 #compute total end end - # display("2") - return errsum end \ No newline at end of file diff --git a/IntervalsModel/src/interval_overlap_sampling.jl b/IntervalsModel/src/interval_overlap_sampling.jl index 8918004c31bec9cece1d5febe62aa49ac453e239..c64d83f47cd94223ba51fa1bad083b9cfaeb5ed0 100644 --- a/IntervalsModel/src/interval_overlap_sampling.jl +++ b/IntervalsModel/src/interval_overlap_sampling.jl @@ -1,4 +1,5 @@ # Set the underlying parameters for the intervals model +# This is the distribution of start_times const startdist = Normal(sparam...) @@ -10,7 +11,14 @@ function coverage!(cov,S_j,E_j) push!(cov,Interval(S_j,E_j)) end end -#compute the total duration of a sample of intervals + +""" + + tot_dur_sample(n::Integer,durlist::Vector{Integer}) + +Returns the mean of the distribution of total durations, given a set of contact durations `durlist`, and a number of samples `n`. +Note that this is different from `tot_dur_sample!` in that it returns only a mean and does not mutate it's arguments. This could be reducible to that one but not without losing some performance I think. Used in `err_hh` where we only need a mean. +""" function tot_dur_sample(n,durlist) if isempty(durlist) return 0 @@ -20,7 +28,6 @@ function tot_dur_sample(n,durlist) int_list = Vector{Interval{Int,Closed,Closed}}() sizehint!(int_list,numcontact*2) - start_matrix = trunc.(Int,rand(rng,startdist,(numcontact,n))) @inbounds for i in 1:n empty!(int_list) @@ -32,10 +39,18 @@ function tot_dur_sample(n,durlist) union!(int_list) total_dur += mapreduce(Intervals.span,+,int_list) end - return total_dur + return total_dur/n end +""" + + tot_dur_sample(sample_list::Vector{Integer},durlist::Vector{Integer}) + +Mutates `sample_list` to contain samples from the distribution of total durations, given a set of contact durations `durlist`. + +This is different from `tot_dur_sample` because it modifies it's arguments. Used in `err_ws` and `err_rest` where we need a distribution. +""" function tot_dur_sample!(sample_list,durlist) if isempty(durlist) sample_list .= 0.0 @@ -45,7 +60,6 @@ function tot_dur_sample!(sample_list,durlist) n = length(sample_list) int_list = Vector{Interval{Int,Closed,Closed}}() sizehint!(int_list,numcontact*2) - # @show rand(rng,startdist,(numcontact,n)) start_matrix = trunc.(Int,rand(rng,startdist,(numcontact,n))) for i in 1:n empty!(int_list) @@ -58,12 +72,3 @@ function tot_dur_sample!(sample_list,durlist) sample_list[i] = mapreduce(Intervals.span,+,int_list) end end - - -function as_symmetric_matrix(l) #turn a vector of length 6, l, into a symmetric 3x3 matrix, probably a nicer way to do this exists - return [ - l[1] l[2] l[3] - l[2] l[4] l[5] - l[3] l[5] l[6] - ] -end \ No newline at end of file diff --git a/IntervalsModel/src/rest_durations_model.jl b/IntervalsModel/src/rest_durations_model.jl index e40e217e62e6793b16d1f96b66db77f0acc815be..53678dfeca5afda3b305a757e2574d7f7b19ce7b 100644 --- a/IntervalsModel/src/rest_durations_model.jl +++ b/IntervalsModel/src/rest_durations_model.jl @@ -6,25 +6,42 @@ const kerneldensity_data_rest = ( M = kde(rest_data.M.durs,weights = rest_data.M.weights), O = kde(rest_data.O.durs,weights = rest_data.O.weights), ) +""" + err_rest(p::Vector{Number},dist::Distribution) -#error function for rest +Calculate error between observed contact durations in rest data and simulated contact durations based on `dist(p[i]...)`. +""" function err_rest(p,dist) + #we get the vector p as a 9 element vector from KissABC, but more convenient as a 3x3 matrix p_matrix = zeros(3,3) for i in 1:9 p_matrix[i] = p[i] end + #sample a matrix of neighourhood sizes for distributions in rest_distributions + #this is actually not correct? neighbourhood sizes should be symmetric! neighourhoods = rand.(rng,rest_distributions) + + #create a matrix of distributions of type dist, from the parameters in p_matrix age_dists = [dist(p_matrix[i,j]...) for i in YOUNG:OLD, j in YOUNG:OLD] + + #initialize the list of samples and the total error sample_list = zeros(comparison_samples) errsum = 0 for age_sample in YOUNG:OLD for age_j in YOUNG:OLD #for a given age_sample loop over possible contact ages if neighourhoods[age_sample,age_j] > 0 + #get durations from our candidate distribtions for each of the contacts in neighbourhood durs = trunc.(Int,rand(rng,age_dists[age_sample,age_j],neighourhoods[age_sample,age_j])) .% durmax + + #this MODIFIES sample_list to contain samples from the distribution of total_durations, given intervals of length dur. tot_dur_sample!(sample_list,durs) + + #compute a kernel density from the list of samples kde_est = kde(sample_list) + + #compute L1 distance between observed distribution of total durations from survey, and observed distribution from simulation errsum += mapreduce(+,0:1:durmax) do i return abs(pdf(kde_est,i) - pdf(kerneldensity_data_ws[age_sample],i)) end diff --git a/IntervalsModel/src/utils.jl b/IntervalsModel/src/utils.jl index 5215ce85bfafa4ad5e5d8b1fa83a7fb72bd1acff..a02d85dad890a3a0ebc7df138be723d79477c2c5 100644 --- a/IntervalsModel/src/utils.jl +++ b/IntervalsModel/src/utils.jl @@ -1,12 +1,32 @@ import Pandas: read_csv using DataFrames +""" + get_params(params::Vector{Number}) + +This function reorganizes a Vector of 6n parameters into a 3x3 symmetric matrix of n-tuples of parameters. +""" function get_params(params) p_list = [as_symmetric_matrix(params[i:i+5]) for i = 1:6:length(params)] return zip(p_list...) |> collect end + +""" +Turn a vector of length 6, l, into a symmetric 3x3 matrix, probably a nicer way to do this exists but meh. +""" +function as_symmetric_matrix(l) + return [ + l[1] l[2] l[3] + l[2] l[4] l[5] + l[3] l[5] l[6] + ] +end + +""" +Load rest data from `network-data/Timeuse/Rest/RData`. +""" function get_rest_data() path = "$PACKAGE_FOLDER/network-data/Timeuse/Rest/RData" data_table_by_age = map(collect(keys(swap_dict))) do age @@ -18,6 +38,9 @@ function get_rest_data() return (;data_table_by_age...) end +""" +Load WS data from `network-data/Timeuse/WS/WorkschoolData`. +""" function get_ws_data() path = "$PACKAGE_FOLDER/network-data/Timeuse/WS/WorkschoolData" data_table_by_age = map(collect(keys(swap_dict))) do age @@ -29,6 +52,9 @@ function get_ws_data() return (;data_table_by_age...) end +""" +Load priors data from `IntervalsModel/network-data/POLYMOD/AALPoisPriors.csv` and format into a DataFrame with some additional columns for the index of the age class (per `swap_dict`). +""" function get_priors() df = DataFrame(read_csv("IntervalsModel/network-data/POLYMOD/AALPoisPriors.csv")) df_w_indicies = transform( @@ -41,12 +67,21 @@ function get_priors() return df_w_indicies end -function filter_priors(key) + +""" + + filter_priors(location_key::String) + +Filter priors dataframe based on location_key, which should be one of "rest", "workschool", or "home". Returns an array of Categorical distributions. +""" +function filter_priors(location_key) priors = get_priors() - if !(key in priors[!,:location]) + + if !(location_key in priors[!,:location]) throw(ArgumentError("location key not in priors file!")) end - priors_probs = filter(row-> row["location"] == key, eachrow(priors)) |> + + priors_probs = filter(row-> row["location"] == location_key, eachrow(priors)) |> df -> map(r -> (r[:index_out],r[:index_in]) => convert(Vector,r[string.(0:143)]),df) |> Dict |> df -> Dict(map(v -> v => Categorical(df[v]), collect(keys(df)))) diff --git a/IntervalsModel/src/ws_durations_model.jl b/IntervalsModel/src/ws_durations_model.jl index cafdb6cb82ed50d1ff60fa011df3e9a4236d7d70..91aadca465abdd4cec66f14883598903cf57c5ff 100644 --- a/IntervalsModel/src/ws_durations_model.jl +++ b/IntervalsModel/src/ws_durations_model.jl @@ -1,29 +1,49 @@ -const ws_distributions = CovidAlertVaccinationModel.initial_workschool_mixing_matrix const ws_data = get_ws_data() + +""" +Kernel density functions based on the durs in ws_data and their corresponding weights. This defines an interpolated density function that we can compare to the density obtained from the simulation. +""" const kerneldensity_data_ws = ( Y = kde(ws_data.Y.durs;weights = ws_data.Y.weights), M = kde(ws_data.M.durs;weights = ws_data.M.weights), O = kde(ws_data.O.durs;weights = ws_data.O.weights), ) +""" + err_ws(p::Vector{Number},dist::Distribution) -#error function for ws +Calculate error between observed contact durations in workschool data and simulated contact durations based on `dist(p[i]...)`. For these workschool simulations, we use a different sampling procedure for neighbourhood sizes based on `ws_sample` in CovidAlertVaccinationModel. +""" function err_ws(p,dist) - p_matrix = zeros(3,3) + #we get the vector p as a 9 element vector from KissABC, but more convenient as a 3x3 matrix + p_matrix = zeros(3,3) for i in 1:9 - p_matrix[i] = p[i] + p_matrix[i] = p[i] #this will fill p_matrix with the elements of p in column-major order end - age_dists = [dist(p_matrix[i,j]...) for i in YOUNG:OLD, j in YOUNG:OLD] + + #create a matrix of distributions of type dist, from the parameters in p_matrix + age_dists = [dist(p_matrix[i,j]...) for i in YOUNG:OLD, j in YOUNG:OLD] + + #initialize the list of samples and the total error sample_list = zeros(comparison_samples) errsum = 0 + for age_sample in YOUNG:OLD + #Sample neighbourhood sizes. Note that this includes the weekend/weekday coinflip, implemented in CovidAlertVaccinationModel. neighourhoods = CovidAlertVaccinationModel.ws_sample(age_sample) - for age_j in YOUNG:OLD #for a given age_sample loop over possible contact ages + + for age_j in YOUNG:OLD #for a given age_sample loop over possible contact ages if neighourhoods[age_j] > 0 + #get durations from our candidate distribtions for each of the contacts in neighbourhood durs = trunc.(Int,rand(rng,age_dists[age_sample,age_j],neighourhoods[age_j])) .% durmax + + #this MODIFIES sample_list to contain samples from the distribution of total_durations, given intervals of length dur. tot_dur_sample!(sample_list,durs) + + #compute a kernel density from the list of samples kde_est = kde(sample_list) + #compute L1 distance between observed distribution of total durations from survey, and observed distribution from simulation errsum += mapreduce(+,0:1:durmax) do i return abs(pdf(kde_est,i) - pdf(kerneldensity_data_ws[age_sample],i)) end