data.jl 8.07 KB
Newer Older
1 2 3 4 5 6 7 8 9 10
#Data parsing and import functions
#These functions specify output types 
#so the compiler is able to do type inference throughout the rest of the code
#I don't think it is necessary in all cases, but it's good when importing data from files to specify because type instability makes things rly slow

function get_canada_demographic_distribution()::Vector{Float64}
    f = readdlm(joinpath(PACKAGE_FOLDER,"data/csv/demographic_data.csv"), ',')
    df = DataFrame([f[1,i] => f[2:end, i] for i = 1:length(f[1,:])])
    binned_data = df[:,:demographic_data]
    source_bins = map(parse_string_as_float_pair, df[:,:demographic_data_bins])
Peter Jentsch's avatar
Peter Jentsch committed
11
    return [sum(binned_data[1:5]),sum(binned_data[6:13]),sum(binned_data[14:end])]
12 13 14 15 16 17 18 19
end

function get_canada_case_fatality()::Tuple{Vector{Tuple{Float64,Float64}},Vector{Float64}}
    f = readdlm(joinpath(PACKAGE_FOLDER,"data/csv/case_fatality_data.csv"), ',')
    df = DataFrame([f[1,i] => f[2:end, i] for i = 1:length(f[1,:])])
    return  map(parse_string_as_float_pair, df[:,:case_fatality_bins]), df[:,:case_fatality_data]
    # https://www.publichealthontario.ca/-/media/documents/ncov/epi/covid-19-severe-outcomes-ontario-epi-summary.pdf?la=en
end
Peter Jentsch's avatar
Peter Jentsch committed
20
function find_household_composition(df_row)
21
    # display(typeof(df_row))
Peter Jentsch's avatar
Peter Jentsch committed
22
    age_resp_to_bin = Dict(
23
        "Y" => 1,
Peter Jentsch's avatar
Peter Jentsch committed
24 25 26 27 28 29
        "M" => 2,
        "O" => 3,
    )
    u25_bins = [:U15CHILD,:O15CHILD,:YSPOUSE]
    m_bins = [:MPAR, :MCHILD,:MHHADULT]
    o_bins = [:OPAR, :OSPOUSE,:OHHADULT]
30 31 32 33 34
    age_distribution =[
        sum(Int(df_row[field]) for field in u25_bins),
        sum(Int(df_row[field])  for field in m_bins),
        sum(Int(df_row[field]) for field in o_bins),
    ]   
Peter Jentsch's avatar
Peter Jentsch committed
35
    age_distribution[age_resp_to_bin[df_row[:AGERESP]]] += 1
36
    return SVector{3}(age_distribution)
Peter Jentsch's avatar
Peter Jentsch committed
37
end
38
function read_household_data()
Peter Jentsch's avatar
Peter Jentsch committed
39 40 41 42
    f = readdlm(joinpath(PACKAGE_FOLDER,"data/csv/home_compositions.csv"), ',')
    df = DataFrame([f[1,i] => f[2:end, i] for i = 1:length(f[1,:])])
    weight_vector::Vector{Float64} = df[!,:WGHT_PER]/sum(df[!,:WGHT_PER])
    households = map(find_household_composition,eachrow(df))
43 44 45 46
    return (;households,weight_vector)
end

function sample_household_data(n)
47
    return sample(Random.default_rng(Threads.threadid()),household_data.households,Weights(household_data.weight_vector), n)
Peter Jentsch's avatar
Peter Jentsch committed
48
end
49

50
function get_household_data_proportions()
51
    households_by_demographic_sum = sum.([map(((l,w),)-> l[i]*w,zip(household_data.households,household_data.weight_vector)) for i in 1:3])
52 53 54 55
    return households_by_demographic_sum./sum(households_by_demographic_sum)
    # https://www.publichealthontario.ca/-/media/documents/ncov/epi/covid-19-severe-outcomes-ontario-epi-summary.pdf?la=en
end

56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
function make_workschool_mixing_matrix()# this is actually used in IntervalsModel only now

    #all geometric with means given
    ws_mixing = map(t->from_mean(t...),[
        (Geometric{Float64}, 4.104848) (Geometric{Float64},2.568782) (Geometric{Float64},0.017729)
        (Geometric{Float64}, 0.975688) (Geometric{Float64},5.057572) (Geometric{Float64},0.021307)
        (Geometric{Float64},0.001937)   (Geometric{Float64},0.00722)   (Geometric{Float64}, 0.022134) 
    ])

    #symmetrize WS mixing with respect to the population proportions in the data 
    ws_mixing_w_unemployment_symmetrized = symmetrize_means(get_household_data_proportions(),ws_mixing)

    #define a function that adjusts the means of W according to the unemployment_matrix
    ws_adjust_mean(W) = (5/7) .* (1 .- unemployment_matrix) ./ ( mean.(W) + (5/7) .*  (1 .- unemployment_matrix))

    #create a zero weighted distribution where the zero-weight is given by the unemployment_matrix, and the non-zero weight is given by ws_mixing, symmetrized, with means adjust upwards
    ws_mixing_w_unemployment_symmetrized_weekday_adjusted = ZWDist.(unemployment_matrix,Geometric.(ws_adjust_mean(ws_mixing_w_unemployment_symmetrized)))
    
    return ws_mixing_w_unemployment_symmetrized_weekday_adjusted
end
Peter Jentsch's avatar
Peter Jentsch committed
76 77


78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
function load_mixing_matrices()
    df = CSV.File(joinpath(PACKAGE_FOLDER,"data/csv/mixing_loc-freq.csv")) |> DataFrame
    workschool_mixing = (
        daily = zeros(3,3),
        twice_a_week = zeros(3,3),
        otherwise = zeros(3,3),
    )
    rest_mixing = (
        daily = zeros(3,3),
        twice_a_week = zeros(3,3),
        otherwise = zeros(3,3),
    )
    mixing = [workschool_mixing,rest_mixing]
    locations_labels = ["workschool", "rest"]
    frequency_labels = ["daily","3xweekly","justonce"]

    for r in eachrow(df)
        location_ind = findfirst(==(r["location"]),locations_labels)
        frequency_ind = findfirst(==(r["frequency"]),frequency_labels)
        for i in 0:8
            mixing[location_ind][frequency_ind]'[i+1] = r[string(i)]  
        end
    end
    return map(t -> from_mean.(Geometric{Float64},t),workschool_mixing), map(t -> from_mean.(Geometric{Float64}, t),rest_mixing)
102 103
end

104 105 106
# function make_sampler(λ)
#     return Distributions.PoissonADSampler(λ)#Distributions.DiscreteNonParametricSampler(0:durmax,[pdf(Poisson(λ),x) for x in 0:durmax])
# end
107
function load_contact_time_distributions()
108 109 110 111 112 113 114 115
    distkey = "Distributions.Poisson"
    fnames = (
        hh = "hh",
        ws = "ws",
        rest = "rest"
    )
    contact_distributions_tuple = map(fnames) do fname
        dat = deserialize(joinpath(PACKAGE_FOLDER,"intervals_model_output","simulation_output","$fname.dat"))
116
        return map(p -> Poisson(mode(p.particles)), as_symmetric_matrix(dat[distkey].P))
117 118
    end
    return contact_distributions_tuple
119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151
end
"""
Load rest data from `data/canada-network-data/Timeuse/Rest/RData`.
"""
function get_rest_data()
    path = "$PACKAGE_FOLDER/data/canada-network-data/Timeuse/Rest/RData"
    data_table_by_age = map(collect(keys(swap_dict))) do age
        data_table = CSV.File(path*"$age.csv") |> Tables.matrix
        weights = Weights(data_table[:,2])
        durs = data_table[:,3]
        Symbol(age) => (;durs, weights) 
    end
    return (;data_table_by_age...)
end

"""
Load WS data from `data/canada-network-data/Timeuse/WS/WorkschoolData`.
"""
function get_ws_data()
    path = "$PACKAGE_FOLDER/data/canada-network-data/Timeuse/WS/WorkschoolData"
    data_table_by_age = map(collect(keys(swap_dict))) do age
        data_table = CSV.File(path*"$age.csv") |> Tables.matrix
        weights = Weights(data_table[:,2])
        durs = data_table[:,3]
        Symbol(age) => (;durs, weights) 
    end
    return (;data_table_by_age...)
end

"""
Load priors data from `data/canada-network-data/POLYMOD/AALPoisPriors.csv` and format into a DataFrame with some additional columns for the index of the age class (per `swap_dict`).
"""
function get_priors()
Peter Jentsch's avatar
Peter Jentsch committed
152 153
    df = DataFrame(CSV.File("$PACKAGE_FOLDER/data/canada-network-data/POLYMOD/AALPoisPriors.csv"))
    display(df)
154 155 156 157 158 159 160
    df_w_indicies = transform(
        df,
        :Age_in => e -> map(x-> swap_dict[x],e),
        :Age_out => e -> map(x -> swap_dict[x],e),
        :Age_in,:Age_out
    )
    rename!(df_w_indicies, Dict(:Age_out_function => "index_out",:Age_in_function => "index_in"))
Peter Jentsch's avatar
Peter Jentsch committed
161
    display(df_w_indicies)
162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
    return df_w_indicies
end


"""

    filter_priors(location_key::String)

Filter priors dataframe based on location_key, which should be one of "rest", "workschool", or "home". Returns an array of Categorical distributions.
"""
function filter_priors(location_key)
    priors = get_priors()

    if !(location_key in priors[!,:location])
        throw(ArgumentError("location key not in priors file!"))
    end

    priors_dict = filter(row-> row["location"] == location_key, eachrow(priors)) |>
Peter Jentsch's avatar
Peter Jentsch committed
180
                    df -> map(r -> (r[:index_out],r[:index_in]) =>Vector{Float64}(r[string.(distribution_support)]),df) |> #this is pretty inflexible to changing support unfortunately
181 182
                    Dict

Peter Jentsch's avatar
Peter Jentsch committed
183
    display(priors_dict)
184 185 186 187 188 189 190 191 192 193

    priors_probs = Dict(map(v -> v => DiscreteNonParametric(distribution_support,priors_dict[v]), collect(keys(priors_dict))))

    output_array = Array{eltype(values(priors_probs)),2}(undef,(3,3))
    for (key,value) in priors_probs
        output_array[key...] = value
        output_array[reverse(key)...] = value #corny way to get symmetry assuming key is a 2-tuple
    end
    return symmetric_matrix_as_vector(output_array)
end