diff --git a/IntervalsModel/network-data/HH/.gitkeep b/IntervalsModel/network-data/HH/.gitkeep deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/IntervalsModel/network-data/POLYMOD/AALfixedfrac.csv b/IntervalsModel/network-data/POLYMOD/AALfixedfrac.csv deleted file mode 100644 index 3866955f8f1a67f92daa63967ecc05fff7ba9089..0000000000000000000000000000000000000000 --- a/IntervalsModel/network-data/POLYMOD/AALfixedfrac.csv +++ /dev/null @@ -1,13 +0,0 @@ -,Age_in,Age_out,location,0 -0,Y,Y,workschool,0.7689040478795289 -1,Y,Y,rest,0.3694669429271049 -2,Y,M,workschool,0.4889133896851648 -3,Y,M,rest,0.24192775701830027 -4,Y,O,workschool,0.5056179775280899 -5,Y,O,rest,0.2018181818181818 -6,M,M,workschool,0.4868351565447001 -7,M,M,rest,0.155895865237366 -8,M,O,workschool,0.3559322033898305 -9,M,O,rest,0.16535671100362756 -10,O,O,workschool,0.19642857142857142 -11,O,O,rest,0.15362731152204837 diff --git a/IntervalsModel/network-data/POLYMOD/Contact-data.py b/IntervalsModel/network-data/POLYMOD/Contact-data.py new file mode 100644 index 0000000000000000000000000000000000000000..51dd323e21020a9f4e95b0b62262e8ba40109496 --- /dev/null +++ b/IntervalsModel/network-data/POLYMOD/Contact-data.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on 2021-03-23 + +Here we tidy and organize the POLYMOD data and save it to a csv + +@author: mark +""" +import numpy as np +import pandas as pd +import math +from itertools import product + +# To start, let's load the two files: The contact and participant data +Contact = pd.read_csv("2008_Mossong_POLYMOD_contact_common.csv") +Part = pd.read_csv("2008_Mossong_POLYMOD_participant_common.csv") + +# Delete contacts missing age, duration and frequency data +Contact = Contact.dropna(subset=['cnt_age_exact', 'duration_multi', 'frequency_multi']) + +# Delete contacts missing location data +def Loctest(row): + emptycheck = row['cnt_home'] or row['cnt_work'] or row['cnt_school'] or row['cnt_transport'] or row['cnt_leisure'] or row['cnt_otherplace'] + if emptycheck == False: + return 1 + else: + return 0 +Contact['Loctest'] = Contact.apply(Loctest, axis=1) +Contact = Contact[Contact['Loctest']==0].copy(deep=True) + + +# Next, we aggregate the locations +# Define the function computing WorkSchool value from remaining entries +def WorkSchool(row): + return row['cnt_work'] or row['cnt_school'] +# Create the new column +Contact['cnt_workschool'] = Contact.apply(WorkSchool, axis=1) + +# Define the function computed Rest value from remaining entries +def Rest(row): + return row['cnt_transport'] or row['cnt_leisure'] or row['cnt_otherplace'] +# And then create the new column +Contact['cnt_rest'] = Contact.apply(Rest, axis=1) + +# We also add a column with the source age bracket. First step is to define age bracket value +def Agesource(row): + ID = row['part_id'] + year = float(Part[Part['part_id'] ==ID]['part_age']) + if year < 25: + return 'Y' + elif year < 65: + return 'M' + else: + return 'O' +# Then create the new source column +Contact['age_source'] = Contact.apply(Agesource, axis=1) + +# Columns function for target age bracket +def Agetarget(row): + ID = row['cont_id'] + year = float(Contact[Contact['cont_id'] ==ID]['cnt_age_exact']) + if year < 25: + return 'Y' + elif year < 65: + return 'M' + else: + return 'O' +# Add the new target column +Contact['age_target'] = Contact.apply(Agetarget, axis=1) + +#save a copy of the new contact dataframe +Tosave = ['part_id', 'cont_id', 'age_source', 'age_target', 'cnt_home', + 'cnt_workschool', 'cnt_rest', 'duration_multi', 'frequency_multi'] +Contact[Tosave].to_csv("AALContact_data.csv", index=False) \ No newline at end of file diff --git a/IntervalsModel/network-data/POLYMOD/Contact-distributions.ipynb b/IntervalsModel/network-data/POLYMOD/Contact-distributions.ipynb deleted file mode 100644 index ab722b7916fb638f451c2589587527d58ae632ce..0000000000000000000000000000000000000000 --- a/IntervalsModel/network-data/POLYMOD/Contact-distributions.ipynb +++ /dev/null @@ -1,548 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "%matplotlib notebook\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "import pandas as pd\n", - "from scipy import optimize\n", - "from scipy import special\n", - "import math\n", - "import random\n", - "import csv\n", - "import itertools\n", - "from itertools import product\n", - "def Flatten(A):\n", - " return list(itertools.chain.from_iterable(A))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Extract contact distributions by age-age and locale from POLYMOD data\n", - "\n", - "The aim here is to find well-fitting parametric forms for distributions of daily contact numbers reported in the POLYMOD data. Contacts are stratified by the age of both respondent and contact, as well as the location in which the contact took place. \n", - "\n", - "For age, we consider 3 brackets: Y (<25), M (25-64) and O (>65). \n", - "\n", - "For location, we use 3 settings: Home, Work/school, and rest. This is done to match the timeuse data. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Cleaning and reorganizing survey results" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/mark/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3071: DtypeWarning: Columns (6,7,8,9,10,11) have mixed types.Specify dtype option on import or set low_memory=False.\n", - " has_raised = await self.run_ast_nodes(code_ast.body, cell_name,\n" - ] - } - ], - "source": [ - "# To start, let's load the two files: The contact and participant data\n", - "Contact = pd.read_csv(\"2008_Mossong_POLYMOD_contact_common.csv\")\n", - "Part = pd.read_csv(\"2008_Mossong_POLYMOD_participant_common.csv\")\n", - "\n", - "# Delete contacts missing age data\n", - "Contact = Contact.dropna(subset=['cnt_age_exact'])\n", - "\n", - "# Delete contacts missing location data\n", - "def Loctest(row):\n", - " emptycheck = row['cnt_home'] or row['cnt_work'] or row['cnt_school'] or row['cnt_transport'] or row['cnt_leisure'] or row['cnt_otherplace']\n", - " if emptycheck == False:\n", - " return 1\n", - " else:\n", - " return 0\n", - "Contact['Loctest'] = Contact.apply(Loctest, axis=1)\n", - "Contact = Contact[Contact['Loctest']==0]" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# Next, we aggregate the locations\n", - "# Define the function computing WorkSchool value from remaining entries\n", - "def WorkSchool(row):\n", - " return row['cnt_work'] or row['cnt_school']\n", - "# Create the new column\n", - "Contact['cnt_workschool'] = Contact.apply(WorkSchool, axis=1)\n", - "\n", - "# Define the function computed Rest value from remaining entries\n", - "def Rest(row):\n", - " return row['cnt_transport'] or row['cnt_leisure'] or row['cnt_otherplace']\n", - "# And then create the new column\n", - "Contact['cnt_rest'] = Contact.apply(Rest, axis=1)\n", - "\n", - "# We also add a column with the source age bracket. First step is to define age bracket value\n", - "def Agesource(row):\n", - " ID = row['part_id']\n", - " year = float(Part[Part['part_id'] ==ID]['part_age'])\n", - " if year < 25:\n", - " return 'Y'\n", - " elif year < 65:\n", - " return 'M'\n", - " else:\n", - " return 'O'\n", - "# Then create the new source column\n", - "Contact['age_source'] = Contact.apply(Agesource, axis=1)\n", - "\n", - "# Columns function for target age bracket\n", - "def Agetarget(row):\n", - " ID = row['cont_id']\n", - " year = float(Contact[Contact['cont_id'] ==ID]['cnt_age_exact'])\n", - " if year < 25:\n", - " return 'Y'\n", - " elif year < 65:\n", - " return 'M'\n", - " else:\n", - " return 'O'\n", - "# Add the new target column\n", - "Contact['age_target'] = Contact.apply(Agetarget, axis=1)\n", - "\n", - "# Before going further let's save a copy of the new contact dataframe\n", - "Tosave = ['part_id', 'cont_id', 'age_source', 'age_target', 'cnt_home', 'cnt_workschool', 'cnt_rest']\n", - "Contact[Tosave].to_csv(\"AALContact_data.csv\", index=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Age-age-location datasets" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "# Read the reorganized survey to a dataframe\n", - "Contact = pd.read_csv(\"AALContact_data.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "<ipython-input-5-3980fc6f4e5f>:8: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n", - " ContAAL[(x,y,z)] = Contact[Contact['cnt_'+z]==True][Contact['age_source']== x][Contact['age_target']==y]\n" - ] - } - ], - "source": [ - "# Specify AAL categories\n", - "Ages = ['Y', 'M', 'O']\n", - "Locales = ['home', 'workschool', 'rest']\n", - "\n", - "# Dictionary separating contact data by AAL\n", - "ContAAL = {}\n", - "for x, y, z in list(product(Ages,Ages,Locales)):\n", - " ContAAL[(x,y,z)] = Contact[Contact['cnt_'+z]==True][Contact['age_source']== x][Contact['age_target']==y] \n", - "\n", - "#Participant IDs stratified by age of respondent\n", - "PartsAge = {}\n", - "for x in Ages:\n", - " PartsAge[x] = list(set(Contact[Contact['age_source']== x]['part_id']))\n", - "\n", - "# Number of contacts stratified by AAL\n", - "DataAAL = {}\n", - "for x, y, z in list(product(Ages,Ages, Locales)):\n", - " C = ContAAL[(x,y,z)]\n", - " DataAAL[(x,y,z)] = np.array([C[C['part_id']==ID]['cont_id'].count() for ID in PartsAge[x]])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data-fitting: AAL Workschool\n", - "\n", - "By playing around with the data it appears that the most natural approach is to fit them to random variables of the form X = alpha D0 + (1-alpha)Geo(p). \n", - "\n", - "There are a few options for how to handle rescaling the mean. One option: To rescale the mean by M, do p -> p/Sqrt(M) and alpha -> 1 - Sqrt(M)(1-alpha)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# Full Dataset, frequency curves and averages\n", - "WSDataAAL = {}\n", - "WSFreqAAL = {}\n", - "WSAvgAAL = {}\n", - "for x, y in list(product(Ages,Ages)):\n", - " C = ContAAL[(x,y,'workschool')]\n", - " D = pd.DataFrame([[ID, C[C['part_id']==ID]['cont_id'].count()] for ID in PartsAge[x]] ,columns = [\"ID\", \"deg\"])\n", - " maxdeg = max(list(D[\"deg\"]))\n", - " Deglist = [D[D['deg']==deg][\"deg\"].count() for deg in range(maxdeg)]\n", - " Datalist = list(sorted(np.array(D[\"deg\"])))\n", - " WSFreqAAL[(x,y)] = Deglist/np.sum(Deglist)\n", - " WSDataAAL[(x,y)] = np.array(Datalist)\n", - " WSAvgAAL[(x,y)] = np.average(WSDataAAL[(x,y)])" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# Define functions for the curve fitting\n", - "def WgtGeoDens(k, alpha, p):\n", - " if k == 0:\n", - " return alpha + (1-alpha)*p\n", - " else:\n", - " return (1-alpha)*p*(1-p)**k\n", - "WgtGeoDens = np.vectorize(WgtGeoDens)\n", - "\n", - "def WgtGeoError(FREQ,alpha,p):\n", - " kmax = len(FREQ)\n", - " klist = np.arange(0,kmax)\n", - " Geolist = WgtGeoDens(klist, alpha, p)\n", - " return np.sum((Geolist - FREQ)**2)\n", - "WgtGeoError = np.vectorize(WgtGeoError, excluded=[0])" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "# Extract the best fit values for all 9 distributions\n", - "alphabound = (0.01,0.99)\n", - "pbound = (0.01, .99)\n", - "WSFitAAL = {}\n", - "WSErrorAAL = {}\n", - "for x, y in list(product(Ages,Ages)):\n", - " def errfun(param):\n", - " alpha = param[0]\n", - " p = param[1]\n", - " return WgtGeoError(WSFreqAAL[(x,y)], alpha, p)\n", - " Optimize = optimize.differential_evolution(errfun, [alphabound, pbound])\n", - " WSFitAAL[(x,y)] = list(Optimize.get('x'))\n", - " WSErrorAAL[(x,y)] = float(Optimize.get('fun'))" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "alphas = np.full([3,3], 0.)\n", - "swap = ['Y', 'M', 'O']\n", - "for i, j in list(product(range(3), range(3))):\n", - " alphas[i][j] = WSFitAAL[(swap[i], swap[j])][0]\n", - "\n", - "pd.DataFrame(alphas).to_csv(\"wsAlphas.csv\", index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[[0.43383514 0.40632649 0.88801511]\n", - " [0.60096567 0.43059992 0.8451303 ]\n", - " [0.88739216 0.79300391 0.94047252]]\n" - ] - } - ], - "source": [ - "print(alphas)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data-fitting: AAL Rest\n", - "\n", - "End results: Pois for YO, MY, MO, OY. Geometric for rest.\n", - "\n", - "Can choose between using MLE (avg or inverse avg) or best fit params." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Full Dataset, frequency curves and averages\n", - "RDataAAL = {}\n", - "RFreqAAL = {}\n", - "RAvgAAL = {}\n", - "for x, y in list(product(Ages,Ages)):\n", - " C = ContAAL[(x,y,'rest')]\n", - " D = pd.DataFrame([[ID, C[C['part_id']==ID]['cont_id'].count()] for ID in PartsAge[x]] ,columns = [\"ID\", \"deg\"])\n", - " maxdeg = max(list(D[\"deg\"]))\n", - " Deglist = [D[D['deg']==deg][\"deg\"].count() for deg in range(maxdeg)]\n", - " Datalist = list(sorted(np.array(D[\"deg\"])))\n", - " RFreqAAL[(x,y)] = Deglist/np.sum(Deglist)\n", - " RDataAAL[(x,y)] = np.array(Datalist)\n", - " RAvgAAL[(x,y)] = np.average(RDataAAL[(x,y)])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Pois and geom functions for fitting\n", - "def PoisDens(k, lam):\n", - " return math.exp(-lam)*lam**(k)/math.factorial(k)\n", - "PoisDens = np.vectorize(PoisDens)\n", - "def GeoDens(k,p):\n", - " return p*(1-p)**k\n", - "GeoDens = np.vectorize(GeoDens)\n", - "\n", - "def PoisError(FREQ, lam):\n", - " kmax = len(FREQ)\n", - " klist = np.arange(0,kmax)\n", - " Poislist = PoisDens(klist, lam)\n", - " return np.sum((Poislist - FREQ)**2)\n", - "\n", - "def GeoError(FREQ, p):\n", - " kmax = len(FREQ)\n", - " klist = np.arange(0,kmax)\n", - " Geolist = GeoDens(klist, p)\n", - " return np.sum((Geolist - FREQ)**2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Extract the best fit values for all 9 distributions\n", - "lambound = (0.0001, 20)\n", - "pbound = (0.01, .99)\n", - "RFitAAL = {}\n", - "RErrorAAL = {}\n", - "for x, y in list(product(Ages,Ages)):\n", - " if RAvgAAL[(x,y)] < 1: # Use Poisson\n", - " tag = 'pois'\n", - " def errfun(param):\n", - " lam = param\n", - " return PoisError(RFreqAAL[(x,y)], lam)\n", - " Optimize = optimize.differential_evolution(errfun, [lambound])\n", - " else: # Use geometric\n", - " tag = 'geo'\n", - " def errfun(param):\n", - " p = param\n", - " return GeoError(RFreqAAL[(x,y)], p)\n", - " Optimize = optimize.differential_evolution(errfun, [pbound])\n", - " RFitAAL[(x,y)] = [tag,float(Optimize.get('x'))]\n", - " RErrorAAL[(x,y)] = float(Optimize.get('fun'))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(RErrorAAL)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data-fitting: AAL Home\n", - "\n", - "Poison works quite well, though YM and OM don't look perfect. You can decide between using the best-fit lambdas OR the empirical average (MLE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Full Dataset, frequency curves and averages\n", - "HDataAAL = {}\n", - "HFreqAAL = {}\n", - "HAvgAAL = {}\n", - "for x, y in list(product(Ages,Ages)):\n", - " C = ContAAL[(x,y,'home')]\n", - " D = pd.DataFrame([[ID, C[C['part_id']==ID]['cont_id'].count()] for ID in PartsAge[x]] ,columns = [\"ID\", \"deg\"])\n", - " maxdeg = max(list(D[\"deg\"]))\n", - " Deglist = [D[D['deg']==deg][\"deg\"].count() for deg in range(maxdeg)]\n", - " Datalist = list(sorted(np.array(D[\"deg\"])))\n", - " HFreqAAL[(x,y)] = Deglist/np.sum(Deglist)\n", - " HDataAAL[(x,y)] = np.array(Datalist)\n", - " HAvgAAL[(x,y)] = np.average(HDataAAL[(x,y)])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Extract the best fit values for all 9 distributions\n", - "lambound = (0.0001, 20)\n", - "HFitAAL = {}\n", - "HErrorAAL = {}\n", - "for x, y in list(product(Ages,Ages)):\n", - " def errfun(param):\n", - " lam = param\n", - " return PoisError(HFreqAAL[(x,y)], lam)\n", - " Optimize = optimize.differential_evolution(errfun, [lambound])\n", - " HFitAAL[(x,y)] = float(Optimize.get('x'))\n", - " HErrorAAL[(x,y)] = float(Optimize.get('fun'))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Age-age contact data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Specify AA categories\n", - "Ages = ['Y', 'M', 'O']\n", - "\n", - "# Dictionary separating contact data by AA\n", - "ContAA = {}\n", - "for x, y in list(product(Ages,Ages)):\n", - " ContAA[(x,y)] = Contact[Contact['age_source']== x][Contact['age_target']==y] \n", - "\n", - "#Participant IDs stratified by age of respondent\n", - "PartsAge = {}\n", - "for x in Ages:\n", - " PartsAge[x] = list(set(Contact[Contact['age_source']== x]['part_id']))\n", - "\n", - "# Number of contacts, frequency curves and average stratified by AA\n", - "DataAA = {}\n", - "FreqAA = {}\n", - "AvgAA = {}\n", - "for x, y in list(product(Ages,Ages)):\n", - " C = ContAA[(x,y)]\n", - " D = pd.DataFrame([[ID, C[C['part_id']==ID]['cont_id'].count()] for ID in PartsAge[x]] ,columns = [\"ID\", \"deg\"])\n", - " maxdeg = max(list(D[\"deg\"]))\n", - " Deglist = [D[D['deg']==deg][\"deg\"].count() for deg in range(maxdeg)]\n", - " Datalist = list(sorted(np.array(D[\"deg\"])))\n", - " FreqAA[(x,y)] = Deglist/np.sum(Deglist)\n", - " DataAA[(x,y)] = np.array(Datalist)\n", - " AvgAA[(x,y)] = np.average(DataAA[(x,y)])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Poisson and geometric fit poorly. Going to try a negative binomial\n", - "def NegBiDens(k, r, p):\n", - " return special.binom(k+r-1,k)*(1-p)**r*p**k\n", - "NegBiDens = np.vectorize(NegBiDens)\n", - "\n", - "def NegBiError(FREQ, r, p):\n", - " kmax = len(FREQ)\n", - " klist = np.arange(0,kmax)\n", - " Bilist = NegBiDens(klist, r, p)\n", - " return np.sum((Bilist - FREQ)**2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "rbound = (0.00001, 20)\n", - "pbound = (0.01, 0.99)\n", - "Label = ('O', 'M')\n", - "def errfun(param):\n", - " r = param[0]\n", - " p = param[1]\n", - " return NegBiError(FreqAA[Label], r, p)\n", - "Optimize = optimize.differential_evolution(errfun, [rbound, pbound])\n", - "print(Optimize.get('fun'))\n", - "print(Optimize.get('x'))\n", - "\n", - "maxdeg = max(DataAA[Label])\n", - "deglist = np.arange(0,maxdeg)\n", - "plt.plot(deglist, FreqAA[Label], label=\"Data\")\n", - "plt.plot(deglist, PoisDens(deglist, AvgAA[Label]), label =\"Pois\")\n", - "plt.plot(deglist, GeoDens(deglist, 1/AvgAA[Label]), label =\"Geo\")\n", - "plt.plot(deglist, NegBiDens(deglist, Optimize.get('x')[0], Optimize.get('x')[1]), label = \"neg binom\")\n", - "plt.legend()\n", - "plt.show();" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.3" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/IntervalsModel/network-data/POLYMOD/Duration-priors.py b/IntervalsModel/network-data/POLYMOD/Duration-priors.py index be4f2b14463cf3ea7602981064f79762411c2862..caac4383a793dd336e91b5010e8c36daa0875111 100644 --- a/IntervalsModel/network-data/POLYMOD/Duration-priors.py +++ b/IntervalsModel/network-data/POLYMOD/Duration-priors.py @@ -19,78 +19,38 @@ import pandas as pd import math from itertools import product -# To start, let's load the two files: The contact and participant data -Contact = pd.read_csv("2008_Mossong_POLYMOD_contact_common.csv") -Part = pd.read_csv("2008_Mossong_POLYMOD_participant_common.csv") - -# Delete contacts missing age data -Contact = Contact.dropna(subset=['cnt_age_exact']) - -# Delete contacts missing location data -def Loctest(row): - emptycheck = row['cnt_home'] or row['cnt_work'] or row['cnt_school'] or row['cnt_transport'] or row['cnt_leisure'] or row['cnt_otherplace'] - if emptycheck == False: - return 1 - else: - return 0 -Contact['Loctest'] = Contact.apply(Loctest, axis=1) -Contact = Contact[Contact['Loctest']==0] - -# Next, we aggregate the locations -# Define the function computing WorkSchool value from remaining entries -def WorkSchool(row): - return row['cnt_work'] or row['cnt_school'] -# Create the new column -Contact['cnt_workschool'] = Contact.apply(WorkSchool, axis=1) - -# Define the function computed Rest value from remaining entries -def Rest(row): - return row['cnt_transport'] or row['cnt_leisure'] or row['cnt_otherplace'] -# And then create the new column -Contact['cnt_rest'] = Contact.apply(Rest, axis=1) - -# We also add a column with the source age bracket. First step is to define age bracket value -def Agesource(row): - ID = row['part_id'] - year = float(Part[Part['part_id'] ==ID]['part_age']) - if year < 25: - return 'Y' - elif year < 65: - return 'M' - else: - return 'O' -# Then create the new source column -Contact['age_source'] = Contact.apply(Agesource, axis=1) - -# Columns function for target age bracket -def Agetarget(row): - ID = row['cont_id'] - year = float(Contact[Contact['cont_id'] ==ID]['cnt_age_exact']) - if year < 25: - return 'Y' - elif year < 65: - return 'M' - else: - return 'O' -# Add the new target column -Contact['age_target'] = Contact.apply(Agetarget, axis=1) +# Read the cleaned survey as a dataframe +Contact = pd.read_csv("AALContact_data.csv") # Specify AAL categories Ages = ['Y', 'M', 'O'] +SymAge = [('Y','Y'), ('Y', 'M'), ('Y', 'O'), ('M', 'M'),('M', 'O'), ('O','O')] Locales = ['home', 'workschool', 'rest'] -# Dictionary separating contact data by AAL +# Dictionary separating contact data by symage-location ContAAL = {} -for x, y, z in list(product(Ages,Ages,Locales)): - ContAAL[(x,y,z)] = Contact[Contact['cnt_'+z]==True][Contact['age_source']== x][Contact['age_target']==y] - - - -# Duration distributions stratified by AAL +columns = ['part_id', 'cont_id', 'duration_multi', 'frequency_multi'] +for symage, loc in list(product(SymAge,Locales)): + x,y,z = symage[0], symage[1], loc + C = Contact[Contact['cnt_'+z] == True].copy(deep=True) + def symage_check(row): + agesrc = row['age_source'] + agetar = row['age_target'] + if (agesrc == x or agesrc == y) and (agetar==x or agetar==y): + return True + else: + return False + C["check"] = C.apply(symage_check, axis=1) + ContAAL[(x,y,z)] = C[columns].copy(deep=True) + + + +# Duration distributions stratified by SymAge-location DurFreqAAL = {} -for x, y, z in list(product(Ages,Ages, Locales)): - C = ContAAL[(x,y,z)] +for symage, loc in list(product(SymAge,Locales)): + x,y,z = symage[0], symage[1], loc + C = ContAAL[(x,y,z)].copy(deep=True) #D = pd.DataFrame([[ID, C[C['part_id']==ID]['duration_multi']] for ID in PartsAge[x]], columns = ["ID", "Dur"]) Durlist = [C[C['duration_multi']==i]["duration_multi"].count() for i in range(1,6)] DurFreqAAL[(x,y,z)] = Durlist/np.sum(Durlist) @@ -110,12 +70,11 @@ def PoisBin(lam,loc): def PoisErr(lam, label): # label is SymAge-Location agesrc, agetar, loc = label err = np.sum((PoisBin(lam, loc) - DurFreqAAL[label])**2) - errop = np.sum((PoisBin(lam, loc) - DurFreqAAL[(agetar, agesrc, loc)])**2) - return err+errop + return err PoisErr = np.vectorize(PoisErr, excluded=[1]) PoisPrior={} -SymAge = [('Y','Y'), ('Y', 'M'), ('Y', 'O'), ('M', 'M'),('M', 'O'), ('O','O')] + for symage, loc in list(product(SymAge, Locales)): AAL = (symage[0],symage[1],loc) arr = [1/PoisErr(i,AAL) for i in range(1,durcutoff[loc])] @@ -131,24 +90,24 @@ dfvals = pd.DataFrame([pd.Series(x) for x in df.col2]) dfout = dfkeys.join(dfvals) dfout.to_csv("AALPoisPriors.csv") -# Determine fixed frac according to symage-locales, excluding hh -fixfracAAL={} +# Determine frequency distributions accord to to symage-locales, excluding hh +freqdistAAL={} locales_nohh = ['workschool', 'rest'] -SymAge = [('Y','Y'), ('Y', 'M'), ('Y', 'O'), ('M', 'M'),('M', 'O'), ('O','O')] for symage, loc in list(product(SymAge, locales_nohh)): x, y, z = symage[0],symage[1],loc - C = ContAAL[(x,y,z)] - Cop = ContAAL[(y,x,z)] - tot = len(C) + len(Cop) - fixed= C[C['frequency_multi']==1] - fixedop = Cop[Cop['frequency_multi']==1] - fixfracAAL[(x,y,z)] = (len(fixed)+ len(fixedop))/tot + C = ContAAL[(x,y,z)].copy(deep=True) + tot = len(C) + freqfull = np.full(5,0.) + for i in range(1,6): + count = len(C[C['frequency_multi']==i]) + freqfull[i-1] = (count)/tot + freqdistAAL[(x,y,z)] = [freqfull[0], freqfull[1], np.sum(freqfull[2:])] + # Save them to csv -df = pd.DataFrame(list(fixfracAAL.items()), columns=['col1','col2']) +df = pd.DataFrame(list(freqdistAAL.items()), columns=['col1','col2']) dfkeys = pd.DataFrame([pd.Series(x) for x in df.col1]) dfkeys.columns = ["Age_in", "Age_out", "location"] dfvals = pd.DataFrame([pd.Series(x) for x in df.col2]) dfout = dfkeys.join(dfvals) -dfout.to_csv("AALfixedfrac.csv") - +dfout.to_csv("AALfreqdist.csv")