From 777fc36c4d6b02402efdbf0228db669703a2e6fb Mon Sep 17 00:00:00 2001
From: Mark Penney <mark.penney@uwaterloo.ca>
Date: Tue, 23 Mar 2021 17:11:32 -0400
Subject: [PATCH] Tidied up files, adding Contact-data.py as script which
 tidies the POLYMOD data.

Durations-priors updated a few ways.

First, corrected problems with symmetrizing the data. Changes priors

Second, added frequency distributions with 3 bins
---
 IntervalsModel/network-data/HH/.gitkeep       |   0
 .../network-data/POLYMOD/AALfixedfrac.csv     |  13 -
 .../network-data/POLYMOD/Contact-data.py      |  75 +++
 .../POLYMOD/Contact-distributions.ipynb       | 548 ------------------
 .../network-data/POLYMOD/Duration-priors.py   | 117 ++--
 5 files changed, 113 insertions(+), 640 deletions(-)
 delete mode 100644 IntervalsModel/network-data/HH/.gitkeep
 delete mode 100644 IntervalsModel/network-data/POLYMOD/AALfixedfrac.csv
 create mode 100644 IntervalsModel/network-data/POLYMOD/Contact-data.py
 delete mode 100644 IntervalsModel/network-data/POLYMOD/Contact-distributions.ipynb

diff --git a/IntervalsModel/network-data/HH/.gitkeep b/IntervalsModel/network-data/HH/.gitkeep
deleted file mode 100644
index e69de29..0000000
diff --git a/IntervalsModel/network-data/POLYMOD/AALfixedfrac.csv b/IntervalsModel/network-data/POLYMOD/AALfixedfrac.csv
deleted file mode 100644
index 3866955..0000000
--- a/IntervalsModel/network-data/POLYMOD/AALfixedfrac.csv
+++ /dev/null
@@ -1,13 +0,0 @@
-,Age_in,Age_out,location,0
-0,Y,Y,workschool,0.7689040478795289
-1,Y,Y,rest,0.3694669429271049
-2,Y,M,workschool,0.4889133896851648
-3,Y,M,rest,0.24192775701830027
-4,Y,O,workschool,0.5056179775280899
-5,Y,O,rest,0.2018181818181818
-6,M,M,workschool,0.4868351565447001
-7,M,M,rest,0.155895865237366
-8,M,O,workschool,0.3559322033898305
-9,M,O,rest,0.16535671100362756
-10,O,O,workschool,0.19642857142857142
-11,O,O,rest,0.15362731152204837
diff --git a/IntervalsModel/network-data/POLYMOD/Contact-data.py b/IntervalsModel/network-data/POLYMOD/Contact-data.py
new file mode 100644
index 0000000..51dd323
--- /dev/null
+++ b/IntervalsModel/network-data/POLYMOD/Contact-data.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on 2021-03-23
+
+Here we tidy and organize the POLYMOD data and save it to a csv
+
+@author: mark
+"""
+import numpy as np
+import pandas as pd
+import math
+from itertools import product
+
+# To start, let's load the two files: The contact and participant data
+Contact = pd.read_csv("2008_Mossong_POLYMOD_contact_common.csv")
+Part = pd.read_csv("2008_Mossong_POLYMOD_participant_common.csv")
+
+# Delete contacts missing age, duration and frequency data
+Contact = Contact.dropna(subset=['cnt_age_exact', 'duration_multi', 'frequency_multi'])
+
+# Delete contacts missing location data
+def Loctest(row):
+    emptycheck = row['cnt_home'] or row['cnt_work'] or row['cnt_school'] or row['cnt_transport'] or row['cnt_leisure'] or row['cnt_otherplace']
+    if emptycheck == False:
+        return 1
+    else:
+        return 0
+Contact['Loctest'] = Contact.apply(Loctest, axis=1)
+Contact = Contact[Contact['Loctest']==0].copy(deep=True)
+
+
+# Next, we aggregate the locations
+# Define the function computing WorkSchool value from remaining entries
+def WorkSchool(row):
+    return row['cnt_work'] or row['cnt_school']
+# Create the new column
+Contact['cnt_workschool'] = Contact.apply(WorkSchool, axis=1)
+
+# Define the function computed Rest value from remaining entries
+def Rest(row):
+    return row['cnt_transport'] or row['cnt_leisure'] or row['cnt_otherplace']
+# And then create the new column
+Contact['cnt_rest'] = Contact.apply(Rest, axis=1)
+
+# We also add a column with the source age bracket. First step is to define age bracket value
+def Agesource(row):
+    ID = row['part_id']
+    year = float(Part[Part['part_id'] ==ID]['part_age'])
+    if year < 25:
+        return 'Y'
+    elif year < 65:
+        return 'M'
+    else:
+        return 'O'
+# Then create the new source column
+Contact['age_source'] = Contact.apply(Agesource, axis=1)
+
+# Columns function for target age bracket
+def Agetarget(row):
+    ID = row['cont_id']
+    year = float(Contact[Contact['cont_id'] ==ID]['cnt_age_exact'])
+    if year < 25:
+        return 'Y'
+    elif year < 65:
+        return 'M'
+    else:
+        return 'O'
+# Add the new target column
+Contact['age_target'] = Contact.apply(Agetarget, axis=1)
+
+#save a copy of the new contact dataframe
+Tosave = ['part_id', 'cont_id', 'age_source', 'age_target', 'cnt_home', 
+          'cnt_workschool', 'cnt_rest', 'duration_multi', 'frequency_multi']
+Contact[Tosave].to_csv("AALContact_data.csv", index=False)
\ No newline at end of file
diff --git a/IntervalsModel/network-data/POLYMOD/Contact-distributions.ipynb b/IntervalsModel/network-data/POLYMOD/Contact-distributions.ipynb
deleted file mode 100644
index ab722b7..0000000
--- a/IntervalsModel/network-data/POLYMOD/Contact-distributions.ipynb
+++ /dev/null
@@ -1,548 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%matplotlib notebook\n",
-    "import numpy as np\n",
-    "import matplotlib.pyplot as plt\n",
-    "import pandas as pd\n",
-    "from scipy import optimize\n",
-    "from scipy import special\n",
-    "import math\n",
-    "import random\n",
-    "import csv\n",
-    "import itertools\n",
-    "from itertools import product\n",
-    "def Flatten(A):\n",
-    "    return list(itertools.chain.from_iterable(A))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Extract contact distributions by age-age and locale from POLYMOD data\n",
-    "\n",
-    "The aim here is to find well-fitting parametric forms for distributions of daily contact numbers reported in the POLYMOD data. Contacts are stratified by the age of both respondent and contact, as well as the location in which the contact took place. \n",
-    "\n",
-    "For age, we consider 3 brackets: Y (<25), M (25-64) and O (>65). \n",
-    "\n",
-    "For location, we use 3 settings: Home, Work/school, and rest. This is done to match the timeuse data. "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Cleaning and reorganizing survey results"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/mark/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3071: DtypeWarning: Columns (6,7,8,9,10,11) have mixed types.Specify dtype option on import or set low_memory=False.\n",
-      "  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,\n"
-     ]
-    }
-   ],
-   "source": [
-    "# To start, let's load the two files: The contact and participant data\n",
-    "Contact = pd.read_csv(\"2008_Mossong_POLYMOD_contact_common.csv\")\n",
-    "Part = pd.read_csv(\"2008_Mossong_POLYMOD_participant_common.csv\")\n",
-    "\n",
-    "# Delete contacts missing age data\n",
-    "Contact = Contact.dropna(subset=['cnt_age_exact'])\n",
-    "\n",
-    "# Delete contacts missing location data\n",
-    "def Loctest(row):\n",
-    "    emptycheck = row['cnt_home'] or row['cnt_work'] or row['cnt_school'] or row['cnt_transport'] or row['cnt_leisure'] or row['cnt_otherplace']\n",
-    "    if emptycheck == False:\n",
-    "        return 1\n",
-    "    else:\n",
-    "        return 0\n",
-    "Contact['Loctest'] = Contact.apply(Loctest, axis=1)\n",
-    "Contact = Contact[Contact['Loctest']==0]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Next, we aggregate the locations\n",
-    "# Define the function computing WorkSchool value from remaining entries\n",
-    "def WorkSchool(row):\n",
-    "    return row['cnt_work'] or row['cnt_school']\n",
-    "# Create the new column\n",
-    "Contact['cnt_workschool'] = Contact.apply(WorkSchool, axis=1)\n",
-    "\n",
-    "# Define the function computed Rest value from remaining entries\n",
-    "def Rest(row):\n",
-    "    return row['cnt_transport'] or row['cnt_leisure'] or row['cnt_otherplace']\n",
-    "# And then create the new column\n",
-    "Contact['cnt_rest'] = Contact.apply(Rest, axis=1)\n",
-    "\n",
-    "# We also add a column with the source age bracket. First step is to define age bracket value\n",
-    "def Agesource(row):\n",
-    "    ID = row['part_id']\n",
-    "    year = float(Part[Part['part_id'] ==ID]['part_age'])\n",
-    "    if year < 25:\n",
-    "        return 'Y'\n",
-    "    elif year < 65:\n",
-    "        return 'M'\n",
-    "    else:\n",
-    "        return 'O'\n",
-    "# Then create the new source column\n",
-    "Contact['age_source'] = Contact.apply(Agesource, axis=1)\n",
-    "\n",
-    "# Columns function for target age bracket\n",
-    "def Agetarget(row):\n",
-    "    ID = row['cont_id']\n",
-    "    year = float(Contact[Contact['cont_id'] ==ID]['cnt_age_exact'])\n",
-    "    if year < 25:\n",
-    "        return 'Y'\n",
-    "    elif year < 65:\n",
-    "        return 'M'\n",
-    "    else:\n",
-    "        return 'O'\n",
-    "# Add the new target column\n",
-    "Contact['age_target'] = Contact.apply(Agetarget, axis=1)\n",
-    "\n",
-    "# Before going further let's save a copy of the new contact dataframe\n",
-    "Tosave = ['part_id', 'cont_id', 'age_source', 'age_target', 'cnt_home', 'cnt_workschool', 'cnt_rest']\n",
-    "Contact[Tosave].to_csv(\"AALContact_data.csv\", index=False)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Age-age-location datasets"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Read the reorganized survey to a dataframe\n",
-    "Contact = pd.read_csv(\"AALContact_data.csv\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "<ipython-input-5-3980fc6f4e5f>:8: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n",
-      "  ContAAL[(x,y,z)] = Contact[Contact['cnt_'+z]==True][Contact['age_source']== x][Contact['age_target']==y]\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Specify AAL categories\n",
-    "Ages = ['Y', 'M', 'O']\n",
-    "Locales = ['home', 'workschool', 'rest']\n",
-    "\n",
-    "# Dictionary separating contact data by AAL\n",
-    "ContAAL = {}\n",
-    "for x, y, z in list(product(Ages,Ages,Locales)):\n",
-    "    ContAAL[(x,y,z)] = Contact[Contact['cnt_'+z]==True][Contact['age_source']== x][Contact['age_target']==y]  \n",
-    "\n",
-    "#Participant IDs stratified by age of respondent\n",
-    "PartsAge = {}\n",
-    "for x in Ages:\n",
-    "    PartsAge[x] =  list(set(Contact[Contact['age_source']== x]['part_id']))\n",
-    "\n",
-    "# Number of contacts stratified by AAL\n",
-    "DataAAL = {}\n",
-    "for x, y, z in list(product(Ages,Ages, Locales)):\n",
-    "    C = ContAAL[(x,y,z)]\n",
-    "    DataAAL[(x,y,z)] = np.array([C[C['part_id']==ID]['cont_id'].count() for ID in PartsAge[x]])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Data-fitting: AAL Workschool\n",
-    "\n",
-    "By playing around with the data it appears that the most natural approach is to fit them to random variables of the form X = alpha D0 + (1-alpha)Geo(p). \n",
-    "\n",
-    "There are a few options for how to handle rescaling the mean. One option: To rescale the mean by M, do p -> p/Sqrt(M) and alpha -> 1 - Sqrt(M)(1-alpha)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Full Dataset, frequency curves and averages\n",
-    "WSDataAAL = {}\n",
-    "WSFreqAAL = {}\n",
-    "WSAvgAAL = {}\n",
-    "for x, y in list(product(Ages,Ages)):\n",
-    "    C = ContAAL[(x,y,'workschool')]\n",
-    "    D = pd.DataFrame([[ID, C[C['part_id']==ID]['cont_id'].count()] for ID in PartsAge[x]] ,columns = [\"ID\", \"deg\"])\n",
-    "    maxdeg = max(list(D[\"deg\"]))\n",
-    "    Deglist = [D[D['deg']==deg][\"deg\"].count() for deg in range(maxdeg)]\n",
-    "    Datalist = list(sorted(np.array(D[\"deg\"])))\n",
-    "    WSFreqAAL[(x,y)] = Deglist/np.sum(Deglist)\n",
-    "    WSDataAAL[(x,y)] = np.array(Datalist)\n",
-    "    WSAvgAAL[(x,y)] = np.average(WSDataAAL[(x,y)])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Define functions for the curve fitting\n",
-    "def WgtGeoDens(k, alpha, p):\n",
-    "    if k == 0:\n",
-    "        return alpha + (1-alpha)*p\n",
-    "    else:\n",
-    "        return (1-alpha)*p*(1-p)**k\n",
-    "WgtGeoDens = np.vectorize(WgtGeoDens)\n",
-    "\n",
-    "def WgtGeoError(FREQ,alpha,p):\n",
-    "    kmax = len(FREQ)\n",
-    "    klist = np.arange(0,kmax)\n",
-    "    Geolist = WgtGeoDens(klist, alpha, p)\n",
-    "    return np.sum((Geolist - FREQ)**2)\n",
-    "WgtGeoError = np.vectorize(WgtGeoError, excluded=[0])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Extract the best fit values for all 9 distributions\n",
-    "alphabound = (0.01,0.99)\n",
-    "pbound = (0.01, .99)\n",
-    "WSFitAAL = {}\n",
-    "WSErrorAAL = {}\n",
-    "for x, y in list(product(Ages,Ages)):\n",
-    "    def errfun(param):\n",
-    "        alpha = param[0]\n",
-    "        p = param[1]\n",
-    "        return WgtGeoError(WSFreqAAL[(x,y)], alpha, p)\n",
-    "    Optimize = optimize.differential_evolution(errfun, [alphabound, pbound])\n",
-    "    WSFitAAL[(x,y)] = list(Optimize.get('x'))\n",
-    "    WSErrorAAL[(x,y)] = float(Optimize.get('fun'))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "alphas = np.full([3,3], 0.)\n",
-    "swap = ['Y', 'M', 'O']\n",
-    "for i, j in list(product(range(3), range(3))):\n",
-    "    alphas[i][j] = WSFitAAL[(swap[i], swap[j])][0]\n",
-    "\n",
-    "pd.DataFrame(alphas).to_csv(\"wsAlphas.csv\", index=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[[0.43383514 0.40632649 0.88801511]\n",
-      " [0.60096567 0.43059992 0.8451303 ]\n",
-      " [0.88739216 0.79300391 0.94047252]]\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(alphas)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Data-fitting: AAL Rest\n",
-    "\n",
-    "End results: Pois for YO, MY, MO, OY. Geometric for rest.\n",
-    "\n",
-    "Can choose between using MLE (avg or inverse avg) or best fit params."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Full Dataset, frequency curves and averages\n",
-    "RDataAAL = {}\n",
-    "RFreqAAL = {}\n",
-    "RAvgAAL = {}\n",
-    "for x, y in list(product(Ages,Ages)):\n",
-    "    C = ContAAL[(x,y,'rest')]\n",
-    "    D = pd.DataFrame([[ID, C[C['part_id']==ID]['cont_id'].count()] for ID in PartsAge[x]] ,columns = [\"ID\", \"deg\"])\n",
-    "    maxdeg = max(list(D[\"deg\"]))\n",
-    "    Deglist = [D[D['deg']==deg][\"deg\"].count() for deg in range(maxdeg)]\n",
-    "    Datalist = list(sorted(np.array(D[\"deg\"])))\n",
-    "    RFreqAAL[(x,y)] = Deglist/np.sum(Deglist)\n",
-    "    RDataAAL[(x,y)] = np.array(Datalist)\n",
-    "    RAvgAAL[(x,y)] = np.average(RDataAAL[(x,y)])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Pois and geom functions for fitting\n",
-    "def PoisDens(k, lam):\n",
-    "    return math.exp(-lam)*lam**(k)/math.factorial(k)\n",
-    "PoisDens = np.vectorize(PoisDens)\n",
-    "def GeoDens(k,p):\n",
-    "    return p*(1-p)**k\n",
-    "GeoDens = np.vectorize(GeoDens)\n",
-    "\n",
-    "def PoisError(FREQ, lam):\n",
-    "    kmax = len(FREQ)\n",
-    "    klist = np.arange(0,kmax)\n",
-    "    Poislist = PoisDens(klist, lam)\n",
-    "    return np.sum((Poislist - FREQ)**2)\n",
-    "\n",
-    "def GeoError(FREQ, p):\n",
-    "    kmax = len(FREQ)\n",
-    "    klist = np.arange(0,kmax)\n",
-    "    Geolist = GeoDens(klist, p)\n",
-    "    return np.sum((Geolist - FREQ)**2)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Extract the best fit values for all 9 distributions\n",
-    "lambound = (0.0001, 20)\n",
-    "pbound = (0.01, .99)\n",
-    "RFitAAL = {}\n",
-    "RErrorAAL = {}\n",
-    "for x, y in list(product(Ages,Ages)):\n",
-    "    if RAvgAAL[(x,y)] < 1: # Use Poisson\n",
-    "        tag = 'pois'\n",
-    "        def errfun(param):\n",
-    "            lam = param\n",
-    "            return PoisError(RFreqAAL[(x,y)], lam)\n",
-    "        Optimize = optimize.differential_evolution(errfun, [lambound])\n",
-    "    else: # Use geometric\n",
-    "        tag = 'geo'\n",
-    "        def errfun(param):\n",
-    "            p = param\n",
-    "            return GeoError(RFreqAAL[(x,y)], p)\n",
-    "        Optimize = optimize.differential_evolution(errfun, [pbound])\n",
-    "    RFitAAL[(x,y)] = [tag,float(Optimize.get('x'))]\n",
-    "    RErrorAAL[(x,y)] = float(Optimize.get('fun'))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(RErrorAAL)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Data-fitting: AAL Home\n",
-    "\n",
-    "Poison works quite well, though YM and OM don't look perfect. You can decide between using the best-fit lambdas OR the empirical average (MLE)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Full Dataset, frequency curves and averages\n",
-    "HDataAAL = {}\n",
-    "HFreqAAL = {}\n",
-    "HAvgAAL = {}\n",
-    "for x, y in list(product(Ages,Ages)):\n",
-    "    C = ContAAL[(x,y,'home')]\n",
-    "    D = pd.DataFrame([[ID, C[C['part_id']==ID]['cont_id'].count()] for ID in PartsAge[x]] ,columns = [\"ID\", \"deg\"])\n",
-    "    maxdeg = max(list(D[\"deg\"]))\n",
-    "    Deglist = [D[D['deg']==deg][\"deg\"].count() for deg in range(maxdeg)]\n",
-    "    Datalist = list(sorted(np.array(D[\"deg\"])))\n",
-    "    HFreqAAL[(x,y)] = Deglist/np.sum(Deglist)\n",
-    "    HDataAAL[(x,y)] = np.array(Datalist)\n",
-    "    HAvgAAL[(x,y)] = np.average(HDataAAL[(x,y)])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Extract the best fit values for all 9 distributions\n",
-    "lambound = (0.0001, 20)\n",
-    "HFitAAL = {}\n",
-    "HErrorAAL = {}\n",
-    "for x, y in list(product(Ages,Ages)):\n",
-    "    def errfun(param):\n",
-    "        lam = param\n",
-    "        return PoisError(HFreqAAL[(x,y)], lam)\n",
-    "    Optimize = optimize.differential_evolution(errfun, [lambound])\n",
-    "    HFitAAL[(x,y)] = float(Optimize.get('x'))\n",
-    "    HErrorAAL[(x,y)] = float(Optimize.get('fun'))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Age-age contact data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Specify AA categories\n",
-    "Ages = ['Y', 'M', 'O']\n",
-    "\n",
-    "# Dictionary separating contact data by AA\n",
-    "ContAA = {}\n",
-    "for x, y in list(product(Ages,Ages)):\n",
-    "    ContAA[(x,y)] = Contact[Contact['age_source']== x][Contact['age_target']==y]  \n",
-    "\n",
-    "#Participant IDs stratified by age of respondent\n",
-    "PartsAge = {}\n",
-    "for x in Ages:\n",
-    "    PartsAge[x] =  list(set(Contact[Contact['age_source']== x]['part_id']))\n",
-    "\n",
-    "# Number of contacts, frequency curves and average stratified by AA\n",
-    "DataAA = {}\n",
-    "FreqAA = {}\n",
-    "AvgAA = {}\n",
-    "for x, y in list(product(Ages,Ages)):\n",
-    "    C = ContAA[(x,y)]\n",
-    "    D = pd.DataFrame([[ID, C[C['part_id']==ID]['cont_id'].count()] for ID in PartsAge[x]] ,columns = [\"ID\", \"deg\"])\n",
-    "    maxdeg = max(list(D[\"deg\"]))\n",
-    "    Deglist = [D[D['deg']==deg][\"deg\"].count() for deg in range(maxdeg)]\n",
-    "    Datalist = list(sorted(np.array(D[\"deg\"])))\n",
-    "    FreqAA[(x,y)] = Deglist/np.sum(Deglist)\n",
-    "    DataAA[(x,y)] = np.array(Datalist)\n",
-    "    AvgAA[(x,y)] = np.average(DataAA[(x,y)])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Poisson and geometric fit poorly. Going to try a negative binomial\n",
-    "def NegBiDens(k, r, p):\n",
-    "    return special.binom(k+r-1,k)*(1-p)**r*p**k\n",
-    "NegBiDens = np.vectorize(NegBiDens)\n",
-    "\n",
-    "def NegBiError(FREQ, r, p):\n",
-    "    kmax = len(FREQ)\n",
-    "    klist = np.arange(0,kmax)\n",
-    "    Bilist = NegBiDens(klist, r, p)\n",
-    "    return np.sum((Bilist - FREQ)**2)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "rbound = (0.00001, 20)\n",
-    "pbound = (0.01, 0.99)\n",
-    "Label = ('O', 'M')\n",
-    "def errfun(param):\n",
-    "    r = param[0]\n",
-    "    p = param[1]\n",
-    "    return NegBiError(FreqAA[Label], r, p)\n",
-    "Optimize = optimize.differential_evolution(errfun, [rbound, pbound])\n",
-    "print(Optimize.get('fun'))\n",
-    "print(Optimize.get('x'))\n",
-    "\n",
-    "maxdeg = max(DataAA[Label])\n",
-    "deglist = np.arange(0,maxdeg)\n",
-    "plt.plot(deglist, FreqAA[Label], label=\"Data\")\n",
-    "plt.plot(deglist, PoisDens(deglist, AvgAA[Label]), label =\"Pois\")\n",
-    "plt.plot(deglist, GeoDens(deglist, 1/AvgAA[Label]), label =\"Geo\")\n",
-    "plt.plot(deglist, NegBiDens(deglist, Optimize.get('x')[0], Optimize.get('x')[1]), label = \"neg binom\")\n",
-    "plt.legend()\n",
-    "plt.show();"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.3"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/IntervalsModel/network-data/POLYMOD/Duration-priors.py b/IntervalsModel/network-data/POLYMOD/Duration-priors.py
index be4f2b1..caac438 100644
--- a/IntervalsModel/network-data/POLYMOD/Duration-priors.py
+++ b/IntervalsModel/network-data/POLYMOD/Duration-priors.py
@@ -19,78 +19,38 @@ import pandas as pd
 import math
 from itertools import product
 
-# To start, let's load the two files: The contact and participant data
-Contact = pd.read_csv("2008_Mossong_POLYMOD_contact_common.csv")
-Part = pd.read_csv("2008_Mossong_POLYMOD_participant_common.csv")
-
-# Delete contacts missing age data
-Contact = Contact.dropna(subset=['cnt_age_exact'])
-
-# Delete contacts missing location data
-def Loctest(row):
-    emptycheck = row['cnt_home'] or row['cnt_work'] or row['cnt_school'] or row['cnt_transport'] or row['cnt_leisure'] or row['cnt_otherplace']
-    if emptycheck == False:
-        return 1
-    else:
-        return 0
-Contact['Loctest'] = Contact.apply(Loctest, axis=1)
-Contact = Contact[Contact['Loctest']==0]
-
-# Next, we aggregate the locations
-# Define the function computing WorkSchool value from remaining entries
-def WorkSchool(row):
-    return row['cnt_work'] or row['cnt_school']
-# Create the new column
-Contact['cnt_workschool'] = Contact.apply(WorkSchool, axis=1)
-
-# Define the function computed Rest value from remaining entries
-def Rest(row):
-    return row['cnt_transport'] or row['cnt_leisure'] or row['cnt_otherplace']
-# And then create the new column
-Contact['cnt_rest'] = Contact.apply(Rest, axis=1)
-
-# We also add a column with the source age bracket. First step is to define age bracket value
-def Agesource(row):
-    ID = row['part_id']
-    year = float(Part[Part['part_id'] ==ID]['part_age'])
-    if year < 25:
-        return 'Y'
-    elif year < 65:
-        return 'M'
-    else:
-        return 'O'
-# Then create the new source column
-Contact['age_source'] = Contact.apply(Agesource, axis=1)
-
-# Columns function for target age bracket
-def Agetarget(row):
-    ID = row['cont_id']
-    year = float(Contact[Contact['cont_id'] ==ID]['cnt_age_exact'])
-    if year < 25:
-        return 'Y'
-    elif year < 65:
-        return 'M'
-    else:
-        return 'O'
-# Add the new target column
-Contact['age_target'] = Contact.apply(Agetarget, axis=1)
+# Read the cleaned survey as a dataframe
+Contact = pd.read_csv("AALContact_data.csv")
 
 
 # Specify AAL categories
 Ages = ['Y', 'M', 'O']
+SymAge = [('Y','Y'), ('Y', 'M'), ('Y', 'O'), ('M', 'M'),('M', 'O'), ('O','O')]
 Locales = ['home', 'workschool', 'rest']
 
-# Dictionary separating contact data by AAL
+# Dictionary separating contact data by symage-location
 ContAAL = {}
-for x, y, z in list(product(Ages,Ages,Locales)):
-    ContAAL[(x,y,z)] = Contact[Contact['cnt_'+z]==True][Contact['age_source']== x][Contact['age_target']==y]  
-
-
-
-# Duration distributions stratified by AAL
+columns = ['part_id', 'cont_id', 'duration_multi', 'frequency_multi']
+for symage, loc in list(product(SymAge,Locales)):
+    x,y,z = symage[0], symage[1], loc
+    C = Contact[Contact['cnt_'+z] == True].copy(deep=True)
+    def symage_check(row):
+        agesrc = row['age_source']
+        agetar = row['age_target']
+        if (agesrc == x or agesrc == y) and (agetar==x or agetar==y):
+            return True
+        else:
+            return False
+    C["check"] = C.apply(symage_check, axis=1)
+    ContAAL[(x,y,z)] = C[columns].copy(deep=True)
+
+
+
+# Duration distributions stratified by SymAge-location
 DurFreqAAL = {}
-for x, y, z in list(product(Ages,Ages, Locales)):
-    C = ContAAL[(x,y,z)]
+for symage, loc in list(product(SymAge,Locales)):
+    x,y,z = symage[0], symage[1], loc
+    C = ContAAL[(x,y,z)].copy(deep=True)
     #D = pd.DataFrame([[ID, C[C['part_id']==ID]['duration_multi']] for ID in PartsAge[x]], columns = ["ID", "Dur"])
     Durlist = [C[C['duration_multi']==i]["duration_multi"].count() for i in range(1,6)]
     DurFreqAAL[(x,y,z)] = Durlist/np.sum(Durlist)
@@ -110,12 +70,11 @@ def PoisBin(lam,loc):
 def PoisErr(lam, label): # label is SymAge-Location
     agesrc, agetar, loc = label
     err = np.sum((PoisBin(lam, loc) - DurFreqAAL[label])**2)
-    errop = np.sum((PoisBin(lam, loc) - DurFreqAAL[(agetar, agesrc, loc)])**2)
-    return err+errop
+    return err
 PoisErr = np.vectorize(PoisErr, excluded=[1])
 
 PoisPrior={}
-SymAge = [('Y','Y'), ('Y', 'M'), ('Y', 'O'), ('M', 'M'),('M', 'O'), ('O','O')]
+
 for symage, loc in list(product(SymAge, Locales)):
     AAL = (symage[0],symage[1],loc)
     arr = [1/PoisErr(i,AAL) for i in range(1,durcutoff[loc])]
@@ -131,24 +90,24 @@ dfvals = pd.DataFrame([pd.Series(x) for x in df.col2])
 dfout = dfkeys.join(dfvals)
 dfout.to_csv("AALPoisPriors.csv")
 
-# Determine fixed frac according to symage-locales, excluding hh
-fixfracAAL={}
+# Determine frequency distributions accord to to symage-locales, excluding hh
+freqdistAAL={}
 locales_nohh = ['workschool', 'rest']
-SymAge = [('Y','Y'), ('Y', 'M'), ('Y', 'O'), ('M', 'M'),('M', 'O'), ('O','O')]
 for symage, loc in list(product(SymAge, locales_nohh)):
     x, y, z = symage[0],symage[1],loc
-    C = ContAAL[(x,y,z)]
-    Cop = ContAAL[(y,x,z)]
-    tot = len(C) + len(Cop)
-    fixed= C[C['frequency_multi']==1]
-    fixedop = Cop[Cop['frequency_multi']==1]
-    fixfracAAL[(x,y,z)] = (len(fixed)+ len(fixedop))/tot
+    C = ContAAL[(x,y,z)].copy(deep=True)
+    tot = len(C)
+    freqfull = np.full(5,0.)
+    for i in range(1,6):
+        count = len(C[C['frequency_multi']==i])
+        freqfull[i-1] = (count)/tot
+    freqdistAAL[(x,y,z)] = [freqfull[0], freqfull[1], np.sum(freqfull[2:])]
+    
 
 # Save them to csv
-df = pd.DataFrame(list(fixfracAAL.items()), columns=['col1','col2'])
+df = pd.DataFrame(list(freqdistAAL.items()), columns=['col1','col2'])
 dfkeys = pd.DataFrame([pd.Series(x) for x in df.col1])
 dfkeys.columns = ["Age_in", "Age_out", "location"]
 dfvals = pd.DataFrame([pd.Series(x) for x in df.col2])
 dfout = dfkeys.join(dfvals)
-dfout.to_csv("AALfixedfrac.csv")  
-    
+dfout.to_csv("AALfreqdist.csv") 
-- 
GitLab