In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
import time
import numpy as np
import pylab as pl
import tqdm

pd.set_option('display.max_columns', None)

Q1: Low Birth Weight Causes Infant Mortality?

In [5]:
twins = pd.read_csv('twins.csv')
print(twins.shape)
twins.head(n=10)
(59052, 35)
Out[5]:
mort csex dbirwt dmage mrace dmeduc dmar dlivord mpcb anemia cardiac lung diabetes herpes hydra hemo chyper phyper eclamp incervix pre4000 preterm renal rh uterine othermr tobacco alcohol dfage frace dfeduc infant_id pair_id term id
0 0 1 2601 22 black college 1 2 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 30 black highschool 2 3 0 1
1 0 2 3069 22 black college 1 2 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 30 black highschool 3 3 0 2
2 0 1 2948 24 white highschool 1 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 30 white college 4 5 0 1
3 0 2 2948 24 white highschool 1 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 30 white college 5 5 0 2
4 0 1 3345 32 white highschool 1 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 36 white highschool 8 9 0 1
5 0 2 2863 32 white highschool 1 3 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 36 white highschool 9 9 0 2
6 0 2 2098 31 white morethancollege 1 1 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 36 white highschool 10 11 0 1
7 0 2 1985 31 white morethancollege 1 1 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 36 white highschool 11 11 0 2
8 0 1 2126 24 white highschool 1 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 22 white highschool 12 13 0 1
9 0 1 1985 24 white highschool 1 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 22 white highschool 13 13 0 2

a) Why is this data well-suited for matching?

Note: No coding is required to answer 3a.

b) Matching for ATE

In [ ]:
 

c) Generalizability from Counterfactual Twins to Singletons

In [6]:
singletons = pd.read_csv('singletons.csv')
singletons.head()
Out[6]:
Unnamed: 0 Unnamed: 0.1 mort csex dbirwt dmage mrace dmeduc dmar dlivord mpcb anemia cardiac lung diabetes herpes hydra hemo chyper phyper eclamp incervix pre4000 preterm renal rh uterine othermr tobacco alcohol dfage frace dfeduc infant_id term m_race_black m_race_other m_race_white m_edu_college m_edu_elementary m_edu_highschool m_edu_morethancollege m_edu_noedu
0 0 0 0 1 2977 29 white highschool 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 22 white highschool 0 0 0 0 1 0 0 1 0 0
1 1 1 0 2 3912 25 white college 1 2 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 25 white college 1 1 0 0 1 1 0 0 0 0
2 2 2 0 1 3317 36 white morethancollege 1 3 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 33 white college 2 0 0 0 1 0 0 0 1 0
3 3 3 0 2 2963 30 white college 1 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 31 white college 3 0 0 0 1 1 0 0 0 0
4 4 4 0 2 3572 25 white highschool 1 3 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 26 white highschool 4 2 0 0 1 0 0 1 0 0
In [ ]:
 

Q2: Smoking During Pregnancy Causes Low Birth Weight?

In [7]:
X = ['dmage', 'dmar', 'dlivord', 'anemia', 'cardiac', 'lung', 'diabetes', 'herpes',\
     'hydra', 'hemo', 'chyper', 'phyper', 'eclamp', 'incervix', 'pre4000', 'preterm', \
     'renal', 'rh', 'uterine', 'othermr', 'alcohol',\
     'm_race_black', 'm_race_other', 'm_race_white', \
     'm_edu_college', 'm_edu_elementary', 'm_edu_highschool', 'm_edu_morethancollege', 'm_edu_noedu']

T = 'tobacco'
O = 'dbirwt'

a) Naive Difference in Cohorts

In [ ]:
 

b) Covariate Adjustment

In [ ]:
 

c) Propensity Score Re-Weighting

In [ ]: