# double checking that we are in the correct directory
!pwd

/Users/annaschoeny/Desktop/TU/Senior/Sem1/DataScience/marisalong.github.io


# importing libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns


# Read in the longitudinal data and take a peek
df_alhz_long = pd.read_csv('./data/oasis_longitudinal.csv')
df_alhz_long.head()


# Dataframe 1: Visit 1
df_long1 = df_alhz_long.loc[df_alhz_long['Visit'] == 1]
df_long1 = df_long1.rename(columns={'Visit': "Visit1"})

# Dataframe 2: Visit 2
df_long2 = df_alhz_long.loc[df_alhz_long['Visit'] == 2]
df_long2 = df_long2.rename(columns={'Visit': "Visit2"})

# Dataframe 3: Visit 3
df_long3 = df_alhz_long.loc[df_alhz_long['Visit'] == 3]
df_long3 = df_long3.rename(columns={'Visit': "Visit3"})

# Dataframe 4: holds the individuals keys and which visits they have had
cols = ['Subject ID', 'Visit1', 'Visit2', 'Visit3']
df_long_ids = df_long1.merge(df_long2, on = "Subject ID", how = "outer")
df_long_ids = df_long_ids.merge(df_long3, on = "Subject ID", how = "outer")
df_long_ids = df_long_ids[cols]
df_long_ids.set_index('Subject ID', inplace=True)


# Drop the Visits from Dataframes 1-3

df_long1.drop(columns=['Visit1'], inplace=True)
df_long2.drop(columns=['Visit2'], inplace=True)
df_long3.drop(columns=['Visit3'], inplace=True)

# Change the visit columns to booleans in the ID dataframe
df_long_ids['Visit1'].replace(1, True, inplace=True)
df_long_ids['Visit2'].replace(2.0, True, inplace=True)
df_long_ids['Visit3'].replace(3.0, True, inplace=True)

# Change np.nan to False in the ID dataframe
df_long_ids.fillna(False, inplace=True)


# Tables created above displayed

display(df_long_ids)
display(df_long1)
display(df_long2)
display(df_long3)


df_alhz_long.dtypes

Subject ID     object
MRI ID         object
Group          object
Visit           int64
MR Delay        int64
M/F            object
Hand           object
Age             int64
EDUC            int64
SES           float64
MMSE          float64
CDR           float64
eTIV            int64
nWBV          float64
ASF           float64
dtype: object


# Looking at what type of patients make up this dataset
# Everyone had a first visit but for various reasons people did not 
# necessarily have a second or third visit 
print("Classifications for first visit:")
display(df_long1['Group'].value_counts())
print("Classifications for second visit:")
display(df_long2['Group'].value_counts())
print("Classifications for third visit:")
display(df_long3['Group'].value_counts())

Classifications for first visit:

Nondemented    72
Demented       64
Converted      14
Name: Group, dtype: int64

Classifications for second visit:

Nondemented    70
Demented       62
Converted      12
Name: Group, dtype: int64

Classifications for third visit:

Nondemented    34
Demented       16
Converted       8
Name: Group, dtype: int64


# Read in the cross-sectional data and take a peek
df_alhz_cross = pd.read_csv('./data/oasis_cross-sectional.csv')
df_alhz_cross.head()


df_alhz_cross.dtypes

ID        object
M/F       object
Hand      object
Age        int64
Educ     float64
SES      float64
MMSE     float64
CDR      float64
eTIV       int64
nWBV     float64
ASF      float64
Delay    float64
dtype: object


#Create a Group variable for cross-sectional data
df_alhz_cross['Group'] = df_alhz_cross['CDR'].map({
    0.0: 'Nondemented',
    0.5: 'Demented',
    1.0: 'Demented',
    1.5:'Demented',
    2.0:'Demented'
})

# Look at how many individuals fall into each category
df_alhz_cross['Group'].value_counts()

Nondemented    135
Demented       100
Name: Group, dtype: int64


# Read in the data 
df_meta_complete = pd.read_csv('./data/sea-ad_cohort_donor_metadata_082222.csv')


# Get a sense of all the columns available in the data
df_meta_complete.columns

Index(['Donor ID', 'Primary Study Name', 'Secondary Study Name',
       'Age at Death', 'Sex', 'Race (choice=White)',
       'Race (choice=Black/ African American)', 'Race (choice=Asian)',
       'Race (choice=American Indian/ Alaska Native)',
       'Race (choice=Native Hawaiian or Pacific Islander)',
       'Race (choice=Unknown or unreported)', 'Race (choice=Other)',
       'specify other race', 'Hispanic/Latino', 'Highest level of education',
       'Years of education', 'APOE4 Status', 'Cognitive Status',
       'Age of onset cognitive symptoms', 'Age of Dementia diagnosis',
       'Known head injury', 'Have they had neuroimaging',
       'Consensus Clinical Dx (choice=Alzheimers disease)',
       'Consensus Clinical Dx (choice=Alzheimers Possible/ Probable)',
       'Consensus Clinical Dx (choice=Ataxia)',
       'Consensus Clinical Dx (choice=Corticobasal Degeneration)',
       'Consensus Clinical Dx (choice=Control)',
       'Consensus Clinical Dx (choice=Dementia with Lewy Bodies/ Lewy Body Disease)',
       'Consensus Clinical Dx (choice=Frontotemporal lobar degeneration)',
       'Consensus Clinical Dx (choice=Huntingtons disease)',
       'Consensus Clinical Dx (choice=Motor Neuron disease)',
       'Consensus Clinical Dx (choice=Multiple System Atrophy)',
       'Consensus Clinical Dx (choice=Parkinsons disease)',
       'Consensus Clinical Dx (choice=Parkinsons Cognitive Impairment - no dementia)',
       'Consensus Clinical Dx (choice=Parkinsons Disease Dementia)',
       'Consensus Clinical Dx (choice=Prion)',
       'Consensus Clinical Dx (choice=Progressive Supranuclear Palsy)',
       'Consensus Clinical Dx (choice=Taupathy)',
       'Consensus Clinical Dx (choice=Vascular Dementia)',
       'Consensus Clinical Dx (choice=Unknown)',
       'Consensus Clinical Dx (choice=Other)',
       'If other Consensus dx, describe', 'Last CASI Score',
       'Interval from last CASI in months', 'Last MMSE Score',
       'Interval from last MMSE in months', 'Last MOCA Score',
       'Interval from last MOCA in months', 'PMI', 'Rapid Frozen Tissue Type',
       'Ex Vivo Imaging', 'Fresh Brain Weight', 'Brain pH',
       'Overall AD neuropathological Change', 'Thal', 'Braak', 'CERAD score',
       'Overall CAA Score', 'Highest Lewy Body Disease',
       'Total Microinfarcts (not observed grossly)',
       'Total microinfarcts in screening sections', 'Atherosclerosis',
       'Arteriolosclerosis', 'LATE', 'RIN'],
      dtype='object')


# Making a dataframe with only the columns we want to examine at this point
df_meta = df_meta_complete[['Donor ID', 'Age at Death', 'Sex', 'Race (choice=White)', 'Race (choice=Black/ African American)', 'Race (choice=Asian)', 'Race (choice=American Indian/ Alaska Native)', 'Race (choice=Native Hawaiian or Pacific Islander)', 'Race (choice=Unknown or unreported)', 'Race (choice=Other)','Hispanic/Latino', 'Highest level of education', 'Years of education', 'Cognitive Status', 'Age of onset cognitive symptoms', 'Age of Dementia diagnosis', 'Known head injury', 'Fresh Brain Weight', 'Brain pH', 'Last MMSE Score']]


# Want to clean up the Race columns into one simple column
df = pd.DataFrame()
df["Race (choice=White)"] = df_meta["Race (choice=White)"].map({
    'Checked': 1, 
    'Unchecked': 0
})
df["Race (choice=Black/ African American)"] = df_meta["Race (choice=Black/ African American)"].map({
    'Checked': 1, 
    'Unchecked': 0
})
df["Race (choice=Asian)"] = df_meta["Race (choice=Asian)"].map({
    'Checked': 1, 
    'Unchecked': 0
})
df["Race (choice=American Indian/ Alaska Native)"] = df_meta["Race (choice=American Indian/ Alaska Native)"].map({
    'Checked': 1, 
    'Unchecked': 0
})
df["Race (choice=Native Hawaiian or Pacific Islander)"] = df_meta["Race (choice=Native Hawaiian or Pacific Islander)"].map({
    'Checked': 1, 
    'Unchecked': 0
})
df["Race (choice=Unknown or unreported)"] = df_meta["Race (choice=Unknown or unreported)"].map({
    'Checked': 1, 
    'Unchecked': 0
})
df["Race (choice=Other)"] = df_meta["Race (choice=Other)"].map({
    'Checked': 1, 
    'Unchecked': 0
})

race_series = df.idxmax(axis=1)
df_meta = df_meta.merge(race_series.rename("Race"), left_index=True, right_index=True)


df_meta = df_meta.drop(['Race (choice=White)', 'Race (choice=Black/ African American)', "Race (choice=Asian)", "Race (choice=American Indian/ Alaska Native)", "Race (choice=Native Hawaiian or Pacific Islander)", "Race (choice=Unknown or unreported)", "Race (choice=Other)"], axis=1)

df_meta["Race"] = df_meta["Race"].map({
    'Race (choice=White)': "White", 
    'Race (choice=Black/ African American)': "Black/African American", 
    'Race (choice=Asian)':'Asian',
    'Race (choice=American Indian/ Alaska Native)':"American Indian/Alaska Native",
    'Race (choice=Native Hawaiian or Pacific Islander)': 'Native Hawaiian/Pacific Islander',  
    'Race (choice=Unknown or unreported)': "Unknown/Unreported",      
    'Race (choice=Other)': "Mixed" # Determined this to be mixed given the the column "specify other race"
})

# Taking peak at reformatted data
display(df_meta.head(5))

# Checking the datatypes
df_meta.dtypes

Donor ID                            object
Age at Death                        object
Sex                                 object
Hispanic/Latino                     object
Highest level of education          object
Years of education                   int64
Cognitive Status                    object
Age of onset cognitive symptoms     object
Age of Dementia diagnosis           object
Known head injury                   object
Fresh Brain Weight                  object
Brain pH                           float64
Last MMSE Score                    float64
Race                                object
dtype: object


# Average Education level for demented individuals
df_alhz_long[df_alhz_long.Group == "Demented"].EDUC.mean()

13.67123287671233


# Average Education level for nondemented individuals
df_alhz_long[df_alhz_long.Group == "Nondemented"].EDUC.mean()

15.142105263157895


# Average Education level for converted individuals
df_alhz_long[df_alhz_long.Group == "Converted"].EDUC.mean()

15.45945945945946


df_alhz_long["EDUC"].corr(df_alhz_long["SES"])

-0.7226472777909835


print(df_long1["EDUC"].corr(df_long1["SES"] == 1.0))
print(df_long1["EDUC"].corr(df_long1["SES"] == 2.0))
print(df_long1["EDUC"].corr(df_long1["SES"] == 3.0))
print(df_long1["EDUC"].corr(df_long1["SES"] == 4.0))
print(df_long1["EDUC"].corr(df_long1["SES"] == 5.0))

0.5358661020777975
0.17930705895287138
-0.08966304117986083
-0.4537338326642282
-0.32575762345123993


# Creating a new dataframe to hold these age-differences

#get start and end dates for each proceeding
gb = df_alhz_long.groupby("Subject ID",sort=False)["Age"]
ages = gb.apply(min).to_frame().merge(gb.apply(max), on="Subject ID").rename(
    columns={"Age_x": "Age of First Visit", "Age_y": "Age of Last Visit"})
ages['Difference'] = (ages["Age of Last Visit"] - ages["Age of First Visit"])
ages


# Looking at how long people were involved in the study
difference = ages["Difference"].value_counts()
difference.plot.bar(xlabel="Years from First to Last Visit", ylabel = "Patient Count", title="Length of Patient Involvement in Longitudinal Study", alpha=.5)

<AxesSubplot:title={'center':'Length of Patient Involvement in Longitudinal Study'}, xlabel='Years from First to Last Visit', ylabel='Patient Count'>


# Filtering data to get just the first visit
first_visit = df_alhz_long[df_alhz_long.Visit == 1]

#Take mean of age of filtered data
first_visit.Age.mean()

75.44666666666667


# Take standard deviation of age of filtered data
first_visit.Age.std()

7.545421000584566


# Creating new dataframe with ages in accending order
age_accending_dementia = df_alhz_long.sort_values('Age')
age_accending_dementia = age_accending_dementia[age_accending_dementia.Group == "Demented"]
age_accending_dementia.groupby("M/F").Age.plot.hist(alpha=.5, density=True, legend=True, xlabel='Age of Indv. with Dementia', title='Males v Females with Dementia by Age (Longitudinal)')

M/F
F    AxesSubplot(0.125,0.125;0.775x0.755)
M    AxesSubplot(0.125,0.125;0.775x0.755)
Name: Age, dtype: object


# Average Education level for demented individuals
df_alhz_cross[df_alhz_cross.Group == "Demented"].Educ.mean()

2.82


# Average Education level for nondemented individuals
df_alhz_cross[df_alhz_cross.Group == "Nondemented"].Educ.mean()

3.4444444444444446


df_alhz_cross["Educ"].corr(df_alhz_cross["SES"])

-0.7423610355426756


#Take mean of age
df_alhz_cross.Age.mean()

51.357798165137616


# Take standard deviation of age
df_alhz_cross.Age.std()

25.269862268101562


# Creating new dataframe with ages in accending order
age_accending_cross = df_alhz_cross.sort_values('Age')
age_accending = age_accending_cross[age_accending_cross.Group == "Demented"]
age_accending.groupby("M/F").Age.plot.hist(alpha=.5, density=True, legend=True)
plt.title('Males v Females with Dementia by Age (Cross-Sectional)')
plt.show()


 df_meta.Sex.value_counts()

Female    51
Male      33
Name: Sex, dtype: int64


df_meta.Race.value_counts()

White    81
Asian     3
Name: Race, dtype: int64


# Distribution of patients with and without dementia
df_meta["Cognitive Status"].value_counts()

No dementia    42
Dementia       42
Name: Cognitive Status, dtype: int64


# ...and now grouped by sex
print("Female:")
print(df_meta[df_meta["Sex"] == "Female"]["Cognitive Status"].value_counts())
print("\nMale:")
print(df_meta[df_meta["Sex"] == "Male"]["Cognitive Status"].value_counts())

Female:
Dementia       27
No dementia    24
Name: Cognitive Status, dtype: int64

Male:
No dementia    18
Dementia       15
Name: Cognitive Status, dtype: int64


df_meta.groupby("Cognitive Status")['Brain pH'].plot.hist(alpha=.5, density=True, legend=True)
plt.title('Brain pH for Subjects With and Without Dementia')
plt.xlabel=('pH')
plt.show()


df_meta_nodementia = df_meta[df_meta["Cognitive Status"] != "Dementia"]
df_meta_dementia = df_meta[df_meta["Cognitive Status"] == "Dementia"]

df_meta_nodementia.groupby("Sex")['Brain pH'].plot.hist(alpha=.5, density=True, legend=True, xlabel = "Brain pH", title = "Brain pH by Sex for Individuals without Dementia", bins=30, range=[4.0, 8.0])

Sex
Female    AxesSubplot(0.125,0.125;0.775x0.755)
Male      AxesSubplot(0.125,0.125;0.775x0.755)
Name: Brain pH, dtype: object


# now fow dementia
df_meta_dementia.groupby("Sex")['Brain pH'].plot.hist(alpha=.5, density=True, legend=True, xlabel = "Brain pH", title = "Brain pH by Sex for Individuals with Dementia", bins=30, range=[4.0, 8.0])

Sex
Female    AxesSubplot(0.125,0.125;0.775x0.755)
Male      AxesSubplot(0.125,0.125;0.775x0.755)
Name: Brain pH, dtype: object


print("Highest Education Level")
print(df_meta["Highest level of education"].value_counts())
print("\nYears of Education")
print(df_meta["Years of education"].value_counts())

Highest Education Level
Graduate (PhD/Masters)       25
Bachelors                    22
High School                  18
Trade School/ Tech School    15
Professional                  4
Name: Highest level of education, dtype: int64

Years of Education
16    17
18    13
12    12
21    10
15     8
14     8
17     7
13     4
20     3
19     2
Name: Years of education, dtype: int64


# Average Education level for demented individuals
print("The average number of years of education for individuals with dementia is", round(df_meta[df_meta["Cognitive Status"] == "Dementia"]["Years of education"].mean(), 2))

# Average Education level for non-demented individuals
print("The average number of years of education for individuals without dementia is", round(df_meta[df_meta["Cognitive Status"] == "No dementia"]["Years of education"].mean(), 2))

The average number of years of education for individuals with dementia is 16.62
The average number of years of education for individuals without dementia is 15.79


# Average Education level for females
print("The average years of education for females is", round(df_meta[df_meta["Sex"] == "Female"]["Years of education"].mean(), 2))

# Average Education level for males
print("The average years of education for males is", round(df_meta[df_meta["Sex"] == "Male"]["Years of education"].mean(), 2))

The average years of education for females is 15.98
The average years of education for males is 16.55


# Average Education level for females
print("The average years of education for females with dementia is", end=' ')
print(round(df_meta[(df_meta["Sex"] == "Female") & (df_meta["Cognitive Status"] == "Dementia")]["Years of education"].mean(),2))
print("The average years of education for females without dementia is", end=' ')
print(round(df_meta[(df_meta["Sex"] == "Female") & (df_meta["Cognitive Status"] == "No dementia")]["Years of education"].mean(),2))

# Average Education level for males
print("The average years of education for males with dementia is", end=' ')
print(round(df_meta[(df_meta["Sex"] == "Male") & (df_meta["Cognitive Status"] == "Dementia")]["Years of education"].mean(),2))
print("The average years of education for males without dementia is", end=' ')
print(round(df_meta[(df_meta["Sex"] == "Male") & (df_meta["Cognitive Status"] == "No dementia")]["Years of education"].mean(),2))

The average years of education for females with dementia is 16.44
The average years of education for females without dementia is 15.46
The average years of education for males with dementia is 16.93
The average years of education for males without dementia is 16.22


# Correlation between education and dementia

# create a numeric version of dementia column for this analysis

df_meta["cog_status_num"] = df_meta["Cognitive Status"].map({
    "Dementia": 1,
    "No dementia": 0
})


df_alhz_cross['cog_status_num'] = df_alhz_cross['Group'].map({
    'Nondemented': 0,
    'Demented': 1
})

df_alhz_long['cog_status_num'] = df_alhz_long['Group'].map({
    'Nondemented': 0,
    'Converted': 1,
    'Demented': 1
})

# correlation
print("Correlation between years of education and dementia from the meta data:")
print(df_meta["Years of education"].corr(df_meta["cog_status_num"]))
print("Correlation between years of education and dementia from oasis study...")
print("based on the cross-sectional data:", df_alhz_cross["Educ"].corr(df_alhz_cross["cog_status_num"]))
print("based on the longitudinal data:", df_alhz_long["EDUC"].corr(df_alhz_long["cog_status_num"]))

Correlation between years of education and dementia from the meta data:
0.15100381378815506
Correlation between years of education and dementia from oasis study...
based on the cross-sectional data: -0.23591049426349353
based on the longitudinal data: -0.19306010020826195


df_meta_dummy = pd.concat([df_meta, pd.get_dummies(df_meta["Sex"])], axis=1)
# TO DO : make a correlation matrix between certain variables and dementia
variables = ["cog_status_num", 
             "Last MMSE Score", 
             "Female",
             "Male",
             "Years of education"]
df_meta_dummy[variables].corr()


# Average Education level for females
print("The average MMSE for females with dementia is", end=' ')
female_d_mmse = df_meta[(df_meta["Sex"] == "Female") & (df_meta["Cognitive Status"] == "Dementia")]["Last MMSE Score"]
print(round(female_d_mmse.mean(),2))
print("The average MMSE for females without dementia is", end=' ')
female_nd_mmse = df_meta[(df_meta["Sex"] == "Female") & (df_meta["Cognitive Status"] == "No dementia")]["Last MMSE Score"]
print(round(female_nd_mmse.mean(),2))


# Average Education level for males
print("The average MMSE for males with dementia is", end=' ')
male_d_mmse = df_meta[(df_meta["Sex"] == "Male") & (df_meta["Cognitive Status"] == "Dementia")]["Last MMSE Score"]
print(round(male_d_mmse.mean(),2))
print("The average MSSE for males without dementia is", end=' ')
male_nd_mmse = df_meta[(df_meta["Sex"] == "Male") & (df_meta["Cognitive Status"] == "No dementia")]["Last MMSE Score"]
print(round(male_nd_mmse.mean(),2))

The average MMSE for females with dementia is 21.52
The average MMSE for females without dementia is 27.52
The average MMSE for males with dementia is 23.93
The average MSSE for males without dementia is 25.89


# Now, adding in the other datasets
df_alhz_cross["MMSE"].dropna(inplace=True)
df_alhz_long["MMSE"].dropna(inplace=True)

female_d_mmse_cross = df_alhz_cross[(df_alhz_cross["M/F"] == "F") & (df_alhz_cross["Group"] == "Demented")]["MMSE"]
female_nd_mmse_cross = df_alhz_cross[(df_alhz_cross["M/F"] == "F") & (df_alhz_cross["Group"] == "Nondemented")]["MMSE"]
female_d_mmse_long = df_alhz_long[(df_alhz_long["M/F"] == "F") & ((df_alhz_long["Group"] == "Demented") | (df_alhz_long["Group"] == "Converted"))]["MMSE"]
female_nd_mmse_long = df_alhz_long[(df_alhz_long["M/F"] == "F") & (df_alhz_long["Group"] == "Nondemented")]["MMSE"]

male_d_mmse_cross = df_alhz_cross[(df_alhz_cross["M/F"] == "M") & (df_alhz_cross["Group"] == "Demented")]["MMSE"]
male_nd_mmse_cross = df_alhz_cross[(df_alhz_cross["M/F"] == "M") & (df_alhz_cross["Group"] == "Nondemented")]["MMSE"]
male_d_mmse_long = df_alhz_long[(df_alhz_long["M/F"] == "M") & ((df_alhz_long["Group"] == "Demented") | (df_alhz_long["Group"] == "Converted"))]["MMSE"]
male_nd_mmse_long = df_alhz_long[(df_alhz_long["M/F"] == "M") & (df_alhz_long["Group"] == "Nondemented")]["MMSE"]

sum_female_d = pd.concat([female_d_mmse_long,female_d_mmse])
sum_female_d = pd.concat([sum_female_d, female_d_mmse_cross])
sum_female_nd = pd.concat([female_nd_mmse_long, female_nd_mmse])
sum_female_nd = pd.concat([sum_female_nd, female_nd_mmse_cross])
sum_male_d = pd.concat([male_d_mmse_long, male_d_mmse])
sum_male_d = pd.concat([sum_male_d, male_d_mmse_cross])
sum_male_nd = pd.concat([male_nd_mmse_long,male_nd_mmse])
sum_male_nd = pd.concat([sum_male_nd, male_nd_mmse_cross])

print("The overall average MMSE for females with dementia is", end=' ')
print(round(sum_female_d.mean(),2))
print("The overall average MMSE for females without dementia is", end=' ')
print(round(sum_female_nd.mean(),2))
print("The overall average MMSE for males with dementia is", end=' ')
print(round(sum_male_d.mean(),2))
print("The overall average MMSE for males without dementia is", end=' ')
print(round(sum_male_nd.mean(),2))

The overall average MMSE for females with dementia is 24.33
The overall average MMSE for females without dementia is 29.09
The overall average MMSE for males with dementia is 25.05
The overall average MMSE for males without dementia is 28.51


df_meta["Age of Dementia diagnosis"] = df_meta["Age of Dementia diagnosis"].replace(['90+'], '90')


# Drop the rows where there is no age of dementia diagnosis 
df_temp = df_meta[df_meta["Age of Dementia diagnosis"] != 0]
df_temp.plot.scatter(x="Years of education", y="Age of Dementia diagnosis")

<AxesSubplot:xlabel='Years of education', ylabel='Age of Dementia diagnosis'>


# All of the imports necessary for this modeling section
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.impute import KNNImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import seaborn as sns


# Connecting the datasets

# Creating a cog status num for df_long1
df_long1['cog_status_num'] = df_alhz_long['Group'].map({
    'Nondemented': 0,
    'Converted': 1,
    'Demented': 1
})

# Creating temporary DFs with just the variables we want to look at
cols = ['Subject ID', 'cog_status_num', 'M/F','Age', 'EDUC', 'SES', 'MMSE', 'CDR', 'eTIV', 
                    'nWBV', 'ASF']
temp_long = df_long1[cols]
cols = ['ID', 'cog_status_num', 'M/F', 'Age', 'Educ', 'SES', 'CDR', 'eTIV', 
                    'nWBV', 'ASF']
temp_cross = df_alhz_cross[cols]
cols = ['Donor ID', 'cog_status_num', 'Sex', 'Age of onset cognitive symptoms', 'Brain pH', 'Last MMSE Score', 'Highest level of education', 'Years of education']
temp_meta = df_meta[cols]


# Rename the column names to be consistent
# Note: EDUC_Y = Years of education, EDUC_L = level of education
temp_long = temp_long.rename(columns = {'Subject ID':'ID',
                           'M/F': 'Sex',
                           'EDUC': 'EDUC_Y'})
temp_cross = temp_cross.rename(columns = {'M/F': 'Sex',
                           'Educ': 'EDUC_L'})
temp_meta = temp_meta.rename(columns = {'Donor ID': 'ID',
                           'Last MMSE Score': 'MMSE',
                           'Highest level of education': 'EDUC_L',
                           'Years of education': 'EDUC_Y'})


# Changing the columns to contain the same type
temp_meta['Sex'] = temp_meta['Sex'].map({
    'Male': 'M',
    'Female': 'F'
})


# Append cross and long,and the drop duplicate data

alz_model_df = pd.concat([temp_long, temp_cross])
alz_model_df = alz_model_df.drop_duplicates(subset=['Age', 'MMSE', 'CDR', 'eTIV', 'nWBV', 'ASF'])


# Appending all of them
alz_model_df = pd.concat([alz_model_df, temp_meta])

# Adding a binary sex column
alz_model_df['Sex_binary'] = alz_model_df['Sex'].map({
    'M': 1,
    'F': 0
})

# Adding a classification column for cognitive status
alz_model_df['cog_status'] = alz_model_df['cog_status_num'].map({
    1: 'Dementia',
    0: 'No dementia'
})
alz_model_df.head(5)


# Drop rows where the cog_status_num is unknown since that is what we are predicting.
alz_model_df.dropna(subset=['cog_status'], inplace=True)


alz_model_df['cog_status'].value_counts()

#Plot training data with color representing cognitive status
colors = alz_model_df['cog_status'].map({
    "Dementia": "red",
    "No dementia": "green"
})

alz_model_df.plot.scatter(
    x = "MMSE", y = "EDUC_Y", c=colors, 
    alpha=.3
)

<AxesSubplot:xlabel='MMSE', ylabel='EDUC_Y'>


plt.figure(figsize=(16, 6))
heatmap = sns.heatmap(alz_model_df.corr(), vmin=-1, vmax=1, annot=True)
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12);


# define features
features = ["Sex_binary", "EDUC_Y", "SES", "MMSE", "eTIV", "nWBV", "ASF", "Brain pH"]

nan = np.nan
X = alz_model_df[features]
imputer = KNNImputer(n_neighbors=2, weights="uniform")
X = imputer.fit_transform(X)
df_X = pd.DataFrame(X, columns = features)

# Define training data
#X_train_dict = dict(enumerate(X.flatten(), 1))
X_train_dict = df_X.to_dict(orient="records")
y_train = alz_model_df["cog_status_num"]

# Dummy encoding
vec = DictVectorizer(sparse=False)
vec.fit(X_train_dict)
X_train = vec.transform(X_train_dict)
#X_new = vec.transform(X_new_dict)

# Standardize the data
scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)

# Fit the 9-nearest neighbors model
model = KNeighborsClassifier(n_neighbors=9)
model.fit(X_train_sc, y_train)

KNeighborsClassifier(n_neighbors=9)


y_train_pred = model.predict(X_train_sc)
accuracy = accuracy_score(y_train, y_train_pred)
precision = precision_score(y_train, y_train_pred, pos_label=1)
recall = recall_score(y_train, y_train_pred, pos_label=1)
f1_score =  2*((precision*recall)/(precision+recall))
print("accuracy:", accuracy, "\nprecision:", precision, "\nrecall:", recall, "\nf1:", f1_score)

accuracy: 0.7270788912579957 
precision: 0.7110091743119266 
recall: 0.7045454545454546 
f1: 0.7077625570776256


def model_testing_k(features, Y_var, scoring_type, k):
    # define the training data
    X_train_dict = alz_model_df[features].to_dict(orient="records")
    y_train = Y_var

    # convert categorical variables to dummy variables
    vec = DictVectorizer(sparse=False)
    vec.fit(X_train_dict)
    X_train = vec.transform(X_train_dict)

    # specify the pipeline
    nan = np.nan
    imputer = KNNImputer(n_neighbors=2, weights="uniform")
    scaler = StandardScaler()
    model = KNeighborsRegressor(n_neighbors=k)
    

    # Create pipeline
    pipeline = Pipeline([
        ("imputer", imputer),
        ("scaler", scaler),
        ("model", model)
    ])

    # calculate evaluation metric using cross validation
    return  cross_val_score(pipeline, X_train, y_train, 
                    cv=10, scoring=scoring_type)


features = ["Sex_binary", "EDUC_Y", "SES", "MMSE", "eTIV", "nWBV", "ASF", "Brain pH"]
X_train_dict = alz_model_df[features].to_dict(orient="records")
y_train = alz_model_df["cog_status_num"]

vec = DictVectorizer(sparse=False)
vec.fit(X_train_dict)
X_train_sc = vec.transform(X_train_dict)

nan = np.nan
imputer = KNNImputer(n_neighbors=2, weights="uniform")
X_train_sc = imputer.fit_transform(X_train_sc)

nmses = []
k_opts = [i for i in range(1, 51, 1)]
for k in k_opts:
    nmse = -np.mean(model_testing_k(features, y_train, 'neg_mean_squared_error', k))
    nmses.append(nmse)


fig, ax = plt.subplots(1,1,figsize=(5,5))

ax.plot(k_opts, nmses, marker="o", linestyle="-",markersize=15, color="steelblue", markeredgecolor="white", alpha=0.8)

ax.set_ylabel("NMSE", fontsize=14)
ax.set_xlabel("K", fontsize=14)
plt.grid()
plt.show()


nmses_series = pd.Series(nmses)
min_nmse = nmses_series.min()
idxmin_nmse = nmses_series.idxmin()
(min_nmse, idxmin_nmse)

(0.20452670429874079, 13)


# feature testing using multiple scoring metrics
def feature_selection(features):
    X_train_dict = alz_model_df[features].to_dict(orient="records")
    y_train = alz_model_df["cog_status"]

    # convert categorical variables to dummy variables
    vec = DictVectorizer(sparse=False)
    vec.fit(X_train_dict)
    X_train = vec.transform(X_train_dict)
    
    # specify the pipeline
    model = KNeighborsClassifier(n_neighbors=13)

    # Create pipeline
    pipeline = Pipeline([
        ("imputer", imputer),
        ("scaler", scaler),
        ("model", model)
    ])
    
    scoring = {'acc': 'accuracy',
           'prec': 'precision',
           'rec': 'recall',
            'f1': 'f1'}
    
    
    is_demented_train = (y_train == "Dementia")
    cv_results = cross_validate(pipeline, X_train, is_demented_train, 
                cv=10, scoring=scoring)
    
    array_scores = np.mean(list(cv_results.values()),1)[2:]
    print("Accuracy: ", array_scores[0], "\nPrecision: ", array_scores[1], "\nRecall: ", array_scores[2], "\nF1 Score: ", array_scores[3])


features1 = ["Sex_binary", "EDUC_Y", "SES", "MMSE", "eTIV", "nWBV", "ASF", "Brain pH"]
feature_selection(features1)

Accuracy:  0.6629509713228492 
Precision:  0.6536152543109065 
Recall:  0.6363636363636364 
F1 Score:  0.6343387875302768


features2 = ["Sex_binary", "EDUC_Y", "MMSE", "eTIV", "nWBV", "ASF", "Brain pH"]
feature_selection(features2)

Accuracy:  0.6629972247918594 
Precision:  0.6425536728389052 
Recall:  0.6590909090909091 
F1 Score:  0.6411227449736051


features3 = ["EDUC_Y", "SES", "MMSE", "eTIV", "nWBV", "ASF", "Brain pH"]
feature_selection(features3)

Accuracy:  0.6779833487511564 
Precision:  0.6823222034748625 
Recall:  0.65 
F1 Score:  0.6406647699073049


features4 = ["EDUC_Y", "MMSE", "eTIV", "nWBV", "ASF", "Brain pH"]
feature_selection(features4)

Accuracy:  0.6609158186864015 
Precision:  0.6674306735521128 
Recall:  0.6318181818181817 
F1 Score:  0.6247648950358135


features5 = ["EDUC_Y", "SES", "MMSE", "eTIV", "ASF", "Brain pH"]
feature_selection(features5)

Accuracy:  0.5842738205365403 
Precision:  0.550197143499775 
Recall:  0.4818181818181818 
F1 Score:  0.5041567449740483


features6 = ["EDUC_Y", "SES", "eTIV", "nWBV", "ASF", "Brain pH"]
feature_selection(features6)

Accuracy:  0.5945420906567993 
Precision:  0.5762638092638093 
Recall:  0.5727272727272728 
F1 Score:  0.5617140823649163


features7 = ["SES", "MMSE", "eTIV", "nWBV", "ASF", "Brain pH"]
feature_selection(features7)

Accuracy:  0.6672987974098057 
Precision:  0.6576358826358828 
Recall:  0.6454545454545454 
F1 Score:  0.6430863293014604


features8 = ["MMSE", "eTIV", "nWBV", "ASF", "Brain pH"]
feature_selection(features8)

Accuracy:  0.6800647548566141 
Precision:  0.6660677577782841 
Recall:  0.6590909090909091 
F1 Score:  0.6536202532339482


# define features
features = ["EDUC_Y", "SES", "MMSE", "eTIV", "nWBV", "ASF", "Brain pH"]

nan = np.nan
X = alz_model_df[features]
imputer = KNNImputer(n_neighbors=2, weights="uniform")
X = imputer.fit_transform(X)
df_X = pd.DataFrame(X, columns = features)


# Define training data
X_train_dict = df_X.to_dict(orient="records")
y_train = alz_model_df["cog_status_num"]

# Dummy encoding
vec = DictVectorizer(sparse=False)
vec.fit(X_train_dict)
X_train = vec.transform(X_train_dict)

# Standardize the data
scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)

# Fit the 13-nearest neighbors model
model = KNeighborsClassifier(n_neighbors=13)
model.fit(X_train_sc, y_train)

KNeighborsClassifier(n_neighbors=13)


y_train_pred = model.predict(X_train_sc)
accuracy = accuracy_score(y_train, y_train_pred)
precision = precision_score(y_train, y_train_pred, pos_label=1)
recall = recall_score(y_train, y_train_pred, pos_label=1)
f1_score =  2*((precision*recall)/(precision+recall))
print("accuracy:", accuracy, "\nprecision:", precision, "\nrecall:", recall, "\nf1:", f1_score)

accuracy: 0.746268656716418 
precision: 0.7186147186147186 
recall: 0.7545454545454545 
f1: 0.7361419068736141


finalfeatures = ["EDUC_Y", "SES", "MMSE", "eTIV", "nWBV", "ASF", "Brain pH"]
feature_selection(finalfeatures)

Accuracy:  0.6779833487511564 
Precision:  0.6823222034748625 
Recall:  0.65 
F1 Score:  0.6406647699073049

Abbreviation	Patient Feature	Feature Definition
EDUC_Y	Years of Education	Simple count of the years in formal education.
EDUC_L	Level of Education	Education codes correspond to the following levels of education: 1: less than high school grad., 2: high school grad., 3: some college, 4: college grad., 5: beyond college.
SES	Socioeconomic Status	Socioeconomic status as assessed by the Hollingshead Index of Social Position and classified into categories from 1 (highest status) to 5 (lowest status) (Hollingshead, 1957)
MMSE	Mini Mental State Examination	Mini-Mental State Examination score (range is from 0 = worst to 30 = best) (Folstein, Folstein, & McHugh, 1975)
CDR	Clinical Dementia Rating	Clinical Dementia Rating (0 = no dementia, 0.5 = very mild AD, 1 = mild AD, 2 = moderate AD) (Morris, 1993)
ASF	Atlas Scaling Factor	Atlas scaling factor (unitless). Computed scaling factor that transforms native-space brain and skull to the atlas target (i.e., the determinant of the transform matrix) (Buckner et al., 2004)
eTIV	Estimated Total Intracranial Volume	Estimated total intracranial volume (cm3) (Buckner et al., 2004)
NWBV	Normalized Whole Brain Volume	Normalized whole-brain volume, expressed as a percent of all voxels in the atlas-masked image that are labeled as gray or white matter by the automated tissue segmentation process (Fotenos et al., 2005)

	Subject ID	MRI ID	Group	MR Delay	M/F	Hand	Age	EDUC	SES	MMSE	CDR	eTIV	nWBV	ASF
4	OAS2_0002	OAS2_0002_MR3	Demented	1895	M	R	80	12	NaN	22.0	0.5	1698	0.701	1.034
9	OAS2_0005	OAS2_0005_MR3	Nondemented	1603	M	R	85	12	4.0	30.0	0.0	1699	0.705	1.033
11	OAS2_0007	OAS2_0007_MR3	Demented	518	M	R	73	16	NaN	27.0	1.0	1365	0.727	1.286
21	OAS2_0012	OAS2_0012_MR3	Nondemented	1598	F	R	83	16	2.0	29.0	0.0	1323	0.718	1.327
24	OAS2_0013	OAS2_0013_MR3	Nondemented	1456	F	R	85	12	4.0	29.0	0.0	1225	0.710	1.433
30	OAS2_0017	OAS2_0017_MR3	Nondemented	617	M	R	81	12	3.0	27.0	0.5	1814	0.759	0.968
34	OAS2_0018	OAS2_0018_MR3	Converted	489	F	R	88	14	1.0	29.0	0.0	1398	0.713	1.255
38	OAS2_0020	OAS2_0020_MR3	Converted	1563	M	R	84	20	1.0	26.0	0.5	1597	0.666	1.099
49	OAS2_0027	OAS2_0027_MR3	Nondemented	1234	F	R	73	12	3.0	30.0	0.0	1358	0.775	1.293
59	OAS2_0031	OAS2_0031_MR3	Converted	1588	F	R	91	12	3.0	28.0	0.5	1463	0.696	1.199
64	OAS2_0034	OAS2_0034_MR3	Nondemented	1287	F	R	82	16	1.0	30.0	0.0	1460	0.695	1.202
69	OAS2_0036	OAS2_0036_MR3	Nondemented	713	F	R	70	13	4.0	30.0	0.0	1361	0.783	1.290
74	OAS2_0037	OAS2_0037_MR3	Demented	2029	M	R	88	12	4.0	26.0	0.5	1483	0.709	1.184
80	OAS2_0040	OAS2_0040_MR3	Demented	1204	M	R	88	6	4.0	23.0	0.5	1348	0.713	1.302
83	OAS2_0041	OAS2_0041_MR3	Converted	1331	F	R	75	16	1.0	28.0	0.5	1314	0.760	1.335
90	OAS2_0044	OAS2_0044_MR3	Demented	866	M	R	71	14	4.0	22.0	1.0	1332	0.679	1.317
99	OAS2_0048	OAS2_0048_MR3	Demented	647	M	R	68	16	1.0	19.0	1.0	1712	0.691	1.025
104	OAS2_0049	OAS2_0049_MR3	Nondemented	687	F	R	71	16	3.0	30.0	0.0	1503	0.788	1.168
109	OAS2_0051	OAS2_0051_MR3	Nondemented	1526	F	R	97	23	1.0	30.0	0.0	1483	0.689	1.184
122	OAS2_0057	OAS2_0057_MR3	Nondemented	1340	F	R	85	12	2.0	30.0	0.0	1580	0.739	1.111
125	OAS2_0058	OAS2_0058_MR3	Demented	764	M	R	80	14	3.0	29.0	0.5	1324	0.695	1.326
130	OAS2_0061	OAS2_0061_MR3	Nondemented	1651	M	R	72	18	1.0	30.0	0.0	1681	0.729	1.044
133	OAS2_0062	OAS2_0062_MR3	Nondemented	1351	F	R	83	18	2.0	29.0	0.0	1667	0.688	1.053
138	OAS2_0064	OAS2_0064_MR3	Demented	1282	F	R	82	8	5.0	18.0	0.5	1464	0.682	1.199
143	OAS2_0067	OAS2_0067_MR3	Nondemented	1438	M	R	71	12	4.0	29.0	0.0	1455	0.724	1.206
151	OAS2_0070	OAS2_0070_MR3	Nondemented	1415	M	R	84	17	1.0	29.0	0.0	1707	0.717	1.028
158	OAS2_0073	OAS2_0073_MR3	Nondemented	1705	F	R	75	14	3.0	28.0	0.0	1507	0.782	1.164
165	OAS2_0076	OAS2_0076_MR3	Nondemented	1663	F	R	71	18	2.0	30.0	0.0	1520	0.718	1.155
170	OAS2_0078	OAS2_0078_MR3	Nondemented	1019	M	R	92	16	1.0	30.0	0.0	1662	0.682	1.056
173	OAS2_0079	OAS2_0079_MR3	Demented	1435	F	R	73	12	4.0	16.0	1.0	1478	0.696	1.188
176	OAS2_0080	OAS2_0080_MR3	Demented	1209	M	R	69	15	2.0	28.0	0.5	1546	0.724	1.135
188	OAS2_0089	OAS2_0089_MR3	Demented	563	M	R	72	12	2.0	27.0	1.0	1432	0.684	1.226
191	OAS2_0090	OAS2_0090_MR3	Nondemented	1345	M	R	76	18	2.0	30.0	0.0	1550	0.758	1.133
200	OAS2_0095	OAS2_0095_MR3	Nondemented	1412	M	R	74	18	1.0	29.0	0.0	1814	0.679	0.967
211	OAS2_0100	OAS2_0100_MR3	Nondemented	1752	F	R	82	11	4.0	30.0	0.0	1590	0.760	1.104
214	OAS2_0101	OAS2_0101_MR3	Nondemented	1631	F	R	76	18	2.0	30.0	0.0	1379	0.757	1.273
217	OAS2_0102	OAS2_0102_MR3	Demented	1387	M	R	86	15	3.0	30.0	0.5	1498	0.681	1.171
220	OAS2_0103	OAS2_0103_MR3	Converted	2002	F	R	75	16	1.0	30.0	0.5	1419	0.731	1.236
243	OAS2_0117	OAS2_0117_MR3	Nondemented	1345	M	R	76	20	2.0	30.0	0.0	1823	0.739	0.963
249	OAS2_0119	OAS2_0119_MR3	Nondemented	1713	F	R	85	15	2.0	30.0	0.0	1488	0.741	1.180
260	OAS2_0126	OAS2_0126_MR3	Nondemented	1192	F	R	77	12	3.0	29.0	0.0	1344	0.740	1.306
263	OAS2_0127	OAS2_0127_MR3	Converted	1042	M	R	81	18	1.0	29.0	0.5	1647	0.717	1.066
270	OAS2_0129	OAS2_0129_MR3	Nondemented	1591	F	R	82	18	1.0	29.0	0.0	1442	0.644	1.217
274	OAS2_0133	OAS2_0133_MR3	Converted	1006	F	R	81	12	3.0	28.0	0.5	1495	0.687	1.174
287	OAS2_0140	OAS2_0140_MR3	Demented	1655	F	R	81	16	3.0	25.0	0.5	1396	0.687	1.257
294	OAS2_0143	OAS2_0143_MR3	Nondemented	1553	F	R	93	18	2.0	29.0	0.0	1744	0.723	1.006
303	OAS2_0147	OAS2_0147_MR3	Nondemented	1204	F	R	80	13	2.0	28.0	0.0	1337	0.762	1.313
311	OAS2_0152	OAS2_0152_MR3	Nondemented	1329	F	R	69	18	2.0	29.0	0.0	1202	0.770	1.461
326	OAS2_0161	OAS2_0161_MR3	Nondemented	1033	M	R	80	16	1.0	29.0	0.0	1830	0.724	0.959
337	OAS2_0171	OAS2_0171_MR3	Nondemented	1695	M	R	81	16	3.0	30.0	0.0	1836	0.744	0.956
342	OAS2_0174	OAS2_0174_MR3	Nondemented	1555	M	R	64	12	4.0	30.0	0.0	1370	0.794	1.281
345	OAS2_0175	OAS2_0175_MR3	Demented	1343	M	R	73	16	4.0	28.0	0.5	1803	0.731	0.973
348	OAS2_0176	OAS2_0176_MR3	Converted	1631	M	R	89	16	2.0	30.0	0.5	1408	0.679	1.246
353	OAS2_0178	OAS2_0178_MR3	Nondemented	1447	F	R	93	14	2.0	30.0	0.0	1488	0.735	1.179
358	OAS2_0181	OAS2_0181_MR3	Demented	1107	F	R	77	12	NaN	NaN	1.0	1159	0.733	1.515
363	OAS2_0183	OAS2_0183_MR3	Nondemented	732	F	R	68	13	2.0	30.0	0.0	1506	0.740	1.165
369	OAS2_0185	OAS2_0185_MR3	Demented	2297	M	R	86	16	1.0	26.0	0.5	1688	0.675	1.040
372	OAS2_0186	OAS2_0186_MR3	Nondemented	1608	F	R	65	13	2.0	30.0	0.0	1333	0.801	1.317

	ID	M/F	Hand	Age	Educ	SES	MMSE	CDR	eTIV	nWBV	ASF	Delay
0	OAS1_0001_MR1	F	R	74	2.0	3.0	29.0	0.0	1344	0.743	1.306	NaN
1	OAS1_0002_MR1	F	R	55	4.0	1.0	29.0	0.0	1147	0.810	1.531	NaN
2	OAS1_0003_MR1	F	R	73	4.0	3.0	27.0	0.5	1454	0.708	1.207	NaN
3	OAS1_0004_MR1	M	R	28	NaN	NaN	NaN	NaN	1588	0.803	1.105	NaN
4	OAS1_0005_MR1	M	R	18	NaN	NaN	NaN	NaN	1737	0.848	1.010	NaN

	cog_status_num	Last MMSE Score	Female	Male	Years of education
cog_status_num	1.000000	-0.498078	0.073127	-0.073127	0.151004
Last MMSE Score	-0.498078	1.000000	-0.070174	0.070174	-0.054593
Female	0.073127	-0.070174	1.000000	-1.000000	-0.100013
Male	-0.073127	0.070174	-1.000000	1.000000	0.100013
Years of education	0.151004	-0.054593	-0.100013	0.100013	1.000000

	ID	cog_status_num	Sex	Age	EDUC_Y	SES	MMSE	CDR	eTIV	nWBV	ASF	EDUC_L	Age of onset cognitive symptoms	Brain pH	Sex_binary	cog_status
0	OAS2_0001	0.0	M	87.0	14.0	2.0	27.0	0.0	1987.0	0.696	0.883	NaN	NaN	NaN	1	No dementia
2	OAS2_0002	1.0	M	75.0	12.0	NaN	23.0	0.5	1678.0	0.736	1.046	NaN	NaN	NaN	1	Dementia
5	OAS2_0004	0.0	F	88.0	18.0	3.0	28.0	0.0	1215.0	0.710	1.444	NaN	NaN	NaN	0	No dementia
7	OAS2_0005	0.0	M	80.0	12.0	4.0	28.0	0.0	1689.0	0.712	1.039	NaN	NaN	NaN	1	No dementia
10	OAS2_0007	1.0	M	71.0	16.0	NaN	28.0	0.5	1357.0	0.748	1.293	NaN	NaN	NaN	1	Dementia

Analysis of Alzheimer's Disease Features

Marisa Long and Anna Schoeny

Notebook Contents¶

Introduction¶

Discussion of Data Sources¶

Dataset Vocabulary and Abbreviations¶

Data Limitations¶

Challenges with MRI Dataset:¶

Challenges with Metadata Dataset:¶

Extract, Transform, and Load Data (ETL)¶

Dataset 1: MRI and Alzheimer's¶

Longitudinal Data:¶

Cross-Sectional Data:¶

Dataset 2: Metadata¶

Exploratory Data Analysis (EDA)¶

Questions for Dataset 1 (Longitudinal):¶

What is the average level of education (variable 'EDUC') of demented vs non-demented vs converted individuals?¶

Over how many years did the study track each individual?¶

How old were people in this dataset when they had their first MRI scan for dementia?¶

How does the number of females vs males with dementia compare in this dataset?¶

Questions for Dataset 1 (Cross Sectional):¶

What is the average level of education (variable 'EDUC') of demented vs non-demented vs converted individuals?¶

How old were people in this dataset when they had their MRI to scan for dementia?¶

How does the number of females vs males with dementia compare in this dataset?¶

Dataset 2 EDA¶

Dataset 2: Metadata¶

Distribution of Dementia by Age and Sex¶

Brain pH for Dementia v. No dementia¶

Examining the Relationship between Education and Dementia¶

Looking at MMSE (Mini Mental State Examination) Scores for Males/Females With/Without Dementia¶

Age of Diagnosis and Years of Education¶

Data Analysis and Modeling¶

Preparation for model-building¶

K Value Selection:¶

Feature Selection:¶

Insights and Final Thoughts¶

References¶

	Subject ID	MRI ID	Group	Visit	MR Delay	M/F	Hand	Age	EDUC	SES	MMSE	CDR	eTIV	nWBV	ASF
0	OAS2_0001	OAS2_0001_MR1	Nondemented	1	0	M	R	87	14	2.0	27.0	0.0	1987	0.696	0.883
1	OAS2_0001	OAS2_0001_MR2	Nondemented	2	457	M	R	88	14	2.0	30.0	0.0	2004	0.681	0.876
2	OAS2_0002	OAS2_0002_MR1	Demented	1	0	M	R	75	12	NaN	23.0	0.5	1678	0.736	1.046
3	OAS2_0002	OAS2_0002_MR2	Demented	2	560	M	R	76	12	NaN	28.0	0.5	1738	0.713	1.010
4	OAS2_0002	OAS2_0002_MR3	Demented	3	1895	M	R	80	12	NaN	22.0	0.5	1698	0.701	1.034

	Visit1	Visit2	Visit3
Subject ID
OAS2_0001	True	True	False
OAS2_0002	True	True	True
OAS2_0004	True	True	False
OAS2_0005	True	True	True
OAS2_0007	True	False	True
...	...	...	...
OAS2_0182	True	True	False
OAS2_0183	True	True	True
OAS2_0184	True	True	False
OAS2_0185	True	True	True
OAS2_0186	True	True	True

	Donor ID	Age at Death	Sex	Hispanic/Latino	Highest level of education	Years of education	Cognitive Status	Age of onset cognitive symptoms	Age of Dementia diagnosis	Known head injury	Fresh Brain Weight	Brain pH	Last MMSE Score	Race
0	H19.33.004	80	Female	No	Bachelors	17	No dementia	NaN	NaN	NaN	1035.00	7.0	25.0	White
1	H20.33.001	82	Male	No	Bachelors	16	No dementia	NaN	NaN	NaN	1338.00	6.8	28.0	White
2	H20.33.002	90+	Female	No	High School	12	No dementia	NaN	NaN	NaN	1078.00	7.3	33.0	White
3	H20.33.004	86	Male	No	Trade School/ Tech School	15	Dementia	80	81	No	1261.00	6.7	25.0	White
4	H20.33.005	90+	Female	No	High School	12	No dementia	NaN	NaN	NaN	1003.00	6.8	29.0	White

Analysis of Alzheimer's Disease Features

Marisa Long and Anna Schoeny

Notebook Contents¶

Introduction¶

Discussion of Data Sources¶

Dataset Vocabulary and Abbreviations¶

Data Limitations¶

Challenges with MRI Dataset:¶

Challenges with Metadata Dataset:¶

Extract, Transform, and Load Data (ETL)¶

Dataset 1: MRI and Alzheimer's¶

Longitudinal Data:¶

Cross-Sectional Data:¶

Dataset 2: Metadata¶

Exploratory Data Analysis (EDA)¶

Questions for Dataset 1 (Longitudinal):¶

What is the average level of education (variable 'EDUC') of demented vs non-demented vs converted individuals?¶

How are education and socioeconomic status related in this sample?¶

Over how many years did the study track each individual?¶

How old were people in this dataset when they had their first MRI scan for dementia?¶

How does the number of females vs males with dementia compare in this dataset?¶

Questions for Dataset 1 (Cross Sectional):¶

What is the average level of education (variable 'EDUC') of demented vs non-demented vs converted individuals?¶

How are education and socioeconomic status related in this sample?¶

How old were people in this dataset when they had their MRI to scan for dementia?¶

How does the number of females vs males with dementia compare in this dataset?¶

Dataset 2 EDA¶

Dataset 2: Metadata¶

Distribution of Dementia by Age and Sex¶

Brain pH for Dementia v. No dementia¶

Examining the Relationship between Education and Dementia¶

Looking at MMSE (Mini Mental State Examination) Scores for Males/Females With/Without Dementia¶

Age of Diagnosis and Years of Education¶

Data Analysis and Modeling¶

Preparation for model-building¶

K Value Selection:¶

Feature Selection:¶

Insights and Final Thoughts¶

References¶