tnsmortalitymosaic.py

# -*- coding: utf-8 -*-
"""TNsMortalityMosaic.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1YiYQB3RN9PWjktbFfRg07xqcxUgF2_HW

# Cleaning and Preprocessing
"""

##This notebook examines mortality patterns in Tennessee counties from 2019 to 2021 to
##identify factors influencing variations in mortality rates through different predictive models such as:
##KNN, linear regression, logistic regression, decision tree, etc. Although this isn't extensive and the models
##aren't the best, we look forward to changing our target variables while training the data to different
##correlations to life expectancy and rerunning the models.

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, confusion_matrix, classification_report, mean_absolute_error
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import PolynomialFeatures, LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.svm import SVR, SVC
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import tensorflow as tf
from tensorflow import keras
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris, make_classification

pip install PyDrive

github_url = 'https://raw.githubusercontent.com/marymorkos/tnhealth/main/2019healthrankingstn.csv'
tnhealth2019 = pd.read_csv(github_url)

github_url = 'https://raw.githubusercontent.com/marymorkos/tnhealth/main/2020healthrankingstn.csv'
tnhealth2020 = pd.read_csv(github_url)

github_url = 'https://raw.githubusercontent.com/marymorkos/tnhealth/main/2021healthrankingstn.csv'
tnhealth2021 = pd.read_csv(github_url)

import os

tnhealth2019= tnhealth2019.rename(columns={'Longtitude': 'Longitude'})
tnhealth2020= tnhealth2020.rename(columns={'Longtitude': 'Longitude'})
tnhealth2021= tnhealth2021.rename(columns={'Longtitude': 'Longitude'})

tnhealth2019.head()

tnhealth2020.head()

tnhealth2021.head()

dfs = {'2019': tnhealth2019, '2020': tnhealth2020, '2021': tnhealth2021}

def find_uncommon_columns(df1, df2):
    return list(set(df1.columns) - set(df2.columns))

uncommon_columns = []

for year1, df1 in dfs.items():
    for year2, df2 in dfs.items():
        if year1 != year2:
            comparison_name = f"{year1} vs {year2}"
            uncommon_columns.append({comparison_name: find_uncommon_columns(df1, df2)})

for comparison in uncommon_columns:
    print("Uncommon columns in", list(comparison.keys())[0], ":", comparison[list(comparison.keys())[0]])

common_columns = set.intersection(*[set(df.columns) for df in dfs.values()])

columns_to_preserve = ['FIPS', 'State', 'County', 'Longitude', 'Latitude']

columns_to_drop = []

for year, df in dfs.items():
    cols_to_drop = [col for col in df.columns if col not in common_columns and col not in columns_to_preserve]
    columns_to_drop.append({year: cols_to_drop})

for year, df in dfs.items():
    df.drop(columns=[col for col in columns_to_drop if list(col.keys())[0] == year][0][year], inplace=True, errors='ignore')

for year, df in dfs.items():
    print(f"DataFrame for {year} after dropping uncommon columns:")
    print(df.head())

tnhealth2019.head()

tnhealth2020.head()

tnhealth2021.head()

for df in [tnhealth2019, tnhealth2020, tnhealth2021]:
    columns_to_drop = [col for col in df.columns if "95% CI" in col or "Z-Score" in col]
    df.drop(columns=columns_to_drop, inplace=True)

print("DataFrame tnhealth2019 after dropping columns containing '95% CI' or 'Z-Score':")
print(tnhealth2019.head())

print("\nDataFrame tnhealth2020 after dropping columns containing '95% CI' or 'Z-Score':")
print(tnhealth2020.head())

print("\nDataFrame tnhealth2021 after dropping columns containing '95% CI' or 'Z-Score':")
print(tnhealth2021.head())

for df in [tnhealth2019, tnhealth2020, tnhealth2021]:
    column_means = df.select_dtypes(include=[np.number]).mean()
    df.fillna(column_means, inplace=True)

print(tnhealth2019.head())

print(tnhealth2020.head())

print(tnhealth2021.head())

concatenated_df = pd.concat([tnhealth2019, tnhealth2020, tnhealth2021], ignore_index=True)

concatenated_df.head()

concatenated_df.tail()

"""# More Cleaning"""

categorical_columns = ['FIPS', 'State', 'County', 'Longitude', 'Latitude', 'Year']

concatenated_df.drop(columns=categorical_columns, inplace=True)

print("Concatenated DataFrame after dropping categorical columns:")
print(concatenated_df.head())

concatenated_df.dropna(axis=1, inplace=True)

print("Concatenated DataFrame after dropping columns with NaN values:")
print(concatenated_df.head())

concatenated_df.head()

columns_with_tennessee = concatenated_df.columns[concatenated_df.eq('Tennessee').any()]

print("Columns containing 'Tennessee':")
for column in columns_with_tennessee:
    print(column)

for column in concatenated_df.columns:
    print("Unique values in column '{}':".format(column))
    print(concatenated_df[column].unique())
    print()

concatenateddf_numeric = concatenated_df.select_dtypes(include=['number'])

print(concatenateddf_numeric)

"""# Model Training"""

X = concatenateddf_numeric.drop(columns=['Life Expectancy'])
y = concatenateddf_numeric['Life Expectancy']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

"""# Linear Model"""

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()

model.fit(X_train, y_train)

predictions = model.predict(X_test)

mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)

import matplotlib.pyplot as plt

plt.scatter(y_test, predictions, color='blue')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--')
plt.title('Linear Regression Model of Life Expectancy')
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.show()

"""# Nonlinear Model - Polynomial"""

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

degree = 2

polynomial_features = PolynomialFeatures(degree=degree)
X_train_poly = polynomial_features.fit_transform(X_train)
X_test_poly = polynomial_features.transform(X_test)

model = LinearRegression()

model.fit(X_train_poly, y_train)

predictions = model.predict(X_test_poly)

mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)

import matplotlib.pyplot as plt
plt.scatter(y_test, predictions, color='blue')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--')
plt.title('Polynomial Regression Model')
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.show()

"""# Decision Tree (numerical)"""

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

model = DecisionTreeRegressor()

model.fit(X_train, y_train)

predictions = model.predict(X_test)

mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)

"""# Nonlinear Model: Gaussian Processes


"""

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.metrics import mean_squared_error

kernel = RBF()

model = GaussianProcessRegressor(kernel=kernel)

model.fit(X_train, y_train)

predictions, std_dev = model.predict(X_test, return_std=True)

mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)

"""# Neural Net"""

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1)  # Output layer with 1 neuron for regression
])

model.compile(optimizer='adam', loss='mean_squared_error')

model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2)

mse = model.evaluate(X_test, y_test)
print("Mean Squared Error:", mse)

import matplotlib.pyplot as plt

predictions = model.predict(X_test).flatten()

plt.scatter(y_test, predictions, color='blue')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--')
plt.title('Neural Network Regression Model')
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.show()

"""# Support Vector Machine"""

from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

model = SVR(kernel='rbf')

model.fit(X_train, y_train)

predictions = model.predict(X_test)

mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)

import matplotlib.pyplot as plt

predictions = model.predict(X_test)

plt.scatter(y_test, predictions, color='blue')
plt.scatter(y_test, y_test, color='blue', label='True Values')

plt.scatter(y_test, predictions, color='red', label='Predicted Values')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='green', linestyle='--', label='Perfect Predictions')
plt.title('Support Vector Machine Regression Model')
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.show()

"""# K-Nearest Neighbors (KNN)


"""

from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

model = KNeighborsRegressor(n_neighbors=5)

model.fit(X_train, y_train)

predictions = model.predict(X_test)

mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)

import matplotlib.pyplot as plt

predictions = model.predict(X_test)

plt.scatter(y_test, y_test, color='blue', label='True Values')

plt.scatter(y_test, predictions, color='red', label='Predicted Values')

plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='green', linestyle='--', label='Perfect Predictions')

plt.title('True vs. Predicted Values')
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.legend()
plt.show()

"""# Gradient Boosting"""

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

model = GradientBoostingRegressor()

model.fit(X_train, y_train)

predictions = model.predict(X_test)

mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)

import matplotlib.pyplot as plt

predictions = model.predict(X_test)

plt.scatter(y_test, y_test, color='blue', label='True Values')

plt.scatter(y_test, predictions, color='red', label='Predicted Values')

plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='green', linestyle='--', label='Perfect Predictions')

plt.title('True vs. Predicted Values')
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.legend()
plt.show()

"""# Random Forest"""

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

model = RandomForestRegressor()

model.fit(X_train, y_train)

predictions = model.predict(X_test)

mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)

import matplotlib.pyplot as plt

predictions = model.predict(X_test)

plt.scatter(y_test, y_test, color='blue', label='True Values')

plt.scatter(y_test, predictions, color='red', label='Predicted Values')

plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='green', linestyle='--', label='Perfect Predictions')

plt.title('True vs. Predicted Values (Random Forest)')
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.legend()
plt.show()

"""# PCA"""

from sklearn.decomposition import PCA

pca = PCA(n_components=2)

X_pca = pca.fit_transform(X)

print("Transformed Data after PCA:")
print(X_pca)

"""# Logistic Regression (attempt)"""

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd

concatenated_df.head()

github_url = 'https://raw.githubusercontent.com/marymorkos/tnhealth/main/2019healthrankingstn.csv'
tnhealth20191 = pd.read_csv(github_url)

github_url = 'https://raw.githubusercontent.com/marymorkos/tnhealth/main/2020healthrankingstn.csv'
tnhealth20201 = pd.read_csv(github_url)

github_url = 'https://raw.githubusercontent.com/marymorkos/tnhealth/main/2021healthrankingstn.csv'
tnhealth20211 = pd.read_csv(github_url)

concatenated1_df = pd.concat([tnhealth20191, tnhealth20201, tnhealth20211], ignore_index=True)

concatenated1_df.dropna(subset=['Life Expectancy', 'County'], inplace=True)

label_encoder = LabelEncoder()
concatenated1_df['County'] = label_encoder.fit_transform(concatenated1_df['County'])

X = concatenated1_df[['County']]
y = concatenated1_df['Life Expectancy']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

##logistic_model = LogisticRegression()
##ogistic_model.fit(X_train, y_train)

concatenated1_df.columns

variable_data_types = concatenated_df.dtypes
print(variable_data_types)

numerical_columns = concatenated_df.select_dtypes(include=['int', 'float']).columns
categorical_columns = concatenated_df.select_dtypes(include=['object']).columns

print("Numerical Columns:")
print(numerical_columns)

print("Categorical Columns:")
print(categorical_columns)

"""# Bootstrap"""

import pandas as pd
import numpy as np

life_expectancy_column = 'Life Expectancy'

num_bootstrap_samples = 1000

bootstrap_estimates = []
for _ in range(num_bootstrap_samples):
    bootstrap_sample = concatenated_df.sample(n=len(concatenated_df), replace=True)

statistic_value = bootstrap_sample[life_expectancy_column].mean()

bootstrap_estimates.append(statistic_value)

confidence_interval = np.percentile(bootstrap_estimates, [2.5, 97.5])

print("Bootstrap Estimate of", life_expectancy_column + ":", np.mean(bootstrap_estimates))
print("95% Confidence Interval:", confidence_interval)

import matplotlib.pyplot as plt

plt.hist(bootstrap_estimates, bins=30, edgecolor='black', alpha=0.7)
plt.axvline(np.mean(bootstrap_estimates), color='red', linestyle='dashed', linewidth=1.5, label='Mean')
plt.axvline(confidence_interval[0], color='green', linestyle='dashed', linewidth=1.5, label='95% CI')
plt.axvline(confidence_interval[1], color='green', linestyle='dashed', linewidth=1.5)
plt.xlabel('Life Expectancy')
plt.ylabel('Frequency')
plt.title('Bootstrap Estimate of Life Expectancy')
plt.legend()
plt.show()

"""# Map and Misc"""

concatenated1_df.head()

numeric1_df = concatenated1_df.select_dtypes(include='number')

# Display the DataFrame with only numeric columns
print(numeric1_df)

integer_df = concatenated1_df.select_dtypes(include='integer')

# Display the DataFrame with only integer columns
print(integer_df)

numeric_df = concatenated1_df.select_dtypes(include='number')

nan_mean = numeric_df.loc[:, numeric_df.isna().any()].mean()

print(nan_mean)

numeric1_df.head()

pip install folium

# Define the models and their corresponding MSE values
models_mse = {
    "Linear Regression": 0.9537812205407389,
    "Gradient Boosting": 0.44564624263030017,
    "Random Forest": 0.5509994912280721
}

# Print the models and their MSE values
print("Model MSE values:")
for model, mse in models_mse.items():
    print(f"{model}: {mse}")

# Additional information
print("\nPermutation Test Results for Geographic_Group:")
print("Group Rural/Low Homeownership Mean Life Expectancy: 74.88867924528302")
print("Group Urban/High Homeownership Mean Life Expectancy: 75.17999999999999")
print("Observed Test Statistic: -0.2913207547169776")
print("P-value: 0.488")
print("Bootstrap Estimate of Life Expectancy: 74.85333333333332")
print("95% Confidence Interval: [74.85333333 74.85333333]")

import matplotlib.pyplot as plt

models_mse = {
    "Linear Regression": 0.9537812205407389,
    "Gradient Boosting": 0.44564624263030017,
    "Random Forest": 0.5509994912280721
}

plt.figure(figsize=(10, 6))
bars = plt.bar(models_mse.keys(), models_mse.values(), color='skyblue')
plt.xlabel('Model')
plt.ylabel('MSE')
plt.title('Model MSE Values')
plt.xticks(rotation=45, ha='right')

for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height, round(height, 4),
             ha='center', va='bottom')

plt.savefig('model_mse_plot.png', bbox_inches='tight')

additional_info = [
    "Permutation Test Results for Geographic_Group:",
    "Group Rural/Low Homeownership Mean Life Expectancy: 74.88867924528302",
    "Group Urban/High Homeownership Mean Life Expectancy: 75.17999999999999",
    "Observed Test Statistic: -0.2913207547169776",
    "P-value: 0.488",
    "Bootstrap Estimate of Life Expectancy: 74.85333333333332",
    "95% Confidence Interval: [74.85333333 74.85333333]"
]

with open('additional_info.txt', 'w') as f:
    for item in additional_info:
        f.write("%s\n" % item)

github_url = 'https://raw.githubusercontent.com/marymorkos/tnhealth/main/2019healthrankingstn.csv'
tnhealth20192 = pd.read_csv(github_url)

github_url = 'https://raw.githubusercontent.com/marymorkos/tnhealth/main/2020healthrankingstn.csv'
tnhealth20202 = pd.read_csv(github_url)

github_url = 'https://raw.githubusercontent.com/marymorkos/tnhealth/main/2021healthrankingstn.csv'
tnhealth20212 = pd.read_csv(github_url)

tnhealth20192= tnhealth20192.rename(columns={'Longtitude': 'Longitude'})
tnhealth20202= tnhealth20202.rename(columns={'Longtitude': 'Longitude'})
tnhealth20212= tnhealth20212.rename(columns={'Longtitude': 'Longitude'})

concatenated_df2 = pd.concat([tnhealth20192, tnhealth20202, tnhealth20212], ignore_index=True)

tnhealth20192.head()

!pip install folium
import folium

from folium.plugins import HeatMap

concatenated_df = pd.concat([tnhealth20192, tnhealth20202, tnhealth20212], ignore_index=True)

county_life_expectancy = concatenated_df.groupby('County')['Life Expectancy'].mean()

latitude = concatenated_df['Latitude']
longitude = concatenated_df['Longitude']
county_name = concatenated_df['County']
life_expectancy = concatenated_df['County'].map(county_life_expectancy)

map_center = [latitude.mean(), longitude.mean()]
mymap = folium.Map(location=map_center, zoom_start=7)

heat_data = [[lat, lon, life] for lat, lon, life in zip(latitude, longitude, life_expectancy)]
heatmap = HeatMap(heat_data, radius=15, blur=20, gradient={0.4: 'blue', 0.65: 'green', 1: 'red'})
mymap.add_child(heatmap)

for lat, lon, county, life in zip(latitude, longitude, county_name, life_expectancy):
    color = 'blue' if life <= 0.4 else 'green' if life <= 0.65 else 'red'
    folium.CircleMarker(
        location=[lat, lon],
        radius=5,
        color=color,
        fill=True,
        fill_color=color,
        fill_opacity=0.7,
        tooltip=f"{county}<br>Average Life Expectancy: {life:.2f}",
    ).add_to(mymap)

mymap

"""# Correlation Matrix"""

github_url = 'https://raw.githubusercontent.com/marymorkos/tnhealth/main/2019healthrankingstn.csv'
tnhealth2019 = pd.read_csv(github_url)

github_url = 'https://raw.githubusercontent.com/marymorkos/tnhealth/main/2020healthrankingstn.csv'
tnhealth2020 = pd.read_csv(github_url)

github_url = 'https://raw.githubusercontent.com/marymorkos/tnhealth/main/2021healthrankingstn.csv'
tnhealth2021 = pd.read_csv(github_url)

concatenated_df = pd.concat([tnhealth2019, tnhealth2020, tnhealth2021], ignore_index=True)

concatenated_df = pd.concat([tnhealth2019, tnhealth2020, tnhealth2021], ignore_index=True)

common_columns = set.intersection(*(set(df.columns) for df in [tnhealth2019, tnhealth2020, tnhealth2021]))
concatenated_df = concatenated_df[list(common_columns)]

categorical_columns = concatenated_df.select_dtypes(include='object').columns
concatenated_df[categorical_columns] = concatenated_df[categorical_columns].fillna(concatenated_df[categorical_columns].mode().iloc[0])

columns_to_drop = [col for col in concatenated_df.columns if 'Z-Score' in col or 'CI' in col]
concatenated_df = concatenated_df.drop(columns=columns_to_drop)

numeric_df = concatenated_df.select_dtypes(include='number')

# Calculate correlation matrix
correlation_matrix = numeric_df.corr()

# Plot heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

selected_columns = [
    '80th Percentile Income',
    '20th Percentile Income',
    '% Some College',
    '% Asian',
    'Dentist Rate',
    '# Non-Hispanic White',
    '% Vaccinated',
    '# Rural',
    '# Homeowners',
    'Labor Force',
    '# Primary Care Physicians',
    '# Associations',
    '% Excessive Drinking',
    '# Dentists',
    'Population: Demographics'
]

selected_df = concatenated_df[selected_columns]

correlation_matrix = selected_df.corr()

plt.figure(figsize=(12, 10))

sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")

plt.title('Correlation Matrix with Life Expectancy')

plt.show()

"""# 3D Plot"""

import plotly.graph_objs as go
from plotly.subplots import make_subplots
import pandas as pd

combined_df = pd.concat([tnhealth2019, tnhealth2020, tnhealth2021])

hover_text = (
    'County: ' + combined_df['County'] + '<br>' +
    'Life Expectancy: ' + combined_df['Life Expectancy'].astype(str) + '<br>' +
    '80th Percentile Income: ' + combined_df['80th Percentile Income'].astype(str) + '<br>' +
    '20th Percentile Income: ' + combined_df['20th Percentile Income'].astype(str) + '<br>' +
    '% Some College: ' + combined_df['% Some College'].astype(str) + '<br>' +
    '% Asian: ' + combined_df['% Asian'].astype(str) + '<br>' +
    '# Non-Hispanic White: ' + combined_df['# Non-Hispanic White'].astype(str) + '<br>' +
    '% Vaccinated: ' + combined_df['% Vaccinated'].astype(str) + '<br>' +
    '# Rural: ' + combined_df['# Rural'].astype(str) + '<br>' +
    '# Homeowners: ' + combined_df['# Homeowners'].astype(str) + '<br>' +
    'Labor Force: ' + combined_df['Labor Force'].astype(str) + '<br>' +
    '# Associations: ' + combined_df['# Associations'].astype(str)
)

scatter = go.Scatter3d(
    x=combined_df['% Vaccinated'],
    y=combined_df['# Associations'],
    z=combined_df['% Some College'],
    mode='markers',
    marker=dict(
        size=5,
        color=combined_df['Life Expectancy'],
        colorscale='Viridis',
        opacity=0.8,
        colorbar=dict(
            title='Life Expectancy',  # Colorbar title
            tickfont=dict(size=10)  # Font size of the colorbar tick labels
        )
    ),
    hoverinfo='text',
    text=hover_text,
    name='Data'
)

fig = go.Figure(data=[scatter])

fig.update_layout(
    title='Interactive 3D Scatter Plot',
    scene=dict(
        xaxis=dict(title='% Vaccinated'),
        yaxis=dict(title='# Associations'),
        zaxis=dict(title='% Some College')
    )
)

fig.show()
fig.write_html('3d_plot.html')

fig.write_html('3d_plot.html')

"""# Feature Selection"""

import pandas as pd

github_url = 'https://raw.githubusercontent.com/marymorkos/tnhealth/main/2019healthrankingstn.csv'
tnhealth2019 = pd.read_csv(github_url)

github_url = 'https://raw.githubusercontent.com/marymorkos/tnhealth/main/2020healthrankingstn.csv'
tnhealth2020 = pd.read_csv(github_url)

github_url = 'https://raw.githubusercontent.com/marymorkos/tnhealth/main/2021healthrankingstn.csv'
tnhealth2021 = pd.read_csv(github_url)

tnhealth2019= tnhealth2019.rename(columns={'Longtitude': 'Longitude'})
tnhealth2020= tnhealth2020.rename(columns={'Longtitude': 'Longitude'})
tnhealth2021= tnhealth2021.rename(columns={'Longtitude': 'Longitude'})

common_columns = set(concatenated_df.columns)
for df in [tnhealth2019, tnhealth2020, tnhealth2021]:
    common_columns = common_columns.intersection(df.columns)

for df in [tnhealth2019, tnhealth2020, tnhealth2021]:
    non_common_columns = set(df.columns) - common_columns
    df.drop(columns=non_common_columns, inplace=True)
    numeric_columns = df.select_dtypes(include=['number']).columns
    df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())

for df in [tnhealth2019, tnhealth2020, tnhealth2021]:
    nan_columns = df.columns[df.isna().any()].tolist()
    df.drop(columns=nan_columns, inplace=True)

keywords_to_drop = ['CI', 'Z-Score']

columns_to_drop = [col for col in df.columns if any(keyword in col for keyword in keywords_to_drop)]
df.drop(columns=columns_to_drop, inplace=True)

df.head()

import pandas as pd
from sklearn.feature_selection import SelectKBest, f_regression

numeric_columns = df.select_dtypes(include=['number']).columns

target_column = 'Life Expectancy'
X = df[numeric_columns].drop(columns=[target_column])  # Features
y = df[target_column]

k = 5  # Number of features to select
selector = SelectKBest(score_func=f_regression, k=k)
X_selected = selector.fit_transform(X, y)

selected_indices = selector.get_support(indices=True)

selected_features = X.columns[selected_indices]

print("Selected Features:", selected_features)

import matplotlib.pyplot as plt

feature_scores = selector.scores_[selected_indices]

plt.figure(figsize=(10, 6))
plt.bar(selected_features, feature_scores, color='skyblue')
plt.title('Feature Scores')
plt.xlabel('Features')
plt.ylabel('Score')
plt.xticks(rotation=45, ha='right')
plt.show()