# Import the necessary packages for data analysis
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from scipy import stats
from sklearn import metrics


# Reads all-ages.cvs into a dataframe and prints the dataframe.
df1 = pd.read_csv (r'/home/jovyan/notebooks/mental_health_tech_2014.csv')
df1.head()


# Checks for NaNs (null values) in our dataframe.
is_NaN = df1.isnull()
rows_with_NaN = df1[is_NaN.any(axis=1)]
print(rows_with_NaN.count)

<bound method DataFrame.count of                 Timestamp  Age  Gender         Country state self_employed  \
0     2014-08-27 11:29:31   37  female   United States    IL           NaN   
1     2014-08-27 11:29:37   44    male   United States    IN           NaN   
2     2014-08-27 11:29:44   32    male          Canada   NaN           NaN   
3     2014-08-27 11:29:46   31    male  United Kingdom   NaN           NaN   
4     2014-08-27 11:30:22   31    male   United States    TX           NaN   
...                   ...  ...     ...             ...   ...           ...   
1254  2015-09-12 11:17:21   26    male  United Kingdom   NaN            No   
1255  2015-09-26 01:07:35   32    male   United States    IL            No   
1256  2015-11-07 12:36:58   34    male   United States    CA            No   
1257  2015-11-30 21:25:06   46  female   United States    NC            No   
1258  2016-02-01 23:04:31   25    male   United States    IL            No   

     family_history treatment work_interfere    no_employees  ...  \
0                No       Yes          Often            6-25  ...   
1                No        No         Rarely  More than 1000  ...   
2                No        No         Rarely            6-25  ...   
3               Yes       Yes          Often          26-100  ...   
4                No        No          Never         100-500  ...   
...             ...       ...            ...             ...  ...   
1254             No       Yes            NaN          26-100  ...   
1255            Yes       Yes          Often          26-100  ...   
1256            Yes       Yes      Sometimes  More than 1000  ...   
1257             No        No            NaN         100-500  ...   
1258            Yes       Yes      Sometimes          26-100  ...   

                   leave mental_health_consequence phys_health_consequence  \
0          Somewhat easy                        No                      No   
1             Don't know                     Maybe                      No   
2     Somewhat difficult                        No                      No   
3     Somewhat difficult                       Yes                     Yes   
4             Don't know                        No                      No   
...                  ...                       ...                     ...   
1254       Somewhat easy                        No                      No   
1255  Somewhat difficult                        No                      No   
1256  Somewhat difficult                       Yes                     Yes   
1257          Don't know                       Yes                      No   
1258          Don't know                     Maybe                      No   

         coworkers    supervisor mental_health_interview  \
0     Some of them           Yes                      No   
1               No            No                      No   
2              Yes           Yes                     Yes   
3     Some of them            No                   Maybe   
4     Some of them           Yes                     Yes   
...            ...           ...                     ...   
1254  Some of them  Some of them                      No   
1255  Some of them           Yes                      No   
1256            No            No                      No   
1257            No            No                      No   
1258  Some of them            No                      No   

     phys_health_interview mental_vs_physical obs_consequence comments  
0                    Maybe                Yes              No      NaN  
1                       No         Don't know              No      NaN  
2                      Yes                 No              No      NaN  
3                    Maybe                 No             Yes      NaN  
4                      Yes         Don't know              No      NaN  
...                    ...                ...             ...      ...  
1254                    No         Don't know              No      NaN  
1255                    No                Yes              No      NaN  
1256                    No                 No              No      NaN  
1257                    No                 No              No      NaN  
1258                    No         Don't know              No      NaN  

[1173 rows x 27 columns]>


# Getting rid of unimportant rows
df1 = df1[df1['self_employed'] == 'No']
df1 = df1[df1['Country'] == 'United States']

# Getting rid of unimportant columns
df1.drop('comments', inplace=True, axis=1)
df1.drop('phys_health_interview', inplace=True, axis=1)
df1.drop('mental_health_interview', inplace=True, axis=1)
df1.drop('state', inplace=True, axis=1)
df1.drop('self_employed', inplace=True, axis=1)
df1.drop('Timestamp', inplace=True, axis=1)
df1.drop('Country', inplace=True, axis=1)


# Gives dummies for the the categorical responses 
df1['Gender'] = df1['Gender'].map(
                  {'Male':0, 'male': 0, 'm': 0, 'M': 0, 'f': 1, 'F':1, 'female': 1,'Female':1, 'Other': 2, 'other': 2})
df1['family_history'] = df1['family_history'].map(
                  {'No':0, 'Yes': 1})
df1['treatment'] = df1['treatment'].map(
                  {'No':0, 'Yes': 1})
df1['remote_work'] = df1['remote_work'].map(
                  {'No':0, 'Yes': 1})
df1['tech_company'] = df1['tech_company'].map(
                  {'No':0, 'Yes': 1})
df1['benefits'] = df1['benefits'].map(
                  {'No':0, 'Yes': 1, 'Don\'t know': 0})
df1['care_options'] = df1['care_options'].map(
                  {'No':0, 'Yes': 1, 'Not sure': 0})
df1['wellness_program'] = df1['wellness_program'].map(
                  {'No':0, 'Yes': 1, 'Don\'t know': 0})
df1['seek_help'] = df1['seek_help'].map(
                  {'No':0, 'Yes': 1, 'Don\'t know': 0})
df1['anonymity'] = df1['anonymity'].map(
                  {'No':0, 'Yes': 1, 'Don\'t know': 0})
df1['leave'] = df1['leave'].map(
                  {'Very easy':0, 'Somewhat easy': 1, 'Somewhat difficult': 2, "Very difficult": 3, 'Don\'t know': 0})
df1['mental_health_consequence'] = df1['mental_health_consequence'].map(
                  {'No':0, 'Yes': 1, 'Maybe': 2})
df1['phys_health_consequence'] = df1['phys_health_consequence'].map(
                  {'No':0, 'Yes': 1, 'Maybe': 2})
df1['coworkers'] = df1['coworkers'].map(
                  {'No':0, 'Yes': 1, 'Some of them': 2})
df1['supervisor'] = df1['supervisor'].map(
                  {'No':0, 'Yes': 1, 'Some of them': 2})
df1['mental_vs_physical'] = df1['mental_vs_physical'].map(
                  {'No':0, 'Yes': 1, 'Don\'t know': 0})
df1['obs_consequence'] = df1['obs_consequence'].map(
                  {'No':0, 'Yes': 1})
df1['work_interfere'] = df1['work_interfere'].fillna(value = 'NA')
df1['work_interfere'] = df1['work_interfere'].map(
                  {'Often':0,'Sometimes': 1, 'Rarely': 2, 'Never':3, 'NA': 4})


# Creating a column that tells us whether or not the person has a mental health condition
df1['Mental_Health'] = np.where(df1['work_interfere'] == 4, 0, 1)


df1.head()


# Dataframe of people that have a mental health condition
dfMHealth = df1[df1['Mental_Health'] == 1].copy()

# Dataframe of people that do not have a mental health condition
dfNoMHealth = df1[df1['Mental_Health'] == 0].copy()


# Create a heatmap displaying the correlation between all the attributes from the dataframe
plt.figure(figsize=(15,15))
sns.heatmap(dfMHealth.corr(), annot=True, cmap='Reds')
plt.show()


# Find those who are not receiving benefits from employers that have a mental health condition
dfBenefits = dfMHealth[dfMHealth['benefits']== 0]

# Find those who are receiving benefits from employers that have a mental health condition
dfNBenefits = dfMHealth[dfMHealth['benefits']== 1]

# Create a pie chart of those who are receiving benefits from employers, and how they feel 
# their mental health interferes with their work
numBenefits0 = len(dfBenefits[dfBenefits['work_interfere'] == 0])
numBenefits1 = len(dfBenefits[dfBenefits['work_interfere'] == 1])
numBenefits2 = len(dfBenefits[dfBenefits['work_interfere'] == 2])
numBenefits3 = len(dfBenefits[dfBenefits['work_interfere'] == 3])
data = [numBenefits0, numBenefits1, numBenefits2, numBenefits3]
labels = ['Often', 'Sometimes', 'Rarely', 'Never']
colors = sns.color_palette('flare')[0:5]
plt.pie(data, labels = labels, colors = colors, autopct='%.0f%%')
plt.title('Work Interference by Mental Health From Employees Receiving Benefits')
plt.show()

# Create a pie chart of those who are not receiving benefits from employers, and how they feel 
# their mental health interferes with their work
numNBenefits0 = len(dfNBenefits[dfNBenefits['work_interfere'] == 0])
numNBenefits1 = len(dfNBenefits[dfNBenefits['work_interfere'] == 1])
numNBenefits2 = len(dfNBenefits[dfNBenefits['work_interfere'] == 2])
numNBenefits3 = len(dfNBenefits[dfNBenefits['work_interfere'] == 3])
data = [numNBenefits0, numNBenefits1, numNBenefits2, numNBenefits3]
labels = ['Often', 'Sometimes', 'Rarely', 'Never']
colors = sns.color_palette('flare')[0:5]
plt.pie(data, labels = labels, colors = colors, autopct='%.0f%%')
plt.title('Work Interference by Mental Health From Employees Not Receiving Benefits')
plt.show()


print(stats.ttest_ind(dfBenefits['work_interfere'], dfNBenefits['work_interfere']))

Ttest_indResult(statistic=-0.3169607944450189, pvalue=0.7513949525643654)


# Find those who are not receiving benefits from employers with mental health condition
dfCare = dfMHealth[dfMHealth['care_options'] == 0]

# Find those who are receiving benefits from employers with mental health condition
dfNCare = dfMHealth[dfMHealth['care_options'] == 1]

# Create a pie chart of those who are receiving mental health care options from employers, 
# and how they feel their mental health interferes with their work
numCare0 = len(dfCare[dfCare['work_interfere'] == 0])
numCare1 = len(dfCare[dfCare['work_interfere'] == 1])
numCare2 = len(dfCare[dfCare['work_interfere'] == 2])
numCare3 = len(dfCare[dfCare['work_interfere'] == 3])
data = [numCare0, numCare1, numCare2, numCare3]
labels = ['Often', 'Sometimes', 'Rarely', 'Never']
colors = sns.color_palette('flare')[0:5]
plt.pie(data, labels = labels, colors = colors, autopct='%.0f%%')
plt.title('Work Interference by Mental Health From Employees With Known Mental Health Care Options')
plt.show()

# Create a pie chart of those who are not receiving mental health care options from employers, 
# and how they feel their mental health interferes with their work
numNCare0 = len(dfNCare[dfNCare['work_interfere'] == 0])
numNCare1 = len(dfNCare[dfNCare['work_interfere'] == 1])
numNCare2 = len(dfNCare[dfNCare['work_interfere'] == 2])
numNCare3 = len(dfNCare[dfNCare['work_interfere'] == 3])
data = [numNCare0, numNCare1, numNCare2, numNCare3]
labels = ['Often', 'Sometimes', 'Rarely', 'Never']
colors = sns.color_palette('flare')[0:5]
plt.pie(data, labels = labels, colors = colors, autopct='%.0f%%')
plt.title('Work Interference by Mental Health From Employees With Unknown Mental Health Care Options')
plt.show()


print(stats.ttest_ind(dfCare['work_interfere'], dfNCare['work_interfere']))

Ttest_indResult(statistic=1.7198200503223497, pvalue=0.08603391717192009)


# Find those who are not receiving benefits from employers with mental health condition
dfWellness = dfMHealth[dfMHealth['wellness_program'] == 0]

# Find those who are receiving benefits from employers with mental health condition
dfNWellness = dfMHealth[dfMHealth['wellness_program'] == 1]

# Create a pie chart of those who have access to mental health wellness programs from employers, 
# and how they feel their mental health interferes with their work
numWellness0 = len(dfWellness[dfWellness['work_interfere'] == 0])
numWellness1 = len(dfWellness[dfWellness['work_interfere'] == 1])
numWellness2 = len(dfWellness[dfWellness['work_interfere'] == 2])
numWellness3 = len(dfWellness[dfWellness['work_interfere'] == 3])
data = [numWellness0, numWellness1, numWellness2, numWellness3]
labels = ['Often', 'Sometimes', 'Rarely', 'Never']
colors = sns.color_palette('flare')[0:5]
plt.pie(data, labels = labels, colors = colors, autopct='%.0f%%')
plt.title('Work Interference by Mental Health From Employees With Knnown Wellness Program')
plt.show()

# Create a pie chart of those who do not have access to mental health wellness programs from employers, 
# and how they feel their mental health interferes with their work
numNWellness0 = len(dfNWellness[dfNWellness['work_interfere'] == 0])
numNWellness1 = len(dfNWellness[dfNWellness['work_interfere'] == 1])
numNWellness2 = len(dfNWellness[dfNWellness['work_interfere'] == 2])
numNWellness3 = len(dfNWellness[dfNWellness['work_interfere'] == 3])
data = [numNWellness0, numNWellness1, numNWellness2, numNWellness3]
labels = ['Often', 'Sometimes', 'Rarely', 'Never']
colors = sns.color_palette('flare')[0:5]
plt.pie(data, labels = labels, colors = colors, autopct='%.0f%%')
plt.title('Work Interference by Mental Health From Employees With Unknown Wellness Program')
plt.show()


print(stats.ttest_ind(dfWellness['work_interfere'], dfNWellness['work_interfere']))

Ttest_indResult(statistic=-1.894436439895522, pvalue=0.058697193222362706)


# Find those who are not receiving benefits from employers with mental health condition
dfHelp = dfMHealth[dfMHealth['seek_help'] == 0]

# Find those who are receiving benefits from employers with mental health condition
dfNHelp = dfMHealth[dfMHealth['seek_help'] == 1]

# Create a pie chart of those who have resources to learn more about mental health and seek help, 
# and how they feel their mental health interferes with their work
numHelp0 = len(dfHelp[dfHelp['work_interfere'] == 0])
numHelp1 = len(dfHelp[dfHelp['work_interfere'] == 1])
numHelp2 = len(dfHelp[dfHelp['work_interfere'] == 2])
numHelp3 = len(dfHelp[dfHelp['work_interfere'] == 3])
data = [numHelp0, numHelp1, numHelp2, numHelp3]
labels = ['Often', 'Sometimes', 'Rarely', 'Never']
colors = sns.color_palette('flare')[0:5]
plt.pie(data, labels = labels, colors = colors, autopct='%.0f%%')
plt.title('Work Interference by Mental Health From Employees With Resources to Seek Help')
plt.show()

# Create a pie chart of those who do not have resources to learn more about mental health and seek help, 
# and how they feel their mental health interferes with their work
numNHelp0 = len(dfNHelp[dfNHelp['work_interfere'] == 0])
numNHelp1 = len(dfNHelp[dfNHelp['work_interfere'] == 1])
numNHelp2 = len(dfNHelp[dfNHelp['work_interfere'] == 2])
numNHelp3 = len(dfNHelp[dfNHelp['work_interfere'] == 3])
data = [numNHelp0, numNHelp1, numNHelp2, numNHelp3]
labels = ['Often', 'Sometimes', 'Rarely', 'Never']
colors = sns.color_palette('flare')[0:5]
plt.pie(data, labels = labels, colors = colors, autopct='%.0f%%')
plt.title('Work Interference by Mental Health From Employees With No Resources to Seek Help')
plt.show()


print(stats.ttest_ind(dfHelp['work_interfere'], dfNHelp['work_interfere']))

Ttest_indResult(statistic=-1.520402718856143, pvalue=0.12899078065210312)


# creates a logistic regression model
dfLRegression = dfMHealth[['wellness_program', 'care_options']]

X = dfLRegression
y = dfMHealth['work_interfere']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)
logreg = LogisticRegression()

# fits the model with data
reg = logreg.fit(X_train,y_train)
y_pred=logreg.predict(X_test)

# Prints a summary of the relationship between "wellness_program" and "care_option" with "work_interfere"
X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:         work_interfere   R-squared:                       0.016
Model:                            OLS   Adj. R-squared:                  0.012
Method:                 Least Squares   F-statistic:                     4.393
Date:                Sun, 24 Oct 2021   Prob (F-statistic):             0.0128
Time:                        05:18:22   Log-Likelihood:                -746.32
No. Observations:                 546   AIC:                             1499.
Df Residuals:                     543   BIC:                             1512.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
====================================================================================
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const                1.5002      0.057     26.199      0.000       1.388       1.613
wellness_program     0.2367      0.098      2.409      0.016       0.044       0.430
care_options        -0.1919      0.084     -2.274      0.023      -0.358      -0.026
==============================================================================
Omnibus:                       79.087   Durbin-Watson:                   2.101
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               28.584
Skew:                           0.331   Prob(JB):                     6.21e-07
Kurtosis:                       2.096   Cond. No.                         2.96
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


# Creates a confusion matrix to calculate accuracy between the predicted values and actual values
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
class_names =[0,1]
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)

# Shows the confusion matrix as a heapmap for better visualization
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap='Reds', fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual Response')
plt.xlabel('Predicted Response')

Text(0.5, 257.44, 'Predicted Response')


# Prints the accuracy
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.44525547445255476

The Influence of Tech Work Environments on Mental Health¶

Authors: Pearl Hwang, Xue Qiu, Ronak Thakur, Tony Yao¶

Introduction¶

Data Overview¶

Initialization of Files¶

Tidying & Cleaning the Data¶

Data Analysis and Visualization¶

Conclusion and Moving Forward¶

	Timestamp	Age	Gender	Country	state	self_employed	family_history	treatment	work_interfere	no_employees	...	leave	mental_health_consequence	phys_health_consequence	coworkers	supervisor	mental_health_interview	phys_health_interview	mental_vs_physical	obs_consequence	comments
0	2014-08-27 11:29:31	37	female	United States	IL	NaN	No	Yes	Often	6-25	...	Somewhat easy	No	No	Some of them	Yes	No	Maybe	Yes	No	NaN
1	2014-08-27 11:29:37	44	male	United States	IN	NaN	No	No	Rarely	More than 1000	...	Don't know	Maybe	No	No	No	No	No	Don't know	No	NaN
2	2014-08-27 11:29:44	32	male	Canada	NaN	NaN	No	No	Rarely	6-25	...	Somewhat difficult	No	No	Yes	Yes	Yes	Yes	No	No	NaN
3	2014-08-27 11:29:46	31	male	United Kingdom	NaN	NaN	Yes	Yes	Often	26-100	...	Somewhat difficult	Yes	Yes	Some of them	No	Maybe	Maybe	No	Yes	NaN
4	2014-08-27 11:30:22	31	male	United States	TX	NaN	No	No	Never	100-500	...	Don't know	No	No	Some of them	Yes	Yes	Yes	Don't know	No	NaN