from warnings import filterwarnings
filterwarnings('ignore')

import pandas as pd
import numpy as np
import seaborn as sns
import plotly.graph_objs as go
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

df = pd.read_parquet('train.parquet')
df.head()

df.shape

(3220868, 3)

df.duplicated().sum()

35571

df.index.duplicated().sum()

166693

df.drop_duplicates(inplace = True)
df.reset_index(drop = True, inplace = True)
df.shape

(3185297, 3)

positive_set = df[df['Incident'] == 'TARGET DRUG']
print(positive_set.shape)
positive_set.head()

(67218, 3)

positive_set.drop('Incident', inplace = True, axis = 1)

# Sorting patients by Patient-uid and date to manage them better

positive_set.sort_values(by=['Patient-Uid', 'Date'], inplace=True)

positive_set['TimeInterval'] = positive_set.groupby('Patient-Uid')['Date'].diff()

prescription_patterns = positive_set.copy()

prescription_patterns.dropna(subset=['TimeInterval'], inplace=True)

print(prescription_patterns['TimeInterval'].min(), prescription_patterns['TimeInterval'].max())

1 days 00:00:00 1219 days 00:00:00

print(df['Date'].min(), df['Date'].max())

2015-04-07 00:00:00 2020-09-03 00:00:00

print(positive_set['Date'].min(), positive_set['Date'].max())

2017-02-22 00:00:00 2020-09-03 00:00:00

X = prescription_patterns['TimeInterval'].values.reshape(-1, 1)

k_values = range(1, 11)
inertia_scores = []
cluster_mapping = {}

for k in k_values:
    kmeans = KMeans(n_clusters=k, n_init='auto', random_state=42)   # init will by default be done using k++ which is the best one to go with...
    kmeans.fit(X)
    
    inertia_score = kmeans.inertia_
    inertia_scores.append(inertia_score)
    
    cluster_mapping[f'{k} Cluster' if k == 1 else f'{k} Clusters'] = inertia_score

print(cluster_mapping)

sns.set_theme()
plt.figure(figsize=(10, 7))
plt.plot(k_values, inertia_scores, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia Score')
plt.title('Elbow Method: Inertia Scores')
plt.show()

{'1 Cluster': 8.596245577653992e+35, '2 Clusters': 4.237133518141459e+35, '3 Clusters': 2.4248190479612254e+35, '4 Clusters': 1.3773206859526056e+35, '5 Clusters': 1.165582461295304e+35, '6 Clusters': 7.009182021866456e+34, '7 Clusters': 5.067819584613085e+34, '8 Clusters': 4.035829757754878e+34, '9 Clusters': 3.079275106145406e+34, '10 Clusters': 2.6283973130400936e+34}

kmeans = KMeans(n_clusters=4, random_state=42)
prescription_patterns['Cluster'] = kmeans.fit_predict(X) + 1

# Cluster by default will start from 0... For our understandability we are making it start from 1

prescription_patterns['Cluster'].value_counts().plot.pie(autopct='%.2f')

<Axes: ylabel='Cluster'>

prescription_patterns.head()

# Seggregating clusters into different dataframes to visualize wach of them with ease...

cluster_1 = prescription_patterns[prescription_patterns['Cluster'] == 1]
cluster_2 = prescription_patterns[prescription_patterns['Cluster'] == 2]
cluster_3 = prescription_patterns[prescription_patterns['Cluster'] == 3]
cluster_4 = prescription_patterns[prescription_patterns['Cluster'] == 4]

# Grouping the data in each cluster by month and counting the number of prescriptions using Grouper object and groupby method

cluster_1_counts = cluster_1.groupby(pd.Grouper(key='Date', freq='M')).size()
cluster_2_counts = cluster_2.groupby(pd.Grouper(key='Date', freq='M')).size()
cluster_3_counts = cluster_3.groupby(pd.Grouper(key='Date', freq='M')).size()
cluster_4_counts = cluster_4.groupby(pd.Grouper(key='Date', freq='M')).size()

plt.figure(figsize=(10, 8))

plt.plot(cluster_1_counts.index, cluster_1_counts.values, label='Cluster 1')
plt.plot(cluster_2_counts.index, cluster_2_counts.values, label='Cluster 2')
plt.plot(cluster_3_counts.index, cluster_3_counts.values, label='Cluster 3')
plt.plot(cluster_4_counts.index, cluster_4_counts.values, label='Cluster 4')

plt.xlabel('Period')
plt.ylabel('Number of Prescriptions')
plt.title('Number of Prescriptions overall as time progresses')
plt.legend()

plt.show()

cluster_1['Month'] = (cluster_1['TimeInterval'].dt.days / 30.44).astype(int)
cluster_2['Month'] = (cluster_2['TimeInterval'].dt.days / 30.44).astype(int)
cluster_3['Month'] = (cluster_3['TimeInterval'].dt.days / 30.44).astype(int)
cluster_4['Month'] = (cluster_4['TimeInterval'].dt.days / 30.44).astype(int)

# Calculating the avg no. of prescriptions per unique patient for each month in each cluster

cluster_1_prescription = cluster_1.groupby(['Month', 'Patient-Uid']).size().groupby('Month').mean().reset_index(name='Average Prescriptions')
cluster_2_prescription = cluster_2.groupby(['Month', 'Patient-Uid']).size().groupby('Month').mean().reset_index(name='Average Prescriptions')
cluster_3_prescription = cluster_3.groupby(['Month', 'Patient-Uid']).size().groupby('Month').mean().reset_index(name='Average Prescriptions')
cluster_4_prescription = cluster_4.groupby(['Month', 'Patient-Uid']).size().groupby('Month').mean().reset_index(name='Average Prescriptions')

std = cluster_1_prescription['Average Prescriptions'].std()    # standard deviation calculation for errorbar

error_y = np.full(len(cluster_1_prescription), std)

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=cluster_1_prescription['Month'],
    y=cluster_1_prescription['Average Prescriptions'],
    error_y=dict(type='data', array=error_y, visible=True),
    mode='markers+lines',
    marker={'size': 16}
))

fig.update_layout(
    title='Average Prescriptions for Cluster 1',
    xaxis_title='Month',
    yaxis_title='Average Prescriptions'
)

fig.show()

std = cluster_2_prescription['Average Prescriptions'].std()

error_y = np.full(len(cluster_2_prescription), std)

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=cluster_2_prescription['Month'],
    y=cluster_2_prescription['Average Prescriptions'],
    error_y=dict(type='data', array=error_y, visible=True),
    mode='markers+lines',
    marker={'size': 16}
))

fig.update_layout(
    title='Average Prescriptions for Cluster 2',
    xaxis_title='Month',
    yaxis_title='Average Prescriptions'
)

fig.show()

std = cluster_3_prescription['Average Prescriptions'].std()

error_y = np.full(len(cluster_3_prescription), std)

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=cluster_3_prescription['Month'],
    y=cluster_3_prescription['Average Prescriptions'],
    error_y=dict(type='data', array=error_y, visible=True),
    mode='markers+lines',
    marker={'size': 16}
))

fig.update_layout(
    title='Average Prescriptions for Cluster 3',
    xaxis_title='Month',
    yaxis_title='Average Prescriptions'
)

fig.show()

std = cluster_4_prescription['Average Prescriptions'].std()

error_y = np.full(len(cluster_4_prescription), std)

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=cluster_4_prescription['Month'],
    y=cluster_4_prescription['Average Prescriptions'],
    error_y=dict(type='data', array=error_y, visible=True),
    mode='markers+lines',
    marker={'size': 16}
))

fig.update_layout(
    title='Average Prescriptions for Cluster 4',
    xaxis_title='Month',
    yaxis_title='Average Prescriptions'
)

fig.show()

	Patient-Uid	Date	Incident
0	a0db1e73-1c7c-11ec-ae39-16262ee38c7f	2019-03-09	PRIMARY_DIAGNOSIS
1	a0dc93f2-1c7c-11ec-9cd2-16262ee38c7f	2015-05-16	PRIMARY_DIAGNOSIS
3	a0dc94c6-1c7c-11ec-a3a0-16262ee38c7f	2018-01-30	SYMPTOM_TYPE_0
4	a0dc950b-1c7c-11ec-b6ec-16262ee38c7f	2015-04-22	DRUG_TYPE_0
8	a0dc9543-1c7c-11ec-bb63-16262ee38c7f	2016-06-18	DRUG_TYPE_1

	Patient-Uid	Date	Incident
2065342	a0eb742b-1c7c-11ec-8f61-16262ee38c7f	2020-04-09	TARGET DRUG
2065362	a0edaf09-1c7c-11ec-a360-16262ee38c7f	2018-06-12	TARGET DRUG
2065502	a0e9fa0e-1c7c-11ec-8dc7-16262ee38c7f	2019-06-11	TARGET DRUG
2065613	a0ecc615-1c7c-11ec-aa31-16262ee38c7f	2019-11-15	TARGET DRUG
2065618	a0ea612f-1c7c-11ec-8cf0-16262ee38c7f	2020-03-18	TARGET DRUG

	Patient-Uid	Date	TimeInterval	Cluster
2094649	a0e9c384-1c7c-11ec-81a0-16262ee38c7f	2020-08-05	28 days	3
2164002	a0e9c384-1c7c-11ec-81a0-16262ee38c7f	2020-09-02	28 days	3
2637552	a0e9c3b3-1c7c-11ec-ae8e-16262ee38c7f	2018-05-17	23 days	3
3171058	a0e9c3b3-1c7c-11ec-ae8e-16262ee38c7f	2018-06-13	27 days	3
2375328	a0e9c3b3-1c7c-11ec-ae8e-16262ee38c7f	2018-08-07	55 days	1

Patient Segmentation Based on Prescription Patterns¶

Importing Necessary Libraries:¶

Reading the data, Understanding the data and Addressing basic inconsistencies if any...¶

Pulling just the positive set out of original dataset...¶

Engineering new feature...¶

Elbow Plot to find ideal no. of clusters:¶

Clustering:¶

Instant observation:¶

Some Seggregation and Grouping before we get into visualization part...¶

Some first line of visualization before we dig deeper into the clusters...¶

Inferences from above visualization:¶

Creating new feature - Month from TimeInterval Column...¶

Visualizations explaining the pattern in each cluster with respect to average prescriptions every month:¶

Insights gained:¶

Insights gained:¶

Insights gained:¶

Insights gained:¶

Overall Summary:¶