


# https://www.globalsino.com/ICs/
# Detecting Anomalies in Wafer Manufacture

import time
import numpy as np  
import pandas as pd 
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

from scipy.stats.mstats import winsorize
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler

from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.metrics import ConfusionMatrixDisplay

from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

import os

trianFile = r"C:\GlobalSino2\ICs\datasets\Detect_Anomalies_Wafer_Train.csv"
testFile = r"C:\GlobalSino2\ICs\datasets\Detect_Anomalies_Wafer_Test.csv"

train_data = pd.read_csv(trianFile)
test_data = pd.read_csv(testFile)

# print("train_data", train_data)
# print("testFile", testFile)

# Compbine the two files
all_data = pd.concat([train_data, test_data], ignore_index = True)

# print(all_data.head())
# print("\n The shapes are: ") 
# print(train_data.shape)
# print(test_data.shape)
# print(all_data.shape)

# Remove missing values
a = all_data.isnull().sum()
missing_value_feature_all_data = []
for i in a.index:
    if a[i] != 0:
        missing_value_feature_all_data.append(i)

missing_value_feature_all_data
# print("all_data", all_data)
# print("\n")
all_df = all_data.copy()
# print("all_df", all_df)
# print("\n")
all_df.drop(columns = ['feature_1', 'feature_2', 'feature_3', 'Class'], inplace = True)
# print("all_df", all_df)

# Show images 
f, ax = plt.subplots(figsize=(10, 10))
fig = sns.heatmap(all_df)
fig = fig.figure
fig.savefig(r"C:\GlobalSino2\ICs\images2\4035xyz.png")
fig.show()

# =================

fig_train = sns.histplot(train_data['Class'])
fig_train = fig_train.figure
fig_train.savefig(r"C:\GlobalSino2\ICs\images2\4035fig_train.png")

f_a, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize = (20,4))
fig_ax1 = sns.histplot(data = all_data['feature_1'], ax = ax1)
fig_ax2 = sns.histplot(data = all_data['feature_2'], ax = ax2)
fig_ax3 =  sns.histplot(data = all_data['feature_3'], ax = ax3)

fig_ax1 = fig_ax1.figure
fig_ax1.savefig(r"C:\GlobalSino2\ICs\images2\4035fig_ax1.png")
# fig_ax1.show()

# =================

f_c, (ax1_c, ax2_c, ax3_c) = plt.subplots(1, 3, figsize = (20,4))
fig_ax1_c = sns.boxplot(data = all_data['feature_1'], ax = ax1_c).set(ylabel='feature_1')
fig_ax2_c = sns.boxplot(data = all_data['feature_2'], ax = ax2_c).set(ylabel='feature_2')
fig_ax3_c = sns.boxplot(data = all_data['feature_3'], ax = ax3_c).set(ylabel='feature_3')

f_c.savefig(r"C:\GlobalSino2\ICs\images2\4035f_c.png")
# f_c.show()

# =================

f_d, (ax1_d, ax2_d, ax3_d) = plt.subplots(1, 3, figsize = (20,4))
sns.boxplot(x = train_data['Class'], y = train_data['feature_1'], ax = ax1_d)
sns.boxplot(x = train_data['Class'], y = train_data['feature_2'], ax = ax2_d)
sns.boxplot(x = train_data['Class'], y = train_data['feature_3'], ax = ax3_d)

f_d.savefig(r"C:\GlobalSino2\ICs\images2\4035f_d.png")
# f_d.show()

# =================

all_data_copy = all_data.copy()
winsorize(all_data_copy['feature_1'], limits = 0.05, inplace = True)
winsorize(all_data_copy['feature_3'], limits = 0.05, inplace = True)
# scal_fea = MinMaxScaler().fit_transform([new_fea_1])
# new_fea_2 = winsorize(all_data['feature_2'], limits = 0.008, inplace = False)

f_e, (ax1_e, ax2_e, ax3_e) = plt.subplots(1, 3, figsize = (20,4))
sns.boxplot(data = all_data_copy['feature_1'], ax = ax1_e)
sns.boxplot(data = all_data_copy['feature_2'], ax = ax2_e)
sns.boxplot(data = all_data_copy['feature_3'], ax = ax3_e)

f_e.savefig(r"C:\GlobalSino2\ICs\images2\4035f_e.png")
# f_e.show()

# =================
# Remove Outlier for feature_1 and feature_3

scaler_1 = MinMaxScaler(copy = False)
scaler_2 = RobustScaler(copy = False)
scaler_3 = StandardScaler(copy = False)


all_data_copy[['feature_1', 'feature_3']] = scaler_3.fit_transform(all_data_copy[['feature_1', 'feature_3']])
all_data_copy[['feature_2']] = scaler_2.fit_transform(all_data_copy[['feature_2']])


f_f, (ax1_f, ax2_f, ax3_f) = plt.subplots(1, 3, figsize = (20,4))
sns.boxplot(data = all_data_copy['feature_1'], ax = ax1_f)
sns.boxplot(data = all_data_copy['feature_2'], ax = ax2_f)
sns.boxplot(data = all_data_copy['feature_3'], ax = ax3_f)

f_f.savefig(r"C:\GlobalSino2\ICs\images2\4035f_f.png")
# f_f.show()

# =================

f_g, (ax1_g, ax2_g, ax3_g) = plt.subplots(1, 3, figsize = (20,4))
sns.boxplot(x = all_data_copy['Class'], y = all_data_copy['feature_1'], ax = ax1_g)
sns.boxplot(x = all_data_copy['Class'], y = all_data_copy['feature_2'], ax = ax2_g)
sns.boxplot(x = all_data_copy['Class'], y = all_data_copy['feature_3'], ax = ax3_g)

f_g.savefig(r"C:\GlobalSino2\ICs\images2\4035f_g.png")
# f_g.show()

# Seperate the Train and Test data
# They data of all_data_test is not used below
all_data_train = all_data_copy.iloc[:1763]
all_data_test = all_data_copy.iloc[1763:].drop(columns = ['Class'])
print("all_data_train", all_data_train)
print("all_data_test", all_data_test)

print(all_data_train.shape)
print(all_data_test.shape) # fig_train.show() 

# Imbalanced target
sns.histplot(all_data_train['Class'])

def over_sample_train_test(x,y):
    ros=RandomOverSampler(random_state=10)
    ros.fit(x,y)
    x_res,y_res=ros.fit_resample(x,y)
    x_train,x_val,y_train,y_val=train_test_split(x_res,y_res,test_size=0.2,random_state = 1)
    # print("x_train", x_train)
    return x_train,x_val,y_train,y_val

x = all_data_train.drop(columns = ['Class'], axis = 1)
y = all_data_train.Class
x_train,x_val,y_train,y_val = over_sample_train_test(x, y)

f_h, (ax1_h, ax2_h) = plt.subplots(1, 2, figsize = (10,4))
sns.histplot(y_train, ax = ax1_h)
sns.histplot(y_val, ax = ax2_h).set(title='Title of Plot')
f_h.savefig(r"C:\GlobalSino2\ICs\images2\4035f_h.png")
f_h.show()

# Logistic regression

# lr = LogisticRegression(solver='liblinear', penalty = 'l1', C = 0.05, random_state = 42, max_iter = 1000)
regulation_parameter = 0.005
lr = LogisticRegression(solver='liblinear', C = regulation_parameter, random_state = 42, max_iter = 1000)

def apply_model(model,x_train,x_val,y_train,y_val):
    print('Logistic Regression')
    model.fit(x_train,y_train)
    y_pred = model.predict(x_val)
    print('')
    print('Train Score:  ',model.score(x_train,y_train))
    print('Validation Score:   ',model.score(x_val,y_val))
    print('')
    print(classification_report(y_val,y_pred))

    img_pred_label = ConfusionMatrixDisplay.from_estimator(model, x_val, y_val)
    plt.savefig(r"C:\GlobalSino2\ICs\images2\4035img_pred_label.png")
    # plt.show()

apply_model(lr, x_train,x_val,y_train,y_val)

coefs = pd.DataFrame(lr.coef_, columns=x_train.columns)
print(coefs.head())

f_i, ax = plt.subplots(1, 1, figsize = (20,4))
sns.barplot(data = coefs, ax = ax)
f_i.savefig(r"C:\GlobalSino2\ICs\images2\4035f_i.png")
# f_i.show()


# Feature reduction
lasso = LogisticRegression(solver='liblinear', penalty = 'l1', C = 0.05, random_state = 42, max_iter = 1000)


def apply_model(model,x_train,x_val,y_train,y_val):
    print('Logistic Regression')
    model.fit(x_train,y_train)
    y_pred = model.predict(x_val)
    print('')
    print('Train Score:  ',model.score(x_train,y_train))
    print('Validation Score:   ',model.score(x_val,y_val))
    print('')
    print(classification_report(y_val,y_pred))
    plot_confusion_matrix(model, x_val, y_val)
    img_pred_label_b = ConfusionMatrixDisplay.from_estimator(model, x_val, y_val)
    plt.savefig(r"C:\GlobalSino2\ICs\images2\4035img_pred_label_b.png")
    
apply_model(lasso, x_train,x_val,y_train,y_val)


lasso_coefs = pd.DataFrame(lasso.coef_, columns=x_train.columns)
imp_lasso_coefs = lasso_coefs.loc[:, (lasso_coefs != 0).any(axis=0)]

print(lasso_coefs)

imp_lasso_coefs.shape

f_j, ax = plt.subplots(1, 1, figsize = (20,4)) # ===============
sns.barplot(data = imp_lasso_coefs, ax = ax)
f_j.savefig(r"C:\GlobalSino2\ICs\images2\4035f_j.png")
# f_j.show()

temp = all_data_copy[imp_lasso_coefs.columns].corr()
half_temp = np.triu(temp)

f_k, ax = plt.subplots(1, 1, figsize = (20,4))
dataplot = sns.heatmap(temp, cmap="YlGnBu", mask = half_temp, annot=True, ax = ax)
f_k.savefig(r"C:\GlobalSino2\ICs\images2\4035f_k.png")
# f_k.show()

# Patterns recognization based on PCA

pca = PCA(n_components = 0.95)
pca.fit(x)
reduced = pca.transform(x)
reduced.shape

fig, ax = plt.subplots(figsize = (10,4))
xi = np.arange(1, reduced.shape[1]+1, step=1)
print(xi)
y = np.cumsum(pca.explained_variance_ratio_)*100
print(y)

plt.ylim(0.0,101)
plt.xlabel('number of principle components')
plt.ylabel('explained features (%)')
plt.scatter(xi, y)
# sns.scatterplot(xi, y)
plt.show()


df_pca = x.copy()
pc_0, pc_1 = 'pc_0', 'pc_1'
df_pca[pc_0] = reduced[:,0]
df_pca[pc_1] = reduced[:,1]
df_pca_label = all_data_train.Class


f_l, ax = plt.subplots(1, 1, figsize = (8,8))
sns.scatterplot(x=pc_0, y=pc_1, data=df_pca,hue=df_pca_label, ax = ax)
f_l.show()










