# kaggle Airline Passenger Satisfaction dataset 활용 (train.csv만)
# https://www.kaggle.com/datasets/teejmahal20/airline-passenger-satisfaction
# DNN으로 고객의 만족 여부를 분류하는 모델 구현
import pandas as pd
df = pd.read_csv('train.csv')
# df = df[:5000]
df = df.drop(columns = ['Unnamed: 0', 'id'], axis = 1)
# 결측치 확인
print(df.isnull().sum())
# 결측치 평균으로 대체
from sklearn.impute import SimpleImputer
mean_imputer = SimpleImputer(strategy = 'mean')
df['Arrival Delay in Minutes'] = mean_imputer.fit_transform(df[['Arrival Delay in Minutes']])
print(df.isnull().sum())
print(df.info())
# object type을 string으로
cols = ['satisfaction', 'Gender', 'Customer Type', 'Type of Travel', 'Class']
df[cols] = df[cols].astype('str')
# 범주형 type을 수치로
df['satisfaction'].replace(['neutral or dissatisfied', 'satisfied'], [0, 1], inplace = True)
# 순서형 인코딩 (Ordinal Eoncondig으로 class를 순서대로 0, 1, 2로 변환]
print(df[:5].to_string())
categories = pd.Categorical(df['Class'], categories = ['Eco', 'Eco Plus', 'Business'], ordered = True)
labels, unique = pd.factorize(categories, sort = True)
df['Class'] = labels
print(df[:5].to_string())
# one-hot encoding
cat_cols = ['Gender', 'Customer Type', 'Type of Travel']
df = pd.get_dummies(df, columns = cat_cols)
print(df[:5].to_string())
print(df.dtypes)
from sklearn.model_selection import train_test_split
y = df.satisfaction
X = df.drop('satisfaction', axis = 1)
# 이렇게도 가능
# X = df.drop(['satisfaction'], axis = 1)
# y = df['satisfaction'].reset_index(drop = True)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify=y)
print('train shape', X_train.shape, y_train.shape)
print('test shape', X_val.shape, y_val.shape)
# Normalization
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import random
import numpy as np
tf.random.set_seed(42)
np.random.seed(42)
random.seed(42)
initializer = tf.keras.initializers.GlorotUniform(seed = 42)
model = Sequential()
model.add(Dense(32, activation = 'relu', input_shape = (25,), kernel_initializer = initializer)) # X_train.shape
model.add(Dense(64, activation = 'relu'))
model.add(Dense(128, activation = 'relu'))
model.add(Dropout(0.3))
model.add(Dense(256, activation = 'relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation = 'sigmoid'))
print(model.summary())
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
es = EarlyStopping(monitor = 'val_loss', min_delta = 0, patience = 10, verbose = 1, restore_best_weights = True)
# monitor : 학습 종료시키기 위한 성능 모니터 : val_loss나 val_accuracy가 주로 사용됨
# min_delta : 개선되고 있다고 판단하기 위한 최소 변화량
# patience : 성능 향상을 몇 번의 epoch 동안 기다릴지
# verbose : 얼마나 자세히 정보를 표현할지, 0, 1, 2
# restore_best_weights : 가장 좋은 값으로 저장. 기본은 마지막 값으로 저장
history = model.fit(X_train, y_train, epochs = 100, batch_size = 128, verbose = 1, validation_data = (X_val, y_val), callbacks = [es])
import matplotlib.pyplot as plt
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Validation'], loc = 'lower right')
plt.show()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train', 'Validation'], loc = 'upper right')
plt.show()
카테고리 없음
댓글