본문 바로가기
카테고리 없음

[Keras] DNN을 이용한 항공권 만족도 판별

by 공부하는우니 2023. 10. 25.
# kaggle Airline Passenger Satisfaction dataset 활용 (train.csv만)
# https://www.kaggle.com/datasets/teejmahal20/airline-passenger-satisfaction
# DNN으로 고객의 만족 여부를 분류하는 모델 구현

import pandas as pd
df = pd.read_csv('train.csv')
# df = df[:5000]
df = df.drop(columns = ['Unnamed: 0', 'id'], axis = 1)

# 결측치 확인
print(df.isnull().sum())

# 결측치 평균으로 대체
from sklearn.impute import SimpleImputer
mean_imputer = SimpleImputer(strategy = 'mean')
df['Arrival Delay in Minutes'] = mean_imputer.fit_transform(df[['Arrival Delay in Minutes']])
print(df.isnull().sum())


print(df.info())
# object type을 string으로
cols = ['satisfaction', 'Gender', 'Customer Type', 'Type of Travel', 'Class']
df[cols] = df[cols].astype('str')


# 범주형 type을 수치로
df['satisfaction'].replace(['neutral or dissatisfied', 'satisfied'], [0, 1], inplace = True)

# 순서형 인코딩 (Ordinal Eoncondig으로 class를 순서대로 0, 1, 2로 변환]
print(df[:5].to_string())
categories = pd.Categorical(df['Class'], categories = ['Eco', 'Eco Plus', 'Business'], ordered = True)
labels, unique = pd.factorize(categories, sort = True)
df['Class'] = labels
print(df[:5].to_string())

# one-hot encoding
cat_cols = ['Gender', 'Customer Type', 'Type of Travel']
df = pd.get_dummies(df, columns = cat_cols)
print(df[:5].to_string())
print(df.dtypes)

from sklearn.model_selection import train_test_split
y = df.satisfaction
X = df.drop('satisfaction', axis = 1)

# 이렇게도 가능
# X = df.drop(['satisfaction'], axis = 1)
# y = df['satisfaction'].reset_index(drop = True)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify=y)
print('train shape', X_train.shape, y_train.shape)
print('test shape', X_val.shape, y_val.shape)

# Normalization
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import random

import numpy as np
tf.random.set_seed(42)
np.random.seed(42)
random.seed(42)

initializer = tf.keras.initializers.GlorotUniform(seed = 42)

model = Sequential()
model.add(Dense(32, activation = 'relu', input_shape = (25,), kernel_initializer = initializer)) # X_train.shape
model.add(Dense(64, activation = 'relu'))
model.add(Dense(128, activation = 'relu'))
model.add(Dropout(0.3))
model.add(Dense(256, activation = 'relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation =  'sigmoid'))
print(model.summary())

model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

es = EarlyStopping(monitor = 'val_loss', min_delta = 0, patience = 10, verbose = 1, restore_best_weights = True)
# monitor : 학습 종료시키기 위한 성능 모니터 : val_loss나 val_accuracy가 주로 사용됨
# min_delta : 개선되고 있다고 판단하기 위한 최소 변화량
# patience : 성능 향상을 몇 번의 epoch 동안 기다릴지
# verbose : 얼마나 자세히 정보를 표현할지, 0, 1, 2
# restore_best_weights : 가장 좋은 값으로 저장. 기본은 마지막 값으로 저장
history = model.fit(X_train, y_train, epochs = 100, batch_size = 128, verbose = 1, validation_data = (X_val, y_val), callbacks = [es])

import matplotlib.pyplot as plt
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Validation'], loc = 'lower right')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train', 'Validation'], loc = 'upper right')
plt.show()

댓글