# kaggle Airline Passenger Satisfaction dataset 활용 (train.csv만)
# https://www.kaggle.com/datasets/teejmahal20/airline-passenger-satisfaction
# rfc를 이용한 고객의 만족 여부를 분류하는 모델 구현
import pandas as pd
cdf = pd.read_csv('train.csv')
cdf = cdf[:5000]
cdf = cdf.drop(columns = ['Unnamed: 0', 'id'], axis = 1)
# pd.set_option('display.max_columns', None)
print(cdf.head(1))
print(cdf.satisfaction.value_counts())
# 결측치 제거
cdf['satisfaction'].value_counts().plot(kind='bar')
print(cdf.info()) # Arrival Delay in Minutes 결측치 제거
cdf = cdf.dropna() # or
# cdf.dropna(axis=0, inplace = True) 얘랑 동일함 기본이 axis = 0
print(cdf.info())
import matplotlib.pyplot as plt
# plt.show()
y = cdf.satisfaction
x = cdf.drop('satisfaction', axis = 1)
print('x shape', x.shape)
print('y shape', y.shape)
# one-hot encoding
x_gd = pd.get_dummies(x, columns = ['Gender', 'Customer Type', 'Type of Travel', 'Class'], drop_first=False)
print(x_gd.info())
# train-test dataset 분리
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_gd, y, stratify = y, test_size = 0.2, random_state = 2023)
# label encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(y_train) # fit을 통해 y_train의 값마다 0과 1을 부여하는 규칙 색성하기
le_y_train = le.transform(y_train) # y_train을 레이블 인코딩하기
le_y_test = le.transform(y_test) # y_test를 레이블 인코딩하기
print('data after labelling', le_y_train)
print(le_y_train.shape)
print('classes', le.classes_)
print('decoding', le.inverse_transform(le_y_train))
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state = 1, n_estimators = 50, max_features = None, n_jobs = -1, max_depth = 10)
rfc.fit(x_train, le_y_train)
acc_train = rfc.score(x_train, le_y_train)
acc_test = rfc.score(x_test, le_y_test)
print('acc', acc_train, acc_test)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report
y_pred = rfc.predict(x_test)
# 실제값 le_y_test(레이블 인코딩을 해야 0, 1로 표현됨) vs prediction
cm = confusion_matrix(le_y_test, y_pred, labels = [0, 1])
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = [0, 1])
disp.plot()
print(classification_report(le_y_test, y_pred))
plt.show()
# kaggle Airline Passenger Satisfaction dataset 활용 (train.csv만)
# https://www.kaggle.com/datasets/teejmahal20/airline-passenger-satisfaction
# rfc를 이용한 고객의 만족 여부를 분류하는 모델 구현
import pandas as pd
cdf = pd.read_csv('train.csv')
cdf = cdf[:5000]
cdf = cdf.drop(columns = ['Unnamed: 0', 'id'], axis = 1)
# pd.set_option('display.max_columns', None)
print(cdf.head(1))
print(cdf.satisfaction.value_counts())
# 결측치 제거
cdf['satisfaction'].value_counts().plot(kind='bar')
print(cdf.info()) # Arrival Delay in Minutes 결측치 제거
cdf = cdf.dropna() # or
# cdf.dropna(axis=0, inplace = True) 얘랑 동일함 기본이 axis = 0
print(cdf.info())
import matplotlib.pyplot as plt
# plt.show()
y = cdf.satisfaction
x = cdf.drop('satisfaction', axis = 1)
print('x shape', x.shape)
print('y shape', y.shape)
# one-hot encoding
x_gd = pd.get_dummies(x, columns = ['Gender', 'Customer Type', 'Type of Travel', 'Class'], drop_first=False)
print(x_gd.info())
# train-test dataset 분리
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_gd, y, stratify = y, test_size = 0.2, random_state = 2023)
# label encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(y_train) # fit을 통해 y_train의 값마다 0과 1을 부여하는 규칙 색성하기
le_y_train = le.transform(y_train) # y_train을 레이블 인코딩하기
le_y_test = le.transform(y_test) # y_test를 레이블 인코딩하기
print('data after labelling', le_y_train)
print(le_y_train.shape)
print('classes', le.classes_)
print('decoding', le.inverse_transform(le_y_train))
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state = 1, n_estimators = 50, max_features = None, n_jobs = -1, max_depth = 10)
rfc.fit(x_train, le_y_train)
acc_train = rfc.score(x_train, le_y_train)
acc_test = rfc.score(x_test, le_y_test)
print('acc', acc_train, acc_test)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report
y_pred = rfc.predict(x_test)
# 실제값 le_y_test(레이블 인코딩을 해야 0, 1로 표현됨) vs prediction
cm = confusion_matrix(le_y_test, y_pred, labels = [0, 1])
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = [0, 1])
disp.plot()
print(classification_report(le_y_test, y_pred))
plt.show()
댓글